1 files changed, 912 insertions, 308 deletions
diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
index dd8929911c6..00f245a05c0 100644
--- a/storage/innobase/include/trx0sys.h
+++ b/storage/innobase/include/trx0sys.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, MariaDB Corporation.
+Copyright (c) 2017, 2018, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -35,7 +35,6 @@ Created 3/26/1996 Heikki Tuuri
 #include "mem0mem.h"
 #include "mtr0mtr.h"
 #include "ut0byte.h"
-#include "mem0mem.h"
 #include "ut0lst.h"
 #include "read0types.h"
 #include "page0types.h"
@@ -47,122 +46,61 @@ Created 3/26/1996 Heikki Tuuri
 
 typedef UT_LIST_BASE_NODE_T(trx_t) trx_ut_list_t;
 
-// Forward declaration
-class MVCC;
-class ReadView;
-
-/** The transaction system */
-extern trx_sys_t*	trx_sys;
-
 /** Checks if a page address is the trx sys header page.
 @param[in]	page_id	page id
 @return true if trx sys header page */
-UNIV_INLINE
+inline
 bool
-trx_sys_hdr_page(
-	const page_id_t&	page_id);
-
-/** Initialize the transaction system main-memory data structures. */
-void trx_sys_init_at_db_start();
+trx_sys_hdr_page(const page_id_t& page_id)
+{
+	return(page_id.space() == TRX_SYS_SPACE
+	       && page_id.page_no() == TRX_SYS_PAGE_NO);
+}
 
 /*****************************************************************//**
-Creates the trx_sys instance and initializes purge_queue and mutex. */
-void
-trx_sys_create(void);
-/*================*/
-/*****************************************************************//**
 Creates and initializes the transaction system at the database creation. */
 void
 trx_sys_create_sys_pages(void);
 /*==========================*/
-/** @return an unallocated rollback segment slot in the TRX_SYS header
+/** Find an available rollback segment.
+@param[in]	sys_header
+@return an unallocated rollback segment slot in the TRX_SYS header
 @retval ULINT_UNDEFINED if not found */
 ulint
-trx_sysf_rseg_find_free(mtr_t* mtr);
-/**********************************************************************//**
-Gets a pointer to the transaction system file copy and x-locks its page.
-@return pointer to system file copy, page x-locked */
-UNIV_INLINE
-trx_sysf_t*
-trx_sysf_get(
-/*=========*/
-	mtr_t*	mtr);	/*!< in: mtr */
-/*****************************************************************//**
-Gets the space of the nth rollback segment slot in the trx system
-file copy.
-@return space id */
-UNIV_INLINE
-ulint
-trx_sysf_rseg_get_space(
-/*====================*/
-	trx_sysf_t*	sys_header,	/*!< in: trx sys file copy */
-	ulint		i,		/*!< in: slot index == rseg id */
-	mtr_t*		mtr);		/*!< in: mtr */
-/*****************************************************************//**
-Gets the page number of the nth rollback segment slot in the trx system
-file copy.
-@return page number, FIL_NULL if slot unused */
-UNIV_INLINE
-ulint
-trx_sysf_rseg_get_page_no(
-/*======================*/
-	trx_sysf_t*	sys_header,	/*!< in: trx sys file copy */
-	ulint		i,		/*!< in: slot index == rseg id */
-	mtr_t*		mtr);		/*!< in: mtr */
-/*****************************************************************//**
-Sets the space id of the nth rollback segment slot in the trx system
-file copy. */
-UNIV_INLINE
-void
-trx_sysf_rseg_set_space(
-/*====================*/
-	trx_sysf_t*	sys_header,	/*!< in: trx sys file copy */
-	ulint		i,		/*!< in: slot index == rseg id */
-	ulint		space,		/*!< in: space id */
-	mtr_t*		mtr);		/*!< in: mtr */
-/*****************************************************************//**
-Sets the page number of the nth rollback segment slot in the trx system
-file copy. */
-UNIV_INLINE
-void
-trx_sysf_rseg_set_page_no(
-/*======================*/
-	trx_sysf_t*	sys_header,	/*!< in: trx sys file copy */
-	ulint		i,		/*!< in: slot index == rseg id */
-	ulint		page_no,	/*!< in: page number, FIL_NULL if
-					the slot is reset to unused */
-	mtr_t*		mtr);		/*!< in: mtr */
-/*****************************************************************//**
-Allocates a new transaction id.
-@return new, allocated trx id */
-UNIV_INLINE
-trx_id_t
-trx_sys_get_new_trx_id();
-/*===================*/
-/*****************************************************************//**
-Determines the maximum transaction id.
-@return maximum currently allocated trx id; will be stale after the
-next call to trx_sys_get_new_trx_id() */
-UNIV_INLINE
-trx_id_t
-trx_sys_get_max_trx_id(void);
-/*========================*/
+trx_sys_rseg_find_free(const buf_block_t* sys_header);
+/** Request the TRX_SYS page.
+@param[in]	rw	whether to lock the page for writing
+@return the TRX_SYS page
+@retval	NULL	if the page cannot be read */
+inline
+buf_block_t*
+trx_sysf_get(mtr_t* mtr, bool rw = true)
+{
+	buf_block_t* block = buf_page_get(
+		page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
+		univ_page_size, rw ? RW_X_LATCH : RW_S_LATCH, mtr);
+	if (block) {
+		buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
+	}
+	return block;
+}
 
 #ifdef UNIV_DEBUG
 /* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
 extern uint			trx_rseg_n_slots_debug;
 #endif
 
-/*****************************************************************//**
-Writes a trx id to an index page. In case that the id size changes in
-some future version, this function should be used instead of
-mach_write_... */
+/** Write DB_TRX_ID.
+@param[out]	db_trx_id	the DB_TRX_ID field to be written to
+@param[in]	id		transaction ID */
 UNIV_INLINE
 void
-trx_write_trx_id(
-/*=============*/
-	byte*		ptr,	/*!< in: pointer to memory where written */
-	trx_id_t	id);	/*!< in: id */
+trx_write_trx_id(byte* db_trx_id, trx_id_t id)
+{
+	compile_time_assert(DATA_TRX_ID_LEN == 6);
+	ut_ad(id);
+	mach_write_to_6(db_trx_id, id);
+}
 
 /** Read a transaction identifier.
 @return id */
@@ -170,9 +108,7 @@ inline
 trx_id_t
 trx_read_trx_id(const byte* ptr)
 {
-#if DATA_TRX_ID_LEN != 6
-# error "DATA_TRX_ID_LEN != 6"
-#endif
+	compile_time_assert(DATA_TRX_ID_LEN == 6);
 	return(mach_read_from_6(ptr));
 }
 
@@ -188,59 +124,6 @@ inline bool trx_id_check(const void* db_trx_id, trx_id_t trx_id)
 }
 #endif
 
-/****************************************************************//**
-Looks for the trx instance with the given id in the rw trx_list.
-@return	the trx handle or NULL if not found */
-UNIV_INLINE
-trx_t*
-trx_get_rw_trx_by_id(
-/*=================*/
-	trx_id_t	trx_id);/*!< in: trx id to search for */
-/****************************************************************//**
-Returns the minimum trx id in rw trx list. This is the smallest id for which
-the trx can possibly be active. (But, you must look at the trx->state to
-find out if the minimum trx id transaction itself is active, or already
-committed.)
-@return the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */
-UNIV_INLINE
-trx_id_t
-trx_rw_min_trx_id(void);
-/*===================*/
-/****************************************************************//**
-Checks if a rw transaction with the given id is active.
-@return transaction instance if active, or NULL */
-UNIV_INLINE
-trx_t*
-trx_rw_is_active_low(
-/*=================*/
-	trx_id_t	trx_id,		/*!< in: trx id of the transaction */
-	ibool*		corrupt);	/*!< in: NULL or pointer to a flag
-					that will be set if corrupt */
-/****************************************************************//**
-Checks if a rw transaction with the given id is active. If the caller is
-not holding trx_sys->mutex, the transaction may already have been
-committed.
-@return transaction instance if active, or NULL; */
-UNIV_INLINE
-trx_t*
-trx_rw_is_active(
-/*=============*/
-	trx_id_t	trx_id,		/*!< in: trx id of the transaction */
-	ibool*		corrupt,	/*!< in: NULL or pointer to a flag
-					that will be set if corrupt */
-	bool		do_ref_count);	/*!< in: if true then increment the
-					trx_t::n_ref_count */
-#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
-/***********************************************************//**
-Assert that a transaction has been recovered.
-@return TRUE */
-UNIV_INLINE
-ibool
-trx_assert_recovered(
-/*=================*/
-	trx_id_t	trx_id)		/*!< in: transaction identifier */
-	MY_ATTRIBUTE((warn_unused_result));
-#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 /*****************************************************************//**
 Updates the offset information about the end of the MySQL binlog entry
 which corresponds to the transaction just being committed. In a MySQL
@@ -251,72 +134,17 @@ trx_sys_update_mysql_binlog_offset(
 /*===============================*/
 	const char*	file_name,/*!< in: MySQL log file name */
 	int64_t		offset,	/*!< in: position in that log file */
-        trx_sysf_t*     sys_header, /*!< in: trx sys header */
-	mtr_t*		mtr);	/*!< in: mtr */
+	buf_block_t*	sys_header, /*!< in,out: trx sys header */
+	mtr_t*		mtr);	/*!< in,out: mini-transaction */
 /** Display the MySQL binlog offset info if it is present in the trx
 system header. */
 void
 trx_sys_print_mysql_binlog_offset();
-#ifdef WITH_WSREP
 
-/** Update WSREP XID info in sys_header of TRX_SYS_PAGE_NO = 5.
-@param[in]	xid		Transaction XID
-@param[in,out]	sys_header	sys_header
-@param[in]	mtr		minitransaction */
-UNIV_INTERN
-void
-trx_sys_update_wsrep_checkpoint(
-	const XID*	xid,
-	trx_sysf_t*	sys_header,
-	mtr_t*		mtr);
-
-/** Read WSREP checkpoint XID from sys header.
-@param[out]	xid	WSREP XID
-@return	whether the checkpoint was present */
-UNIV_INTERN
-bool
-trx_sys_read_wsrep_checkpoint(XID* xid);
-#endif /* WITH_WSREP */
-
-/*****************************************************************//**
-Shutdown/Close the transaction system. */
-void
-trx_sys_close(void);
-/*===============*/
 /** Create the rollback segments.
 @return	whether the creation succeeded */
 bool
 trx_sys_create_rsegs();
-/*****************************************************************//**
-Get the number of transaction in the system, independent of their state.
-@return count of transactions in trx_sys_t::trx_list */
-UNIV_INLINE
-ulint
-trx_sys_get_n_rw_trx(void);
-/*======================*/
-
-/*********************************************************************
-Check if there are any active (non-prepared) transactions.
-@return total number of active transactions or 0 if none */
-ulint
-trx_sys_any_active_transactions(void);
-/*=================================*/
-
-/**
-Add the transaction to the RW transaction set
-@param trx		transaction instance to add */
-UNIV_INLINE
-void
-trx_sys_rw_trx_add(trx_t* trx);
-
-#ifdef UNIV_DEBUG
-/*************************************************************//**
-Validate the trx_sys_t::rw_trx_list.
-@return true if the list is valid */
-bool
-trx_sys_validate_trx_list();
-/*========================*/
-#endif /* UNIV_DEBUG */
 
 /** The automatically created system rollback segment has this id */
 #define TRX_SYS_SYSTEM_RSEG_ID	0
@@ -326,18 +154,13 @@ trx_sys_validate_trx_list();
 
 /** Transaction system header */
 /*------------------------------------------------------------- @{ */
-#define	TRX_SYS_TRX_ID_STORE	0	/*!< the maximum trx id or trx
-					number modulo
-					TRX_SYS_TRX_ID_UPDATE_MARGIN
-					written to a file page by any
-					transaction; the assignment of
-					transaction ids continues from
-					this number rounded up by
-					TRX_SYS_TRX_ID_UPDATE_MARGIN
-					plus
-					TRX_SYS_TRX_ID_UPDATE_MARGIN
-					when the database is
-					started */
+/** In old versions of InnoDB, this persisted the value of
+trx_sys.get_max_trx_id(). Starting with MariaDB 10.3.5,
+the field TRX_RSEG_MAX_TRX_ID in rollback segment header pages
+and the fields TRX_UNDO_TRX_ID, TRX_UNDO_TRX_NO in undo log pages
+are used instead. The field only exists for the purpose of upgrading
+from older MySQL or MariaDB versions. */
+#define	TRX_SYS_TRX_ID_STORE	0
 #define TRX_SYS_FSEG_HEADER	8	/*!< segment header for the
 					tablespace segment the trx
 					system is created into */
@@ -347,16 +170,52 @@ trx_sys_validate_trx_list();
 					slots */
 /*------------------------------------------------------------- @} */
 
-/* Max number of rollback segments: the number of segment specification slots
-in the transaction system array; rollback segment id must fit in one (signed)
-byte, therefore 128; each slot is currently 8 bytes in size. If you want
-to raise the level to 256 then you will need to fix some assertions that
-impose the 7 bit restriction. e.g., mach_write_to_3() */
+/** The number of rollback segments; rollback segment id must fit in
+the 7 bits reserved for it in DB_ROLL_PTR. */
 #define	TRX_SYS_N_RSEGS			128
 /** Maximum number of undo tablespaces (not counting the system tablespace) */
 #define TRX_SYS_MAX_UNDO_SPACES		(TRX_SYS_N_RSEGS - 1)
 
-/** Maximum length of MySQL binlog file name, in bytes. */
+/* Rollback segment specification slot offsets */
+
+/** the tablespace ID of an undo log header; starting with
+MySQL/InnoDB 5.1.7, this is FIL_NULL if the slot is unused */
+#define	TRX_SYS_RSEG_SPACE	0
+/** the page number of an undo log header, or FIL_NULL if unused */
+#define	TRX_SYS_RSEG_PAGE_NO	4
+/** Size of a rollback segment specification slot */
+#define TRX_SYS_RSEG_SLOT_SIZE	8
+
+/** Read the tablespace ID of a rollback segment slot.
+@param[in]	sys_header	TRX_SYS page
+@param[in]	rseg_id		rollback segment identifier
+@return	undo tablespace id */
+inline
+uint32_t
+trx_sysf_rseg_get_space(const buf_block_t* sys_header, ulint rseg_id)
+{
+	ut_ad(rseg_id < TRX_SYS_N_RSEGS);
+	return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE
+				+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE
+				+ sys_header->frame);
+}
+
+/** Read the page number of a rollback segment slot.
+@param[in]	sys_header	TRX_SYS page
+@param[in]	rseg_id		rollback segment identifier
+@return	undo page number */
+inline
+uint32_t
+trx_sysf_rseg_get_page_no(const buf_block_t* sys_header, ulint rseg_id)
+{
+	ut_ad(rseg_id < TRX_SYS_N_RSEGS);
+	return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO
+				+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE
+				+ sys_header->frame);
+}
+
+/** Maximum length of MySQL binlog file name, in bytes.
+(Used before MariaDB 10.3.5.) */
 #define TRX_SYS_MYSQL_LOG_NAME_LEN	512
 /** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */
 #define TRX_SYS_MYSQL_LOG_MAGIC_N	873422344
@@ -433,7 +292,7 @@ FIXED WSREP XID info offsets for 4k page size 10.0.32-galera
 
 */
 #ifdef WITH_WSREP
-/** The offset to WSREP XID headers */
+/** The offset to WSREP XID headers (used before MariaDB 10.3.5) */
 #define TRX_SYS_WSREP_XID_INFO std::max(srv_page_size - 3500, 1596UL)
 #define TRX_SYS_WSREP_XID_MAGIC_N_FLD 0
 #define TRX_SYS_WSREP_XID_MAGIC_N 0x77737265
@@ -497,42 +356,466 @@ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
 #define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE	FSP_EXTENT_SIZE
 /* @} */
 
+trx_t* current_trx();
+
+struct rw_trx_hash_element_t
+{
+  rw_trx_hash_element_t(): trx(0)
+  {
+    mutex_create(LATCH_ID_RW_TRX_HASH_ELEMENT, &mutex);
+  }
+
+
+  ~rw_trx_hash_element_t()
+  {
+    mutex_free(&mutex);
+  }
+
+
+  trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */
+  trx_id_t no;
+  trx_t *trx;
+  ib_mutex_t mutex;
+};
+
+
+/**
+  Wrapper around LF_HASH to store set of in memory read-write transactions.
+*/
+
+class rw_trx_hash_t
+{
+  LF_HASH hash;
+
+
+  /**
+    Constructor callback for lock-free allocator.
+
+    Object is just allocated and is not yet accessible via rw_trx_hash by
+    concurrent threads. Object can be reused multiple times before it is freed.
+    Every time object is being reused initializer() callback is called.
+  */
+
+  static void rw_trx_hash_constructor(uchar *arg)
+  {
+    new(arg + LF_HASH_OVERHEAD) rw_trx_hash_element_t();
+  }
+
+
+  /**
+    Destructor callback for lock-free allocator.
+
+    Object is about to be freed and is not accessible via rw_trx_hash by
+    concurrent threads.
+  */
+
+  static void rw_trx_hash_destructor(uchar *arg)
+  {
+    reinterpret_cast<rw_trx_hash_element_t*>
+      (arg + LF_HASH_OVERHEAD)->~rw_trx_hash_element_t();
+  }
+
+
+  /**
+    Destructor callback for lock-free allocator.
+
+    This destructor is used at shutdown. It frees remaining transaction
+    objects.
+
+    XA PREPARED transactions may remain if they haven't been committed or
+    rolled back. ACTIVE transactions may remain if startup was interrupted or
+    server is running in read-only mode or for certain srv_force_recovery
+    levels.
+  */
+
+  static void rw_trx_hash_shutdown_destructor(uchar *arg)
+  {
+    rw_trx_hash_element_t *element=
+      reinterpret_cast<rw_trx_hash_element_t*>(arg + LF_HASH_OVERHEAD);
+    if (trx_t *trx= element->trx)
+    {
+      ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED) ||
+            (trx_state_eq(trx, TRX_STATE_ACTIVE) &&
+             (!srv_was_started ||
+              srv_read_only_mode ||
+              srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO)));
+      trx_free_at_shutdown(trx);
+    }
+    element->~rw_trx_hash_element_t();
+  }
+
+
+  /**
+    Initializer callback for lock-free hash.
+
+    Object is not yet accessible via rw_trx_hash by concurrent threads, but is
+    about to become such. Object id can be changed only by this callback and
+    remains the same until all pins to this object are released.
+
+    Object trx can be changed to 0 by erase() under object mutex protection,
+    which indicates it is about to be removed from lock-free hash and become
+    not accessible by concurrent threads.
+  */
+
+  static void rw_trx_hash_initializer(LF_HASH *,
+                                      rw_trx_hash_element_t *element,
+                                      trx_t *trx)
+  {
+    ut_ad(element->trx == 0);
+    element->trx= trx;
+    element->id= trx->id;
+    element->no= TRX_ID_MAX;
+    trx->rw_trx_hash_element= element;
+  }
+
+
+  /**
+    Gets LF_HASH pins.
+
+    Pins are used to protect object from being destroyed or reused. They are
+    normally stored in trx object for quick access. If caller doesn't have trx
+    available, we try to get it using currnet_trx(). If caller doesn't have trx
+    at all, temporary pins are allocated.
+  */
+
+  LF_PINS *get_pins(trx_t *trx)
+  {
+    if (!trx->rw_trx_hash_pins)
+    {
+      trx->rw_trx_hash_pins= lf_hash_get_pins(&hash);
+      ut_a(trx->rw_trx_hash_pins);
+    }
+    return trx->rw_trx_hash_pins;
+  }
+
+
+  struct eliminate_duplicates_arg
+  {
+    trx_ids_t ids;
+    my_hash_walk_action action;
+    void *argument;
+    eliminate_duplicates_arg(size_t size, my_hash_walk_action act, void* arg):
+      action(act), argument(arg) { ids.reserve(size); }
+  };
+
+
+  static my_bool eliminate_duplicates(rw_trx_hash_element_t *element,
+                                      eliminate_duplicates_arg *arg)
+  {
+    for (trx_ids_t::iterator it= arg->ids.begin(); it != arg->ids.end(); it++)
+    {
+      if (*it == element->id)
+        return 0;
+    }
+    arg->ids.push_back(element->id);
+    return arg->action(element, arg->argument);
+  }
+
+
+#ifdef UNIV_DEBUG
+  static void validate_element(trx_t *trx)
+  {
+    ut_ad(!trx->read_only || !trx->rsegs.m_redo.rseg);
+    ut_ad(!trx_is_autocommit_non_locking(trx));
+    mutex_enter(&trx->mutex);
+    ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
+          trx_state_eq(trx, TRX_STATE_PREPARED));
+    mutex_exit(&trx->mutex);
+  }
+
+
+  struct debug_iterator_arg
+  {
+    my_hash_walk_action action;
+    void *argument;
+  };
+
+
+  static my_bool debug_iterator(rw_trx_hash_element_t *element,
+                                debug_iterator_arg *arg)
+  {
+    mutex_enter(&element->mutex);
+    if (element->trx)
+      validate_element(element->trx);
+    mutex_exit(&element->mutex);
+    return arg->action(element, arg->argument);
+  }
+#endif
+
+
+public:
+  void init()
+  {
+    lf_hash_init(&hash, sizeof(rw_trx_hash_element_t), LF_HASH_UNIQUE, 0,
+                 sizeof(trx_id_t), 0, &my_charset_bin);
+    hash.alloc.constructor= rw_trx_hash_constructor;
+    hash.alloc.destructor= rw_trx_hash_destructor;
+    hash.initializer=
+      reinterpret_cast<lf_hash_initializer>(rw_trx_hash_initializer);
+  }
+
+
+  void destroy()
+  {
+    hash.alloc.destructor= rw_trx_hash_shutdown_destructor;
+    lf_hash_destroy(&hash);
+  }
+
+
+  /**
+    Releases LF_HASH pins.
+
+    Must be called by thread that owns trx_t object when the latter is being
+    "detached" from thread (e.g. released to the pool by trx_free()). Can be
+    called earlier if thread is expected not to use rw_trx_hash.
+
+    Since pins are not allowed to be transferred to another thread,
+    initialisation thread calls this for recovered transactions.
+  */
+
+  void put_pins(trx_t *trx)
+  {
+    if (trx->rw_trx_hash_pins)
+    {
+      lf_hash_put_pins(trx->rw_trx_hash_pins);
+      trx->rw_trx_hash_pins= 0;
+    }
+  }
+
+
+  /**
+    Finds trx object in lock-free hash with given id.
+
+    Only ACTIVE or PREPARED trx objects may participate in hash. Nevertheless
+    the transaction may get committed before this method returns.
+
+    With do_ref_count == false the caller may dereference returned trx pointer
+    only if lock_sys.mutex was acquired before calling find().
+
+    With do_ref_count == true caller may dereference trx even if it is not
+    holding lock_sys.mutex. Caller is responsible for calling
+    trx->release_reference() when it is done playing with trx.
+
+    Ideally this method should get caller rw_trx_hash_pins along with trx
+    object as a parameter, similar to insert() and erase(). However most
+    callers lose trx early in their call chains and it is not that easy to pass
+    them through.
+
+    So we take more expensive approach: get trx through current_thd()->ha_data.
+    Some threads don't have trx attached to THD, and at least server
+    initialisation thread, fts_optimize_thread, srv_master_thread,
+    dict_stats_thread, srv_monitor_thread, btr_defragment_thread don't even
+    have THD at all. For such cases we allocate pins only for duration of
+    search and free them immediately.
+
+    This has negative performance impact and should be fixed eventually (by
+    passing caller_trx as a parameter). Still stream of DML is more or less Ok.
+
+    @return
+      @retval 0 not found
+      @retval pointer to trx
+  */
+
+  trx_t *find(trx_t *caller_trx, trx_id_t trx_id, bool do_ref_count= false)
+  {
+    /*
+      In MariaDB 10.3, purge will reset DB_TRX_ID to 0
+      when the history is lost. Read/write transactions will
+      always have a nonzero trx_t::id; there the value 0 is
+      reserved for transactions that did not write or lock
+      anything yet.
+    */
+    if (!trx_id)
+      return NULL;
+
+    trx_t *trx= 0;
+    LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
+    ut_a(pins);
+
+    rw_trx_hash_element_t *element= reinterpret_cast<rw_trx_hash_element_t*>
+      (lf_hash_search(&hash, pins, reinterpret_cast<const void*>(&trx_id),
+                      sizeof(trx_id_t)));
+    if (element)
+    {
+      mutex_enter(&element->mutex);
+      lf_hash_search_unpin(pins);
+      if ((trx= element->trx))
+      {
+        if (do_ref_count)
+          trx->reference();
+        ut_d(validate_element(trx));
+      }
+      mutex_exit(&element->mutex);
+    }
+    if (!caller_trx)
+      lf_hash_put_pins(pins);
+    return trx;
+  }
+
+
+  /**
+    Inserts trx to lock-free hash.
+
+    Object becomes accessible via rw_trx_hash.
+  */
+
+  void insert(trx_t *trx)
+  {
+    ut_d(validate_element(trx));
+    int res= lf_hash_insert(&hash, get_pins(trx),
+                            reinterpret_cast<void*>(trx));
+    ut_a(res == 0);
+  }
+
+
+  /**
+    Removes trx from lock-free hash.
+
+    Object becomes not accessible via rw_trx_hash. But it still can be pinned
+    by concurrent find(), which is supposed to release it immediately after
+    it sees object trx is 0.
+  */
+
+  void erase(trx_t *trx)
+  {
+    ut_d(validate_element(trx));
+    mutex_enter(&trx->rw_trx_hash_element->mutex);
+    trx->rw_trx_hash_element->trx= 0;
+    mutex_exit(&trx->rw_trx_hash_element->mutex);
+    int res= lf_hash_delete(&hash, get_pins(trx),
+                            reinterpret_cast<const void*>(&trx->id),
+                            sizeof(trx_id_t));
+    ut_a(res == 0);
+  }
+
+
+  /**
+    Returns the number of elements in the hash.
+
+    The number is exact only if hash is protected against concurrent
+    modifications (e.g. single threaded startup or hash is protected
+    by some mutex). Otherwise the number may be used as a hint only,
+    because it may change even before this method returns.
+  */
+
+  int32_t size()
+  {
+    return my_atomic_load32_explicit(&hash.count, MY_MEMORY_ORDER_RELAXED);
+  }
+
+
+  /**
+    Iterates the hash.
+
+    @param caller_trx  used to get/set pins
+    @param action      called for every element in hash
+    @param argument    opque argument passed to action
+
+    May return the same element multiple times if hash is under contention.
+    If caller doesn't like to see the same transaction multiple times, it has
+    to call iterate_no_dups() instead.
+
+    May return element with committed transaction. If caller doesn't like to
+    see committed transactions, it has to skip those under element mutex:
+
+      mutex_enter(&element->mutex);
+      if (trx_t trx= element->trx)
+      {
+        // trx is protected against commit in this branch
+      }
+      mutex_exit(&element->mutex);
+
+    May miss concurrently inserted transactions.
+
+    @return
+      @retval 0 iteration completed successfully
+      @retval 1 iteration was interrupted (action returned 1)
+  */
+
+  int iterate(trx_t *caller_trx, my_hash_walk_action action, void *argument)
+  {
+    LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
+    ut_a(pins);
+#ifdef UNIV_DEBUG
+    debug_iterator_arg debug_arg= { action, argument };
+    action= reinterpret_cast<my_hash_walk_action>(debug_iterator);
+    argument= &debug_arg;
+#endif
+    int res= lf_hash_iterate(&hash, pins, action, argument);
+    if (!caller_trx)
+      lf_hash_put_pins(pins);
+    return res;
+  }
+
+
+  int iterate(my_hash_walk_action action, void *argument)
+  {
+    return iterate(current_trx(), action, argument);
+  }
+
+
+  /**
+    Iterates the hash and eliminates duplicate elements.
+
+    @sa iterate()
+  */
+
+  int iterate_no_dups(trx_t *caller_trx, my_hash_walk_action action,
+                      void *argument)
+  {
+    eliminate_duplicates_arg arg(size() + 32, action, argument);
+    return iterate(caller_trx, reinterpret_cast<my_hash_walk_action>
+                   (eliminate_duplicates), &arg);
+  }
+
+
+  int iterate_no_dups(my_hash_walk_action action, void *argument)
+  {
+    return iterate_no_dups(current_trx(), action, argument);
+  }
+};
+
+
 /** The transaction system central memory data structure. */
-struct trx_sys_t {
+class trx_sys_t
+{
+  /**
+    The smallest number not yet assigned as a transaction id or transaction
+    number. Accessed and updated with atomic operations.
+  */
+  MY_ALIGNED(CACHE_LINE_SIZE) trx_id_t m_max_trx_id;
+
+
+  /**
+    Solves race conditions between register_rw() and snapshot_ids() as well as
+    race condition between assign_new_trx_no() and snapshot_ids().
+
+    @sa register_rw()
+    @sa assign_new_trx_no()
+    @sa snapshot_ids()
+  */
+  MY_ALIGNED(CACHE_LINE_SIZE) trx_id_t m_rw_trx_hash_version;
+
+
+  /**
+    TRX_RSEG_HISTORY list length (number of committed transactions to purge)
+  */
+  MY_ALIGNED(CACHE_LINE_SIZE) int32 rseg_history_len;
+
+  /** Active views. */
+  MY_ALIGNED(CACHE_LINE_SIZE) UT_LIST_BASE_NODE_T(ReadView) m_views;
 
+  bool m_initialised;
+
+public:
+	MY_ALIGNED(CACHE_LINE_SIZE) mutable
 	TrxSysMutex	mutex;		/*!< mutex protecting most fields in
 					this structure except when noted
 					otherwise */
-
-	MVCC*		mvcc;		/*!< Multi version concurrency control
-					manager */
-	volatile trx_id_t
-			max_trx_id;	/*!< The smallest number not yet
-					assigned as a transaction id or
-					transaction number. This is declared
-					volatile because it can be accessed
-					without holding any mutex during
-					AC-NL-RO view creation. */
-	trx_ut_list_t	serialisation_list;
-					/*!< Ordered on trx_t::no of all the
-					currenrtly active RW transactions */
-#ifdef UNIV_DEBUG
-	trx_id_t	rw_max_trx_id;	/*!< Max trx id of read-write
-					transactions which exist or existed */
-#endif /* UNIV_DEBUG */
-
-	/** Avoid false sharing */
-	const char	pad1[CACHE_LINE_SIZE];
-	trx_ut_list_t	rw_trx_list;	/*!< List of active and committed in
-					memory read-write transactions, sorted
-					on trx id, biggest first. Recovered
-					transactions are always on this list. */
-
-	/** Avoid false sharing */
-	const char	pad2[CACHE_LINE_SIZE];
+	MY_ALIGNED(CACHE_LINE_SIZE)
 	trx_ut_list_t	mysql_trx_list;	/*!< List of transactions created
 					for MySQL. All user transactions are
-					on mysql_trx_list. The rw_trx_list
+					on mysql_trx_list. The rw_trx_hash
 					can contain system transactions and
 					recovered transactions that will not
 					be in the mysql_trx_list.
@@ -540,22 +823,11 @@ struct trx_sys_t {
 					transactions that have not yet been
 					started in InnoDB. */
 
-	trx_ids_t	rw_trx_ids;	/*!< Array of Read write transaction IDs
-					for MVCC snapshot. A ReadView would take
-					a snapshot of these transactions whose
-					changes are not visible to it. We should
-					remove transactions from the list before
-					committing in memory and releasing locks
-					to ensure right order of removal and
-					consistent snapshot. */
-
-	/** Avoid false sharing */
-	const char	pad3[CACHE_LINE_SIZE];
+	MY_ALIGNED(CACHE_LINE_SIZE)
 	/** Temporary rollback segments */
 	trx_rseg_t*	temp_rsegs[TRX_SYS_N_RSEGS];
-	/** Avoid false sharing */
-	const char	pad4[CACHE_LINE_SIZE];
 
+	MY_ALIGNED(CACHE_LINE_SIZE)
 	trx_rseg_t*	rseg_array[TRX_SYS_N_RSEGS];
 					/*!< Pointer array to rollback
 					segments; NULL if slot not in use;
@@ -563,46 +835,378 @@ struct trx_sys_t {
 					single-threaded mode; not protected
 					by any mutex, because it is read-only
 					during multi-threaded operation */
-	ulint		rseg_history_len;
-					/*!< Length of the TRX_RSEG_HISTORY
-					list (update undo logs for committed
-					transactions), protected by
-					rseg->mutex */
-
-	TrxIdSet	rw_trx_set;	/*!< Mapping from transaction id
-					to transaction instance */
-
-	ulint		n_prepared_trx;	/*!< Number of transactions currently
-					in the XA PREPARED state */
-
-	ulint		n_prepared_recovered_trx; /*!< Number of transactions
-					currently in XA PREPARED state that are
-					also recovered. Such transactions cannot
-					be added during runtime. They can only
-					occur after recovery if mysqld crashed
-					while there were XA PREPARED
-					transactions. We disable query cache
-					if such transactions exist. */
-};
 
-/** When a trx id which is zero modulo this number (which must be a power of
-two) is assigned, the field TRX_SYS_TRX_ID_STORE on the transaction system
-page is updated */
-#define TRX_SYS_TRX_ID_WRITE_MARGIN	((trx_id_t) 256)
+  /**
+    Lock-free hash of in memory read-write transactions.
+    Works faster when it is on it's own cache line (tested).
+  */
+
+  MY_ALIGNED(CACHE_LINE_SIZE) rw_trx_hash_t rw_trx_hash;
+
+
+#ifdef WITH_WSREP
+  /** Latest recovered XID during startup */
+  XID recovered_wsrep_xid;
+#endif
+  /** Latest recovered binlog offset */
+  int64_t recovered_binlog_offset;
+  /** Latest recovred binlog file name */
+  char recovered_binlog_filename[TRX_SYS_MYSQL_LOG_NAME_LEN];
+
+
+  /**
+    Constructor.
+
+    Some members may require late initialisation, thus we just mark object as
+    uninitialised. Real initialisation happens in create().
+  */
+
+  trx_sys_t(): m_initialised(false) {}
+
+
+  /**
+    Returns the minimum trx id in rw trx list.
+
+    This is the smallest id for which the trx can possibly be active. (But, you
+    must look at the trx->state to find out if the minimum trx id transaction
+    itself is active, or already committed.)
+
+    @return the minimum trx id, or m_max_trx_id if the trx list is empty
+  */
+
+  trx_id_t get_min_trx_id()
+  {
+    trx_id_t id= get_max_trx_id();
+    rw_trx_hash.iterate(reinterpret_cast<my_hash_walk_action>
+                        (get_min_trx_id_callback), &id);
+    return id;
+  }
+
+
+  /**
+    Determines the maximum transaction id.
+
+    @return maximum currently allocated trx id; will be stale after the
+            next call to trx_sys.get_new_trx_id()
+  */
+
+  trx_id_t get_max_trx_id()
+  {
+    return static_cast<trx_id_t>
+           (my_atomic_load64_explicit(reinterpret_cast<int64*>(&m_max_trx_id),
+                                      MY_MEMORY_ORDER_RELAXED));
+  }
+
+
+  /**
+    Allocates a new transaction id.
+    @return new, allocated trx id
+  */
+
+  trx_id_t get_new_trx_id()
+  {
+    trx_id_t id= get_new_trx_id_no_refresh();
+    refresh_rw_trx_hash_version();
+    return id;
+  }
 
-/** Test if trx_sys->mutex is owned. */
-#define trx_sys_mutex_own() (trx_sys->mutex.is_owned())
 
-/** Acquire the trx_sys->mutex. */
-#define trx_sys_mutex_enter() do {			\
-	mutex_enter(&trx_sys->mutex);			\
-} while (0)
+  /**
+    Allocates and assigns new transaction serialisation number.
 
-/** Release the trx_sys->mutex. */
-#define trx_sys_mutex_exit() do {			\
-	trx_sys->mutex.exit();				\
-} while (0)
+    There's a gap between m_max_trx_id increment and transaction serialisation
+    number becoming visible through rw_trx_hash. While we're in this gap
+    concurrent thread may come and do MVCC snapshot without seeing allocated
+    but not yet assigned serialisation number. Then at some point purge thread
+    may clone this view. As a result it won't see newly allocated serialisation
+    number and may remove "unnecessary" history data of this transaction from
+    rollback segments.
 
-#include "trx0sys.ic"
+    m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
+    to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
+    means that all transaction serialisation numbers up to m_max_trx_id are
+    available through rw_trx_hash.
+
+    We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
+    that m_rw_trx_hash_version increment happens after
+    trx->rw_trx_hash_element->no becomes visible through rw_trx_hash.
+
+    @param trx transaction
+  */
+  void assign_new_trx_no(trx_t *trx)
+  {
+    trx->no= get_new_trx_id_no_refresh();
+    my_atomic_store64_explicit(reinterpret_cast<int64*>
+                               (&trx->rw_trx_hash_element->no),
+                               trx->no, MY_MEMORY_ORDER_RELAXED);
+    refresh_rw_trx_hash_version();
+  }
+
+
+  /**
+    Takes MVCC snapshot.
+
+    To reduce malloc probablility we reserver rw_trx_hash.size() + 32 elements
+    in ids.
+
+    For details about get_rw_trx_hash_version() != get_max_trx_id() spin
+    @sa register_rw() and @sa assign_new_trx_no().
+
+    We rely on get_rw_trx_hash_version() to issue ACQUIRE memory barrier so
+    that loading of m_rw_trx_hash_version happens before accessing rw_trx_hash.
+
+    To optimise snapshot creation rw_trx_hash.iterate() is being used instead
+    of rw_trx_hash.iterate_no_dups(). It means that some transaction
+    identifiers may appear multiple times in ids.
+
+    @param[in,out] caller_trx used to get access to rw_trx_hash_pins
+    @param[out]    ids        array to store registered transaction identifiers
+    @param[out]    max_trx_id variable to store m_max_trx_id value
+    @param[out]    mix_trx_no variable to store min(trx->no) value
+  */
+
+  void snapshot_ids(trx_t *caller_trx, trx_ids_t *ids, trx_id_t *max_trx_id,
+                    trx_id_t *min_trx_no)
+  {
+    ut_ad(!mutex_own(&mutex));
+    snapshot_ids_arg arg(ids);
+
+    while ((arg.m_id= get_rw_trx_hash_version()) != get_max_trx_id())
+      ut_delay(1);
+    arg.m_no= arg.m_id;
+
+    ids->clear();
+    ids->reserve(rw_trx_hash.size() + 32);
+    rw_trx_hash.iterate(caller_trx,
+                        reinterpret_cast<my_hash_walk_action>(copy_one_id),
+                        &arg);
+
+    *max_trx_id= arg.m_id;
+    *min_trx_no= arg.m_no;
+  }
+
+
+  /** Initialiser for m_max_trx_id and m_rw_trx_hash_version. */
+  void init_max_trx_id(trx_id_t value)
+  {
+    m_max_trx_id= m_rw_trx_hash_version= value;
+  }
+
+
+  bool is_initialised() { return m_initialised; }
+
+
+  /** Initialise the purge subsystem. */
+  void create();
+
+  /** Close the purge subsystem on shutdown. */
+  void close();
+
+  /** @return total number of active (non-prepared) transactions */
+  ulint any_active_transactions();
+
+
+  /**
+    Registers read-write transaction.
+
+    Transaction becomes visible to MVCC.
+
+    There's a gap between m_max_trx_id increment and transaction becoming
+    visible through rw_trx_hash. While we're in this gap concurrent thread may
+    come and do MVCC snapshot. As a result concurrent read view will be able to
+    observe records owned by this transaction even before it was committed.
+
+    m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
+    to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
+    means that all transactions up to m_max_trx_id are available through
+    rw_trx_hash.
+
+    We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
+    that m_rw_trx_hash_version increment happens after transaction becomes
+    visible through rw_trx_hash.
+  */
+
+  void register_rw(trx_t *trx)
+  {
+    trx->id= get_new_trx_id_no_refresh();
+    rw_trx_hash.insert(trx);
+    refresh_rw_trx_hash_version();
+  }
+
+
+  /**
+    Deregisters read-write transaction.
+
+    Transaction is removed from rw_trx_hash, which releases all implicit locks.
+    MVCC snapshot won't see this transaction anymore.
+  */
+
+  void deregister_rw(trx_t *trx)
+  {
+    rw_trx_hash.erase(trx);
+  }
+
+
+  bool is_registered(trx_t *caller_trx, trx_id_t id)
+  {
+    return rw_trx_hash.find(caller_trx, id);
+  }
+
+
+  trx_t *find(trx_t *caller_trx, trx_id_t id)
+  {
+    return rw_trx_hash.find(caller_trx, id, true);
+  }
+
+
+  /**
+    Registers view in MVCC.
+
+    @param view    view owned by the caller
+  */
+  void register_view(ReadView *view)
+  {
+    mutex_enter(&mutex);
+    UT_LIST_ADD_FIRST(m_views, view);
+    mutex_exit(&mutex);
+  }
+
+
+  /**
+    Deregisters view in MVCC.
+
+    @param view    view owned by the caller
+  */
+  void deregister_view(ReadView *view)
+  {
+    mutex_enter(&mutex);
+    UT_LIST_REMOVE(m_views, view);
+    mutex_exit(&mutex);
+  }
+
+
+  /**
+    Clones the oldest view and stores it in view.
+
+    No need to call ReadView::close(). The caller owns the view that is passed
+    in. This function is called by purge thread to determine whether it should
+    purge the delete marked record or not.
+  */
+  void clone_oldest_view();
+
+
+  /** @return the number of active views */
+  size_t view_count() const
+  {
+    size_t count= 0;
+
+    mutex_enter(&mutex);
+    for (const ReadView* view= UT_LIST_GET_FIRST(m_views); view;
+         view= UT_LIST_GET_NEXT(m_view_list, view))
+    {
+      if (view->get_state() == READ_VIEW_STATE_OPEN)
+        ++count;
+    }
+    mutex_exit(&mutex);
+    return count;
+  }
+
+  /** @return number of committed transactions waiting for purge */
+  ulint history_size() const
+  {
+    return uint32(my_atomic_load32(&const_cast<trx_sys_t*>(this)
+                                   ->rseg_history_len));
+  }
+  /** Add to the TRX_RSEG_HISTORY length (on database startup). */
+  void history_add(int32 len)
+  {
+    my_atomic_add32(&rseg_history_len, len);
+  }
+  /** Register a committed transaction. */
+  void history_insert() { history_add(1); }
+  /** Note that a committed transaction was purged. */
+  void history_remove() { history_add(-1); }
+
+private:
+  static my_bool get_min_trx_id_callback(rw_trx_hash_element_t *element,
+                                         trx_id_t *id)
+  {
+    if (element->id < *id)
+    {
+      mutex_enter(&element->mutex);
+      /* We don't care about read-only transactions here. */
+      if (element->trx && element->trx->rsegs.m_redo.rseg)
+        *id= element->id;
+      mutex_exit(&element->mutex);
+    }
+    return 0;
+  }
+
+
+  struct snapshot_ids_arg
+  {
+    snapshot_ids_arg(trx_ids_t *ids): m_ids(ids) {}
+    trx_ids_t *m_ids;
+    trx_id_t m_id;
+    trx_id_t m_no;
+  };
+
+
+  static my_bool copy_one_id(rw_trx_hash_element_t *element,
+                             snapshot_ids_arg *arg)
+  {
+    if (element->id < arg->m_id)
+    {
+      trx_id_t no= static_cast<trx_id_t>(my_atomic_load64_explicit(
+        reinterpret_cast<int64*>(&element->no), MY_MEMORY_ORDER_RELAXED));
+      arg->m_ids->push_back(element->id);
+      if (no < arg->m_no)
+        arg->m_no= no;
+    }
+    return 0;
+  }
+
+
+  /** Getter for m_rw_trx_hash_version, must issue ACQUIRE memory barrier. */
+  trx_id_t get_rw_trx_hash_version()
+  {
+    return static_cast<trx_id_t>
+           (my_atomic_load64_explicit(reinterpret_cast<int64*>
+                                      (&m_rw_trx_hash_version),
+                                      MY_MEMORY_ORDER_ACQUIRE));
+  }
+
+
+  /** Increments m_rw_trx_hash_version, must issue RELEASE memory barrier. */
+  void refresh_rw_trx_hash_version()
+  {
+    my_atomic_add64_explicit(reinterpret_cast<int64*>(&m_rw_trx_hash_version),
+                             1, MY_MEMORY_ORDER_RELEASE);
+  }
+
+
+  /**
+    Allocates new transaction id without refreshing rw_trx_hash version.
+
+    This method is extracted for exclusive use by register_rw() and
+    assign_new_trx_no() where new id must be allocated atomically with
+    payload of these methods from MVCC snapshot point of view.
+
+    @sa get_new_trx_id()
+    @sa assign_new_trx_no()
+
+    @return new transaction id
+  */
+
+  trx_id_t get_new_trx_id_no_refresh()
+  {
+    return static_cast<trx_id_t>(my_atomic_add64_explicit(
+      reinterpret_cast<int64*>(&m_max_trx_id), 1, MY_MEMORY_ORDER_RELAXED));
+  }
+};
+
+
+/** The transaction system */
+extern trx_sys_t trx_sys;
 
 #endif