summaryrefslogtreecommitdiff
path: root/storage/xtradb/trx
diff options
context:
space:
mode:
authorSergei Golubchik <serg@mariadb.org>2015-04-27 23:37:51 +0200
committerSergei Golubchik <serg@mariadb.org>2015-04-27 23:37:51 +0200
commitfd39c56effd5b56aae2ebe7709a1fbf73503edcd (patch)
tree17d03ddb4e6a2acbb6d90ba76753c7690653f7b4 /storage/xtradb/trx
parent13927f878e02c33d118cac43b14bd06d2382eb26 (diff)
downloadmariadb-git-fd39c56effd5b56aae2ebe7709a1fbf73503edcd.tar.gz
move to storage/xtradb/
Diffstat (limited to 'storage/xtradb/trx')
-rw-r--r--storage/xtradb/trx/trx0i_s.c1607
-rw-r--r--storage/xtradb/trx/trx0purge.c1254
-rw-r--r--storage/xtradb/trx/trx0rec.c1698
-rw-r--r--storage/xtradb/trx/trx0roll.c1357
-rw-r--r--storage/xtradb/trx/trx0rseg.c374
-rw-r--r--storage/xtradb/trx/trx0sys.c2049
-rw-r--r--storage/xtradb/trx/trx0trx.c2449
-rw-r--r--storage/xtradb/trx/trx0undo.c2000
8 files changed, 12788 insertions, 0 deletions
diff --git a/storage/xtradb/trx/trx0i_s.c b/storage/xtradb/trx/trx0i_s.c
new file mode 100644
index 00000000000..8b3a83585cc
--- /dev/null
+++ b/storage/xtradb/trx/trx0i_s.c
@@ -0,0 +1,1607 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0i_s.c
+INFORMATION SCHEMA innodb_trx, innodb_locks and
+innodb_lock_waits tables fetch code.
+
+The code below fetches information needed to fill those
+3 dynamic tables and uploads it into a "transactions
+table cache" for later retrieval.
+
+Created July 17, 2007 Vasil Dimov
+*******************************************************/
+
+/* Found during the build of 5.5.3 on Linux 2.4 and early 2.6 kernels:
+ The includes "univ.i" -> "my_global.h" cause a different path
+ to be taken further down with pthread functions and types,
+ so they must come first.
+ From the symptoms, this is related to bug#46587 in the MySQL bug DB.
+*/
+#include "univ.i"
+
+#include <mysql/plugin.h>
+
+#include "buf0buf.h"
+#include "dict0dict.h"
+#include "ha0storage.h"
+#include "ha_prototypes.h"
+#include "hash0hash.h"
+#include "lock0iter.h"
+#include "lock0lock.h"
+#include "mem0mem.h"
+#include "page0page.h"
+#include "rem0rec.h"
+#include "row0row.h"
+#include "srv0srv.h"
+#include "sync0rw.h"
+#include "sync0sync.h"
+#include "sync0types.h"
+#include "trx0i_s.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "ut0mem.h"
+#include "ut0ut.h"
+
+/** Initial number of rows in the table cache */
+#define TABLE_CACHE_INITIAL_ROWSNUM 1024
+
+/** @brief The maximum number of chunks to allocate for a table cache.
+
+The rows of a table cache are stored in a set of chunks. When a new
+row is added a new chunk is allocated if necessary. Assuming that the
+first one is 1024 rows (TABLE_CACHE_INITIAL_ROWSNUM) and each
+subsequent is N/2 where N is the number of rows we have allocated till
+now, then 39th chunk would accommodate 1677416425 rows and all chunks
+would accommodate 3354832851 rows. */
+#define MEM_CHUNKS_IN_TABLE_CACHE 39
+
+/** The following are some testing auxiliary macros. Do not enable them
+in a production environment. */
+/* @{ */
+
+#if 0
+/** If this is enabled then lock folds will always be different
+resulting in equal rows being put in a different cells of the hash
+table. Checking for duplicates will be flawed because different
+fold will be calculated when a row is searched in the hash table. */
+#define TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+#endif
+
+#if 0
+/** This effectively kills the search-for-duplicate-before-adding-a-row
+function, but searching in the hash is still performed. It will always
+be assumed that lock is not present and insertion will be performed in
+the hash table. */
+#define TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+#endif
+
+#if 0
+/** This aggressively repeats adding each row many times. Depending on
+the above settings this may be noop or may result in lots of rows being
+added. */
+#define TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+#endif
+
+#if 0
+/** Very similar to TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T but hash
+table search is not performed at all. */
+#define TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+#endif
+
+#if 0
+/** Do not insert each row into the hash table, duplicates may appear
+if this is enabled, also if this is enabled searching into the hash is
+noop because it will be empty. */
+#define TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+#endif
+/* @} */
+
+/** Memory limit passed to ha_storage_put_memlim().
+@param cache hash storage
+@return maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_STORAGE(cache) \
+ (TRX_I_S_MEM_LIMIT \
+ - (cache)->mem_allocd)
+
+/** Memory limit in table_cache_create_empty_row().
+@param cache hash storage
+@return maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_ALLOC(cache) \
+ (TRX_I_S_MEM_LIMIT \
+ - (cache)->mem_allocd \
+ - ha_storage_get_size((cache)->storage))
+
+/** Memory for each table in the intermediate buffer is allocated in
+separate chunks. These chunks are considered to be concatenated to
+represent one flat array of rows. */
+typedef struct i_s_mem_chunk_struct {
+ ulint offset; /*!< offset, in number of rows */
+ ulint rows_allocd; /*!< the size of this chunk, in number
+ of rows */
+ void* base; /*!< start of the chunk */
+} i_s_mem_chunk_t;
+
+/** This represents one table's cache. */
+typedef struct i_s_table_cache_struct {
+ ulint rows_used; /*!< number of used rows */
+ ulint rows_allocd; /*!< number of allocated rows */
+ ulint row_size; /*!< size of a single row */
+ i_s_mem_chunk_t chunks[MEM_CHUNKS_IN_TABLE_CACHE]; /*!< array of
+ memory chunks that stores the
+ rows */
+} i_s_table_cache_t;
+
+/** This structure describes the intermediate buffer */
+struct trx_i_s_cache_struct {
+ rw_lock_t rw_lock; /*!< read-write lock protecting
+ the rest of this structure */
+ ullint last_read; /*!< last time the cache was read;
+ measured in microseconds since
+ epoch */
+ mutex_t last_read_mutex;/*!< mutex protecting the
+ last_read member - it is updated
+ inside a shared lock of the
+ rw_lock member */
+ i_s_table_cache_t innodb_trx; /*!< innodb_trx table */
+ i_s_table_cache_t innodb_locks; /*!< innodb_locks table */
+ i_s_table_cache_t innodb_lock_waits;/*!< innodb_lock_waits table */
+/** the hash table size is LOCKS_HASH_CELLS_NUM * sizeof(void*) bytes */
+#define LOCKS_HASH_CELLS_NUM 10000
+ hash_table_t* locks_hash; /*!< hash table used to eliminate
+ duplicate entries in the
+ innodb_locks table */
+/** Initial size of the cache storage */
+#define CACHE_STORAGE_INITIAL_SIZE 1024
+/** Number of hash cells in the cache storage */
+#define CACHE_STORAGE_HASH_CELLS 2048
+ ha_storage_t* storage; /*!< storage for external volatile
+ data that can possibly not be
+ available later, when we release
+ the kernel mutex */
+ ulint mem_allocd; /*!< the amount of memory
+ allocated with mem_alloc*() */
+ ibool is_truncated; /*!< this is TRUE if the memory
+ limit was hit and thus the data
+ in the cache is truncated */
+};
+
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+static trx_i_s_cache_t trx_i_s_cache_static;
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+UNIV_INTERN trx_i_s_cache_t* trx_i_s_cache = &trx_i_s_cache_static;
+
+/* Key to register the lock/mutex with performance schema */
+#ifdef UNIV_PFS_RWLOCK
+UNIV_INTERN mysql_pfs_key_t trx_i_s_cache_lock_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t cache_last_read_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/*******************************************************************//**
+For a record lock that is in waiting state retrieves the only bit that
+is set, for a table lock returns ULINT_UNDEFINED.
+@return record number within the heap */
+static
+ulint
+wait_lock_get_heap_no(
+/*==================*/
+ const lock_t* lock) /*!< in: lock */
+{
+ ulint ret;
+
+ switch (lock_get_type(lock)) {
+ case LOCK_REC:
+ ret = lock_rec_find_set_bit(lock);
+ ut_a(ret != ULINT_UNDEFINED);
+ break;
+ case LOCK_TABLE:
+ ret = ULINT_UNDEFINED;
+ break;
+ default:
+ ut_error;
+ }
+
+ return(ret);
+}
+
+/*******************************************************************//**
+Initializes the members of a table cache. */
+static
+void
+table_cache_init(
+/*=============*/
+ i_s_table_cache_t* table_cache, /*!< out: table cache */
+ size_t row_size) /*!< in: the size of a
+ row */
+{
+ ulint i;
+
+ table_cache->rows_used = 0;
+ table_cache->rows_allocd = 0;
+ table_cache->row_size = row_size;
+
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ /* the memory is actually allocated in
+ table_cache_create_empty_row() */
+ table_cache->chunks[i].base = NULL;
+ }
+}
+
+/*******************************************************************//**
+Frees a table cache. */
+static
+void
+table_cache_free(
+/*=============*/
+ i_s_table_cache_t* table_cache) /*!< in/out: table cache */
+{
+ ulint i;
+
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ /* the memory is actually allocated in
+ table_cache_create_empty_row() */
+ if (table_cache->chunks[i].base) {
+ mem_free(table_cache->chunks[i].base);
+ table_cache->chunks[i].base = NULL;
+ }
+ }
+}
+
+/*******************************************************************//**
+Returns an empty row from a table cache. The row is allocated if no more
+empty rows are available. The number of used rows is incremented.
+If the memory limit is hit then NULL is returned and nothing is
+allocated.
+@return empty row, or NULL if out of memory */
+static
+void*
+table_cache_create_empty_row(
+/*=========================*/
+ i_s_table_cache_t* table_cache, /*!< in/out: table cache */
+ trx_i_s_cache_t* cache) /*!< in/out: cache to record
+ how many bytes are
+ allocated */
+{
+ ulint i;
+ void* row;
+
+ ut_a(table_cache->rows_used <= table_cache->rows_allocd);
+
+ if (table_cache->rows_used == table_cache->rows_allocd) {
+
+ /* rows_used == rows_allocd means that new chunk needs
+ to be allocated: either no more empty rows in the
+ last allocated chunk or nothing has been allocated yet
+ (rows_num == rows_allocd == 0); */
+
+ i_s_mem_chunk_t* chunk;
+ ulint req_bytes;
+ ulint got_bytes;
+ ulint req_rows;
+ ulint got_rows;
+
+ /* find the first not allocated chunk */
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ if (table_cache->chunks[i].base == NULL) {
+
+ break;
+ }
+ }
+
+ /* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+ have been allocated :-X */
+ ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+ /* allocate the chunk we just found */
+
+ if (i == 0) {
+
+ /* first chunk, nothing is allocated yet */
+ req_rows = TABLE_CACHE_INITIAL_ROWSNUM;
+ } else {
+
+ /* Memory is increased by the formula
+ new = old + old / 2; We are trying not to be
+ aggressive here (= using the common new = old * 2)
+ because the allocated memory will not be freed
+ until InnoDB exit (it is reused). So it is better
+ to once allocate the memory in more steps, but
+ have less unused/wasted memory than to use less
+ steps in allocation (which is done once in a
+ lifetime) but end up with lots of unused/wasted
+ memory. */
+ req_rows = table_cache->rows_allocd / 2;
+ }
+ req_bytes = req_rows * table_cache->row_size;
+
+ if (req_bytes > MAX_ALLOWED_FOR_ALLOC(cache)) {
+
+ return(NULL);
+ }
+
+ chunk = &table_cache->chunks[i];
+
+ chunk->base = mem_alloc2(req_bytes, &got_bytes);
+
+ got_rows = got_bytes / table_cache->row_size;
+
+ cache->mem_allocd += got_bytes;
+
+#if 0
+ printf("allocating chunk %d req bytes=%lu, got bytes=%lu, "
+ "row size=%lu, "
+ "req rows=%lu, got rows=%lu\n",
+ i, req_bytes, got_bytes,
+ table_cache->row_size,
+ req_rows, got_rows);
+#endif
+
+ chunk->rows_allocd = got_rows;
+
+ table_cache->rows_allocd += got_rows;
+
+ /* adjust the offset of the next chunk */
+ if (i < MEM_CHUNKS_IN_TABLE_CACHE - 1) {
+
+ table_cache->chunks[i + 1].offset
+ = chunk->offset + chunk->rows_allocd;
+ }
+
+ /* return the first empty row in the newly allocated
+ chunk */
+ row = chunk->base;
+ } else {
+
+ char* chunk_start;
+ ulint offset;
+
+ /* there is an empty row, no need to allocate new
+ chunks */
+
+ /* find the first chunk that contains allocated but
+ empty/unused rows */
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ if (table_cache->chunks[i].offset
+ + table_cache->chunks[i].rows_allocd
+ > table_cache->rows_used) {
+
+ break;
+ }
+ }
+
+ /* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+ are full, but
+ table_cache->rows_used != table_cache->rows_allocd means
+ exactly the opposite - there are allocated but
+ empty/unused rows :-X */
+ ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+ chunk_start = (char*) table_cache->chunks[i].base;
+ offset = table_cache->rows_used
+ - table_cache->chunks[i].offset;
+
+ row = chunk_start + offset * table_cache->row_size;
+ }
+
+ table_cache->rows_used++;
+
+ return(row);
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Validates a row in the locks cache.
+@return TRUE if valid */
+static
+ibool
+i_s_locks_row_validate(
+/*===================*/
+ const i_s_locks_row_t* row) /*!< in: row to validate */
+{
+ ut_ad(row->lock_trx_id != 0);
+ ut_ad(row->lock_mode != NULL);
+ ut_ad(row->lock_type != NULL);
+ ut_ad(row->lock_table != NULL);
+ ut_ad(row->lock_table_id != 0);
+
+ if (row->lock_space == ULINT_UNDEFINED) {
+ /* table lock */
+ ut_ad(!strcmp("TABLE", row->lock_type));
+ ut_ad(row->lock_index == NULL);
+ ut_ad(row->lock_data == NULL);
+ ut_ad(row->lock_page == ULINT_UNDEFINED);
+ ut_ad(row->lock_rec == ULINT_UNDEFINED);
+ } else {
+ /* record lock */
+ ut_ad(!strcmp("RECORD", row->lock_type));
+ ut_ad(row->lock_index != NULL);
+ /* row->lock_data == NULL if buf_page_try_get() == NULL */
+ ut_ad(row->lock_page != ULINT_UNDEFINED);
+ ut_ad(row->lock_rec != ULINT_UNDEFINED);
+ }
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Fills i_s_trx_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+fill_trx_row(
+/*=========*/
+ i_s_trx_row_t* row, /*!< out: result object
+ that's filled */
+ const trx_t* trx, /*!< in: transaction to
+ get data from */
+ const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the
+ corresponding row in
+ innodb_locks if trx is
+ waiting or NULL if trx
+ is not waiting */
+ trx_i_s_cache_t* cache) /*!< in/out: cache into
+ which to copy volatile
+ strings */
+{
+ const char* stmt;
+ size_t stmt_len;
+ const char* s;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ row->trx_id = trx->id;
+ row->trx_started = (ib_time_t) trx->start_time;
+ row->trx_state = trx_get_que_state_str(trx);
+ row->requested_lock_row = requested_lock_row;
+ ut_ad(requested_lock_row == NULL
+ || i_s_locks_row_validate(requested_lock_row));
+
+ if (trx->wait_lock != NULL) {
+ ut_a(requested_lock_row != NULL);
+ row->trx_wait_started = (ib_time_t) trx->wait_started;
+ } else {
+ ut_a(requested_lock_row == NULL);
+ row->trx_wait_started = 0;
+ }
+
+ row->trx_weight = (ullint) TRX_WEIGHT(trx);
+
+ if (trx->mysql_thd == NULL) {
+ /* For internal transactions e.g., purge and transactions
+ being recovered at startup there is no associated MySQL
+ thread data structure. */
+ row->trx_mysql_thread_id = 0;
+ row->trx_query = NULL;
+ goto thd_done;
+ }
+
+ row->trx_mysql_thread_id = thd_get_thread_id(trx->mysql_thd);
+ stmt = innobase_get_stmt(trx->mysql_thd, &stmt_len);
+
+ if (stmt != NULL) {
+ char query[TRX_I_S_TRX_QUERY_MAX_LEN + 1];
+
+ if (stmt_len > TRX_I_S_TRX_QUERY_MAX_LEN) {
+ stmt_len = TRX_I_S_TRX_QUERY_MAX_LEN;
+ }
+
+ memcpy(query, stmt, stmt_len);
+ query[stmt_len] = '\0';
+
+ row->trx_query = ha_storage_put_memlim(
+ cache->storage, query, stmt_len + 1,
+ MAX_ALLOWED_FOR_STORAGE(cache));
+
+ row->trx_query_cs = innobase_get_charset(trx->mysql_thd);
+
+ if (row->trx_query == NULL) {
+
+ return(FALSE);
+ }
+ } else {
+
+ row->trx_query = NULL;
+ }
+
+thd_done:
+ s = trx->op_info;
+
+ if (s != NULL && s[0] != '\0') {
+
+ TRX_I_S_STRING_COPY(s, row->trx_operation_state,
+ TRX_I_S_TRX_OP_STATE_MAX_LEN, cache);
+
+ if (row->trx_operation_state == NULL) {
+
+ return(FALSE);
+ }
+ } else {
+
+ row->trx_operation_state = NULL;
+ }
+
+ row->trx_tables_in_use = trx->n_mysql_tables_in_use;
+
+ row->trx_tables_locked = trx->mysql_n_tables_locked;
+
+ row->trx_lock_structs = UT_LIST_GET_LEN(trx->trx_locks);
+
+ row->trx_lock_memory_bytes = mem_heap_get_size(trx->lock_heap);
+
+ row->trx_rows_locked = lock_number_of_rows_locked(trx);
+
+ row->trx_rows_modified = trx->undo_no;
+
+ row->trx_concurrency_tickets = trx->n_tickets_to_enter_innodb;
+
+ switch (trx->isolation_level) {
+ case TRX_ISO_READ_UNCOMMITTED:
+ row->trx_isolation_level = "READ UNCOMMITTED";
+ break;
+ case TRX_ISO_READ_COMMITTED:
+ row->trx_isolation_level = "READ COMMITTED";
+ break;
+ case TRX_ISO_REPEATABLE_READ:
+ row->trx_isolation_level = "REPEATABLE READ";
+ break;
+ case TRX_ISO_SERIALIZABLE:
+ row->trx_isolation_level = "SERIALIZABLE";
+ break;
+ /* Should not happen as TRX_ISO_READ_COMMITTED is default */
+ default:
+ row->trx_isolation_level = "UNKNOWN";
+ }
+
+ row->trx_unique_checks = (ibool) trx->check_unique_secondary;
+
+ row->trx_foreign_key_checks = (ibool) trx->check_foreigns;
+
+ s = trx->detailed_error;
+
+ if (s != NULL && s[0] != '\0') {
+
+ TRX_I_S_STRING_COPY(s,
+ row->trx_foreign_key_error,
+ TRX_I_S_TRX_FK_ERROR_MAX_LEN, cache);
+
+ if (row->trx_foreign_key_error == NULL) {
+
+ return(FALSE);
+ }
+ } else {
+ row->trx_foreign_key_error = NULL;
+ }
+
+ row->trx_has_search_latch = (ibool) trx->has_search_latch;
+
+ row->trx_search_latch_timeout = trx->search_latch_timeout;
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Format the nth field of "rec" and put it in "buf". The result is always
+NUL-terminated. Returns the number of bytes that were written to "buf"
+(including the terminating NUL).
+@return end of the result */
+static
+ulint
+put_nth_field(
+/*==========*/
+ char* buf, /*!< out: buffer */
+ ulint buf_size,/*!< in: buffer size in bytes */
+ ulint n, /*!< in: number of field */
+ const dict_index_t* index, /*!< in: index */
+ const rec_t* rec, /*!< in: record */
+ const ulint* offsets)/*!< in: record offsets, returned
+ by rec_get_offsets() */
+{
+ const byte* data;
+ ulint data_len;
+ dict_field_t* dict_field;
+ ulint ret;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ if (buf_size == 0) {
+
+ return(0);
+ }
+
+ ret = 0;
+
+ if (n > 0) {
+ /* we must append ", " before the actual data */
+
+ if (buf_size < 3) {
+
+ buf[0] = '\0';
+ return(1);
+ }
+
+ memcpy(buf, ", ", 3);
+
+ buf += 2;
+ buf_size -= 2;
+ ret += 2;
+ }
+
+ /* now buf_size >= 1 */
+
+ data = rec_get_nth_field(rec, offsets, n, &data_len);
+
+ dict_field = dict_index_get_nth_field(index, n);
+
+ ret += row_raw_format((const char*) data, data_len,
+ dict_field, buf, buf_size);
+
+ return(ret);
+}
+
+/*******************************************************************//**
+Fills the "lock_data" member of i_s_locks_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+fill_lock_data(
+/*===========*/
+ const char** lock_data,/*!< out: "lock_data" to fill */
+ const lock_t* lock, /*!< in: lock used to find the data */
+ ulint heap_no,/*!< in: rec num used to find the data */
+ trx_i_s_cache_t* cache) /*!< in/out: cache where to store
+ volatile data */
+{
+ mtr_t mtr;
+
+ const buf_block_t* block;
+ const page_t* page;
+ const rec_t* rec;
+
+ ut_a(lock_get_type(lock) == LOCK_REC);
+
+ mtr_start(&mtr);
+
+ block = buf_page_try_get(lock_rec_get_space_id(lock),
+ lock_rec_get_page_no(lock),
+ &mtr);
+
+ if (block == NULL) {
+
+ *lock_data = NULL;
+
+ mtr_commit(&mtr);
+
+ return(TRUE);
+ }
+
+ page = (const page_t*) buf_block_get_frame(block);
+
+ rec = page_find_rec_with_heap_no(page, heap_no);
+
+ if (page_rec_is_infimum(rec)) {
+
+ *lock_data = ha_storage_put_str_memlim(
+ cache->storage, "infimum pseudo-record",
+ MAX_ALLOWED_FOR_STORAGE(cache));
+ } else if (page_rec_is_supremum(rec)) {
+
+ *lock_data = ha_storage_put_str_memlim(
+ cache->storage, "supremum pseudo-record",
+ MAX_ALLOWED_FOR_STORAGE(cache));
+ } else {
+
+ const dict_index_t* index;
+ ulint n_fields;
+ mem_heap_t* heap;
+ ulint offsets_onstack[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets;
+ char buf[TRX_I_S_LOCK_DATA_MAX_LEN];
+ ulint buf_used;
+ ulint i;
+
+ rec_offs_init(offsets_onstack);
+ offsets = offsets_onstack;
+
+ index = lock_rec_get_index(lock);
+
+ n_fields = dict_index_get_n_unique(index);
+
+ ut_a(n_fields > 0);
+
+ heap = NULL;
+ offsets = rec_get_offsets(rec, index, offsets, n_fields,
+ &heap);
+
+ /* format and store the data */
+
+ buf_used = 0;
+ for (i = 0; i < n_fields; i++) {
+
+ buf_used += put_nth_field(
+ buf + buf_used, sizeof(buf) - buf_used,
+ i, index, rec, offsets) - 1;
+ }
+
+ *lock_data = (const char*) ha_storage_put_memlim(
+ cache->storage, buf, buf_used + 1,
+ MAX_ALLOWED_FOR_STORAGE(cache));
+
+ if (UNIV_UNLIKELY(heap != NULL)) {
+
+ /* this means that rec_get_offsets() has created a new
+ heap and has stored offsets in it; check that this is
+ really the case and free the heap */
+ ut_a(offsets != offsets_onstack);
+ mem_heap_free(heap);
+ }
+ }
+
+ mtr_commit(&mtr);
+
+ if (*lock_data == NULL) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Fills i_s_locks_row_t object. Returns its first argument.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+fill_locks_row(
+/*===========*/
+ i_s_locks_row_t* row, /*!< out: result object that's filled */
+ const lock_t* lock, /*!< in: lock to get data from */
+ ulint heap_no,/*!< in: lock's record number
+ or ULINT_UNDEFINED if the lock
+ is a table lock */
+ trx_i_s_cache_t* cache) /*!< in/out: cache into which to copy
+ volatile strings */
+{
+ row->lock_trx_id = lock_get_trx_id(lock);
+ row->lock_mode = lock_get_mode_str(lock);
+ row->lock_type = lock_get_type_str(lock);
+
+ row->lock_table = ha_storage_put_str_memlim(
+ cache->storage, lock_get_table_name(lock),
+ MAX_ALLOWED_FOR_STORAGE(cache));
+
+ /* memory could not be allocated */
+ if (row->lock_table == NULL) {
+
+ return(FALSE);
+ }
+
+ switch (lock_get_type(lock)) {
+ case LOCK_REC:
+ row->lock_index = ha_storage_put_str_memlim(
+ cache->storage, lock_rec_get_index_name(lock),
+ MAX_ALLOWED_FOR_STORAGE(cache));
+
+ /* memory could not be allocated */
+ if (row->lock_index == NULL) {
+
+ return(FALSE);
+ }
+
+ row->lock_space = lock_rec_get_space_id(lock);
+ row->lock_page = lock_rec_get_page_no(lock);
+ row->lock_rec = heap_no;
+
+ if (!fill_lock_data(&row->lock_data, lock, heap_no, cache)) {
+
+ /* memory could not be allocated */
+ return(FALSE);
+ }
+
+ break;
+ case LOCK_TABLE:
+ row->lock_index = NULL;
+
+ row->lock_space = ULINT_UNDEFINED;
+ row->lock_page = ULINT_UNDEFINED;
+ row->lock_rec = ULINT_UNDEFINED;
+
+ row->lock_data = NULL;
+
+ break;
+ default:
+ ut_error;
+ }
+
+ row->lock_table_id = lock_get_table_id(lock);
+
+ row->hash_chain.value = row;
+ ut_ad(i_s_locks_row_validate(row));
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Fills i_s_lock_waits_row_t object. Returns its first argument.
+@return result object that's filled */
+static
+i_s_lock_waits_row_t*
+fill_lock_waits_row(
+/*================*/
+ i_s_lock_waits_row_t* row, /*!< out: result object
+ that's filled */
+ const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the
+ relevant requested lock
+ row in innodb_locks */
+ const i_s_locks_row_t* blocking_lock_row)/*!< in: pointer to the
+ relevant blocking lock
+ row in innodb_locks */
+{
+ ut_ad(i_s_locks_row_validate(requested_lock_row));
+ ut_ad(i_s_locks_row_validate(blocking_lock_row));
+
+ row->requested_lock_row = requested_lock_row;
+ row->blocking_lock_row = blocking_lock_row;
+
+ return(row);
+}
+
+/*******************************************************************//**
+Calculates a hash fold for a lock. For a record lock the fold is
+calculated from 4 elements, which uniquely identify a lock at a given
+point in time: transaction id, space id, page number, record number.
+For a table lock the fold is table's id.
+@return fold */
+static
+ulint
+fold_lock(
+/*======*/
+ const lock_t* lock, /*!< in: lock object to fold */
+ ulint heap_no)/*!< in: lock's record number
+ or ULINT_UNDEFINED if the lock
+ is a table lock */
+{
+#ifdef TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+ static ulint fold = 0;
+
+ return(fold++);
+#else
+ ulint ret;
+
+ switch (lock_get_type(lock)) {
+ case LOCK_REC:
+ ut_a(heap_no != ULINT_UNDEFINED);
+
+ ret = ut_fold_ulint_pair((ulint) lock_get_trx_id(lock),
+ lock_rec_get_space_id(lock));
+
+ ret = ut_fold_ulint_pair(ret,
+ lock_rec_get_page_no(lock));
+
+ ret = ut_fold_ulint_pair(ret, heap_no);
+
+ break;
+ case LOCK_TABLE:
+ /* this check is actually not necessary for continuing
+ correct operation, but something must have gone wrong if
+ it fails. */
+ ut_a(heap_no == ULINT_UNDEFINED);
+
+ ret = (ulint) lock_get_table_id(lock);
+
+ break;
+ default:
+ ut_error;
+ }
+
+ return(ret);
+#endif
+}
+
+/*******************************************************************//**
+Checks whether i_s_locks_row_t object represents a lock_t object.
+@return TRUE if they match */
+static
+ibool
+locks_row_eq_lock(
+/*==============*/
+ const i_s_locks_row_t* row, /*!< in: innodb_locks row */
+ const lock_t* lock, /*!< in: lock object */
+ ulint heap_no)/*!< in: lock's record number
+ or ULINT_UNDEFINED if the lock
+ is a table lock */
+{
+ ut_ad(i_s_locks_row_validate(row));
+#ifdef TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+ return(0);
+#else
+ switch (lock_get_type(lock)) {
+ case LOCK_REC:
+ ut_a(heap_no != ULINT_UNDEFINED);
+
+ return(row->lock_trx_id == lock_get_trx_id(lock)
+ && row->lock_space == lock_rec_get_space_id(lock)
+ && row->lock_page == lock_rec_get_page_no(lock)
+ && row->lock_rec == heap_no);
+
+ case LOCK_TABLE:
+ /* this check is actually not necessary for continuing
+ correct operation, but something must have gone wrong if
+ it fails. */
+ ut_a(heap_no == ULINT_UNDEFINED);
+
+ return(row->lock_trx_id == lock_get_trx_id(lock)
+ && row->lock_table_id == lock_get_table_id(lock));
+
+ default:
+ ut_error;
+ return(FALSE);
+ }
+#endif
+}
+
+/*******************************************************************//**
+Searches for a row in the innodb_locks cache that has a specified id.
+This happens in O(1) time since a hash table is used. Returns pointer to
+the row or NULL if none is found.
+@return row or NULL */
+static
+i_s_locks_row_t*
+search_innodb_locks(
+/*================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ const lock_t* lock, /*!< in: lock to search for */
+ ulint heap_no)/*!< in: lock's record number
+ or ULINT_UNDEFINED if the lock
+ is a table lock */
+{
+ i_s_hash_chain_t* hash_chain;
+
+ HASH_SEARCH(
+ /* hash_chain->"next" */
+ next,
+ /* the hash table */
+ cache->locks_hash,
+ /* fold */
+ fold_lock(lock, heap_no),
+ /* the type of the next variable */
+ i_s_hash_chain_t*,
+ /* auxiliary variable */
+ hash_chain,
+ /* assertion on every traversed item */
+ ut_ad(i_s_locks_row_validate(hash_chain->value)),
+ /* this determines if we have found the lock */
+ locks_row_eq_lock(hash_chain->value, lock, heap_no));
+
+ if (hash_chain == NULL) {
+
+ return(NULL);
+ }
+ /* else */
+
+ return(hash_chain->value);
+}
+
+/*******************************************************************//**
+Adds new element to the locks cache, enlarging it if necessary.
+Returns a pointer to the added row. If the row is already present then
+no row is added and a pointer to the existing row is returned.
+If row can not be allocated then NULL is returned.
+@return row */
+static
+i_s_locks_row_t*
+add_lock_to_cache(
+/*==============*/
+ trx_i_s_cache_t* cache, /*!< in/out: cache */
+ const lock_t* lock, /*!< in: the element to add */
+ ulint heap_no)/*!< in: lock's record number
+ or ULINT_UNDEFINED if the lock
+ is a table lock */
+{
+ i_s_locks_row_t* dst_row;
+
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+ ulint i;
+ for (i = 0; i < 10000; i++) {
+#endif
+#ifndef TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+ /* quit if this lock is already present */
+ dst_row = search_innodb_locks(cache, lock, heap_no);
+ if (dst_row != NULL) {
+
+ ut_ad(i_s_locks_row_validate(dst_row));
+ return(dst_row);
+ }
+#endif
+
+ dst_row = (i_s_locks_row_t*)
+ table_cache_create_empty_row(&cache->innodb_locks, cache);
+
+ /* memory could not be allocated */
+ if (dst_row == NULL) {
+
+ return(NULL);
+ }
+
+ if (!fill_locks_row(dst_row, lock, heap_no, cache)) {
+
+ /* memory could not be allocated */
+ cache->innodb_locks.rows_used--;
+ return(NULL);
+ }
+
+#ifndef TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+ HASH_INSERT(
+ /* the type used in the hash chain */
+ i_s_hash_chain_t,
+ /* hash_chain->"next" */
+ next,
+ /* the hash table */
+ cache->locks_hash,
+ /* fold */
+ fold_lock(lock, heap_no),
+ /* add this data to the hash */
+ &dst_row->hash_chain);
+#endif
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+ } /* for()-loop */
+#endif
+
+ ut_ad(i_s_locks_row_validate(dst_row));
+ return(dst_row);
+}
+
+/*******************************************************************//**
+Adds new pair of locks to the lock waits cache.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+add_lock_wait_to_cache(
+/*===================*/
+ trx_i_s_cache_t* cache, /*!< in/out: cache */
+ const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the
+ relevant requested lock
+ row in innodb_locks */
+ const i_s_locks_row_t* blocking_lock_row)/*!< in: pointer to the
+ relevant blocking lock
+ row in innodb_locks */
+{
+ i_s_lock_waits_row_t* dst_row;
+
+ dst_row = (i_s_lock_waits_row_t*)
+ table_cache_create_empty_row(&cache->innodb_lock_waits,
+ cache);
+
+ /* memory could not be allocated */
+ if (dst_row == NULL) {
+
+ return(FALSE);
+ }
+
+ fill_lock_waits_row(dst_row, requested_lock_row, blocking_lock_row);
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Adds transaction's relevant (important) locks to cache.
+If the transaction is waiting, then the wait lock is added to
+innodb_locks and a pointer to the added row is returned in
+requested_lock_row, otherwise requested_lock_row is set to NULL.
+If rows can not be allocated then FALSE is returned and the value of
+requested_lock_row is undefined.
+@return FALSE if allocation fails */
+static
+ibool
+add_trx_relevant_locks_to_cache(
+/*============================*/
+ trx_i_s_cache_t* cache, /*!< in/out: cache */
+ const trx_t* trx, /*!< in: transaction */
+ i_s_locks_row_t** requested_lock_row)/*!< out: pointer to the
+ requested lock row, or NULL or
+ undefined */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+
+ /* If transaction is waiting we add the wait lock and all locks
+ from another transactions that are blocking the wait lock. */
+ if (trx->que_state == TRX_QUE_LOCK_WAIT) {
+
+ const lock_t* curr_lock;
+ ulint wait_lock_heap_no;
+ i_s_locks_row_t* blocking_lock_row;
+ lock_queue_iterator_t iter;
+
+ ut_a(trx->wait_lock != NULL);
+
+ wait_lock_heap_no
+ = wait_lock_get_heap_no(trx->wait_lock);
+
+ /* add the requested lock */
+ *requested_lock_row
+ = add_lock_to_cache(cache, trx->wait_lock,
+ wait_lock_heap_no);
+
+ /* memory could not be allocated */
+ if (*requested_lock_row == NULL) {
+
+ return(FALSE);
+ }
+
+ /* then iterate over the locks before the wait lock and
+ add the ones that are blocking it */
+
+ lock_queue_iterator_reset(&iter, trx->wait_lock,
+ ULINT_UNDEFINED);
+
+ curr_lock = lock_queue_iterator_get_prev(&iter);
+ while (curr_lock != NULL) {
+
+ if (lock_has_to_wait(trx->wait_lock,
+ curr_lock)) {
+
+ /* add the lock that is
+ blocking trx->wait_lock */
+ blocking_lock_row
+ = add_lock_to_cache(
+ cache, curr_lock,
+ /* heap_no is the same
+ for the wait and waited
+ locks */
+ wait_lock_heap_no);
+
+ /* memory could not be allocated */
+ if (blocking_lock_row == NULL) {
+
+ return(FALSE);
+ }
+
+ /* add the relation between both locks
+ to innodb_lock_waits */
+ if (!add_lock_wait_to_cache(
+ cache, *requested_lock_row,
+ blocking_lock_row)) {
+
+ /* memory could not be allocated */
+ return(FALSE);
+ }
+ }
+
+ curr_lock = lock_queue_iterator_get_prev(&iter);
+ }
+ } else {
+
+ *requested_lock_row = NULL;
+ }
+
+ return(TRUE);
+}
+
+/** The minimum time that a cache must not be updated after it has been
+read for the last time; measured in microseconds. We use this technique
+to ensure that SELECTs which join several INFORMATION SCHEMA tables read
+the same version of the cache. */
+#define CACHE_MIN_IDLE_TIME_US 100000 /* 0.1 sec */
+
+/*******************************************************************//**
+Checks if the cache can safely be updated.
+@return TRUE if can be updated */
+static
+ibool
+can_cache_be_updated(
+/*=================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ ullint now;
+
+ /* Here we read cache->last_read without acquiring its mutex
+ because last_read is only updated when a shared rw lock on the
+ whole cache is being held (see trx_i_s_cache_end_read()) and
+ we are currently holding an exclusive rw lock on the cache.
+ So it is not possible for last_read to be updated while we are
+ reading it. */
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_EX));
+#endif
+
+ now = ut_time_us(NULL);
+ if (now - cache->last_read > CACHE_MIN_IDLE_TIME_US) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+Declare a cache empty, preparing it to be filled up. Not all resources
+are freed because they can be reused. */
+static
+void
+trx_i_s_cache_clear(
+/*================*/
+ trx_i_s_cache_t* cache) /*!< out: cache to clear */
+{
+ cache->innodb_trx.rows_used = 0;
+ cache->innodb_locks.rows_used = 0;
+ cache->innodb_lock_waits.rows_used = 0;
+
+ hash_table_clear(cache->locks_hash);
+
+ ha_storage_empty(&cache->storage);
+}
+
+/*******************************************************************//**
+Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the
+table cache buffer. Cache must be locked for write. */
+static
+void
+fetch_data_into_cache(
+/*==================*/
+ trx_i_s_cache_t* cache) /*!< in/out: cache */
+{
+ trx_t* trx;
+ i_s_trx_row_t* trx_row;
+ i_s_locks_row_t* requested_lock_row;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ trx_i_s_cache_clear(cache);
+
+ /* We iterate over the list of all transactions and add each one
+ to innodb_trx's cache. We also add all locks that are relevant
+ to each transaction into innodb_locks' and innodb_lock_waits'
+ caches. */
+
+ for (trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+ trx != NULL;
+ trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+ if (!add_trx_relevant_locks_to_cache(cache, trx,
+ &requested_lock_row)) {
+
+ cache->is_truncated = TRUE;
+ return;
+ }
+
+ trx_row = (i_s_trx_row_t*)
+ table_cache_create_empty_row(&cache->innodb_trx,
+ cache);
+
+ /* memory could not be allocated */
+ if (trx_row == NULL) {
+
+ cache->is_truncated = TRUE;
+ return;
+ }
+
+ if (!fill_trx_row(trx_row, trx, requested_lock_row, cache)) {
+
+ /* memory could not be allocated */
+ cache->innodb_trx.rows_used--;
+ cache->is_truncated = TRUE;
+ return;
+ }
+ }
+
+ cache->is_truncated = FALSE;
+}
+
+/*******************************************************************//**
+Update the transactions cache if it has not been read for some time.
+Called from handler/i_s.cc.
+@return 0 - fetched, 1 - not */
+UNIV_INTERN
+int
+trx_i_s_possibly_fetch_data_into_cache(
+/*===================================*/
+ trx_i_s_cache_t* cache) /*!< in/out: cache */
+{
+ if (!can_cache_be_updated(cache)) {
+
+ return(1);
+ }
+
+ /* We need to read trx_sys and record/table lock queues */
+ mutex_enter(&kernel_mutex);
+
+ fetch_data_into_cache(cache);
+
+ mutex_exit(&kernel_mutex);
+
+ return(0);
+}
+
+/*******************************************************************//**
+Returns TRUE if the data in the cache is truncated due to the memory
+limit posed by TRX_I_S_MEM_LIMIT.
+@return TRUE if truncated */
+UNIV_INTERN
+ibool
+trx_i_s_cache_is_truncated(
+/*=======================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ return(cache->is_truncated);
+}
+
+/*******************************************************************//**
+Initialize INFORMATION SCHEMA trx related cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_init(
+/*===============*/
+ trx_i_s_cache_t* cache) /*!< out: cache to init */
+{
+ /* The latching is done in the following order:
+ acquire trx_i_s_cache_t::rw_lock, X
+ acquire kernel_mutex
+ release kernel_mutex
+ release trx_i_s_cache_t::rw_lock
+ acquire trx_i_s_cache_t::rw_lock, S
+ acquire trx_i_s_cache_t::last_read_mutex
+ release trx_i_s_cache_t::last_read_mutex
+ release trx_i_s_cache_t::rw_lock */
+
+ rw_lock_create(trx_i_s_cache_lock_key, &cache->rw_lock,
+ SYNC_TRX_I_S_RWLOCK);
+
+ cache->last_read = 0;
+
+ mutex_create(cache_last_read_mutex_key,
+ &cache->last_read_mutex, SYNC_TRX_I_S_LAST_READ);
+
+ table_cache_init(&cache->innodb_trx, sizeof(i_s_trx_row_t));
+ table_cache_init(&cache->innodb_locks, sizeof(i_s_locks_row_t));
+ table_cache_init(&cache->innodb_lock_waits,
+ sizeof(i_s_lock_waits_row_t));
+
+ cache->locks_hash = hash_create(LOCKS_HASH_CELLS_NUM);
+
+ cache->storage = ha_storage_create(CACHE_STORAGE_INITIAL_SIZE,
+ CACHE_STORAGE_HASH_CELLS);
+
+ cache->mem_allocd = 0;
+
+ cache->is_truncated = FALSE;
+}
+
+/*******************************************************************//**
+Free the INFORMATION SCHEMA trx related cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_free(
+/*===============*/
+ trx_i_s_cache_t* cache) /*!< in, own: cache to free */
+{
+ hash_table_free(cache->locks_hash);
+ ha_storage_free(cache->storage);
+ table_cache_free(&cache->innodb_trx);
+ table_cache_free(&cache->innodb_locks);
+ table_cache_free(&cache->innodb_lock_waits);
+ memset(cache, 0, sizeof *cache);
+}
+
+/*******************************************************************//**
+Issue a shared/read lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_start_read(
+/*=====================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ rw_lock_s_lock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Release a shared/read lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_end_read(
+/*===================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ ullint now;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_SHARED));
+#endif
+
+ /* update cache last read time */
+ now = ut_time_us(NULL);
+ mutex_enter(&cache->last_read_mutex);
+ cache->last_read = now;
+ mutex_exit(&cache->last_read_mutex);
+
+ rw_lock_s_unlock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Issue an exclusive/write lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_start_write(
+/*======================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ rw_lock_x_lock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Release an exclusive/write lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_end_write(
+/*====================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_EX));
+#endif
+
+ rw_lock_x_unlock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Selects a INFORMATION SCHEMA table cache from the whole cache.
+@return table cache */
+static
+i_s_table_cache_t*
+cache_select_table(
+/*===============*/
+ trx_i_s_cache_t* cache, /*!< in: whole cache */
+ enum i_s_table table) /*!< in: which table */
+{
+ i_s_table_cache_t* table_cache;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_SHARED)
+ || rw_lock_own(&cache->rw_lock, RW_LOCK_EX));
+#endif
+
+ switch (table) {
+ case I_S_INNODB_TRX:
+ table_cache = &cache->innodb_trx;
+ break;
+ case I_S_INNODB_LOCKS:
+ table_cache = &cache->innodb_locks;
+ break;
+ case I_S_INNODB_LOCK_WAITS:
+ table_cache = &cache->innodb_lock_waits;
+ break;
+ default:
+ ut_error;
+ }
+
+ return(table_cache);
+}
+
+/*******************************************************************//**
+Retrieves the number of used rows in the cache for a given
+INFORMATION SCHEMA table.
+@return number of rows */
+UNIV_INTERN
+ulint
+trx_i_s_cache_get_rows_used(
+/*========================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ enum i_s_table table) /*!< in: which table */
+{
+ i_s_table_cache_t* table_cache;
+
+ table_cache = cache_select_table(cache, table);
+
+ return(table_cache->rows_used);
+}
+
+/*******************************************************************//**
+Retrieves the nth row (zero-based) in the cache for a given
+INFORMATION SCHEMA table.
+@return row */
+UNIV_INTERN
+void*
+trx_i_s_cache_get_nth_row(
+/*======================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ enum i_s_table table, /*!< in: which table */
+ ulint n) /*!< in: row number */
+{
+ i_s_table_cache_t* table_cache;
+ ulint i;
+ void* row;
+
+ table_cache = cache_select_table(cache, table);
+
+ ut_a(n < table_cache->rows_used);
+
+ row = NULL;
+
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ if (table_cache->chunks[i].offset
+ + table_cache->chunks[i].rows_allocd > n) {
+
+ row = (char*) table_cache->chunks[i].base
+ + (n - table_cache->chunks[i].offset)
+ * table_cache->row_size;
+ break;
+ }
+ }
+
+ ut_a(row != NULL);
+
+ return(row);
+}
+
+/*******************************************************************//**
+Crafts a lock id string from a i_s_locks_row_t object. Returns its
+second argument. This function aborts if there is not enough space in
+lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you
+want to be 100% sure that it will not abort.
+@return resulting lock id */
+UNIV_INTERN
+char*
+trx_i_s_create_lock_id(
+/*===================*/
+ const i_s_locks_row_t* row, /*!< in: innodb_locks row */
+ char* lock_id,/*!< out: resulting lock_id */
+ ulint lock_id_size)/*!< in: size of the lock id
+ buffer */
+{
+ int res_len;
+
+ /* please adjust TRX_I_S_LOCK_ID_MAX_LEN if you change this */
+
+ if (row->lock_space != ULINT_UNDEFINED) {
+ /* record lock */
+ res_len = ut_snprintf(lock_id, lock_id_size,
+ TRX_ID_FMT ":%lu:%lu:%lu",
+ row->lock_trx_id, row->lock_space,
+ row->lock_page, row->lock_rec);
+ } else {
+ /* table lock */
+ res_len = ut_snprintf(lock_id, lock_id_size,
+ TRX_ID_FMT ":%llu",
+ row->lock_trx_id,
+ row->lock_table_id);
+ }
+
+ /* the typecast is safe because snprintf(3) never returns
+ negative result */
+ ut_a(res_len >= 0);
+ ut_a((ulint) res_len < lock_id_size);
+
+ return(lock_id);
+}
diff --git a/storage/xtradb/trx/trx0purge.c b/storage/xtradb/trx/trx0purge.c
new file mode 100644
index 00000000000..d343a73c9d8
--- /dev/null
+++ b/storage/xtradb/trx/trx0purge.c
@@ -0,0 +1,1254 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0purge.c
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0purge.h"
+
+#ifdef UNIV_NONINL
+#include "trx0purge.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "read0read.h"
+#include "fut0fut.h"
+#include "que0que.h"
+#include "row0purge.h"
+#include "row0upd.h"
+#include "trx0rec.h"
+#include "srv0srv.h"
+#include "os0thread.h"
+
+/** The global data structure coordinating a purge */
+UNIV_INTERN trx_purge_t* purge_sys = NULL;
+
+/** A dummy undo record used as a return value when we have a whole undo log
+which needs no purge */
+UNIV_INTERN trx_undo_rec_t trx_purge_dummy_rec;
+
+#ifdef UNIV_PFS_RWLOCK
+/* Key to register trx_purge_latch with performance schema */
+UNIV_INTERN mysql_pfs_key_t trx_purge_latch_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register purge_sys_bh_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t purge_sys_bh_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#ifdef UNIV_DEBUG
+UNIV_INTERN my_bool srv_purge_view_update_only_debug;
+#endif /* UNIV_DEBUG */
+
+/*****************************************************************//**
+Checks if trx_id is >= purge_view: then it is guaranteed that its update
+undo log still exists in the system.
+@return TRUE if is sure that it is preserved, also if the function
+returns FALSE, it is possible that the undo log still exists in the
+system */
+UNIV_INTERN
+ibool
+trx_purge_update_undo_must_exist(
+/*=============================*/
+ trx_id_t trx_id) /*!< in: transaction id */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (!read_view_sees_trx_id(purge_sys->view, trx_id)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*=================== PURGE RECORD ARRAY =============================*/
+
+/*******************************************************************//**
+Stores info of an undo log record during a purge.
+@return pointer to the storage cell */
+static
+trx_undo_inf_t*
+trx_purge_arr_store_info(
+/*=====================*/
+ trx_id_t trx_no, /*!< in: transaction number */
+ undo_no_t undo_no)/*!< in: undo number */
+{
+ trx_undo_inf_t* cell;
+ trx_undo_arr_t* arr;
+ ulint i;
+
+ arr = purge_sys->arr;
+
+ for (i = 0;; i++) {
+ cell = trx_undo_arr_get_nth_info(arr, i);
+
+ if (!(cell->in_use)) {
+ /* Not in use, we may store here */
+ cell->undo_no = undo_no;
+ cell->trx_no = trx_no;
+ cell->in_use = TRUE;
+
+ arr->n_used++;
+
+ return(cell);
+ }
+ }
+}
+
+/*******************************************************************//**
+Removes info of an undo log record during a purge. */
+UNIV_INLINE
+void
+trx_purge_arr_remove_info(
+/*======================*/
+ trx_undo_inf_t* cell) /*!< in: pointer to the storage cell */
+{
+ trx_undo_arr_t* arr;
+
+ arr = purge_sys->arr;
+
+ cell->in_use = FALSE;
+
+ ut_ad(arr->n_used > 0);
+
+ arr->n_used--;
+}
+
+/*******************************************************************//**
+Gets the biggest pair of a trx number and an undo number in a purge array. */
+static
+void
+trx_purge_arr_get_biggest(
+/*======================*/
+ trx_undo_arr_t* arr, /*!< in: purge array */
+ trx_id_t* trx_no, /*!< out: transaction number: 0
+ if array is empty */
+ undo_no_t* undo_no)/*!< out: undo number */
+{
+ trx_undo_inf_t* cell;
+ trx_id_t pair_trx_no;
+ undo_no_t pair_undo_no;
+ ulint i;
+ ulint n;
+
+ n = arr->n_used;
+ pair_trx_no = 0;
+ pair_undo_no = 0;
+
+ if (n) {
+ for (i = 0;; i++) {
+ cell = trx_undo_arr_get_nth_info(arr, i);
+
+ if (!cell->in_use) {
+ continue;
+ }
+
+ if ((cell->trx_no > pair_trx_no)
+ || ((cell->trx_no == pair_trx_no)
+ && cell->undo_no >= pair_undo_no)) {
+
+ pair_trx_no = cell->trx_no;
+ pair_undo_no = cell->undo_no;
+ }
+
+ if (!--n) {
+ break;
+ }
+ }
+ }
+
+ *trx_no = pair_trx_no;
+ *undo_no = pair_undo_no;
+}
+
+/****************************************************************//**
+Builds a purge 'query' graph. The actual purge is performed by executing
+this query graph.
+@return own: the query graph */
+static
+que_t*
+trx_purge_graph_build(void)
+/*=======================*/
+{
+ mem_heap_t* heap;
+ que_fork_t* fork;
+ que_thr_t* thr;
+ /* que_thr_t* thr2; */
+
+ heap = mem_heap_create(512);
+ fork = que_fork_create(NULL, NULL, QUE_FORK_PURGE, heap);
+ fork->trx = purge_sys->trx;
+
+ thr = que_thr_create(fork, heap);
+
+ thr->child = row_purge_node_create(thr, heap);
+
+ /* thr2 = que_thr_create(fork, fork, heap);
+
+ thr2->child = row_purge_node_create(fork, thr2, heap); */
+
+ return(fork);
+}
+
+/********************************************************************//**
+Creates the global purge system control structure and inits the history
+mutex. */
+UNIV_INTERN
+void
+trx_purge_sys_create(
+/*=================*/
+ ib_bh_t* ib_bh) /*!< in, own: UNDO log min binary heap */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+
+ purge_sys = mem_zalloc(sizeof(trx_purge_t));
+
+ /* Take ownership of ib_bh, we are responsible for freeing it. */
+ purge_sys->ib_bh = ib_bh;
+ purge_sys->state = TRX_STOP_PURGE;
+
+ purge_sys->n_pages_handled = 0;
+
+ purge_sys->purge_trx_no = 0;
+ purge_sys->purge_undo_no = 0;
+ purge_sys->next_stored = FALSE;
+ ut_d(purge_sys->done_trx_no = 0);
+
+ rw_lock_create(trx_purge_latch_key,
+ &purge_sys->latch, SYNC_PURGE_LATCH);
+
+ mutex_create(
+ purge_sys_bh_mutex_key, &purge_sys->bh_mutex,
+ SYNC_PURGE_QUEUE);
+
+ purge_sys->heap = mem_heap_create(256);
+
+ purge_sys->arr = trx_undo_arr_create();
+
+ purge_sys->sess = sess_open();
+
+ purge_sys->trx = purge_sys->sess->trx;
+
+ purge_sys->trx->is_purge = 1;
+
+ ut_a(trx_start_low(purge_sys->trx, ULINT_UNDEFINED));
+
+ purge_sys->query = trx_purge_graph_build();
+
+ purge_sys->prebuilt_view =
+ read_view_oldest_copy_or_open_new(0, NULL);
+ purge_sys->view = purge_sys->prebuilt_view;
+}
+
+/************************************************************************
+Frees the global purge system control structure. */
+UNIV_INTERN
+void
+trx_purge_sys_close(void)
+/*======================*/
+{
+ ut_ad(!mutex_own(&kernel_mutex));
+
+ que_graph_free(purge_sys->query);
+
+ ut_a(purge_sys->sess->trx->is_purge);
+ purge_sys->sess->trx->state = TRX_NOT_STARTED;
+
+ mutex_enter(&kernel_mutex);
+ trx_release_descriptor(purge_sys->sess->trx);
+ mutex_exit(&kernel_mutex);
+
+ sess_close(purge_sys->sess);
+ purge_sys->sess = NULL;
+
+ if (purge_sys->view != NULL) {
+ /* Because acquiring the kernel mutex is a pre-condition
+ of read_view_close(). We don't really need it here. */
+ mutex_enter(&kernel_mutex);
+
+ read_view_close(purge_sys->view);
+ read_view_free(purge_sys->prebuilt_view);
+ purge_sys->prebuilt_view = NULL;
+ purge_sys->view = NULL;
+
+ mutex_exit(&kernel_mutex);
+ }
+
+ trx_undo_arr_free(purge_sys->arr);
+
+ rw_lock_free(&purge_sys->latch);
+ mutex_free(&purge_sys->bh_mutex);
+
+ mem_heap_free(purge_sys->heap);
+
+ ib_bh_free(purge_sys->ib_bh);
+
+ mem_free(purge_sys);
+
+ purge_sys = NULL;
+}
+
+/*================ UNDO LOG HISTORY LIST =============================*/
+
+/********************************************************************//**
+Adds the update undo log as the first log in the history list. Removes the
+update undo log segment from the rseg slot if it is too big for reuse. */
+UNIV_INTERN
+void
+trx_purge_add_update_undo_to_history(
+/*=================================*/
+ trx_t* trx, /*!< in: transaction */
+ page_t* undo_page, /*!< in: update undo log header page,
+ x-latched */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_undo_t* undo;
+ trx_rsegf_t* rseg_header;
+ trx_ulogf_t* undo_header;
+
+ undo = trx->update_undo;
+
+ ut_ad(undo);
+
+ ut_ad(mutex_own(&undo->rseg->mutex));
+
+ rseg_header = trx_rsegf_get(
+ undo->rseg->space, undo->rseg->zip_size, undo->rseg->page_no,
+ mtr);
+
+ undo_header = undo_page + undo->hdr_offset;
+ /* Add the log as the first in the history list */
+
+ if (undo->state != TRX_UNDO_CACHED) {
+ ulint hist_size;
+#ifdef UNIV_DEBUG
+ trx_usegf_t* seg_header = undo_page + TRX_UNDO_SEG_HDR;
+#endif /* UNIV_DEBUG */
+
+ /* The undo log segment will not be reused */
+
+ if (UNIV_UNLIKELY(undo->id >= TRX_RSEG_N_SLOTS)) {
+ fprintf(stderr,
+ "InnoDB: Error: undo->id is %lu\n",
+ (ulong) undo->id);
+ ut_error;
+ }
+
+ trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, mtr);
+
+ hist_size = mtr_read_ulint(
+ rseg_header + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, mtr);
+
+ ut_ad(undo->size == flst_get_len(
+ seg_header + TRX_UNDO_PAGE_LIST, mtr));
+
+ mlog_write_ulint(
+ rseg_header + TRX_RSEG_HISTORY_SIZE,
+ hist_size + undo->size, MLOG_4BYTES, mtr);
+ }
+
+ flst_add_first(
+ rseg_header + TRX_RSEG_HISTORY,
+ undo_header + TRX_UNDO_HISTORY_NODE, mtr);
+
+ /* Write the trx number to the undo log header */
+
+ mlog_write_ull(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr);
+
+ /* Write information about delete markings to the undo log header */
+
+ if (!undo->del_marks) {
+ mlog_write_ulint(
+ undo_header + TRX_UNDO_DEL_MARKS, FALSE,
+ MLOG_2BYTES, mtr);
+ }
+
+ if (undo->rseg->last_page_no == FIL_NULL) {
+ undo->rseg->last_trx_no = trx->no;
+ undo->rseg->last_offset = undo->hdr_offset;
+ undo->rseg->last_page_no = undo->hdr_page_no;
+ undo->rseg->last_del_marks = undo->del_marks;
+
+ /* FIXME: Add a bin heap validate function to check that
+ the rseg exists. */
+ }
+
+ mutex_enter(&kernel_mutex);
+ trx_sys->rseg_history_len++;
+ mutex_exit(&kernel_mutex);
+
+// if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) { /*should wake up always*/
+ /* Inform the purge thread that there is work to do. */
+ srv_wake_purge_thread_if_not_active();
+// }
+}
+
+/**********************************************************************//**
+Frees an undo log segment which is in the history list. Cuts the end of the
+history list at the youngest undo log in this segment. */
+static
+void
+trx_purge_free_segment(
+/*===================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment */
+ fil_addr_t hdr_addr, /*!< in: the file address of log_hdr */
+ ulint n_removed_logs) /*!< in: count of how many undo logs we
+ will cut off from the end of the
+ history list */
+{
+ page_t* undo_page;
+ trx_rsegf_t* rseg_hdr;
+ trx_ulogf_t* log_hdr;
+ trx_usegf_t* seg_hdr;
+ ibool freed;
+ ulint seg_size;
+ ulint hist_size;
+ ibool marked = FALSE;
+ mtr_t mtr;
+
+ /* fputs("Freeing an update undo log segment\n", stderr); */
+
+loop:
+ mtr_start(&mtr);
+ mutex_enter(&(rseg->mutex));
+
+ rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size,
+ rseg->page_no, &mtr);
+
+ undo_page = trx_undo_page_get(rseg->space, rseg->zip_size,
+ hdr_addr.page, &mtr);
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+ log_hdr = undo_page + hdr_addr.boffset;
+
+ /* Mark the last undo log totally purged, so that if the system
+ crashes, the tail of the undo log will not get accessed again. The
+ list of pages in the undo log tail gets inconsistent during the
+ freeing of the segment, and therefore purge should not try to access
+ them again. */
+
+ if (!marked) {
+ mlog_write_ulint(log_hdr + TRX_UNDO_DEL_MARKS, FALSE,
+ MLOG_2BYTES, &mtr);
+ marked = TRUE;
+ }
+
+ freed = fseg_free_step_not_header(seg_hdr + TRX_UNDO_FSEG_HEADER,
+ &mtr);
+ if (!freed) {
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ goto loop;
+ }
+
+ /* The page list may now be inconsistent, but the length field
+ stored in the list base node tells us how big it was before we
+ started the freeing. */
+
+ seg_size = flst_get_len(seg_hdr + TRX_UNDO_PAGE_LIST, &mtr);
+
+ /* We may free the undo log segment header page; it must be freed
+ within the same mtr as the undo log header is removed from the
+ history list: otherwise, in case of a database crash, the segment
+ could become inaccessible garbage in the file space. */
+
+ flst_cut_end(rseg_hdr + TRX_RSEG_HISTORY,
+ log_hdr + TRX_UNDO_HISTORY_NODE, n_removed_logs, &mtr);
+
+ mutex_enter(&kernel_mutex);
+ ut_ad(trx_sys->rseg_history_len >= n_removed_logs);
+ trx_sys->rseg_history_len -= n_removed_logs;
+ mutex_exit(&kernel_mutex);
+
+ freed = FALSE;
+
+ while (!freed) {
+ /* Here we assume that a file segment with just the header
+ page can be freed in a few steps, so that the buffer pool
+ is not flooded with bufferfixed pages: see the note in
+ fsp0fsp.c. */
+
+ freed = fseg_free_step(seg_hdr + TRX_UNDO_FSEG_HEADER,
+ &mtr);
+ }
+
+ hist_size = mtr_read_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE,
+ MLOG_4BYTES, &mtr);
+ ut_ad(hist_size >= seg_size);
+
+ mlog_write_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE,
+ hist_size - seg_size, MLOG_4BYTES, &mtr);
+
+ ut_ad(rseg->curr_size >= seg_size);
+
+ rseg->curr_size -= seg_size;
+
+ mutex_exit(&(rseg->mutex));
+
+ mtr_commit(&mtr);
+}
+
+/********************************************************************//**
+Removes unnecessary history data from a rollback segment. */
+static
+void
+trx_purge_truncate_rseg_history(
+/*============================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment */
+ trx_id_t limit_trx_no, /*!< in: remove update undo logs whose
+ trx number is < limit_trx_no */
+ undo_no_t limit_undo_no) /*!< in: if transaction number is equal
+ to limit_trx_no, truncate undo records
+ with undo number < limit_undo_no */
+{
+ fil_addr_t hdr_addr;
+ fil_addr_t prev_hdr_addr;
+ trx_rsegf_t* rseg_hdr;
+ page_t* undo_page;
+ trx_ulogf_t* log_hdr;
+ trx_usegf_t* seg_hdr;
+ ulint n_removed_logs = 0;
+ mtr_t mtr;
+ trx_id_t undo_trx_no;
+
+ mtr_start(&mtr);
+ mutex_enter(&(rseg->mutex));
+
+ rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size,
+ rseg->page_no, &mtr);
+
+ hdr_addr = trx_purge_get_log_from_hist(
+ flst_get_last(rseg_hdr + TRX_RSEG_HISTORY, &mtr));
+loop:
+ if (hdr_addr.page == FIL_NULL) {
+
+ mutex_exit(&(rseg->mutex));
+
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ undo_page = trx_undo_page_get(rseg->space, rseg->zip_size,
+ hdr_addr.page, &mtr);
+
+ log_hdr = undo_page + hdr_addr.boffset;
+ undo_trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
+
+ if (undo_trx_no >= limit_trx_no) {
+ if (undo_trx_no == limit_trx_no) {
+ trx_undo_truncate_start(rseg, rseg->space,
+ hdr_addr.page,
+ hdr_addr.boffset,
+ limit_undo_no);
+ }
+
+ mutex_enter(&kernel_mutex);
+ ut_a(trx_sys->rseg_history_len >= n_removed_logs);
+ trx_sys->rseg_history_len -= n_removed_logs;
+ mutex_exit(&kernel_mutex);
+
+ flst_truncate_end(rseg_hdr + TRX_RSEG_HISTORY,
+ log_hdr + TRX_UNDO_HISTORY_NODE,
+ n_removed_logs, &mtr);
+
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ prev_hdr_addr = trx_purge_get_log_from_hist(
+ flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr));
+ n_removed_logs++;
+
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+ if ((mach_read_from_2(seg_hdr + TRX_UNDO_STATE) == TRX_UNDO_TO_PURGE)
+ && (mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG) == 0)) {
+
+ /* We can free the whole log segment */
+
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ trx_purge_free_segment(rseg, hdr_addr, n_removed_logs);
+
+ n_removed_logs = 0;
+ } else {
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+ }
+
+ mtr_start(&mtr);
+ mutex_enter(&(rseg->mutex));
+
+ rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size,
+ rseg->page_no, &mtr);
+
+ hdr_addr = prev_hdr_addr;
+
+ goto loop;
+}
+
+/********************************************************************//**
+Removes unnecessary history data from rollback segments. NOTE that when this
+function is called, the caller must not have any latches on undo log pages! */
+static
+void
+trx_purge_truncate_history(void)
+/*============================*/
+{
+ trx_rseg_t* rseg;
+ trx_id_t limit_trx_no;
+ undo_no_t limit_undo_no;
+
+ trx_purge_arr_get_biggest(
+ purge_sys->arr, &limit_trx_no, &limit_undo_no);
+
+ if (limit_trx_no == 0) {
+
+ limit_trx_no = purge_sys->purge_trx_no;
+ limit_undo_no = purge_sys->purge_undo_no;
+ }
+
+ /* We play safe and set the truncate limit at most to the purge view
+ low_limit number, though this is not necessary */
+
+ if (limit_trx_no >= purge_sys->view->low_limit_no) {
+ limit_trx_no = purge_sys->view->low_limit_no;
+ limit_undo_no = 0;
+ }
+
+ ut_ad(limit_trx_no <= purge_sys->view->low_limit_no);
+
+ for (rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+ rseg != NULL;
+ rseg = UT_LIST_GET_NEXT(rseg_list, rseg)) {
+
+ trx_purge_truncate_rseg_history(
+ rseg, limit_trx_no, limit_undo_no);
+ }
+}
+
+/********************************************************************//**
+Does a truncate if the purge array is empty. NOTE that when this function is
+called, the caller must not have any latches on undo log pages! */
+UNIV_INLINE
+void
+trx_purge_truncate_if_arr_empty(void)
+/*=================================*/
+{
+ static ulint count;
+
+#ifdef UNIV_DEBUG
+ if (purge_sys->arr->n_used == 0) {
+ purge_sys->done_trx_no = purge_sys->purge_trx_no;
+ }
+#endif /* UNIV_DEBUG */
+
+ if (!(++count % TRX_SYS_N_RSEGS) && purge_sys->arr->n_used == 0) {
+
+ trx_purge_truncate_history();
+ }
+}
+
+/***********************************************************************//**
+Updates the last not yet purged history log info in rseg when we have purged
+a whole undo log. Advances also purge_sys->purge_trx_no past the purged log. */
+static
+void
+trx_purge_rseg_get_next_history_log(
+/*================================*/
+ trx_rseg_t* rseg) /*!< in: rollback segment */
+{
+ page_t* undo_page;
+ trx_ulogf_t* log_hdr;
+ fil_addr_t prev_log_addr;
+ trx_id_t trx_no;
+ ibool del_marks;
+ mtr_t mtr;
+ rseg_queue_t rseg_queue;
+ const void* ptr;
+
+ mutex_enter(&(rseg->mutex));
+
+ ut_a(rseg->last_page_no != FIL_NULL);
+
+ purge_sys->purge_trx_no = rseg->last_trx_no + 1;
+ purge_sys->purge_undo_no = 0;
+ purge_sys->next_stored = FALSE;
+
+ mtr_start(&mtr);
+
+ undo_page = trx_undo_page_get_s_latched(
+ rseg->space, rseg->zip_size, rseg->last_page_no, &mtr);
+
+ log_hdr = undo_page + rseg->last_offset;
+
+ /* Increase the purge page count by one for every handled log */
+
+ purge_sys->n_pages_handled++;
+
+ prev_log_addr = trx_purge_get_log_from_hist(
+ flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr));
+
+ if (prev_log_addr.page == FIL_NULL) {
+ /* No logs left in the history list */
+
+ rseg->last_page_no = FIL_NULL;
+
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ mutex_enter(&kernel_mutex);
+
+ /* Add debug code to track history list corruption reported
+ on the MySQL mailing list on Nov 9, 2004. The fut0lst.c
+ file-based list was corrupt. The prev node pointer was
+ FIL_NULL, even though the list length was over 8 million nodes!
+ We assume that purge truncates the history list in large
+ size pieces, and if we here reach the head of the list, the
+ list cannot be longer than 2000 000 undo logs now. */
+
+ if (trx_sys->rseg_history_len > 2000000) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: purge reached the"
+ " head of the history list,\n"
+ "InnoDB: but its length is still"
+ " reported as %lu! Make a detailed bug\n"
+ "InnoDB: report, and submit it"
+ " to http://bugs.mysql.com\n",
+ (ulong) trx_sys->rseg_history_len);
+ ut_ad(0);
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ return;
+ }
+
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ /* Read the trx number and del marks from the previous log header */
+ mtr_start(&mtr);
+
+ log_hdr = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size,
+ prev_log_addr.page, &mtr)
+ + prev_log_addr.boffset;
+
+ trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
+
+ del_marks = mach_read_from_2(log_hdr + TRX_UNDO_DEL_MARKS);
+
+ mtr_commit(&mtr);
+
+ mutex_enter(&(rseg->mutex));
+
+ rseg->last_page_no = prev_log_addr.page;
+ rseg->last_offset = prev_log_addr.boffset;
+ rseg->last_trx_no = trx_no;
+ rseg->last_del_marks = del_marks;
+
+ rseg_queue.rseg = rseg;
+ rseg_queue.trx_no = rseg->last_trx_no;
+
+ /* Purge can also produce events, however these are already ordered
+ in the rollback segment and any user generated event will be greater
+ than the events that Purge produces. ie. Purge can never produce
+ events from an empty rollback segment. */
+
+ mutex_enter(&purge_sys->bh_mutex);
+
+ ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue);
+ ut_a(ptr != NULL);
+
+ mutex_exit(&purge_sys->bh_mutex);
+
+ mutex_exit(&(rseg->mutex));
+}
+
+/***********************************************************************//**
+Chooses the rollback segment with the smallest trx_id.
+@return zip_size if log is for a compressed table, ULINT_UNDEFINED if
+ no rollback segments to purge, 0 for non compressed tables. */
+static
+ulint
+trx_purge_get_rseg_with_min_trx_id(
+/*===============================*/
+ trx_purge_t* purge_sys) /*!< in/out: purge instance */
+
+{
+ ulint zip_size = 0;
+
+ mutex_enter(&purge_sys->bh_mutex);
+
+ /* Only purge consumes events from the binary heap, user
+ threads only produce the events. */
+
+ if (!ib_bh_is_empty(purge_sys->ib_bh)) {
+ trx_rseg_t* rseg;
+
+ rseg = ((rseg_queue_t*) ib_bh_first(purge_sys->ib_bh))->rseg;
+ ib_bh_pop(purge_sys->ib_bh);
+
+ mutex_exit(&purge_sys->bh_mutex);
+
+ purge_sys->rseg = rseg;
+ } else {
+ mutex_exit(&purge_sys->bh_mutex);
+
+ purge_sys->rseg = NULL;
+
+ return(ULINT_UNDEFINED);
+ }
+
+ ut_a(purge_sys->rseg != NULL);
+
+ mutex_enter(&purge_sys->rseg->mutex);
+
+ ut_a(purge_sys->rseg->last_page_no != FIL_NULL);
+
+ /* We assume in purge of externally stored fields
+ that space id == 0 */
+ ut_a(purge_sys->rseg->space == 0);
+
+ zip_size = purge_sys->rseg->zip_size;
+
+ ut_a(purge_sys->purge_trx_no <= purge_sys->rseg->last_trx_no);
+
+ purge_sys->purge_trx_no = purge_sys->rseg->last_trx_no;
+
+ purge_sys->hdr_offset = purge_sys->rseg->last_offset;
+
+ purge_sys->hdr_page_no = purge_sys->rseg->last_page_no;
+
+ mutex_exit(&purge_sys->rseg->mutex);
+
+ return(zip_size);
+}
+
+/***********************************************************************//**
+Position the purge sys "iterator" on the undo record to use for purging. */
+static
+void
+trx_purge_read_undo_rec(
+/*====================*/
+ trx_purge_t* purge_sys, /*!< in/out: purge instance */
+ ulint zip_size) /*!< in: block size or 0 */
+{
+ ulint page_no;
+ ulint offset = 0;
+ ib_uint64_t undo_no = 0;
+
+ purge_sys->hdr_offset = purge_sys->rseg->last_offset;
+ page_no = purge_sys->hdr_page_no = purge_sys->rseg->last_page_no;
+
+ if (purge_sys->rseg->last_del_marks) {
+ mtr_t mtr;
+ trx_undo_rec_t* undo_rec;
+
+ mtr_start(&mtr);
+
+ undo_rec = trx_undo_get_first_rec(
+ 0 /* System space id */, zip_size,
+ purge_sys->hdr_page_no,
+ purge_sys->hdr_offset, RW_S_LATCH, &mtr);
+
+ if (undo_rec != NULL) {
+ offset = page_offset(undo_rec);
+ undo_no = trx_undo_rec_get_undo_no(undo_rec);
+ page_no = page_get_page_no(page_align(undo_rec));
+ }
+
+ mtr_commit(&mtr);
+ }
+
+ purge_sys->offset = offset;
+ purge_sys->page_no = page_no;
+ purge_sys->purge_undo_no = undo_no;
+
+ purge_sys->next_stored = TRUE;
+}
+
+/***********************************************************************//**
+Chooses the next undo log to purge and updates the info in purge_sys. This
+function is used to initialize purge_sys when the next record to purge is
+not known, and also to update the purge system info on the next record when
+purge has handled the whole undo log for a transaction. */
+static
+void
+trx_purge_choose_next_log(void)
+/*===========================*/
+{
+ ulint zip_size;
+
+ ut_ad(purge_sys->next_stored == FALSE);
+
+ zip_size = trx_purge_get_rseg_with_min_trx_id(purge_sys);
+
+ if (purge_sys->rseg != NULL) {
+
+ trx_purge_read_undo_rec(purge_sys, zip_size);
+ } else {
+ /* There is nothing to do yet. */
+ os_thread_yield();
+ }
+}
+
+/***********************************************************************//**
+Gets the next record to purge and updates the info in the purge system.
+@return copy of an undo log record or pointer to the dummy undo log record */
+static
+trx_undo_rec_t*
+trx_purge_get_next_rec(
+/*===================*/
+ mem_heap_t* heap) /*!< in: memory heap where copied */
+{
+ trx_undo_rec_t* rec;
+ trx_undo_rec_t* rec_copy;
+ trx_undo_rec_t* rec2;
+ trx_undo_rec_t* next_rec;
+ page_t* undo_page;
+ page_t* page;
+ ulint offset;
+ ulint page_no;
+ ulint space;
+ ulint zip_size;
+ ulint type;
+ ulint cmpl_info;
+ mtr_t mtr;
+
+ ut_ad(purge_sys->next_stored);
+
+ space = purge_sys->rseg->space;
+ zip_size = purge_sys->rseg->zip_size;
+ page_no = purge_sys->page_no;
+ offset = purge_sys->offset;
+
+ if (offset == 0) {
+ /* It is the dummy undo log record, which means that there is
+ no need to purge this undo log */
+
+ trx_purge_rseg_get_next_history_log(purge_sys->rseg);
+
+ /* Look for the next undo log and record to purge */
+
+ trx_purge_choose_next_log();
+
+ return(&trx_purge_dummy_rec);
+ }
+
+ mtr_start(&mtr);
+
+ undo_page = trx_undo_page_get_s_latched(space, zip_size, page_no, &mtr);
+
+ rec = undo_page + offset;
+
+ rec2 = rec;
+
+ for (;;) {
+ /* Try first to find the next record which requires a purge
+ operation from the same page of the same undo log */
+
+ next_rec = trx_undo_page_get_next_rec(
+ rec2, purge_sys->hdr_page_no, purge_sys->hdr_offset);
+
+ if (next_rec == NULL) {
+ rec2 = trx_undo_get_next_rec(
+ rec2, purge_sys->hdr_page_no,
+ purge_sys->hdr_offset, &mtr);
+ break;
+ }
+
+ rec2 = next_rec;
+
+ type = trx_undo_rec_get_type(rec2);
+
+ if (type == TRX_UNDO_DEL_MARK_REC) {
+
+ break;
+ }
+
+ cmpl_info = trx_undo_rec_get_cmpl_info(rec2);
+
+ if (trx_undo_rec_get_extern_storage(rec2)) {
+ break;
+ }
+
+ if ((type == TRX_UNDO_UPD_EXIST_REC)
+ && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+ break;
+ }
+ }
+
+ if (rec2 == NULL) {
+ mtr_commit(&mtr);
+
+ trx_purge_rseg_get_next_history_log(purge_sys->rseg);
+
+ /* Look for the next undo log and record to purge */
+
+ trx_purge_choose_next_log();
+
+ mtr_start(&mtr);
+
+ undo_page = trx_undo_page_get_s_latched(space, zip_size,
+ page_no, &mtr);
+
+ rec = undo_page + offset;
+ } else {
+ page = page_align(rec2);
+
+ purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec2);
+ purge_sys->page_no = page_get_page_no(page);
+ purge_sys->offset = rec2 - page;
+
+ if (undo_page != page) {
+ /* We advance to a new page of the undo log: */
+ purge_sys->n_pages_handled++;
+ }
+ }
+
+ rec_copy = trx_undo_rec_copy(rec, heap);
+
+ mtr_commit(&mtr);
+
+ return(rec_copy);
+}
+
+/********************************************************************//**
+Fetches the next undo log record from the history list to purge. It must be
+released with the corresponding release function.
+@return copy of an undo log record or pointer to trx_purge_dummy_rec,
+if the whole undo log can skipped in purge; NULL if none left */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_purge_fetch_next_rec(
+/*=====================*/
+ roll_ptr_t* roll_ptr,/*!< out: roll pointer to undo record */
+ trx_undo_inf_t** cell, /*!< out: storage cell for the record in the
+ purge array */
+ mem_heap_t* heap) /*!< in: memory heap where copied */
+{
+ trx_undo_rec_t* undo_rec;
+
+
+ if (purge_sys->state == TRX_STOP_PURGE) {
+ trx_purge_truncate_if_arr_empty();
+
+ return(NULL);
+ } else if (!purge_sys->next_stored) {
+ trx_purge_choose_next_log();
+
+ if (!purge_sys->next_stored) {
+ purge_sys->state = TRX_STOP_PURGE;
+
+ trx_purge_truncate_if_arr_empty();
+
+ if (srv_print_thread_releases) {
+ fprintf(stderr,
+ "Purge: No logs left in the"
+ " history list; pages handled %lu\n",
+ (ulong) purge_sys->n_pages_handled);
+ }
+
+ return(NULL);
+ }
+ }
+
+ if (purge_sys->n_pages_handled >= purge_sys->handle_limit) {
+
+ purge_sys->state = TRX_STOP_PURGE;
+
+ trx_purge_truncate_if_arr_empty();
+
+ return(NULL);
+ } else if (purge_sys->purge_trx_no >= purge_sys->view->low_limit_no) {
+ purge_sys->state = TRX_STOP_PURGE;
+
+ trx_purge_truncate_if_arr_empty();
+
+ return(NULL);
+ }
+
+ /* fprintf(stderr, "Thread %lu purging trx %llu undo record %llu\n",
+ os_thread_get_curr_id(),
+ (ullint) purge_sys->purge_trx_no,
+ (ullint) purge_sys->purge_undo_no); */
+
+
+ *roll_ptr = trx_undo_build_roll_ptr(
+ FALSE, (purge_sys->rseg)->id, purge_sys->page_no,
+ purge_sys->offset);
+
+ *cell = trx_purge_arr_store_info(
+ purge_sys->purge_trx_no, purge_sys->purge_undo_no);
+
+ ut_ad(purge_sys->purge_trx_no < purge_sys->view->low_limit_no);
+
+ /* The following call will advance the stored values of purge_trx_no
+ and purge_undo_no, therefore we had to store them first */
+
+ undo_rec = trx_purge_get_next_rec(heap);
+
+ return(undo_rec);
+}
+
+/*******************************************************************//**
+Releases a reserved purge undo record. */
+UNIV_INTERN
+void
+trx_purge_rec_release(
+/*==================*/
+ trx_undo_inf_t* cell) /*!< in: storage cell */
+{
+ trx_purge_arr_remove_info(cell);
+}
+
+/*******************************************************************//**
+This function runs a purge batch.
+@return number of undo log pages handled in the batch */
+UNIV_INTERN
+ulint
+trx_purge(
+/*======*/
+ ulint limit) /*!< in: the maximum number of records to
+ purge in one batch */
+{
+ que_thr_t* thr;
+ ulint old_pages_handled;
+
+ ut_a(purge_sys->trx->n_active_thrs == 0);
+
+ rw_lock_x_lock(&purge_sys->latch);
+
+ mutex_enter(&kernel_mutex);
+
+ /* Close and free the old purge view */
+
+ read_view_close(purge_sys->view);
+ purge_sys->view = NULL;
+ mem_heap_empty(purge_sys->heap);
+
+ /* Determine how much data manipulation language (DML) statements
+ need to be delayed in order to reduce the lagging of the purge
+ thread. */
+ srv_dml_needed_delay = 0; /* in microseconds; default: no delay */
+
+ /* If we cannot advance the 'purge view' because of an old
+ 'consistent read view', then the DML statements cannot be delayed.
+ Also, srv_max_purge_lag <= 0 means 'infinity'. */
+ if (srv_max_purge_lag > 0) {
+ float ratio = (float) trx_sys->rseg_history_len
+ / srv_max_purge_lag;
+ if (ratio > ULINT_MAX / 10000) {
+ /* Avoid overflow: maximum delay is 4295 seconds */
+ srv_dml_needed_delay = ULINT_MAX;
+ } else if (ratio > 1) {
+ /* If the history list length exceeds the
+ innodb_max_purge_lag, the
+ data manipulation statements are delayed
+ by at least 5000 microseconds. */
+ srv_dml_needed_delay = (ulint) ((ratio - .5) * 10000);
+ }
+ }
+
+ purge_sys->view = read_view_oldest_copy_or_open_new(
+ 0, purge_sys->prebuilt_view);
+
+ mutex_exit(&kernel_mutex);
+
+ rw_lock_x_unlock(&(purge_sys->latch));
+
+#ifdef UNIV_DEBUG
+ if (srv_purge_view_update_only_debug) {
+ return(0);
+ }
+#endif
+
+ purge_sys->state = TRX_PURGE_ON;
+
+ purge_sys->handle_limit = purge_sys->n_pages_handled + limit;
+
+ old_pages_handled = purge_sys->n_pages_handled;
+
+
+ mutex_enter(&kernel_mutex);
+
+ thr = que_fork_start_command(purge_sys->query);
+
+ ut_ad(thr);
+
+ mutex_exit(&kernel_mutex);
+
+ if (srv_print_thread_releases) {
+
+ fputs("Starting purge\n", stderr);
+ }
+
+ que_run_threads(thr);
+
+ if (srv_print_thread_releases) {
+
+ fprintf(stderr,
+ "Purge ends; pages handled %lu\n",
+ (ulong) purge_sys->n_pages_handled);
+ }
+
+ return((ulint) (purge_sys->n_pages_handled - old_pages_handled));
+}
+
+/******************************************************************//**
+Prints information of the purge system to stderr. */
+UNIV_INTERN
+void
+trx_purge_sys_print(void)
+/*=====================*/
+{
+ fprintf(stderr, "InnoDB: Purge system view:\n");
+ read_view_print(stderr, purge_sys->view);
+
+ fprintf(stderr, "InnoDB: Purge trx n:o " TRX_ID_FMT
+ ", undo n:o " TRX_ID_FMT "\n",
+ (ullint) purge_sys->purge_trx_no,
+ (ullint) purge_sys->purge_undo_no);
+ fprintf(stderr,
+ "InnoDB: Purge next stored %lu, page_no %lu, offset %lu,\n"
+ "InnoDB: Purge hdr_page_no %lu, hdr_offset %lu\n",
+ (ulong) purge_sys->next_stored,
+ (ulong) purge_sys->page_no,
+ (ulong) purge_sys->offset,
+ (ulong) purge_sys->hdr_page_no,
+ (ulong) purge_sys->hdr_offset);
+}
diff --git a/storage/xtradb/trx/trx0rec.c b/storage/xtradb/trx/trx0rec.c
new file mode 100644
index 00000000000..ef42152aeb7
--- /dev/null
+++ b/storage/xtradb/trx/trx0rec.c
@@ -0,0 +1,1698 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rec.c
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rec.h"
+
+#ifdef UNIV_NONINL
+#include "trx0rec.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0undo.h"
+#include "mtr0log.h"
+#ifndef UNIV_HOTBACKUP
+#include "dict0dict.h"
+#include "ut0mem.h"
+#include "read0read.h"
+#include "row0ext.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "trx0purge.h"
+#include "trx0rseg.h"
+#include "row0row.h"
+
+/*=========== UNDO LOG RECORD CREATION AND DECODING ====================*/
+
+/**********************************************************************//**
+Writes the mtr log entry of the inserted undo log record on the undo log
+page. */
+UNIV_INLINE
+void
+trx_undof_page_add_undo_rec_log(
+/*============================*/
+ page_t* undo_page, /*!< in: undo log page */
+ ulint old_free, /*!< in: start offset of the inserted entry */
+ ulint new_free, /*!< in: end offset of the entry */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ byte* log_ptr;
+ const byte* log_end;
+ ulint len;
+
+ log_ptr = mlog_open(mtr, 11 + 13 + MLOG_BUF_MARGIN);
+
+ if (log_ptr == NULL) {
+
+ return;
+ }
+
+ log_end = &log_ptr[11 + 13 + MLOG_BUF_MARGIN];
+ log_ptr = mlog_write_initial_log_record_fast(
+ undo_page, MLOG_UNDO_INSERT, log_ptr, mtr);
+ len = new_free - old_free - 4;
+
+ mach_write_to_2(log_ptr, len);
+ log_ptr += 2;
+
+ if (log_ptr + len <= log_end) {
+ memcpy(log_ptr, undo_page + old_free + 2, len);
+ mlog_close(mtr, log_ptr + len);
+ } else {
+ mlog_close(mtr, log_ptr);
+ mlog_catenate_string(mtr, undo_page + old_free + 2, len);
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a redo log record of adding an undo log record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_add_undo_rec(
+/*========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page) /*!< in: page or NULL */
+{
+ ulint len;
+ byte* rec;
+ ulint first_free;
+
+ if (end_ptr < ptr + 2) {
+
+ return(NULL);
+ }
+
+ len = mach_read_from_2(ptr);
+ ptr += 2;
+
+ if (end_ptr < ptr + len) {
+
+ return(NULL);
+ }
+
+ if (page == NULL) {
+
+ return(ptr + len);
+ }
+
+ first_free = mach_read_from_2(page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE);
+ rec = page + first_free;
+
+ mach_write_to_2(rec, first_free + 4 + len);
+ mach_write_to_2(rec + 2 + len, first_free);
+
+ mach_write_to_2(page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE,
+ first_free + 4 + len);
+ ut_memcpy(rec + 2, ptr, len);
+
+ return(ptr + len);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Calculates the free space left for extending an undo log record.
+@return bytes left */
+UNIV_INLINE
+ulint
+trx_undo_left(
+/*==========*/
+ const page_t* page, /*!< in: undo log page */
+ const byte* ptr) /*!< in: pointer to page */
+{
+ /* The '- 10' is a safety margin, in case we have some small
+ calculation error below */
+
+ return(UNIV_PAGE_SIZE - (ptr - page) - 10 - FIL_PAGE_DATA_END);
+}
+
+/**********************************************************************//**
+Set the next and previous pointers in the undo page for the undo record
+that was written to ptr. Update the first free value by the number of bytes
+written for this undo record.
+@return offset of the inserted entry on the page if succeeded, 0 if fail */
+static
+ulint
+trx_undo_page_set_next_prev_and_add(
+/*================================*/
+ page_t* undo_page, /*!< in/out: undo log page */
+ byte* ptr, /*!< in: ptr up to where data has been
+ written on this undo page. */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint first_free; /*!< offset within undo_page */
+ ulint end_of_rec; /*!< offset within undo_page */
+ byte* ptr_to_first_free;
+ /* pointer within undo_page
+ that points to the next free
+ offset value within undo_page.*/
+
+ ut_ad(ptr > undo_page);
+ ut_ad(ptr < undo_page + UNIV_PAGE_SIZE);
+
+ if (UNIV_UNLIKELY(trx_undo_left(undo_page, ptr) < 2)) {
+
+ return(0);
+ }
+
+ ptr_to_first_free = undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE;
+
+ first_free = mach_read_from_2(ptr_to_first_free);
+
+ /* Write offset of the previous undo log record */
+ mach_write_to_2(ptr, first_free);
+ ptr += 2;
+
+ end_of_rec = ptr - undo_page;
+
+ /* Write offset of the next undo log record */
+ mach_write_to_2(undo_page + first_free, end_of_rec);
+
+ /* Update the offset to first free undo record */
+ mach_write_to_2(ptr_to_first_free, end_of_rec);
+
+ /* Write this log entry to the UNDO log */
+ trx_undof_page_add_undo_rec_log(undo_page, first_free,
+ end_of_rec, mtr);
+
+ return(first_free);
+}
+
+/**********************************************************************//**
+Reports in the undo log of an insert of a clustered index record.
+@return offset of the inserted entry on the page if succeed, 0 if fail */
+static
+ulint
+trx_undo_page_report_insert(
+/*========================*/
+ page_t* undo_page, /*!< in: undo log page */
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t* clust_entry, /*!< in: index entry which will be
+ inserted to the clustered index */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint first_free;
+ byte* ptr;
+ ulint i;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_INSERT);
+
+ first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE);
+ ptr = undo_page + first_free;
+
+ ut_ad(first_free <= UNIV_PAGE_SIZE);
+
+ if (trx_undo_left(undo_page, ptr) < 2 + 1 + 11 + 11) {
+
+ /* Not enough space for writing the general parameters */
+
+ return(0);
+ }
+
+ /* Reserve 2 bytes for the pointer to the next undo log record */
+ ptr += 2;
+
+ /* Store first some general parameters to the undo log */
+ *ptr++ = TRX_UNDO_INSERT_REC;
+ ptr += mach_ull_write_much_compressed(ptr, trx->undo_no);
+ ptr += mach_ull_write_much_compressed(ptr, index->table->id);
+ /*----------------------------------------*/
+ /* Store then the fields required to uniquely determine the record
+ to be inserted in the clustered index */
+
+ for (i = 0; i < dict_index_get_n_unique(index); i++) {
+
+ const dfield_t* field = dtuple_get_nth_field(clust_entry, i);
+ ulint flen = dfield_get_len(field);
+
+ if (trx_undo_left(undo_page, ptr) < 5) {
+
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, flen);
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_page, ptr) < flen) {
+
+ return(0);
+ }
+
+ ut_memcpy(ptr, dfield_get_data(field), flen);
+ ptr += flen;
+ }
+ }
+
+ return(trx_undo_page_set_next_prev_and_add(undo_page, ptr, mtr));
+}
+
+/**********************************************************************//**
+Reads from an undo log record the general parameters.
+@return remaining part of undo log record after reading these values */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_pars(
+/*==================*/
+ trx_undo_rec_t* undo_rec, /*!< in: undo log record */
+ ulint* type, /*!< out: undo record type:
+ TRX_UNDO_INSERT_REC, ... */
+ ulint* cmpl_info, /*!< out: compiler info, relevant only
+ for update type records */
+ ibool* updated_extern, /*!< out: TRUE if we updated an
+ externally stored fild */
+ undo_no_t* undo_no, /*!< out: undo log record number */
+ table_id_t* table_id) /*!< out: table id */
+{
+ byte* ptr;
+ ulint type_cmpl;
+
+ ptr = undo_rec + 2;
+
+ type_cmpl = mach_read_from_1(ptr);
+ ptr++;
+
+ if (type_cmpl & TRX_UNDO_UPD_EXTERN) {
+ *updated_extern = TRUE;
+ type_cmpl -= TRX_UNDO_UPD_EXTERN;
+ } else {
+ *updated_extern = FALSE;
+ }
+
+ *type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1);
+ *cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT;
+
+ *undo_no = mach_ull_read_much_compressed(ptr);
+ ptr += mach_ull_get_much_compressed_size(*undo_no);
+
+ *table_id = mach_ull_read_much_compressed(ptr);
+ ptr += mach_ull_get_much_compressed_size(*table_id);
+
+ return(ptr);
+}
+
+/**********************************************************************//**
+Reads from an undo log record a stored column value.
+@return remaining part of undo log record after reading these values */
+static
+byte*
+trx_undo_rec_get_col_val(
+/*=====================*/
+ byte* ptr, /*!< in: pointer to remaining part of undo log record */
+ byte** field, /*!< out: pointer to stored field */
+ ulint* len, /*!< out: length of the field, or UNIV_SQL_NULL */
+ ulint* orig_len)/*!< out: original length of the locally
+ stored part of an externally stored column, or 0 */
+{
+ *len = mach_read_compressed(ptr);
+ ptr += mach_get_compressed_size(*len);
+
+ *orig_len = 0;
+
+ switch (*len) {
+ case UNIV_SQL_NULL:
+ *field = NULL;
+ break;
+ case UNIV_EXTERN_STORAGE_FIELD:
+ *orig_len = mach_read_compressed(ptr);
+ ptr += mach_get_compressed_size(*orig_len);
+ *len = mach_read_compressed(ptr);
+ ptr += mach_get_compressed_size(*len);
+ *field = ptr;
+ ptr += *len;
+
+ ut_ad(*orig_len >= BTR_EXTERN_FIELD_REF_SIZE);
+ ut_ad(*len > *orig_len);
+ /* @see dtuple_convert_big_rec() */
+ ut_ad(*len >= BTR_EXTERN_FIELD_REF_SIZE);
+ /* we do not have access to index->table here
+ ut_ad(dict_table_get_format(index->table) >= DICT_TF_FORMAT_ZIP
+ || *len >= col->max_prefix
+ + BTR_EXTERN_FIELD_REF_SIZE);
+ */
+
+ *len += UNIV_EXTERN_STORAGE_FIELD;
+ break;
+ default:
+ *field = ptr;
+ if (*len >= UNIV_EXTERN_STORAGE_FIELD) {
+ ptr += *len - UNIV_EXTERN_STORAGE_FIELD;
+ } else {
+ ptr += *len;
+ }
+ }
+
+ return(ptr);
+}
+
+/*******************************************************************//**
+Builds a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+ byte* ptr, /*!< in: remaining part of a copy of an undo log
+ record, at the start of the row reference;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the row reference is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ dtuple_t** ref, /*!< out, own: row reference */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory
+ needed is allocated */
+{
+ ulint ref_len;
+ ulint i;
+
+ ut_ad(index && ptr && ref && heap);
+ ut_a(dict_index_is_clust(index));
+
+ ref_len = dict_index_get_n_unique(index);
+
+ *ref = dtuple_create(heap, ref_len);
+
+ dict_index_copy_types(*ref, index, ref_len);
+
+ for (i = 0; i < ref_len; i++) {
+ dfield_t* dfield;
+ byte* field;
+ ulint len;
+ ulint orig_len;
+
+ dfield = dtuple_get_nth_field(*ref, i);
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+ dfield_set_data(dfield, field, len);
+ }
+
+ return(ptr);
+}
+
+/*******************************************************************//**
+Skips a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_skip_row_ref(
+/*======================*/
+ byte* ptr, /*!< in: remaining part in update undo log
+ record, at the start of the row reference */
+ dict_index_t* index) /*!< in: clustered index */
+{
+ ulint ref_len;
+ ulint i;
+
+ ut_ad(index && ptr);
+ ut_a(dict_index_is_clust(index));
+
+ ref_len = dict_index_get_n_unique(index);
+
+ for (i = 0; i < ref_len; i++) {
+ byte* field;
+ ulint len;
+ ulint orig_len;
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+ }
+
+ return(ptr);
+}
+
+/**********************************************************************//**
+Fetch a prefix of an externally stored column, for writing to the undo log
+of an update or delete marking of a clustered index record.
+@return ext_buf */
+static
+byte*
+trx_undo_page_fetch_ext(
+/*====================*/
+ byte* ext_buf, /*!< in: buffer to hold the prefix
+ data and BLOB pointer */
+ ulint prefix_len, /*!< in: prefix size to store
+ in the undo log */
+ ulint zip_size, /*!< compressed page size in bytes,
+ or 0 for uncompressed BLOB */
+ const byte* field, /*!< in: an externally stored column */
+ ulint* len) /*!< in: length of field;
+ out: used length of ext_buf */
+{
+ /* Fetch the BLOB. */
+ ulint ext_len = btr_copy_externally_stored_field_prefix(
+ ext_buf, prefix_len, zip_size, field, *len);
+ /* BLOBs should always be nonempty. */
+ ut_a(ext_len);
+ /* Append the BLOB pointer to the prefix. */
+ memcpy(ext_buf + ext_len,
+ field + *len - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ *len = ext_len + BTR_EXTERN_FIELD_REF_SIZE;
+ return(ext_buf);
+}
+
+/**********************************************************************//**
+Writes to the undo log a prefix of an externally stored column.
+@return undo log position */
+static
+byte*
+trx_undo_page_report_modify_ext(
+/*============================*/
+ byte* ptr, /*!< in: undo log position,
+ at least 15 bytes must be available */
+ byte* ext_buf, /*!< in: a buffer of
+ DICT_MAX_FIELD_LEN_BY_FORMAT() size,
+ or NULL when should not fetch
+ a longer prefix */
+ ulint prefix_len, /*!< prefix size to store in the
+ undo log */
+ ulint zip_size, /*!< compressed page size in bytes,
+ or 0 for uncompressed BLOB */
+ const byte** field, /*!< in/out: the locally stored part of
+ the externally stored column */
+ ulint* len) /*!< in/out: length of field, in bytes */
+{
+ if (ext_buf) {
+ ut_a(prefix_len > 0);
+
+ /* If an ordering column is externally stored, we will
+ have to store a longer prefix of the field. In this
+ case, write to the log a marker followed by the
+ original length and the real length of the field. */
+ ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD);
+
+ ptr += mach_write_compressed(ptr, *len);
+
+ *field = trx_undo_page_fetch_ext(ext_buf, prefix_len, zip_size,
+ *field, len);
+
+ ptr += mach_write_compressed(ptr, *len);
+ } else {
+ ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD
+ + *len);
+ }
+
+ return(ptr);
+}
+
+/**********************************************************************//**
+Reports in the undo log of an update or delete marking of a clustered index
+record.
+@return byte offset of the inserted undo log entry on the page if
+succeed, 0 if fail */
+static
+ulint
+trx_undo_page_report_modify(
+/*========================*/
+ page_t* undo_page, /*!< in: undo log page */
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: clustered index where update or
+ delete marking is done */
+ const rec_t* rec, /*!< in: clustered index record which
+ has NOT yet been modified */
+ const ulint* offsets, /*!< in: rec_get_offsets(rec, index) */
+ const upd_t* update, /*!< in: update vector which tells the
+ columns to be updated; in the case of
+ a delete, this should be set to NULL */
+ ulint cmpl_info, /*!< in: compiler info on secondary
+ index updates */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_table_t* table;
+ ulint first_free;
+ byte* ptr;
+ const byte* field;
+ ulint flen;
+ ulint col_no;
+ ulint type_cmpl;
+ byte* type_cmpl_ptr;
+ ulint i;
+ trx_id_t trx_id;
+ ibool ignore_prefix = FALSE;
+ byte ext_buf[REC_VERSION_56_MAX_INDEX_COL_LEN
+ + BTR_EXTERN_FIELD_REF_SIZE];
+
+ ut_a(dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE);
+ table = index->table;
+
+ first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE);
+ ptr = undo_page + first_free;
+
+ ut_ad(first_free <= UNIV_PAGE_SIZE);
+
+ if (trx_undo_left(undo_page, ptr) < 50) {
+
+ /* NOTE: the value 50 must be big enough so that the general
+ fields written below fit on the undo log page */
+
+ return(0);
+ }
+
+ /* Reserve 2 bytes for the pointer to the next undo log record */
+ ptr += 2;
+
+ /* Store first some general parameters to the undo log */
+
+ if (!update) {
+ type_cmpl = TRX_UNDO_DEL_MARK_REC;
+ } else if (rec_get_deleted_flag(rec, dict_table_is_comp(table))) {
+ type_cmpl = TRX_UNDO_UPD_DEL_REC;
+ /* We are about to update a delete marked record.
+ We don't typically need the prefix in this case unless
+ the delete marking is done by the same transaction
+ (which we check below). */
+ ignore_prefix = TRUE;
+ } else {
+ type_cmpl = TRX_UNDO_UPD_EXIST_REC;
+ }
+
+ type_cmpl |= cmpl_info * TRX_UNDO_CMPL_INFO_MULT;
+ type_cmpl_ptr = ptr;
+
+ *ptr++ = (byte) type_cmpl;
+ ptr += mach_ull_write_much_compressed(ptr, trx->undo_no);
+
+ ptr += mach_ull_write_much_compressed(ptr, table->id);
+
+ /*----------------------------------------*/
+ /* Store the state of the info bits */
+
+ *ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table));
+
+ /* Store the values of the system columns */
+ field = rec_get_nth_field(rec, offsets,
+ dict_index_get_sys_col_pos(
+ index, DATA_TRX_ID), &flen);
+ ut_ad(flen == DATA_TRX_ID_LEN);
+
+ trx_id = trx_read_trx_id(field);
+
+ /* If it is an update of a delete marked record, then we are
+ allowed to ignore blob prefixes if the delete marking was done
+ by some other trx as it must have committed by now for us to
+ allow an over-write. */
+ if (ignore_prefix) {
+ ignore_prefix = (trx_id != trx->id);
+ }
+ ptr += mach_ull_write_compressed(ptr, trx_id);
+
+ field = rec_get_nth_field(rec, offsets,
+ dict_index_get_sys_col_pos(
+ index, DATA_ROLL_PTR), &flen);
+ ut_ad(flen == DATA_ROLL_PTR_LEN);
+
+ ptr += mach_ull_write_compressed(ptr, trx_read_roll_ptr(field));
+
+ /*----------------------------------------*/
+ /* Store then the fields required to uniquely determine the
+ record which will be modified in the clustered index */
+
+ for (i = 0; i < dict_index_get_n_unique(index); i++) {
+
+ field = rec_get_nth_field(rec, offsets, i, &flen);
+
+ /* The ordering columns must not be stored externally. */
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+ ut_ad(dict_index_get_nth_col(index, i)->ord_part);
+
+ if (trx_undo_left(undo_page, ptr) < 5) {
+
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, flen);
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_page, ptr) < flen) {
+
+ return(0);
+ }
+
+ ut_memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+ }
+
+ /*----------------------------------------*/
+ /* Save to the undo log the old values of the columns to be updated. */
+
+ if (update) {
+ ulint extended = 0;
+
+ if (trx_undo_left(undo_page, ptr) < 5) {
+
+ return(0);
+ }
+
+ if (srv_use_sys_stats_table
+ && index == UT_LIST_GET_FIRST(dict_sys->sys_stats->indexes)) {
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+ ulint pos = upd_get_nth_field(update, i)->field_no;
+
+ if (pos >= rec_offs_n_fields(offsets)) {
+ extended++;
+ }
+ }
+ }
+
+ ptr += mach_write_compressed(ptr, upd_get_n_fields(update) - extended);
+
+ for (i = 0; i < upd_get_n_fields(update) - extended; i++) {
+
+ ulint pos = upd_get_nth_field(update, i)->field_no;
+
+ /* Write field number to undo log */
+ if (trx_undo_left(undo_page, ptr) < 5) {
+
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, pos);
+
+ /* Save the old value of field */
+ field = rec_get_nth_field(rec, offsets, pos, &flen);
+
+ if (trx_undo_left(undo_page, ptr) < 15) {
+
+ return(0);
+ }
+
+ if (rec_offs_nth_extern(offsets, pos)) {
+ const dict_col_t* col
+ = dict_index_get_nth_col(index, pos);
+ ulint prefix_len
+ = dict_max_field_len_store_undo(
+ table, col);
+
+ ut_ad(prefix_len + BTR_EXTERN_FIELD_REF_SIZE
+ <= sizeof ext_buf);
+
+ ptr = trx_undo_page_report_modify_ext(
+ ptr,
+ col->ord_part
+ && !ignore_prefix
+ && flen < REC_ANTELOPE_MAX_INDEX_COL_LEN
+ ? ext_buf : NULL, prefix_len,
+ dict_table_zip_size(table),
+ &field, &flen);
+
+ /* Notify purge that it eventually has to
+ free the old externally stored field */
+
+ trx->update_undo->del_marks = TRUE;
+
+ *type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN;
+ } else {
+ ptr += mach_write_compressed(ptr, flen);
+ }
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_page, ptr) < flen) {
+
+ return(0);
+ }
+
+ ut_memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+ }
+ }
+
+ /*----------------------------------------*/
+ /* In the case of a delete marking, and also in the case of an update
+ where any ordering field of any index changes, store the values of all
+ columns which occur as ordering fields in any index. This info is used
+ in the purge of old versions where we use it to build and search the
+ delete marked index records, to look if we can remove them from the
+ index tree. Note that starting from 4.0.14 also externally stored
+ fields can be ordering in some index. Starting from 5.2, we no longer
+ store REC_MAX_INDEX_COL_LEN first bytes to the undo log record,
+ but we can construct the column prefix fields in the index by
+ fetching the first page of the BLOB that is pointed to by the
+ clustered index. This works also in crash recovery, because all pages
+ (including BLOBs) are recovered before anything is rolled back. */
+
+ if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+ byte* old_ptr = ptr;
+
+ trx->update_undo->del_marks = TRUE;
+
+ if (trx_undo_left(undo_page, ptr) < 5) {
+
+ return(0);
+ }
+
+ /* Reserve 2 bytes to write the number of bytes the stored
+ fields take in this undo record */
+
+ ptr += 2;
+
+ for (col_no = 0; col_no < dict_table_get_n_cols(table);
+ col_no++) {
+
+ const dict_col_t* col
+ = dict_table_get_nth_col(table, col_no);
+
+ if (col->ord_part) {
+ ulint pos;
+
+ /* Write field number to undo log */
+ if (trx_undo_left(undo_page, ptr) < 5 + 15) {
+
+ return(0);
+ }
+
+ pos = dict_index_get_nth_col_pos(index,
+ col_no);
+ ptr += mach_write_compressed(ptr, pos);
+
+ /* Save the old value of field */
+ field = rec_get_nth_field(rec, offsets, pos,
+ &flen);
+
+ if (rec_offs_nth_extern(offsets, pos)) {
+ const dict_col_t* col =
+ dict_index_get_nth_col(
+ index, pos);
+ ulint prefix_len =
+ dict_max_field_len_store_undo(
+ table, col);
+
+ ut_a(prefix_len < sizeof ext_buf);
+
+ ptr = trx_undo_page_report_modify_ext(
+ ptr,
+ flen < REC_ANTELOPE_MAX_INDEX_COL_LEN
+ && !ignore_prefix
+ ? ext_buf : NULL, prefix_len,
+ dict_table_zip_size(table),
+ &field, &flen);
+ } else {
+ ptr += mach_write_compressed(
+ ptr, flen);
+ }
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_page, ptr)
+ < flen) {
+
+ return(0);
+ }
+
+ ut_memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+ }
+ }
+
+ mach_write_to_2(old_ptr, ptr - old_ptr);
+ }
+
+ /*----------------------------------------*/
+ /* Write pointers to the previous and the next undo log records */
+ if (trx_undo_left(undo_page, ptr) < 2) {
+
+ return(0);
+ }
+
+ mach_write_to_2(ptr, first_free);
+ ptr += 2;
+ mach_write_to_2(undo_page + first_free, ptr - undo_page);
+
+ mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE,
+ ptr - undo_page);
+
+ /* Write to the REDO log about this change in the UNDO log */
+
+ trx_undof_page_add_undo_rec_log(undo_page, first_free,
+ ptr - undo_page, mtr);
+ return(first_free);
+}
+
+/**********************************************************************//**
+Reads from an undo log update record the system field values of the old
+version.
+@return remaining part of undo log record after reading these values */
+UNIV_INTERN
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+ byte* ptr, /*!< in: remaining part of undo
+ log record after reading
+ general parameters */
+ trx_id_t* trx_id, /*!< out: trx id */
+ roll_ptr_t* roll_ptr, /*!< out: roll ptr */
+ ulint* info_bits) /*!< out: info bits state */
+{
+ /* Read the state of the info bits */
+ *info_bits = mach_read_from_1(ptr);
+ ptr += 1;
+
+ /* Read the values of the system columns */
+
+ *trx_id = mach_ull_read_compressed(ptr);
+ ptr += mach_ull_get_compressed_size(*trx_id);
+
+ *roll_ptr = mach_ull_read_compressed(ptr);
+ ptr += mach_ull_get_compressed_size(*roll_ptr);
+
+ return(ptr);
+}
+
+/**********************************************************************//**
+Reads from an update undo log record the number of updated fields.
+@return remaining part of undo log record after reading this value */
+UNIV_INLINE
+byte*
+trx_undo_update_rec_get_n_upd_fields(
+/*=================================*/
+ byte* ptr, /*!< in: pointer to remaining part of undo log record */
+ ulint* n) /*!< out: number of fields */
+{
+ *n = mach_read_compressed(ptr);
+ ptr += mach_get_compressed_size(*n);
+
+ return(ptr);
+}
+
+/**********************************************************************//**
+Reads from an update undo log record a stored field number.
+@return remaining part of undo log record after reading this value */
+UNIV_INLINE
+byte*
+trx_undo_update_rec_get_field_no(
+/*=============================*/
+ byte* ptr, /*!< in: pointer to remaining part of undo log record */
+ ulint* field_no)/*!< out: field number */
+{
+ *field_no = mach_read_compressed(ptr);
+ ptr += mach_get_compressed_size(*field_no);
+
+ return(ptr);
+}
+
+/*******************************************************************//**
+Builds an update vector based on a remaining part of an undo log record.
+@return remaining part of the record, NULL if an error detected, which
+means that the record is corrupted */
+UNIV_INTERN
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+ byte* ptr, /*!< in: remaining part in update undo log
+ record, after reading the row reference
+ NOTE that this copy of the undo log record must
+ be preserved as long as the update vector is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint type, /*!< in: TRX_UNDO_UPD_EXIST_REC,
+ TRX_UNDO_UPD_DEL_REC, or
+ TRX_UNDO_DEL_MARK_REC; in the last case,
+ only trx id and roll ptr fields are added to
+ the update vector */
+ trx_id_t trx_id, /*!< in: transaction id from this undo record */
+ roll_ptr_t roll_ptr,/*!< in: roll pointer from this undo record */
+ ulint info_bits,/*!< in: info bits from this undo record */
+ trx_t* trx, /*!< in: transaction */
+ mem_heap_t* heap, /*!< in: memory heap from which the memory
+ needed is allocated */
+ upd_t** upd) /*!< out, own: update vector */
+{
+ upd_field_t* upd_field;
+ upd_t* update;
+ ulint n_fields;
+ byte* buf;
+ ulint i;
+
+ ut_a(dict_index_is_clust(index));
+
+ if (type != TRX_UNDO_DEL_MARK_REC) {
+ ptr = trx_undo_update_rec_get_n_upd_fields(ptr, &n_fields);
+ } else {
+ n_fields = 0;
+ }
+
+ update = upd_create(n_fields + 2, heap);
+
+ update->info_bits = info_bits;
+
+ /* Store first trx id and roll ptr to update vector */
+
+ upd_field = upd_get_nth_field(update, n_fields);
+ buf = mem_heap_alloc(heap, DATA_TRX_ID_LEN);
+ trx_write_trx_id(buf, trx_id);
+
+ upd_field_set_field_no(upd_field,
+ dict_index_get_sys_col_pos(index, DATA_TRX_ID),
+ index, trx);
+ dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN);
+
+ upd_field = upd_get_nth_field(update, n_fields + 1);
+ buf = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN);
+ trx_write_roll_ptr(buf, roll_ptr);
+
+ upd_field_set_field_no(
+ upd_field, dict_index_get_sys_col_pos(index, DATA_ROLL_PTR),
+ index, trx);
+ dfield_set_data(&(upd_field->new_val), buf, DATA_ROLL_PTR_LEN);
+
+ /* Store then the updated ordinary columns to the update vector */
+
+ for (i = 0; i < n_fields; i++) {
+
+ byte* field;
+ ulint len;
+ ulint field_no;
+ ulint orig_len;
+
+ ptr = trx_undo_update_rec_get_field_no(ptr, &field_no);
+
+ if (field_no >= dict_index_get_n_fields(index)) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to access"
+ " update undo rec field %lu in ",
+ (ulong) field_no);
+ dict_index_name_print(stderr, trx, index);
+ fprintf(stderr, "\n"
+ "InnoDB: but index has only %lu fields\n"
+ "InnoDB: Submit a detailed bug report"
+ " to http://bugs.mysql.com\n"
+ "InnoDB: Run also CHECK TABLE ",
+ (ulong) dict_index_get_n_fields(index));
+ ut_print_name(stderr, trx, TRUE, index->table_name);
+ fprintf(stderr, "\n"
+ "InnoDB: n_fields = %lu, i = %lu, ptr %p\n",
+ (ulong) n_fields, (ulong) i, ptr);
+ ut_ad(0);
+ *upd = NULL;
+ return(NULL);
+ }
+
+ upd_field = upd_get_nth_field(update, i);
+
+ upd_field_set_field_no(upd_field, field_no, index, trx);
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+ upd_field->orig_len = orig_len;
+
+ if (len == UNIV_SQL_NULL) {
+ dfield_set_null(&upd_field->new_val);
+ } else if (len < UNIV_EXTERN_STORAGE_FIELD) {
+ dfield_set_data(&upd_field->new_val, field, len);
+ } else {
+ len -= UNIV_EXTERN_STORAGE_FIELD;
+
+ dfield_set_data(&upd_field->new_val, field, len);
+ dfield_set_ext(&upd_field->new_val);
+ }
+ }
+
+ *upd = update;
+
+ return(ptr);
+}
+
+/*******************************************************************//**
+Builds a partial row from an update undo log record. It contains the
+columns which occur as ordering in any index of the table.
+@return pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_partial_row(
+/*=========================*/
+ byte* ptr, /*!< in: remaining part in update undo log
+ record of a suitable type, at the start of
+ the stored index columns;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the partial row is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ dtuple_t** row, /*!< out, own: partial row */
+ ibool ignore_prefix, /*!< in: flag to indicate if we
+ expect blob prefixes in undo. Used
+ only in the assertion. */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory
+ needed is allocated */
+{
+ const byte* end_ptr;
+ ulint row_len;
+
+ ut_ad(index);
+ ut_ad(ptr);
+ ut_ad(row);
+ ut_ad(heap);
+ ut_ad(dict_index_is_clust(index));
+
+ row_len = dict_table_get_n_cols(index->table);
+
+ *row = dtuple_create(heap, row_len);
+
+ dict_table_copy_types(*row, index->table);
+
+ end_ptr = ptr + mach_read_from_2(ptr);
+ ptr += 2;
+
+ while (ptr != end_ptr) {
+ dfield_t* dfield;
+ byte* field;
+ ulint field_no;
+ const dict_col_t* col;
+ ulint col_no;
+ ulint len;
+ ulint orig_len;
+
+ ptr = trx_undo_update_rec_get_field_no(ptr, &field_no);
+
+ col = dict_index_get_nth_col(index, field_no);
+ col_no = dict_col_get_no(col);
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+ dfield = dtuple_get_nth_field(*row, col_no);
+
+ dfield_set_data(dfield, field, len);
+
+ if (len != UNIV_SQL_NULL
+ && len >= UNIV_EXTERN_STORAGE_FIELD) {
+ dfield_set_len(dfield,
+ len - UNIV_EXTERN_STORAGE_FIELD);
+ dfield_set_ext(dfield);
+ /* If the prefix of this column is indexed,
+ ensure that enough prefix is stored in the
+ undo log record. */
+ if (!ignore_prefix && col->ord_part) {
+ ut_a(dfield_get_len(dfield)
+ >= BTR_EXTERN_FIELD_REF_SIZE);
+ ut_a(dict_table_get_format(index->table)
+ >= DICT_TF_FORMAT_ZIP
+ || dfield_get_len(dfield)
+ >= REC_ANTELOPE_MAX_INDEX_COL_LEN
+ + BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+ }
+
+ return(ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Erases the unused undo log page end.
+@return TRUE if the page contained something, FALSE if it was empty */
+static __attribute__((nonnull))
+ibool
+trx_undo_erase_page_end(
+/*====================*/
+ page_t* undo_page, /*!< in/out: undo page whose end to erase */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ulint first_free;
+
+ first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE);
+ memset(undo_page + first_free, 0xff,
+ (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) - first_free);
+
+ mlog_write_initial_log_record(undo_page, MLOG_UNDO_ERASE_END, mtr);
+ return(first_free != TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+}
+
+/***********************************************************//**
+Parses a redo log record of erasing of an undo page end.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_erase_page_end(
+/*==========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr __attribute__((unused)), /*!< in: buffer end */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr) /*!< in: mtr or NULL */
+{
+ ut_ad(ptr && end_ptr);
+
+ if (page == NULL) {
+
+ return(ptr);
+ }
+
+ trx_undo_erase_page_end(page, mtr);
+
+ return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+trx_undo_report_row_operation(
+/*==========================*/
+ ulint flags, /*!< in: if BTR_NO_UNDO_LOG_FLAG bit is
+ set, does nothing */
+ ulint op_type, /*!< in: TRX_UNDO_INSERT_OP or
+ TRX_UNDO_MODIFY_OP */
+ que_thr_t* thr, /*!< in: query thread */
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t* clust_entry, /*!< in: in the case of an insert,
+ index entry to insert into the
+ clustered index, otherwise NULL */
+ const upd_t* update, /*!< in: in the case of an update,
+ the update vector, otherwise NULL */
+ ulint cmpl_info, /*!< in: compiler info on secondary
+ index updates */
+ const rec_t* rec, /*!< in: in case of an update or delete
+ marking, the record in the clustered
+ index, otherwise NULL */
+ roll_ptr_t* roll_ptr) /*!< out: rollback pointer to the
+ inserted undo log record,
+ 0 if BTR_NO_UNDO_LOG
+ flag was specified */
+{
+ trx_t* trx;
+ trx_undo_t* undo;
+ ulint page_no;
+ buf_block_t* undo_block;
+ trx_rseg_t* rseg;
+ mtr_t mtr;
+ ulint err = DB_SUCCESS;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+#ifdef UNIV_DEBUG
+ int loop_count = 0;
+#endif /* UNIV_DEBUG */
+ rec_offs_init(offsets_);
+
+ ut_a(dict_index_is_clust(index));
+
+ if (flags & BTR_NO_UNDO_LOG_FLAG) {
+
+ *roll_ptr = 0;
+
+ return(DB_SUCCESS);
+ }
+
+ ut_ad(thr);
+ ut_ad((op_type != TRX_UNDO_INSERT_OP)
+ || (clust_entry && !update && !rec));
+
+ trx = thr_get_trx(thr);
+ rseg = trx->rseg;
+
+ mutex_enter(&(trx->undo_mutex));
+
+ /* If the undo log is not assigned yet, assign one */
+
+ if (op_type == TRX_UNDO_INSERT_OP) {
+
+ if (trx->insert_undo == NULL) {
+
+ err = trx_undo_assign_undo(trx, TRX_UNDO_INSERT);
+ }
+
+ undo = trx->insert_undo;
+
+ if (UNIV_UNLIKELY(!undo)) {
+ /* Did not succeed */
+ ut_ad(err != DB_SUCCESS);
+ mutex_exit(&(trx->undo_mutex));
+
+ return(err);
+ }
+
+ ut_ad(err == DB_SUCCESS);
+ } else {
+ ut_ad(op_type == TRX_UNDO_MODIFY_OP);
+
+ if (trx->update_undo == NULL) {
+
+ err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE);
+
+ }
+
+ undo = trx->update_undo;
+
+ if (UNIV_UNLIKELY(!undo)) {
+ /* Did not succeed */
+ ut_ad(err != DB_SUCCESS);
+ mutex_exit(&(trx->undo_mutex));
+ return(err);
+ }
+
+ ut_ad(err == DB_SUCCESS);
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ }
+
+ mtr_start(&mtr);
+
+ page_no = undo->last_page_no;
+ undo_block = buf_page_get_gen(
+ undo->space, undo->zip_size, page_no, RW_X_LATCH,
+ undo->guess_block, BUF_GET, __FILE__, __LINE__, &mtr);
+ buf_block_dbg_add_level(undo_block, SYNC_TRX_UNDO_PAGE);
+
+ do {
+ page_t* undo_page;
+ ulint offset;
+
+ undo_page = buf_block_get_frame(undo_block);
+ ut_ad(page_no == buf_block_get_page_no(undo_block));
+
+ if (op_type == TRX_UNDO_INSERT_OP) {
+ offset = trx_undo_page_report_insert(
+ undo_page, trx, index, clust_entry, &mtr);
+ } else {
+ offset = trx_undo_page_report_modify(
+ undo_page, trx, index, rec, offsets, update,
+ cmpl_info, &mtr);
+ }
+
+ if (UNIV_UNLIKELY(offset == 0)) {
+ /* The record did not fit on the page. We erase the
+ end segment of the undo log page and write a log
+ record of it: this is to ensure that in the debug
+ version the replicate page constructed using the log
+ records stays identical to the original page */
+
+ if (!trx_undo_erase_page_end(undo_page, &mtr)) {
+ /* The record did not fit on an empty
+ undo page. Discard the freshly allocated
+ page and return an error. */
+
+ /* When we remove a page from an undo
+ log, this is analogous to a
+ pessimistic insert in a B-tree, and we
+ must reserve the counterpart of the
+ tree latch, which is the rseg
+ mutex. We must commit the mini-transaction
+ first, because it may be holding lower-level
+ latches, such as SYNC_FSP and SYNC_FSP_PAGE. */
+
+ mtr_commit(&mtr);
+ mtr_start(&mtr);
+
+ mutex_enter(&rseg->mutex);
+ trx_undo_free_last_page(trx, undo, &mtr);
+ mutex_exit(&rseg->mutex);
+
+ err = DB_UNDO_RECORD_TOO_BIG;
+ goto err_exit;
+ }
+
+ mtr_commit(&mtr);
+ } else {
+ /* Success */
+
+ mtr_commit(&mtr);
+
+ undo->empty = FALSE;
+ undo->top_page_no = page_no;
+ undo->top_offset = offset;
+ undo->top_undo_no = trx->undo_no;
+ undo->guess_block = undo_block;
+
+ trx->undo_no++;
+
+ mutex_exit(&trx->undo_mutex);
+
+ *roll_ptr = trx_undo_build_roll_ptr(
+ op_type == TRX_UNDO_INSERT_OP,
+ rseg->id, page_no, offset);
+ err = DB_SUCCESS;
+ goto func_exit;
+ }
+
+ ut_ad(page_no == undo->last_page_no);
+
+ /* We have to extend the undo log by one page */
+
+ ut_ad(++loop_count < 2);
+ mtr_start(&mtr);
+
+ /* When we add a page to an undo log, this is analogous to
+ a pessimistic insert in a B-tree, and we must reserve the
+ counterpart of the tree latch, which is the rseg mutex. */
+
+ mutex_enter(&rseg->mutex);
+ undo_block = trx_undo_add_page(trx, undo, &mtr);
+ mutex_exit(&rseg->mutex);
+ page_no = undo->last_page_no;
+ } while (undo_block != NULL);
+
+ /* Did not succeed: out of space */
+ err = DB_OUT_OF_FILE_SPACE;
+
+err_exit:
+ mutex_exit(&trx->undo_mutex);
+ mtr_commit(&mtr);
+func_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(err);
+}
+
+/*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/
+
+/******************************************************************//**
+Copies an undo record to heap. This function can be called if we know that
+the undo log record exists.
+@return own: copy of the record */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_undo_rec_low(
+/*======================*/
+ roll_ptr_t roll_ptr, /*!< in: roll pointer to record */
+ mem_heap_t* heap) /*!< in: memory heap where copied */
+{
+ trx_undo_rec_t* undo_rec;
+ ulint rseg_id;
+ ulint page_no;
+ ulint offset;
+ const page_t* undo_page;
+ trx_rseg_t* rseg;
+ ibool is_insert;
+ mtr_t mtr;
+
+ trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no,
+ &offset);
+ rseg = trx_rseg_get_on_id(rseg_id);
+
+ mtr_start(&mtr);
+
+ undo_page = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size,
+ page_no, &mtr);
+
+ undo_rec = trx_undo_rec_copy(undo_page + offset, heap);
+
+ mtr_commit(&mtr);
+
+ return(undo_rec);
+}
+
+/******************************************************************//**
+Copies an undo record to heap.
+
+NOTE: the caller must have latches on the clustered index page and
+purge_view.
+
+@return DB_SUCCESS, or DB_MISSING_HISTORY if the undo log has been
+truncated and we cannot fetch the old version */
+UNIV_INTERN
+ulint
+trx_undo_get_undo_rec(
+/*==================*/
+ roll_ptr_t roll_ptr, /*!< in: roll pointer to record */
+ trx_id_t trx_id, /*!< in: id of the trx that generated
+ the roll pointer: it points to an
+ undo log of this transaction */
+ trx_undo_rec_t** undo_rec, /*!< out, own: copy of the record */
+ mem_heap_t* heap) /*!< in: memory heap where copied */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (!trx_purge_update_undo_must_exist(trx_id)) {
+
+ /* It may be that the necessary undo log has already been
+ deleted */
+
+ return(DB_MISSING_HISTORY);
+ }
+
+ *undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
+
+ return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Build a previous version of a clustered index record. This function checks
+that the caller has a latch on the index page of the clustered index record
+and an s-latch on the purge_view. This guarantees that the stack of versions
+is locked all the way down to the purge_view.
+@return DB_SUCCESS, or DB_MISSING_HISTORY if the previous version is
+earlier than purge_view, which means that it may have been removed,
+DB_ERROR if corrupted record */
+UNIV_INTERN
+ulint
+trx_undo_prev_version_build(
+/*========================*/
+ const rec_t* index_rec,/*!< in: clustered index record in the
+ index tree */
+ mtr_t* index_mtr __attribute__((unused)),
+ /*!< in: mtr which contains the latch to
+ index_rec page and purge_view */
+ const rec_t* rec, /*!< in: version of a clustered index record */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ mem_heap_t* heap, /*!< in: memory heap from which the memory
+ needed is allocated */
+ rec_t** old_vers)/*!< out, own: previous version, or NULL if
+ rec is the first inserted version, or if
+ history data has been deleted (an error),
+ or if the purge COULD have removed the version
+ though it has not yet done so */
+{
+ trx_undo_rec_t* undo_rec = NULL;
+ dtuple_t* entry;
+ trx_id_t rec_trx_id;
+ ulint type;
+ undo_no_t undo_no;
+ table_id_t table_id;
+ trx_id_t trx_id;
+ roll_ptr_t roll_ptr;
+ roll_ptr_t old_roll_ptr;
+ upd_t* update;
+ byte* ptr;
+ ulint info_bits;
+ ulint cmpl_info;
+ ibool dummy_extern;
+ byte* buf;
+ ulint err;
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(mtr_memo_contains_page(index_mtr, index_rec, MTR_MEMO_PAGE_S_FIX)
+ || mtr_memo_contains_page(index_mtr, index_rec,
+ MTR_MEMO_PAGE_X_FIX));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (!dict_index_is_clust(index)) {
+ fprintf(stderr, "InnoDB: Error: trying to access"
+ " update undo rec for non-clustered index %s\n"
+ "InnoDB: Submit a detailed bug report to"
+ " http://bugs.mysql.com\n"
+ "InnoDB: index record ", index->name);
+ rec_print(stderr, index_rec, index);
+ fputs("\n"
+ "InnoDB: record version ", stderr);
+ rec_print_new(stderr, rec, offsets);
+ putc('\n', stderr);
+ ut_ad(0);
+ return(DB_ERROR);
+ }
+
+ roll_ptr = row_get_rec_roll_ptr(rec, index, offsets);
+ old_roll_ptr = roll_ptr;
+
+ *old_vers = NULL;
+
+ if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+ /* The record rec is the first inserted version */
+
+ return(DB_SUCCESS);
+ }
+
+ rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+ err = trx_undo_get_undo_rec(roll_ptr, rec_trx_id, &undo_rec, heap);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ /* The undo record may already have been purged.
+ This should never happen in InnoDB. */
+
+ return(err);
+ }
+
+ ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
+ &dummy_extern, &undo_no, &table_id);
+
+ ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+ &info_bits);
+
+ /* (a) If a clustered index record version is such that the
+ trx id stamp in it is bigger than purge_sys->view, then the
+ BLOBs in that version are known to exist (the purge has not
+ progressed that far);
+
+ (b) if the version is the first version such that trx id in it
+ is less than purge_sys->view, and it is not delete-marked,
+ then the BLOBs in that version are known to exist (the purge
+ cannot have purged the BLOBs referenced by that version
+ yet).
+
+ This function does not fetch any BLOBs. The callers might, by
+ possibly invoking row_ext_create() via row_build(). However,
+ they should have all needed information in the *old_vers
+ returned by this function. This is because *old_vers is based
+ on the transaction undo log records. The function
+ trx_undo_page_fetch_ext() will write BLOB prefixes to the
+ transaction undo log that are at least as long as the longest
+ possible column prefix in a secondary index. Thus, secondary
+ index entries for *old_vers can be constructed without
+ dereferencing any BLOB pointers. */
+
+ ptr = trx_undo_rec_skip_row_ref(ptr, index);
+
+ ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id,
+ roll_ptr, info_bits,
+ NULL, heap, &update);
+
+ if (UNIV_UNLIKELY(table_id != index->table->id)) {
+ ptr = NULL;
+
+ fprintf(stderr,
+ "InnoDB: Error: trying to access update undo rec"
+ " for table %s\n"
+ "InnoDB: but the table id in the"
+ " undo record is wrong\n"
+ "InnoDB: Submit a detailed bug report"
+ " to http://bugs.mysql.com\n"
+ "InnoDB: Run also CHECK TABLE %s\n",
+ index->table_name, index->table_name);
+ }
+
+ if (ptr == NULL) {
+ /* The record was corrupted, return an error; these printfs
+ should catch an elusive bug in row_vers_old_has_index_entry */
+
+ fprintf(stderr,
+ "InnoDB: table %s, index %s, n_uniq %lu\n"
+ "InnoDB: undo rec address %p, type %lu cmpl_info %lu\n"
+ "InnoDB: undo rec table id %llu,"
+ " index table id %llu\n"
+ "InnoDB: dump of 150 bytes in undo rec: ",
+ index->table_name, index->name,
+ (ulong) dict_index_get_n_unique(index),
+ undo_rec, (ulong) type, (ulong) cmpl_info,
+ (ullint) table_id,
+ (ullint) index->table->id);
+ ut_print_buf(stderr, undo_rec, 150);
+ fputs("\n"
+ "InnoDB: index record ", stderr);
+ rec_print(stderr, index_rec, index);
+ fputs("\n"
+ "InnoDB: record version ", stderr);
+ rec_print_new(stderr, rec, offsets);
+ fprintf(stderr, "\n"
+ "InnoDB: Record trx id " TRX_ID_FMT
+ ", update rec trx id " TRX_ID_FMT "\n"
+ "InnoDB: Roll ptr in rec " TRX_ID_FMT
+ ", in update rec" TRX_ID_FMT "\n",
+ (ullint) rec_trx_id, (ullint) trx_id,
+ (ullint) old_roll_ptr, (ullint) roll_ptr);
+
+ trx_purge_sys_print();
+ ut_ad(0);
+ return(DB_ERROR);
+ }
+
+# ifdef UNIV_BLOB_NULL_DEBUG
+ ut_a(!rec_offs_any_null_extern(rec, offsets));
+# endif /* UNIV_BLOB_NULL_DEBUG */
+
+ if (row_upd_changes_field_size_or_external(index, offsets, update)) {
+ ulint n_ext;
+
+ /* We should confirm the existence of disowned external data,
+ if the previous version record is delete marked. If the trx_id
+ of the previous record is seen by purge view, we should treat
+ it as missing history, because the disowned external data
+ might be purged already.
+
+ The inherited external data (BLOBs) can be freed (purged)
+ after trx_id was committed, provided that no view was started
+ before trx_id. If the purge view can see the committed
+ delete-marked record by trx_id, no transactions need to access
+ the BLOB. */
+
+ if ((update->info_bits & REC_INFO_DELETED_FLAG)
+ && read_view_sees_trx_id(purge_sys->view, trx_id)) {
+ /* treat as a fresh insert, not to
+ cause assertion error at the caller. */
+ return(DB_SUCCESS);
+ }
+
+ /* We have to set the appropriate extern storage bits in the
+ old version of the record: the extern bits in rec for those
+ fields that update does NOT update, as well as the bits for
+ those fields that update updates to become externally stored
+ fields. Store the info: */
+
+ entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index,
+ offsets, &n_ext, heap);
+ n_ext += btr_push_update_extern_fields(entry, update, heap);
+ /* The page containing the clustered index record
+ corresponding to entry is latched in mtr. Thus the
+ following call is safe. */
+ row_upd_index_replace_new_col_vals(entry, index, update, heap);
+
+ buf = mem_heap_alloc(heap, rec_get_converted_size(index, entry,
+ n_ext));
+
+ *old_vers = rec_convert_dtuple_to_rec(buf, index,
+ entry, n_ext);
+ } else {
+ buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+ *old_vers = rec_copy(buf, rec, offsets);
+ rec_offs_make_valid(*old_vers, index, offsets);
+ row_upd_rec_in_place(*old_vers, index, offsets, update, NULL);
+ }
+
+ return(DB_SUCCESS);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/trx/trx0roll.c b/storage/xtradb/trx/trx0roll.c
new file mode 100644
index 00000000000..25c1d5d4692
--- /dev/null
+++ b/storage/xtradb/trx/trx0roll.c
@@ -0,0 +1,1357 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0roll.c
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0roll.h"
+
+#ifdef UNIV_NONINL
+#include "trx0roll.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0undo.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "usr0sess.h"
+#include "srv0start.h"
+#include "row0undo.h"
+#include "row0mysql.h"
+#include "lock0lock.h"
+#include "pars0pars.h"
+
+/** This many pages must be undone before a truncate is tried within
+rollback */
+#define TRX_ROLL_TRUNC_THRESHOLD 1
+
+/** In crash recovery, the current trx to be rolled back; NULL otherwise */
+static const trx_t* trx_roll_crash_recv_trx = NULL;
+
+/** In crash recovery we set this to the undo n:o of the current trx to be
+rolled back. Then we can print how many % the rollback has progressed. */
+static undo_no_t trx_roll_max_undo_no;
+
+/** Auxiliary variable which tells the previous progress % we printed */
+static ulint trx_roll_progress_printed_pct;
+
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_general_rollback_for_mysql(
+/*===========================*/
+ trx_t* trx, /*!< in: transaction handle */
+ trx_savept_t* savept) /*!< in: pointer to savepoint undo number, if
+ partial rollback requested, or NULL for
+ complete rollback */
+{
+ mem_heap_t* heap;
+ que_thr_t* thr;
+ roll_node_t* roll_node;
+
+ /* Tell Innobase server that there might be work for
+ utility threads: */
+
+ srv_active_wake_master_thread();
+
+ trx_start_if_not_started(trx);
+
+ heap = mem_heap_create(512);
+
+ roll_node = roll_node_create(heap);
+
+ if (savept) {
+ roll_node->partial = TRUE;
+ roll_node->savept = *savept;
+ }
+
+ trx->error_state = DB_SUCCESS;
+
+ thr = pars_complete_graph_for_exec(roll_node, trx, heap);
+
+ ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+ que_run_threads(thr);
+
+ mutex_enter(&kernel_mutex);
+
+ while (trx->que_state != TRX_QUE_RUNNING) {
+
+ mutex_exit(&kernel_mutex);
+
+ os_thread_sleep(100000);
+
+ mutex_enter(&kernel_mutex);
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ mem_heap_free(heap);
+
+ ut_a(trx->error_state == DB_SUCCESS);
+
+ /* Tell Innobase server that there might be work for
+ utility threads: */
+
+ srv_active_wake_master_thread();
+
+ return((int) trx->error_state);
+}
+
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_rollback_for_mysql(
+/*===================*/
+ trx_t* trx) /*!< in: transaction handle */
+{
+ int err;
+
+ if (trx->state == TRX_NOT_STARTED) {
+
+ return(DB_SUCCESS);
+ }
+
+ trx->op_info = "rollback";
+
+ /* If we are doing the XA recovery of prepared transactions, then
+ the transaction object does not have an InnoDB session object, and we
+ set a dummy session that we use for all MySQL transactions. */
+
+ err = trx_general_rollback_for_mysql(trx, NULL);
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*******************************************************************//**
+Rollback the latest SQL statement for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+ trx_t* trx) /*!< in: transaction handle */
+{
+ int err;
+
+ if (trx->state == TRX_NOT_STARTED) {
+
+ return(DB_SUCCESS);
+ }
+
+ trx->op_info = "rollback of SQL statement";
+
+ err = trx_general_rollback_for_mysql(trx, &trx->last_sql_stat_start);
+ /* The following call should not be needed, but we play safe: */
+ trx_mark_sql_stat_end(trx);
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*******************************************************************//**
+Frees a single savepoint struct. */
+UNIV_INTERN
+void
+trx_roll_savepoint_free(
+/*=====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ trx_named_savept_t* savep) /*!< in: savepoint to free */
+{
+ ut_a(savep != NULL);
+ ut_a(UT_LIST_GET_LEN(trx->trx_savepoints) > 0);
+
+ UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep);
+ mem_free(savep->name);
+ mem_free(savep);
+}
+
+/*******************************************************************//**
+Frees savepoint structs starting from savep, if savep == NULL then
+free all savepoints. */
+UNIV_INTERN
+void
+trx_roll_savepoints_free(
+/*=====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ trx_named_savept_t* savep) /*!< in: free all savepoints > this one;
+ if this is NULL, free all savepoints
+ of trx */
+{
+ trx_named_savept_t* next_savep;
+
+ if (savep == NULL) {
+ savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+ } else {
+ savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+ }
+
+ while (savep != NULL) {
+ next_savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+
+ trx_roll_savepoint_free(trx, savep);
+
+ savep = next_savep;
+ }
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+UNIV_INTERN
+ulint
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name, /*!< in: savepoint name */
+ ib_int64_t* mysql_binlog_cache_pos) /*!< out: the MySQL binlog cache
+ position corresponding to this
+ savepoint; MySQL needs this
+ information to remove the
+ binlog entries of the queries
+ executed after the savepoint */
+{
+ trx_named_savept_t* savep;
+ ulint err;
+
+ savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+
+ while (savep != NULL) {
+ if (0 == ut_strcmp(savep->name, savepoint_name)) {
+ /* Found */
+ break;
+ }
+ savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+ }
+
+ if (savep == NULL) {
+
+ return(DB_NO_SAVEPOINT);
+ }
+
+ if (trx->state == TRX_NOT_STARTED) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: transaction has a savepoint ", stderr);
+ ut_print_name(stderr, trx, FALSE, savep->name);
+ fputs(" though it is not started\n", stderr);
+ return(DB_ERROR);
+ }
+
+ /* We can now free all savepoints strictly later than this one */
+
+ trx_roll_savepoints_free(trx, savep);
+
+ *mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos;
+
+ trx->op_info = "rollback to a savepoint";
+
+ err = trx_general_rollback_for_mysql(trx, &savep->savept);
+
+ /* Store the current undo_no of the transaction so that we know where
+ to roll back if we have to roll back the next SQL statement: */
+
+ trx_mark_sql_stat_end(trx);
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*******************************************************************//**
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback.
+@return always DB_SUCCESS */
+UNIV_INTERN
+ulint
+trx_savepoint_for_mysql(
+/*====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name, /*!< in: savepoint name */
+ ib_int64_t binlog_cache_pos) /*!< in: MySQL binlog cache
+ position corresponding to this
+ connection at the time of the
+ savepoint */
+{
+ trx_named_savept_t* savep;
+
+ ut_a(trx);
+ ut_a(savepoint_name);
+
+ trx_start_if_not_started(trx);
+
+ savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+
+ while (savep != NULL) {
+ if (0 == ut_strcmp(savep->name, savepoint_name)) {
+ /* Found */
+ break;
+ }
+ savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+ }
+
+ if (savep) {
+ /* There is a savepoint with the same name: free that */
+
+ UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep);
+
+ mem_free(savep->name);
+ mem_free(savep);
+ }
+
+ /* Create a new savepoint and add it as the last in the list */
+
+ savep = mem_alloc(sizeof(trx_named_savept_t));
+
+ savep->name = mem_strdup(savepoint_name);
+
+ savep->savept = trx_savept_take(trx);
+
+ savep->mysql_binlog_cache_pos = binlog_cache_pos;
+
+ UT_LIST_ADD_LAST(trx_savepoints, trx->trx_savepoints, savep);
+
+ return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Releases only the named savepoint. Savepoints which were set after this
+savepoint are left as is.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+UNIV_INTERN
+ulint
+trx_release_savepoint_for_mysql(
+/*============================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name) /*!< in: savepoint name */
+{
+ trx_named_savept_t* savep;
+
+ savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+
+ /* Search for the savepoint by name and free if found. */
+ while (savep != NULL) {
+ if (0 == ut_strcmp(savep->name, savepoint_name)) {
+ trx_roll_savepoint_free(trx, savep);
+ return(DB_SUCCESS);
+ }
+ savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+ }
+
+ return(DB_NO_SAVEPOINT);
+}
+
+/*******************************************************************//**
+Determines if this transaction is rolling back an incomplete transaction
+in crash recovery.
+@return TRUE if trx is an incomplete transaction that is being rolled
+back in crash recovery */
+UNIV_INTERN
+ibool
+trx_is_recv(
+/*========*/
+ const trx_t* trx) /*!< in: transaction */
+{
+ return(trx == trx_roll_crash_recv_trx);
+}
+
+/*******************************************************************//**
+Returns a transaction savepoint taken at this point in time.
+@return savepoint */
+UNIV_INTERN
+trx_savept_t
+trx_savept_take(
+/*============*/
+ trx_t* trx) /*!< in: transaction */
+{
+ trx_savept_t savept;
+
+ savept.least_undo_no = trx->undo_no;
+
+ return(savept);
+}
+
+/*******************************************************************//**
+Roll back an active transaction. */
+static
+void
+trx_rollback_active(
+/*================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ mem_heap_t* heap;
+ que_fork_t* fork;
+ que_thr_t* thr;
+ roll_node_t* roll_node;
+ dict_table_t* table;
+ ib_int64_t rows_to_undo;
+ const char* unit = "";
+ ibool dictionary_locked = FALSE;
+
+ heap = mem_heap_create(512);
+
+ fork = que_fork_create(NULL, NULL, QUE_FORK_RECOVERY, heap);
+ fork->trx = trx;
+
+ thr = que_thr_create(fork, heap);
+
+ roll_node = roll_node_create(heap);
+
+ thr->child = roll_node;
+ roll_node->common.parent = thr;
+
+ mutex_enter(&kernel_mutex);
+
+ trx->graph = fork;
+
+ ut_a(thr == que_fork_start_command(fork));
+
+ trx_roll_crash_recv_trx = trx;
+ trx_roll_max_undo_no = trx->undo_no;
+ trx_roll_progress_printed_pct = 0;
+ rows_to_undo = trx_roll_max_undo_no;
+
+ if (rows_to_undo > 1000000000) {
+ rows_to_undo = rows_to_undo / 1000000;
+ unit = "M";
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Rolling back trx with id " TRX_ID_FMT ", %lu%s"
+ " rows to undo\n",
+ (ullint) trx->id,
+ (ulong) rows_to_undo, unit);
+ mutex_exit(&kernel_mutex);
+
+ if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) {
+ row_mysql_lock_data_dictionary(trx);
+ dictionary_locked = TRUE;
+ }
+
+ que_run_threads(thr);
+
+ mutex_enter(&kernel_mutex);
+
+ while (trx->que_state != TRX_QUE_RUNNING) {
+
+ mutex_exit(&kernel_mutex);
+
+ fprintf(stderr,
+ "InnoDB: Waiting for rollback of trx id "
+ TRX_ID_FMT " to end\n",
+ (ullint) trx->id);
+ os_thread_sleep(100000);
+
+ mutex_enter(&kernel_mutex);
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE
+ && trx->table_id != 0) {
+
+ /* If the transaction was for a dictionary operation, we
+ drop the relevant table, if it still exists */
+
+ fprintf(stderr,
+ "InnoDB: Dropping table with id %llu"
+ " in recovery if it exists\n",
+ (ullint) trx->table_id);
+
+ table = dict_table_get_on_id_low(trx->table_id);
+
+ if (table) {
+ ulint err;
+
+ fputs("InnoDB: Table found: dropping table ", stderr);
+ ut_print_name(stderr, trx, TRUE, table->name);
+ fputs(" in recovery\n", stderr);
+
+ err = row_drop_table_for_mysql(table->name, trx, TRUE);
+ trx_commit_for_mysql(trx);
+
+ ut_a(err == (int) DB_SUCCESS);
+ }
+ }
+
+ if (dictionary_locked) {
+ row_mysql_unlock_data_dictionary(trx);
+ }
+
+ fprintf(stderr, "\nInnoDB: Rolling back of trx id " TRX_ID_FMT
+ " completed\n",
+ (ullint) trx->id);
+ mem_heap_free(heap);
+
+ trx_roll_crash_recv_trx = NULL;
+}
+
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery. If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back. */
+UNIV_INTERN
+void
+trx_rollback_or_clean_recovered(
+/*============================*/
+ ibool all) /*!< in: FALSE=roll back dictionary transactions;
+ TRUE=roll back all non-PREPARED transactions */
+{
+ trx_t* trx;
+
+ mutex_enter(&kernel_mutex);
+
+ if (!UT_LIST_GET_FIRST(trx_sys->trx_list)) {
+ goto leave_function;
+ }
+
+ if (all) {
+ fprintf(stderr,
+ "InnoDB: Starting in background the rollback"
+ " of uncommitted transactions\n");
+ }
+
+ mutex_exit(&kernel_mutex);
+
+loop:
+ mutex_enter(&kernel_mutex);
+
+ for (trx = UT_LIST_GET_FIRST(trx_sys->trx_list); trx;
+ trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+ if (!trx->is_recovered) {
+ continue;
+ }
+
+ switch (trx->state) {
+ case TRX_NOT_STARTED:
+ case TRX_PREPARED:
+ continue;
+
+ case TRX_COMMITTED_IN_MEMORY:
+ mutex_exit(&kernel_mutex);
+ fprintf(stderr,
+ "InnoDB: Cleaning up trx with id "
+ TRX_ID_FMT "\n",
+ (ullint) trx->id);
+ trx_cleanup_at_db_startup(trx);
+ goto loop;
+
+ case TRX_ACTIVE:
+ if (all || trx_get_dict_operation(trx)
+ != TRX_DICT_OP_NONE) {
+ mutex_exit(&kernel_mutex);
+ trx_rollback_active(trx);
+ goto loop;
+ }
+ }
+ }
+
+ if (all) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Rollback of non-prepared"
+ " transactions completed\n");
+ }
+
+leave_function:
+ mutex_exit(&kernel_mutex);
+}
+
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery. If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+trx_rollback_or_clean_all_recovered(
+/*================================*/
+ void* arg __attribute__((unused)))
+ /*!< in: a dummy parameter required by
+ os_thread_create */
+{
+#ifdef UNIV_PFS_THREAD
+ pfs_register_thread(trx_rollback_clean_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+ trx_rollback_or_clean_recovered(TRUE);
+
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+
+ os_thread_exit(NULL);
+
+ OS_THREAD_DUMMY_RETURN;
+}
+
+/*******************************************************************//**
+Creates an undo number array.
+@return own: undo number array */
+UNIV_INTERN
+trx_undo_arr_t*
+trx_undo_arr_create(void)
+/*=====================*/
+{
+ trx_undo_arr_t* arr;
+ mem_heap_t* heap;
+ ulint i;
+
+ heap = mem_heap_create(1024);
+
+ arr = mem_heap_alloc(heap, sizeof(trx_undo_arr_t));
+
+ arr->infos = mem_heap_alloc(heap, sizeof(trx_undo_inf_t)
+ * UNIV_MAX_PARALLELISM);
+ arr->n_cells = UNIV_MAX_PARALLELISM;
+ arr->n_used = 0;
+
+ arr->heap = heap;
+
+ for (i = 0; i < UNIV_MAX_PARALLELISM; i++) {
+
+ (trx_undo_arr_get_nth_info(arr, i))->in_use = FALSE;
+ }
+
+ return(arr);
+}
+
+/*******************************************************************//**
+Frees an undo number array. */
+UNIV_INTERN
+void
+trx_undo_arr_free(
+/*==============*/
+ trx_undo_arr_t* arr) /*!< in: undo number array */
+{
+ ut_ad(arr->n_used == 0);
+
+ mem_heap_free(arr->heap);
+}
+
+/*******************************************************************//**
+Stores info of an undo log record to the array if it is not stored yet.
+@return FALSE if the record already existed in the array */
+static
+ibool
+trx_undo_arr_store_info(
+/*====================*/
+ trx_t* trx, /*!< in: transaction */
+ undo_no_t undo_no)/*!< in: undo number */
+{
+ trx_undo_inf_t* cell;
+ trx_undo_inf_t* stored_here;
+ trx_undo_arr_t* arr;
+ ulint n_used;
+ ulint n;
+ ulint i;
+
+ n = 0;
+ arr = trx->undo_no_arr;
+ n_used = arr->n_used;
+ stored_here = NULL;
+
+ for (i = 0;; i++) {
+ cell = trx_undo_arr_get_nth_info(arr, i);
+
+ if (!cell->in_use) {
+ if (!stored_here) {
+ /* Not in use, we may store here */
+ cell->undo_no = undo_no;
+ cell->in_use = TRUE;
+
+ arr->n_used++;
+
+ stored_here = cell;
+ }
+ } else {
+ n++;
+
+ if (cell->undo_no == undo_no) {
+
+ if (stored_here) {
+ stored_here->in_use = FALSE;
+ ut_ad(arr->n_used > 0);
+ arr->n_used--;
+ }
+
+ ut_ad(arr->n_used == n_used);
+
+ return(FALSE);
+ }
+ }
+
+ if (n == n_used && stored_here) {
+
+ ut_ad(arr->n_used == 1 + n_used);
+
+ return(TRUE);
+ }
+ }
+}
+
+/*******************************************************************//**
+Removes an undo number from the array. */
+static
+void
+trx_undo_arr_remove_info(
+/*=====================*/
+ trx_undo_arr_t* arr, /*!< in: undo number array */
+ undo_no_t undo_no)/*!< in: undo number */
+{
+ trx_undo_inf_t* cell;
+ ulint i;
+
+ for (i = 0;; i++) {
+ cell = trx_undo_arr_get_nth_info(arr, i);
+
+ if (cell->in_use
+ && cell->undo_no == undo_no) {
+
+ cell->in_use = FALSE;
+
+ ut_ad(arr->n_used > 0);
+
+ arr->n_used--;
+
+ return;
+ }
+ }
+}
+
+/*******************************************************************//**
+Gets the biggest undo number in an array.
+@return biggest value, 0 if the array is empty */
+static
+undo_no_t
+trx_undo_arr_get_biggest(
+/*=====================*/
+ trx_undo_arr_t* arr) /*!< in: undo number array */
+{
+ trx_undo_inf_t* cell;
+ ulint n_used;
+ undo_no_t biggest;
+ ulint n;
+ ulint i;
+
+ n = 0;
+ n_used = arr->n_used;
+ biggest = 0;
+
+ for (i = 0;; i++) {
+ cell = trx_undo_arr_get_nth_info(arr, i);
+
+ if (cell->in_use) {
+ n++;
+ if (cell->undo_no > biggest) {
+
+ biggest = cell->undo_no;
+ }
+ }
+
+ if (n == n_used) {
+ return(biggest);
+ }
+ }
+}
+
+/***********************************************************************//**
+Tries truncate the undo logs. */
+UNIV_INTERN
+void
+trx_roll_try_truncate(
+/*==================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ trx_undo_arr_t* arr;
+ undo_no_t limit;
+ undo_no_t biggest;
+
+ ut_ad(mutex_own(&(trx->undo_mutex)));
+ ut_ad(mutex_own(&((trx->rseg)->mutex)));
+
+ trx->pages_undone = 0;
+
+ arr = trx->undo_no_arr;
+
+ limit = trx->undo_no;
+
+ if (arr->n_used > 0) {
+ biggest = trx_undo_arr_get_biggest(arr);
+
+ if (biggest >= limit) {
+
+ limit = biggest + 1;
+ }
+ }
+
+ if (trx->insert_undo) {
+ trx_undo_truncate_end(trx, trx->insert_undo, limit);
+ }
+
+ if (trx->update_undo) {
+ trx_undo_truncate_end(trx, trx->update_undo, limit);
+ }
+}
+
+/***********************************************************************//**
+Pops the topmost undo log record in a single undo log and updates the info
+about the topmost record in the undo log memory struct.
+@return undo log record, the page s-latched */
+static
+trx_undo_rec_t*
+trx_roll_pop_top_rec(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ trx_undo_t* undo, /*!< in: undo log */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* undo_page;
+ ulint offset;
+ trx_undo_rec_t* prev_rec;
+ page_t* prev_rec_page;
+
+ ut_ad(mutex_own(&(trx->undo_mutex)));
+
+ undo_page = trx_undo_page_get_s_latched(undo->space, undo->zip_size,
+ undo->top_page_no, mtr);
+ offset = undo->top_offset;
+
+ /* fprintf(stderr, "Thread %lu undoing trx " TRX_ID_FMT
+ " undo record " TRX_ID_FMT "\n",
+ os_thread_get_curr_id(), trx->id, undo->top_undo_no); */
+
+ prev_rec = trx_undo_get_prev_rec(undo_page + offset,
+ undo->hdr_page_no, undo->hdr_offset,
+ mtr);
+ if (prev_rec == NULL) {
+
+ undo->empty = TRUE;
+ } else {
+ prev_rec_page = page_align(prev_rec);
+
+ if (prev_rec_page != undo_page) {
+
+ trx->pages_undone++;
+ }
+
+ undo->top_page_no = page_get_page_no(prev_rec_page);
+ undo->top_offset = prev_rec - prev_rec_page;
+ undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec);
+ }
+
+ return(undo_page + offset);
+}
+
+/********************************************************************//**
+Pops the topmost record when the two undo logs of a transaction are seen
+as a single stack of records ordered by their undo numbers. Inserts the
+undo number of the popped undo record to the array of currently processed
+undo numbers in the transaction. When the query thread finishes processing
+of this undo record, it must be released with trx_undo_rec_release.
+@return undo log record copied to heap, NULL if none left, or if the
+undo number of the top record would be less than the limit */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_roll_pop_top_rec_of_trx(
+/*========================*/
+ trx_t* trx, /*!< in: transaction */
+ undo_no_t limit, /*!< in: least undo number we need */
+ roll_ptr_t* roll_ptr,/*!< out: roll pointer to undo record */
+ mem_heap_t* heap) /*!< in: memory heap where copied */
+{
+ trx_undo_t* undo;
+ trx_undo_t* ins_undo;
+ trx_undo_t* upd_undo;
+ trx_undo_rec_t* undo_rec;
+ trx_undo_rec_t* undo_rec_copy;
+ undo_no_t undo_no;
+ ibool is_insert;
+ trx_rseg_t* rseg;
+ ulint progress_pct;
+ mtr_t mtr;
+
+ rseg = trx->rseg;
+try_again:
+ mutex_enter(&(trx->undo_mutex));
+
+ if (trx->pages_undone >= TRX_ROLL_TRUNC_THRESHOLD) {
+ mutex_enter(&(rseg->mutex));
+
+ trx_roll_try_truncate(trx);
+
+ mutex_exit(&(rseg->mutex));
+ }
+
+ ins_undo = trx->insert_undo;
+ upd_undo = trx->update_undo;
+
+ if (!ins_undo || ins_undo->empty) {
+ undo = upd_undo;
+ } else if (!upd_undo || upd_undo->empty) {
+ undo = ins_undo;
+ } else if (upd_undo->top_undo_no > ins_undo->top_undo_no) {
+ undo = upd_undo;
+ } else {
+ undo = ins_undo;
+ }
+
+ if (!undo || undo->empty
+ || limit > undo->top_undo_no) {
+
+ if ((trx->undo_no_arr)->n_used == 0) {
+ /* Rollback is ending */
+
+ mutex_enter(&(rseg->mutex));
+
+ trx_roll_try_truncate(trx);
+
+ mutex_exit(&(rseg->mutex));
+ }
+
+ mutex_exit(&(trx->undo_mutex));
+
+ return(NULL);
+ }
+
+ if (undo == ins_undo) {
+ is_insert = TRUE;
+ } else {
+ is_insert = FALSE;
+ }
+
+ *roll_ptr = trx_undo_build_roll_ptr(is_insert, (undo->rseg)->id,
+ undo->top_page_no,
+ undo->top_offset);
+ mtr_start(&mtr);
+
+ undo_rec = trx_roll_pop_top_rec(trx, undo, &mtr);
+
+ undo_no = trx_undo_rec_get_undo_no(undo_rec);
+
+ ut_ad(undo_no + 1 == trx->undo_no);
+
+ /* We print rollback progress info if we are in a crash recovery
+ and the transaction has at least 1000 row operations to undo. */
+
+ if (trx == trx_roll_crash_recv_trx && trx_roll_max_undo_no > 1000) {
+
+ progress_pct = 100 - (ulint)
+ ((undo_no * 100) / trx_roll_max_undo_no);
+ if (progress_pct != trx_roll_progress_printed_pct) {
+ if (trx_roll_progress_printed_pct == 0) {
+ fprintf(stderr,
+ "\nInnoDB: Progress in percents:"
+ " %lu", (ulong) progress_pct);
+ } else {
+ fprintf(stderr,
+ " %lu", (ulong) progress_pct);
+ }
+ fflush(stderr);
+ trx_roll_progress_printed_pct = progress_pct;
+ }
+ }
+
+ trx->undo_no = undo_no;
+
+ if (!trx_undo_arr_store_info(trx, undo_no)) {
+ /* A query thread is already processing this undo log record */
+
+ mutex_exit(&(trx->undo_mutex));
+
+ mtr_commit(&mtr);
+
+ goto try_again;
+ }
+
+ undo_rec_copy = trx_undo_rec_copy(undo_rec, heap);
+
+ mutex_exit(&(trx->undo_mutex));
+
+ mtr_commit(&mtr);
+
+ return(undo_rec_copy);
+}
+
+/********************************************************************//**
+Reserves an undo log record for a query thread to undo. This should be
+called if the query thread gets the undo log record not using the pop
+function above.
+@return TRUE if succeeded */
+UNIV_INTERN
+ibool
+trx_undo_rec_reserve(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ undo_no_t undo_no)/*!< in: undo number of the record */
+{
+ ibool ret;
+
+ mutex_enter(&(trx->undo_mutex));
+
+ ret = trx_undo_arr_store_info(trx, undo_no);
+
+ mutex_exit(&(trx->undo_mutex));
+
+ return(ret);
+}
+
+/*******************************************************************//**
+Releases a reserved undo record. */
+UNIV_INTERN
+void
+trx_undo_rec_release(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ undo_no_t undo_no)/*!< in: undo number */
+{
+ trx_undo_arr_t* arr;
+
+ mutex_enter(&(trx->undo_mutex));
+
+ arr = trx->undo_no_arr;
+
+ trx_undo_arr_remove_info(arr, undo_no);
+
+ mutex_exit(&(trx->undo_mutex));
+}
+
+/*********************************************************************//**
+Starts a rollback operation. */
+UNIV_INTERN
+void
+trx_rollback(
+/*=========*/
+ trx_t* trx, /*!< in: transaction */
+ trx_sig_t* sig, /*!< in: signal starting the rollback */
+ que_thr_t** next_thr)/*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if the passed value is
+ NULL, the parameter is ignored */
+{
+ que_t* roll_graph;
+ que_thr_t* thr;
+ /* que_thr_t* thr2; */
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad((trx->undo_no_arr == NULL) || ((trx->undo_no_arr)->n_used == 0));
+
+ /* Initialize the rollback field in the transaction */
+
+ switch (sig->type) {
+ case TRX_SIG_TOTAL_ROLLBACK:
+ trx->roll_limit = 0;
+ break;
+ case TRX_SIG_ROLLBACK_TO_SAVEPT:
+ trx->roll_limit = (sig->savept).least_undo_no;
+ break;
+ case TRX_SIG_ERROR_OCCURRED:
+ trx->roll_limit = trx->last_sql_stat_start.least_undo_no;
+ break;
+ default:
+ ut_error;
+ }
+
+ ut_a(trx->roll_limit <= trx->undo_no);
+
+ trx->pages_undone = 0;
+
+ if (trx->undo_no_arr == NULL) {
+ trx->undo_no_arr = trx_undo_arr_create();
+ }
+
+ /* Build a 'query' graph which will perform the undo operations */
+
+ roll_graph = trx_roll_graph_build(trx);
+
+ trx->graph = roll_graph;
+ trx->que_state = TRX_QUE_ROLLING_BACK;
+
+ thr = que_fork_start_command(roll_graph);
+
+ ut_ad(thr);
+
+ /* thr2 = que_fork_start_command(roll_graph);
+
+ ut_ad(thr2); */
+
+ if (next_thr && (*next_thr == NULL)) {
+ *next_thr = thr;
+ /* srv_que_task_enqueue_low(thr2); */
+ } else {
+ srv_que_task_enqueue_low(thr);
+ /* srv_que_task_enqueue_low(thr2); */
+ }
+}
+
+/****************************************************************//**
+Builds an undo 'query' graph for a transaction. The actual rollback is
+performed by executing this query graph like a query subprocedure call.
+The reply about the completion of the rollback will be sent by this
+graph.
+@return own: the query graph */
+UNIV_INTERN
+que_t*
+trx_roll_graph_build(
+/*=================*/
+ trx_t* trx) /*!< in: trx handle */
+{
+ mem_heap_t* heap;
+ que_fork_t* fork;
+ que_thr_t* thr;
+ /* que_thr_t* thr2; */
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ heap = mem_heap_create(512);
+ fork = que_fork_create(NULL, NULL, QUE_FORK_ROLLBACK, heap);
+ fork->trx = trx;
+
+ thr = que_thr_create(fork, heap);
+ /* thr2 = que_thr_create(fork, heap); */
+
+ thr->child = row_undo_node_create(trx, thr, heap);
+ /* thr2->child = row_undo_node_create(trx, thr2, heap); */
+
+ return(fork);
+}
+
+/*********************************************************************//**
+Finishes error processing after the necessary partial rollback has been
+done. */
+static
+void
+trx_finish_error_processing(
+/*========================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ trx_sig_t* sig;
+ trx_sig_t* next_sig;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ sig = UT_LIST_GET_FIRST(trx->signals);
+
+ while (sig != NULL) {
+ next_sig = UT_LIST_GET_NEXT(signals, sig);
+
+ if (sig->type == TRX_SIG_ERROR_OCCURRED) {
+
+ trx_sig_remove(trx, sig);
+ }
+
+ sig = next_sig;
+ }
+
+ trx->que_state = TRX_QUE_RUNNING;
+}
+
+/*********************************************************************//**
+Finishes a partial rollback operation. */
+static
+void
+trx_finish_partial_rollback_off_kernel(
+/*===================================*/
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t** next_thr)/*!< in/out: next query thread to run;
+ if the value which is passed in is a pointer
+ to a NULL pointer, then the calling function
+ can start running a new query thread; if this
+ parameter is NULL, it is ignored */
+{
+ trx_sig_t* sig;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ sig = UT_LIST_GET_FIRST(trx->signals);
+
+ /* Remove the signal from the signal queue and send reply message
+ to it */
+
+ trx_sig_reply(sig, next_thr);
+ trx_sig_remove(trx, sig);
+
+ trx->que_state = TRX_QUE_RUNNING;
+}
+
+/****************************************************************//**
+Finishes a transaction rollback. */
+UNIV_INTERN
+void
+trx_finish_rollback_off_kernel(
+/*===========================*/
+ que_t* graph, /*!< in: undo graph which can now be freed */
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t** next_thr)/*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if this parameter is
+ NULL, it is ignored */
+{
+ trx_sig_t* sig;
+ trx_sig_t* next_sig;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ ut_a(trx->undo_no_arr == NULL || trx->undo_no_arr->n_used == 0);
+
+ /* Free the memory reserved by the undo graph */
+ que_graph_free(graph);
+
+ sig = UT_LIST_GET_FIRST(trx->signals);
+
+ if (sig->type == TRX_SIG_ROLLBACK_TO_SAVEPT) {
+
+ trx_finish_partial_rollback_off_kernel(trx, next_thr);
+
+ return;
+
+ } else if (sig->type == TRX_SIG_ERROR_OCCURRED) {
+
+ trx_finish_error_processing(trx);
+
+ return;
+ }
+
+#ifdef UNIV_DEBUG
+ if (lock_print_waits) {
+ fprintf(stderr, "Trx " TRX_ID_FMT " rollback finished\n",
+ (ullint) trx->id);
+ }
+#endif /* UNIV_DEBUG */
+
+ trx_commit_off_kernel(trx);
+
+ /* Remove all TRX_SIG_TOTAL_ROLLBACK signals from the signal queue and
+ send reply messages to them */
+
+ trx->que_state = TRX_QUE_RUNNING;
+
+ while (sig != NULL) {
+ next_sig = UT_LIST_GET_NEXT(signals, sig);
+
+ if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
+
+ trx_sig_reply(sig, next_thr);
+
+ trx_sig_remove(trx, sig);
+ }
+
+ sig = next_sig;
+ }
+}
+
+/*********************************************************************//**
+Creates a rollback command node struct.
+@return own: rollback node struct */
+UNIV_INTERN
+roll_node_t*
+roll_node_create(
+/*=============*/
+ mem_heap_t* heap) /*!< in: mem heap where created */
+{
+ roll_node_t* node;
+
+ node = mem_heap_alloc(heap, sizeof(roll_node_t));
+ node->common.type = QUE_NODE_ROLLBACK;
+ node->state = ROLL_NODE_SEND;
+
+ node->partial = FALSE;
+
+ return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a rollback command node in a query graph.
+@return query thread to run next, or NULL */
+UNIV_INTERN
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ roll_node_t* node;
+ ulint sig_no;
+ trx_savept_t* savept;
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ node->state = ROLL_NODE_SEND;
+ }
+
+ if (node->state == ROLL_NODE_SEND) {
+ mutex_enter(&kernel_mutex);
+
+ node->state = ROLL_NODE_WAIT;
+
+ if (node->partial) {
+ sig_no = TRX_SIG_ROLLBACK_TO_SAVEPT;
+ savept = &(node->savept);
+ } else {
+ sig_no = TRX_SIG_TOTAL_ROLLBACK;
+ savept = NULL;
+ }
+
+ /* Send a rollback signal to the transaction */
+
+ trx_sig_send(thr_get_trx(thr), sig_no, TRX_SIG_SELF, thr,
+ savept, NULL);
+
+ thr->state = QUE_THR_SIG_REPLY_WAIT;
+
+ mutex_exit(&kernel_mutex);
+
+ return(NULL);
+ }
+
+ ut_ad(node->state == ROLL_NODE_WAIT);
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
diff --git a/storage/xtradb/trx/trx0rseg.c b/storage/xtradb/trx/trx0rseg.c
new file mode 100644
index 00000000000..ed3c27326d4
--- /dev/null
+++ b/storage/xtradb/trx/trx0rseg.c
@@ -0,0 +1,374 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2011, Oracle Corpn. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rseg.c
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rseg.h"
+
+#ifdef UNIV_NONINL
+#include "trx0rseg.ic"
+#endif
+
+#include "trx0undo.h"
+#include "fut0lst.h"
+#include "srv0srv.h"
+#include "trx0purge.h"
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register rseg_mutex_key with performance schema */
+UNIV_INTERN mysql_pfs_key_t rseg_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/******************************************************************//**
+Looks for a rollback segment, based on the rollback segment id.
+@return rollback segment */
+UNIV_INTERN
+trx_rseg_t*
+trx_rseg_get_on_id(
+/*===============*/
+ ulint id) /*!< in: rollback segment id */
+{
+ trx_rseg_t* rseg;
+
+ ut_a(id < TRX_SYS_N_RSEGS);
+
+ rseg = trx_sys->rseg_array[id];
+
+ ut_a(rseg == NULL || id == rseg->id);
+
+ return(rseg);
+}
+
+/****************************************************************//**
+Creates a rollback segment header. This function is called only when
+a new rollback segment is created in the database.
+@return page number of the created segment, FIL_NULL if fail */
+UNIV_INTERN
+ulint
+trx_rseg_header_create(
+/*===================*/
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint max_size, /*!< in: max size in pages */
+ ulint rseg_slot_no, /*!< in: rseg id == slot number in trx sys */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint page_no;
+ trx_rsegf_t* rsegf;
+ trx_sysf_t* sys_header;
+ ulint i;
+ buf_block_t* block;
+
+ ut_ad(mtr);
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL),
+ MTR_MEMO_X_LOCK));
+
+ /* Allocate a new file segment for the rollback segment */
+ block = fseg_create(space, 0,
+ TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr);
+
+ if (block == NULL) {
+ /* No space left */
+
+ return(FIL_NULL);
+ }
+
+ buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW);
+
+ page_no = buf_block_get_page_no(block);
+
+ /* Get the rollback segment file page */
+ rsegf = trx_rsegf_get_new(space, zip_size, page_no, mtr);
+
+ /* Initialize max size field */
+ mlog_write_ulint(rsegf + TRX_RSEG_MAX_SIZE, max_size,
+ MLOG_4BYTES, mtr);
+
+ /* Initialize the history list */
+
+ mlog_write_ulint(rsegf + TRX_RSEG_HISTORY_SIZE, 0, MLOG_4BYTES, mtr);
+ flst_init(rsegf + TRX_RSEG_HISTORY, mtr);
+
+ /* Reset the undo log slots */
+ for (i = 0; i < TRX_RSEG_N_SLOTS; i++) {
+
+ trx_rsegf_set_nth_undo(rsegf, i, FIL_NULL, mtr);
+ }
+
+ /* Add the rollback segment info to the free slot in
+ the trx system header */
+
+ sys_header = trx_sysf_get(mtr);
+
+ trx_sysf_rseg_set_space(sys_header, rseg_slot_no, space, mtr);
+ trx_sysf_rseg_set_page_no(sys_header, rseg_slot_no, page_no, mtr);
+
+ return(page_no);
+}
+
+/***********************************************************************//**
+Free's an instance of the rollback segment in memory. */
+UNIV_INTERN
+void
+trx_rseg_mem_free(
+/*==============*/
+ trx_rseg_t* rseg) /* in, own: instance to free */
+{
+ trx_undo_t* undo;
+
+ mutex_free(&rseg->mutex);
+
+ /* There can't be any active transactions. */
+ ut_a(UT_LIST_GET_LEN(rseg->update_undo_list) == 0);
+ ut_a(UT_LIST_GET_LEN(rseg->insert_undo_list) == 0);
+
+ undo = UT_LIST_GET_FIRST(rseg->update_undo_cached);
+
+ while (undo != NULL) {
+ trx_undo_t* prev_undo = undo;
+
+ undo = UT_LIST_GET_NEXT(undo_list, undo);
+ UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, prev_undo);
+
+ trx_undo_mem_free(prev_undo);
+ }
+
+ undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached);
+
+ while (undo != NULL) {
+ trx_undo_t* prev_undo = undo;
+
+ undo = UT_LIST_GET_NEXT(undo_list, undo);
+ UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, prev_undo);
+
+ trx_undo_mem_free(prev_undo);
+ }
+
+ trx_sys_set_nth_rseg(trx_sys, rseg->id, NULL);
+
+ mem_free(rseg);
+}
+
+/***************************************************************************
+Creates and initializes a rollback segment object. The values for the
+fields are read from the header. The object is inserted to the rseg
+list of the trx system object and a pointer is inserted in the rseg
+array in the trx system object.
+@return own: rollback segment object */
+static
+trx_rseg_t*
+trx_rseg_mem_create(
+/*================*/
+ ulint id, /*!< in: rollback segment id */
+ ulint space, /*!< in: space where the segment
+ placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number of the segment
+ header */
+ ib_bh_t* ib_bh, /*!< in/out: rseg queue */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint len;
+ trx_rseg_t* rseg;
+ fil_addr_t node_addr;
+ trx_rsegf_t* rseg_header;
+ trx_ulogf_t* undo_log_hdr;
+ ulint sum_of_undo_sizes;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ rseg = mem_zalloc(sizeof(trx_rseg_t));
+
+ rseg->id = id;
+ rseg->space = space;
+ rseg->zip_size = zip_size;
+ rseg->page_no = page_no;
+
+ mutex_create(rseg_mutex_key, &rseg->mutex, SYNC_RSEG);
+
+ UT_LIST_ADD_LAST(rseg_list, trx_sys->rseg_list, rseg);
+
+ trx_sys_set_nth_rseg(trx_sys, id, rseg);
+
+ rseg_header = trx_rsegf_get_new(space, zip_size, page_no, mtr);
+
+ rseg->max_size = mtr_read_ulint(rseg_header + TRX_RSEG_MAX_SIZE,
+ MLOG_4BYTES, mtr);
+
+ /* Initialize the undo log lists according to the rseg header */
+
+ sum_of_undo_sizes = trx_undo_lists_init(rseg);
+
+ rseg->curr_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+ MLOG_4BYTES, mtr)
+ + 1 + sum_of_undo_sizes;
+
+ len = flst_get_len(rseg_header + TRX_RSEG_HISTORY, mtr);
+ if (len > 0) {
+ const void* ptr;
+ rseg_queue_t rseg_queue;
+
+ trx_sys->rseg_history_len += len;
+
+ node_addr = trx_purge_get_log_from_hist(
+ flst_get_last(rseg_header + TRX_RSEG_HISTORY, mtr));
+ rseg->last_page_no = node_addr.page;
+ rseg->last_offset = node_addr.boffset;
+
+ undo_log_hdr = trx_undo_page_get(rseg->space, rseg->zip_size,
+ node_addr.page,
+ mtr) + node_addr.boffset;
+
+ rseg->last_trx_no = mach_read_from_8(
+ undo_log_hdr + TRX_UNDO_TRX_NO);
+ rseg->last_del_marks = mtr_read_ulint(
+ undo_log_hdr + TRX_UNDO_DEL_MARKS, MLOG_2BYTES, mtr);
+
+ rseg_queue.rseg = rseg;
+ rseg_queue.trx_no = rseg->last_trx_no;
+
+ if (rseg->last_page_no != FIL_NULL) {
+ /* There is no need to cover this operation by the purge
+ mutex because we are still bootstrapping. */
+
+ ptr = ib_bh_push(ib_bh, &rseg_queue);
+ ut_a(ptr != NULL);
+ }
+ } else {
+ rseg->last_page_no = FIL_NULL;
+ }
+
+ return(rseg);
+}
+
+/********************************************************************
+Creates the memory copies for the rollback segments and initializes the
+rseg list and array in trx_sys at a database startup. */
+static
+void
+trx_rseg_create_instance(
+/*=====================*/
+ trx_sysf_t* sys_header, /*!< in: trx system header */
+ ib_bh_t* ib_bh, /*!< in/out: rseg queue */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint i;
+
+ for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+ ulint page_no;
+
+ page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
+
+ if (page_no == FIL_NULL) {
+ trx_sys_set_nth_rseg(trx_sys, i, NULL);
+ } else {
+ ulint space;
+ ulint zip_size;
+ trx_rseg_t* rseg = NULL;
+
+ ut_a(!trx_rseg_get_on_id(i));
+
+ space = trx_sysf_rseg_get_space(sys_header, i, mtr);
+
+ zip_size = space ? fil_space_get_zip_size(space) : 0;
+
+ rseg = trx_rseg_mem_create(
+ i, space, zip_size, page_no, ib_bh, mtr);
+
+ ut_a(rseg->id == i);
+ }
+ }
+}
+
+/*********************************************************************
+Creates a rollback segment.
+@return pointer to new rollback segment if create successful */
+UNIV_INTERN
+trx_rseg_t*
+trx_rseg_create(void)
+/*=================*/
+{
+ mtr_t mtr;
+ ulint slot_no;
+ trx_rseg_t* rseg = NULL;
+
+ mtr_start(&mtr);
+
+ /* To obey the latching order, acquire the file space
+ x-latch before the kernel mutex. */
+ mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), &mtr);
+
+ mutex_enter(&kernel_mutex);
+
+ slot_no = trx_sysf_rseg_find_free(&mtr);
+
+ if (slot_no != ULINT_UNDEFINED) {
+ ulint space;
+ ulint page_no;
+ ulint zip_size;
+ trx_sysf_t* sys_header;
+
+ page_no = trx_rseg_header_create(
+ TRX_SYS_SPACE, 0, ULINT_MAX, slot_no, &mtr);
+
+ ut_a(page_no != FIL_NULL);
+
+ ut_ad(!trx_rseg_get_on_id(slot_no));
+
+ sys_header = trx_sysf_get(&mtr);
+
+ space = trx_sysf_rseg_get_space(sys_header, slot_no, &mtr);
+
+ zip_size = space ? fil_space_get_zip_size(space) : 0;
+
+ rseg = trx_rseg_mem_create(
+ slot_no, space, zip_size, page_no,
+ purge_sys->ib_bh, &mtr);
+ }
+
+ mutex_exit(&kernel_mutex);
+ mtr_commit(&mtr);
+
+ return(rseg);
+}
+
+/********************************************************************
+Initialize the rollback instance list. */
+UNIV_INTERN
+void
+trx_rseg_list_and_array_init(
+/*=========================*/
+ trx_sysf_t* sys_header, /*!< in: trx system header */
+ ib_bh_t* ib_bh, /*!< in: rseg queue */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ UT_LIST_INIT(trx_sys->rseg_list);
+
+ trx_sys->rseg_history_len = 0;
+
+ trx_rseg_create_instance(sys_header, ib_bh, mtr);
+}
+
diff --git a/storage/xtradb/trx/trx0sys.c b/storage/xtradb/trx/trx0sys.c
new file mode 100644
index 00000000000..6b230a296c0
--- /dev/null
+++ b/storage/xtradb/trx/trx0sys.c
@@ -0,0 +1,2049 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0sys.c
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0sys.h"
+
+#ifdef UNIV_NONINL
+#include "trx0sys.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+#include "fsp0fsp.h"
+#include "mtr0log.h"
+#include "mtr0log.h"
+#include "trx0trx.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "os0file.h"
+#include "read0read.h"
+
+/** The file format tag structure with id and name. */
+struct file_format_struct {
+ ulint id; /*!< id of the file format */
+ const char* name; /*!< text representation of the
+ file format */
+ mutex_t mutex; /*!< covers changes to the above
+ fields */
+};
+
+/** The file format tag */
+typedef struct file_format_struct file_format_t;
+
+/** The transaction system */
+UNIV_INTERN trx_sys_t* trx_sys = NULL;
+/** The doublewrite buffer */
+UNIV_INTERN trx_doublewrite_t* trx_doublewrite = NULL;
+
+/** The following is set to TRUE when we are upgrading from pre-4.1
+format data files to the multiple tablespaces format data files */
+UNIV_INTERN ibool trx_doublewrite_must_reset_space_ids = FALSE;
+/** Set to TRUE when the doublewrite buffer is being created */
+UNIV_INTERN ibool trx_doublewrite_buf_is_being_created = FALSE;
+
+/** The following is TRUE when we are using the database in the
+post-4.1 format, i.e., we have successfully upgraded, or have created
+a new database installation */
+UNIV_INTERN ibool trx_sys_multiple_tablespace_format = FALSE;
+
+/** In a MySQL replication slave, in crash recovery we store the master log
+file name and position here. */
+/* @{ */
+/** Master binlog file name */
+UNIV_INTERN char trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN];
+/** Master binlog file position. We have successfully got the updates
+up to this position. -1 means that no crash recovery was needed, or
+there was no master log position info inside InnoDB.*/
+UNIV_INTERN ib_int64_t trx_sys_mysql_master_log_pos = -1;
+/* @} */
+
+UNIV_INTERN char trx_sys_mysql_relay_log_name[TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN];
+UNIV_INTERN ib_int64_t trx_sys_mysql_relay_log_pos = -1;
+
+/** If this MySQL server uses binary logging, after InnoDB has been inited
+and if it has done a crash recovery, we store the binlog file name and position
+here. */
+/* @{ */
+/** Binlog file name */
+UNIV_INTERN char trx_sys_mysql_bin_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
+/** Binlog file position, or -1 if unknown */
+UNIV_INTERN ib_int64_t trx_sys_mysql_bin_log_pos = -1;
+/* @} */
+#endif /* !UNIV_HOTBACKUP */
+
+/** List of animal names representing file format. */
+static const char* file_format_name_map[] = {
+ "Antelope",
+ "Barracuda",
+ "Cheetah",
+ "Dragon",
+ "Elk",
+ "Fox",
+ "Gazelle",
+ "Hornet",
+ "Impala",
+ "Jaguar",
+ "Kangaroo",
+ "Leopard",
+ "Moose",
+ "Nautilus",
+ "Ocelot",
+ "Porpoise",
+ "Quail",
+ "Rabbit",
+ "Shark",
+ "Tiger",
+ "Urchin",
+ "Viper",
+ "Whale",
+ "Xenops",
+ "Yak",
+ "Zebra"
+};
+
+/** The number of elements in the file format name array. */
+static const ulint FILE_FORMAT_NAME_N
+ = sizeof(file_format_name_map) / sizeof(file_format_name_map[0]);
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t trx_doublewrite_mutex_key;
+UNIV_INTERN mysql_pfs_key_t file_format_max_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_DEBUG
+/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
+UNIV_INTERN uint trx_rseg_n_slots_debug = 0;
+#endif
+
+/** This is used to track the maximum file format id known to InnoDB. It's
+updated via SET GLOBAL innodb_file_format_max = 'x' or when we open
+or create a table. */
+static file_format_t file_format_max;
+
+/****************************************************************//**
+Determines if a page number is located inside the doublewrite buffer.
+@return TRUE if the location is inside the two blocks of the
+doublewrite buffer */
+UNIV_INTERN
+ibool
+trx_doublewrite_page_inside(
+/*========================*/
+ ulint page_no) /*!< in: page number */
+{
+ if (trx_doublewrite == NULL) {
+
+ return(FALSE);
+ }
+
+ if (page_no >= trx_doublewrite->block1
+ && page_no < trx_doublewrite->block1
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ return(TRUE);
+ }
+
+ if (page_no >= trx_doublewrite->block2
+ && page_no < trx_doublewrite->block2
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/****************************************************************//**
+Creates or initialializes the doublewrite buffer at a database start. */
+static
+void
+trx_doublewrite_init(
+/*=================*/
+ byte* doublewrite) /*!< in: pointer to the doublewrite buf
+ header on trx sys page */
+{
+ trx_doublewrite = mem_alloc(sizeof(trx_doublewrite_t));
+
+ /* Since we now start to use the doublewrite buffer, no need to call
+ fsync() after every write to a data file */
+#ifdef UNIV_DO_FLUSH
+ os_do_not_call_flush_at_each_write = TRUE;
+#endif /* UNIV_DO_FLUSH */
+
+ mutex_create(trx_doublewrite_mutex_key,
+ &trx_doublewrite->mutex, SYNC_DOUBLEWRITE);
+
+ trx_doublewrite->first_free = 0;
+
+ trx_doublewrite->block1 = mach_read_from_4(
+ doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1);
+ trx_doublewrite->block2 = mach_read_from_4(
+ doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2);
+ trx_doublewrite->write_buf_unaligned = ut_malloc(
+ (1 + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE);
+
+ trx_doublewrite->write_buf = ut_align(
+ trx_doublewrite->write_buf_unaligned, UNIV_PAGE_SIZE);
+ trx_doublewrite->buf_block_arr = mem_alloc(
+ 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * sizeof(void*));
+}
+
+/****************************************************************//**
+Marks the trx sys header when we have successfully upgraded to the >= 4.1.x
+multiple tablespace format. */
+UNIV_INTERN
+void
+trx_sys_mark_upgraded_to_multiple_tablespaces(void)
+/*===============================================*/
+{
+ buf_block_t* block;
+ byte* doublewrite;
+ mtr_t mtr;
+
+ /* We upgraded to 4.1.x and reset the space id fields in the
+ doublewrite buffer. Let us mark to the trx_sys header that the upgrade
+ has been done. */
+
+ mtr_start(&mtr);
+
+ block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
+ RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
+
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+ MLOG_4BYTES, &mtr);
+ mtr_commit(&mtr);
+
+ /* Flush the modified pages to disk and make a checkpoint */
+ log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+
+ trx_sys_multiple_tablespace_format = TRUE;
+}
+
+/****************************************************************//**
+Creates the doublewrite buffer to a new InnoDB installation. The header of the
+doublewrite buffer is placed on the trx system header page. */
+UNIV_INTERN
+void
+trx_sys_create_doublewrite_buf(void)
+/*================================*/
+{
+ buf_block_t* block;
+ buf_block_t* block2;
+ buf_block_t* new_block;
+ byte* doublewrite;
+ byte* fseg_header;
+ ulint page_no;
+ ulint prev_page_no;
+ ulint i;
+ mtr_t mtr;
+
+ if (trx_doublewrite) {
+ /* Already inited */
+
+ return;
+ }
+
+start_again:
+ mtr_start(&mtr);
+ trx_doublewrite_buf_is_being_created = TRUE;
+
+ block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
+ RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
+
+ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+ == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+ /* The doublewrite buffer has already been created:
+ just read in some numbers */
+
+ trx_doublewrite_init(doublewrite);
+
+ mtr_commit(&mtr);
+ trx_doublewrite_buf_is_being_created = FALSE;
+ } else {
+ fprintf(stderr,
+ "InnoDB: Doublewrite buffer not found:"
+ " creating new\n");
+
+ if (buf_pool_get_curr_size()
+ < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ + FSP_EXTENT_SIZE / 2 + 100)
+ * UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite buffer:"
+ " you must\n"
+ "InnoDB: increase your buffer pool size.\n"
+ "InnoDB: Cannot continue operation.\n");
+
+ exit(1);
+ }
+
+ block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO,
+ TRX_SYS_DOUBLEWRITE
+ + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
+
+ /* fseg_create acquires a second latch on the page,
+ therefore we must declare it: */
+
+ buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
+
+ if (block2 == NULL) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite buffer:"
+ " you must\n"
+ "InnoDB: increase your tablespace size.\n"
+ "InnoDB: Cannot continue operation.\n");
+
+ /* We exit without committing the mtr to prevent
+ its modifications to the database getting to disk */
+
+ exit(1);
+ }
+
+ fseg_header = buf_block_get_frame(block)
+ + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG;
+ prev_page_no = 0;
+
+ for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ + FSP_EXTENT_SIZE / 2; i++) {
+ new_block = fseg_alloc_free_page(
+ fseg_header, prev_page_no + 1, FSP_UP, &mtr);
+ if (new_block == NULL) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite"
+ " buffer: you must\n"
+ "InnoDB: increase your"
+ " tablespace size.\n"
+ "InnoDB: Cannot continue operation.\n"
+ );
+
+ exit(1);
+ }
+
+ /* We read the allocated pages to the buffer pool;
+ when they are written to disk in a flush, the space
+ id and page number fields are also written to the
+ pages. When we at database startup read pages
+ from the doublewrite buffer, we know that if the
+ space id and page number in them are the same as
+ the page position in the tablespace, then the page
+ has not been written to in doublewrite. */
+
+ ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
+ page_no = buf_block_get_page_no(new_block);
+
+ if (i == FSP_EXTENT_SIZE / 2) {
+ ut_a(page_no == FSP_EXTENT_SIZE);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK1,
+ page_no, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_REPEAT
+ + TRX_SYS_DOUBLEWRITE_BLOCK1,
+ page_no, MLOG_4BYTES, &mtr);
+ } else if (i == FSP_EXTENT_SIZE / 2
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ ut_a(page_no == 2 * FSP_EXTENT_SIZE);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK2,
+ page_no, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_REPEAT
+ + TRX_SYS_DOUBLEWRITE_BLOCK2,
+ page_no, MLOG_4BYTES, &mtr);
+ } else if (i > FSP_EXTENT_SIZE / 2) {
+ ut_a(page_no == prev_page_no + 1);
+ }
+
+ prev_page_no = page_no;
+ }
+
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N,
+ MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
+ + TRX_SYS_DOUBLEWRITE_REPEAT,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N,
+ MLOG_4BYTES, &mtr);
+
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+ MLOG_4BYTES, &mtr);
+ mtr_commit(&mtr);
+
+ /* Flush the modified pages to disk and make a checkpoint */
+ log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+
+ fprintf(stderr, "InnoDB: Doublewrite buffer created\n");
+
+ trx_sys_multiple_tablespace_format = TRUE;
+
+ goto start_again;
+ }
+
+ if (srv_doublewrite_file) {
+ /* the same doublewrite buffer to TRX_SYS_SPACE should exist.
+ check and create if not exist.*/
+
+ mtr_start(&mtr);
+ trx_doublewrite_buf_is_being_created = TRUE;
+
+ block = buf_page_get(TRX_DOUBLEWRITE_SPACE, 0, TRX_SYS_PAGE_NO,
+ RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
+
+ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+ == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+ /* The doublewrite buffer has already been created:
+ just read in some numbers */
+
+ trx_doublewrite_init(doublewrite);
+
+ mtr_commit(&mtr);
+ trx_doublewrite_buf_is_being_created = FALSE;
+ } else {
+ fprintf(stderr,
+ "InnoDB: Doublewrite buffer not found in the doublewrite file:"
+ " creating new doublewrite buffer.\n");
+
+ if (buf_pool_get_curr_size()
+ < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ + FSP_EXTENT_SIZE / 2 + 100)
+ * UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+ "InnoDB: Cannot create the doublewrite buffer:"
+ " You must\n"
+ "InnoDB: increase your buffer pool size.\n"
+ "InnoDB: Cannot continue processing.\n");
+
+ exit(1);
+ }
+
+ block2 = fseg_create(TRX_DOUBLEWRITE_SPACE, TRX_SYS_PAGE_NO,
+ TRX_SYS_DOUBLEWRITE
+ + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
+
+ /* fseg_create acquires a second latch on the page,
+ therefore we must declare it: */
+
+ buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
+
+ if (block2 == NULL) {
+ fprintf(stderr,
+ "InnoDB: Cannot create the doublewrite buffer:"
+ " You must\n"
+ "InnoDB: increase your tablespace size.\n"
+ "InnoDB: Cannot continue processing.\n");
+
+ /* We exit without committing the mtr to prevent
+ its modifications to the database getting to disk */
+
+ exit(1);
+ }
+
+ fseg_header = buf_block_get_frame(block)
+ + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG;
+ prev_page_no = 0;
+
+ for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ + FSP_EXTENT_SIZE / 2; i++) {
+ new_block = fseg_alloc_free_page(
+ fseg_header, prev_page_no + 1, FSP_UP, &mtr);
+ if (new_block == NULL) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite"
+ " buffer: you must\n"
+ "InnoDB: increase your"
+ " tablespace size.\n"
+ "InnoDB: Cannot continue operation.\n"
+ );
+
+ exit(1);
+ }
+
+ /* We read the allocated pages to the buffer pool;
+ when they are written to disk in a flush, the space
+ id and page number fields are also written to the
+ pages. When we at database startup read pages
+ from the doublewrite buffer, we know that if the
+ space id and page number in them are the same as
+ the page position in the tablespace, then the page
+ has not been written to in doublewrite. */
+
+ ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
+ page_no = buf_block_get_page_no(new_block);
+
+ if (i == FSP_EXTENT_SIZE / 2) {
+ ut_a(page_no == FSP_EXTENT_SIZE);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK1,
+ page_no, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_REPEAT
+ + TRX_SYS_DOUBLEWRITE_BLOCK1,
+ page_no, MLOG_4BYTES, &mtr);
+ } else if (i == FSP_EXTENT_SIZE / 2
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ ut_a(page_no == 2 * FSP_EXTENT_SIZE);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK2,
+ page_no, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_REPEAT
+ + TRX_SYS_DOUBLEWRITE_BLOCK2,
+ page_no, MLOG_4BYTES, &mtr);
+ } else if (i > FSP_EXTENT_SIZE / 2) {
+ ut_a(page_no == prev_page_no + 1);
+ }
+
+ prev_page_no = page_no;
+ }
+
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N,
+ MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
+ + TRX_SYS_DOUBLEWRITE_REPEAT,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N,
+ MLOG_4BYTES, &mtr);
+
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+ MLOG_4BYTES, &mtr);
+ mtr_commit(&mtr);
+
+ /* Flush the modified pages to disk and make a checkpoint */
+ log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+
+ fprintf(stderr, "InnoDB: Doublewrite buffer created in the doublewrite file\n");
+ trx_sys_multiple_tablespace_format = TRUE;
+ }
+ trx_doublewrite_buf_is_being_created = FALSE;
+ }
+}
+
+/****************************************************************//**
+At a database startup initializes the doublewrite buffer memory structure if
+we already have a doublewrite buffer created in the data files. If we are
+upgrading to an InnoDB version which supports multiple tablespaces, then this
+function performs the necessary update operations. If we are in a crash
+recovery, this function uses a possible doublewrite buffer to restore
+half-written pages in the data files. */
+UNIV_INTERN
+void
+trx_sys_doublewrite_init_or_restore_pages(
+/*======================================*/
+ ibool restore_corrupt_pages) /*!< in: TRUE=restore pages */
+{
+ byte* buf;
+ byte* read_buf;
+ byte* unaligned_read_buf;
+ ulint block1;
+ ulint block2;
+ ulint source_page_no;
+ byte* page;
+ byte* doublewrite;
+ ulint doublewrite_space_id;
+ ulint space_id;
+ ulint page_no;
+ ulint i;
+
+ doublewrite_space_id = (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE);
+
+ if (srv_doublewrite_file) {
+ fprintf(stderr,
+ "InnoDB: doublewrite file '%s' is used.\n",
+ srv_doublewrite_file);
+ }
+
+ /* We do the file i/o past the buffer pool */
+
+ unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE);
+ read_buf = ut_align(unaligned_read_buf, UNIV_PAGE_SIZE);
+
+ /* Read the trx sys header to check if we are using the doublewrite
+ buffer */
+
+ fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, TRX_SYS_PAGE_NO, 0,
+ UNIV_PAGE_SIZE, read_buf, NULL);
+ doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
+
+ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+ == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+ /* The doublewrite buffer has been created */
+
+ trx_doublewrite_init(doublewrite);
+
+ block1 = trx_doublewrite->block1;
+ block2 = trx_doublewrite->block2;
+
+ buf = trx_doublewrite->write_buf;
+ } else {
+ goto leave_func;
+ }
+
+ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
+ != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
+
+ /* We are upgrading from a version < 4.1.x to a version where
+ multiple tablespaces are supported. We must reset the space id
+ field in the pages in the doublewrite buffer because starting
+ from this version the space id is stored to
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
+
+ trx_doublewrite_must_reset_space_ids = TRUE;
+
+ fprintf(stderr,
+ "InnoDB: Resetting space id's in the"
+ " doublewrite buffer\n");
+ } else {
+ trx_sys_multiple_tablespace_format = TRUE;
+ }
+
+ /* Read the pages from the doublewrite buffer to memory */
+
+ fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, block1, 0,
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+ buf, NULL);
+ fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, block2, 0,
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+ buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+ NULL);
+ /* Check if any of these pages is half-written in data files, in the
+ intended position */
+
+ page = buf;
+
+ for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
+
+ page_no = mach_read_from_4(page + FIL_PAGE_OFFSET);
+
+ if (trx_doublewrite_must_reset_space_ids) {
+
+ space_id = 0;
+ mach_write_to_4(page
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0);
+ /* We do not need to calculate new checksums for the
+ pages because the field .._SPACE_ID does not affect
+ them. Write the page back to where we read it from. */
+
+ if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ source_page_no = block1 + i;
+ } else {
+ source_page_no = block2
+ + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+ }
+
+ fil_io(OS_FILE_WRITE, TRUE, 0, 0, source_page_no, 0,
+ UNIV_PAGE_SIZE, page, NULL);
+ /* printf("Resetting space id in page %lu\n",
+ source_page_no); */
+ } else {
+ space_id = mach_read_from_4(
+ page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ }
+
+ if (!restore_corrupt_pages) {
+ /* The database was shut down gracefully: no need to
+ restore pages */
+
+ } else if (!fil_tablespace_exists_in_mem(space_id)) {
+ /* Maybe we have dropped the single-table tablespace
+ and this page once belonged to it: do nothing */
+
+ } else if (!fil_check_adress_in_tablespace(space_id,
+ page_no)) {
+ fprintf(stderr,
+ "InnoDB: Warning: a page in the"
+ " doublewrite buffer is not within space\n"
+ "InnoDB: bounds; space id %lu"
+ " page number %lu, page %lu in"
+ " doublewrite buf.\n",
+ (ulong) space_id, (ulong) page_no, (ulong) i);
+
+ } else if ((space_id == TRX_SYS_SPACE
+ || (srv_doublewrite_file && space_id == TRX_DOUBLEWRITE_SPACE))
+ && ((page_no >= block1
+ && page_no
+ < block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+ || (page_no >= block2
+ && page_no
+ < (block2
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)))) {
+
+ /* It is an unwritten doublewrite buffer page:
+ do nothing */
+ } else {
+ ulint zip_size = fil_space_get_zip_size(space_id);
+
+ /* Read in the actual page from the file */
+ fil_io(OS_FILE_READ, TRUE, space_id, zip_size,
+ page_no, 0,
+ zip_size ? zip_size : UNIV_PAGE_SIZE,
+ read_buf, NULL);
+
+ if (srv_recovery_stats && recv_recovery_is_on()) {
+ mutex_enter(&(recv_sys->mutex));
+ recv_sys->stats_doublewrite_check_pages++;
+ mutex_exit(&(recv_sys->mutex));
+ }
+
+ /* Check if the page is corrupt */
+
+ if (UNIV_UNLIKELY
+ (buf_page_is_corrupted(
+ TRUE, read_buf, zip_size))) {
+
+ fprintf(stderr,
+ "InnoDB: Warning: database page"
+ " corruption or a failed\n"
+ "InnoDB: file read of"
+ " space %lu page %lu.\n"
+ "InnoDB: Trying to recover it from"
+ " the doublewrite buffer.\n",
+ (ulong) space_id, (ulong) page_no);
+
+ if (buf_page_is_corrupted(
+ TRUE, page, zip_size)) {
+ fprintf(stderr,
+ "InnoDB: Dump of the page:\n");
+ buf_page_print(
+ read_buf, zip_size,
+ BUF_PAGE_PRINT_NO_CRASH);
+ fprintf(stderr,
+ "InnoDB: Dump of"
+ " corresponding page"
+ " in doublewrite buffer:\n");
+ buf_page_print(
+ page, zip_size,
+ BUF_PAGE_PRINT_NO_CRASH);
+
+ fprintf(stderr,
+ "InnoDB: Also the page in the"
+ " doublewrite buffer"
+ " is corrupt.\n"
+ "InnoDB: Cannot continue"
+ " operation.\n"
+ "InnoDB: You can try to"
+ " recover the database"
+ " with the my.cnf\n"
+ "InnoDB: option:\n"
+ "InnoDB:"
+ " innodb_force_recovery=6\n");
+ ut_error;
+ }
+
+ /* Write the good page from the
+ doublewrite buffer to the intended
+ position */
+
+ fil_io(OS_FILE_WRITE, TRUE, space_id,
+ zip_size, page_no, 0,
+ zip_size ? zip_size : UNIV_PAGE_SIZE,
+ page, NULL);
+
+ if (srv_recovery_stats && recv_recovery_is_on()) {
+ mutex_enter(&(recv_sys->mutex));
+ recv_sys->stats_doublewrite_overwrite_pages++;
+ mutex_exit(&(recv_sys->mutex));
+ }
+
+ fprintf(stderr,
+ "InnoDB: Recovered the page from"
+ " the doublewrite buffer.\n");
+ }
+ }
+
+ page += UNIV_PAGE_SIZE;
+ }
+
+ fil_flush_file_spaces(FIL_TABLESPACE);
+
+leave_func:
+ ut_free(unaligned_read_buf);
+}
+
+/****************************************************************//**
+Checks that trx is in the trx list.
+@return TRUE if is in */
+UNIV_INTERN
+ibool
+trx_in_trx_list(
+/*============*/
+ trx_t* in_trx) /*!< in: trx */
+{
+ trx_t* trx;
+
+ ut_ad(mutex_own(&(kernel_mutex)));
+
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ while (trx != NULL) {
+
+ if (trx == in_trx) {
+
+ return(TRUE);
+ }
+
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+ }
+
+ return(FALSE);
+}
+
+/*****************************************************************//**
+Writes the value of max_trx_id to the file based trx system header. */
+UNIV_INTERN
+void
+trx_sys_flush_max_trx_id(void)
+/*==========================*/
+{
+ trx_sysf_t* sys_header;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ mtr_start(&mtr);
+
+ sys_header = trx_sysf_get(&mtr);
+
+ mlog_write_ull(sys_header + TRX_SYS_TRX_ID_STORE,
+ trx_sys->max_trx_id, &mtr);
+ mtr_commit(&mtr);
+}
+
+/*****************************************************************//**
+Updates the offset information about the end of the MySQL binlog entry
+which corresponds to the transaction just being committed. In a MySQL
+replication slave updates the latest master binlog position up to which
+replication has proceeded. */
+UNIV_INTERN
+void
+trx_sys_update_mysql_binlog_offset(
+/*===============================*/
+ trx_sysf_t* sys_header,
+ const char* file_name_in,/*!< in: MySQL log file name */
+ ib_int64_t offset, /*!< in: position in that log file */
+ ulint field, /*!< in: offset of the MySQL log info field in
+ the trx sys header */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ const char* file_name;
+
+ if (ut_strlen(file_name_in) >= TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN) {
+
+ /* We cannot fit the name to the 512 bytes we have reserved */
+ /* -> To store relay log file information, file_name must fit to the 480 bytes */
+
+ file_name = "";
+ } else {
+ file_name = file_name_in;
+ }
+
+ if (mach_read_from_4(sys_header + field
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+ != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+ mlog_write_ulint(sys_header + field
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD,
+ TRX_SYS_MYSQL_LOG_MAGIC_N,
+ MLOG_4BYTES, mtr);
+ }
+
+ if (0 != strcmp((char*) (sys_header + field + TRX_SYS_MYSQL_LOG_NAME),
+ file_name)) {
+
+ mlog_write_string(sys_header + field
+ + TRX_SYS_MYSQL_LOG_NAME,
+ (byte*) file_name, 1 + ut_strlen(file_name),
+ mtr);
+ }
+
+ if (mach_read_from_4(sys_header + field
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH) > 0
+ || (offset >> 32) > 0) {
+
+ mlog_write_ulint(sys_header + field
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH,
+ (ulint)(offset >> 32),
+ MLOG_4BYTES, mtr);
+ }
+
+ mlog_write_ulint(sys_header + field
+ + TRX_SYS_MYSQL_LOG_OFFSET_LOW,
+ (ulint)(offset & 0xFFFFFFFFUL),
+ MLOG_4BYTES, mtr);
+}
+
+/*****************************************************************//**
+Stores the MySQL binlog offset info in the trx system header if
+the magic number shows it valid, and print the info to stderr */
+UNIV_INTERN
+void
+trx_sys_print_mysql_binlog_offset(void)
+/*===================================*/
+{
+ trx_sysf_t* sys_header;
+ mtr_t mtr;
+ ulint trx_sys_mysql_bin_log_pos_high;
+ ulint trx_sys_mysql_bin_log_pos_low;
+
+ mtr_start(&mtr);
+
+ sys_header = trx_sysf_get(&mtr);
+
+ if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+ != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ trx_sys_mysql_bin_log_pos_high = mach_read_from_4(
+ sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH);
+ trx_sys_mysql_bin_log_pos_low = mach_read_from_4(
+ sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_LOW);
+
+ trx_sys_mysql_bin_log_pos
+ = (((ib_int64_t)trx_sys_mysql_bin_log_pos_high) << 32)
+ + (ib_int64_t)trx_sys_mysql_bin_log_pos_low;
+
+ ut_memcpy(trx_sys_mysql_bin_log_name,
+ sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_NAME, TRX_SYS_MYSQL_LOG_NAME_LEN);
+
+ fprintf(stderr,
+ "InnoDB: Last MySQL binlog file position %lu %lu,"
+ " file name %s\n",
+ trx_sys_mysql_bin_log_pos_high, trx_sys_mysql_bin_log_pos_low,
+ trx_sys_mysql_bin_log_name);
+
+ mtr_commit(&mtr);
+}
+
+/*****************************************************************//**
+Reads the log coordinates at the given offset in the trx sys header. */
+static
+void
+trx_sys_read_log_pos(
+/*=================*/
+ const trx_sysf_t* sys_header, /*!< in: the trx sys header */
+ uint header_offset, /*!< in: coord offset in the
+ header */
+ char* log_fn, /*!< out: the log file name */
+ ib_int64_t* log_pos) /*!< out: the log poistion */
+{
+ ut_memcpy(log_fn, sys_header + header_offset + TRX_SYS_MYSQL_LOG_NAME,
+ TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN);
+
+ *log_pos =
+ (((ib_int64_t)mach_read_from_4(sys_header + header_offset
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH)) << 32)
+ + mach_read_from_4(sys_header + header_offset
+ + TRX_SYS_MYSQL_LOG_OFFSET_LOW);
+}
+
+/*****************************************************************//**
+Prints to stderr the MySQL master log offset info in the trx system header
+PREPARE set of fields if the magic number shows it valid and stores it
+in global variables. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_master_log_pos(void)
+/*====================================*/
+{
+ trx_sysf_t* sys_header;
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ sys_header = trx_sysf_get(&mtr);
+
+ if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+ != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ /* Copy the master log position info to global variables we can
+ use in ha_innobase.cc to initialize glob_mi to right values */
+ trx_sys_read_log_pos(sys_header, TRX_SYS_MYSQL_MASTER_LOG_INFO,
+ trx_sys_mysql_master_log_name,
+ &trx_sys_mysql_master_log_pos);
+
+ trx_sys_read_log_pos(sys_header, TRX_SYS_MYSQL_RELAY_LOG_INFO,
+ trx_sys_mysql_relay_log_name,
+ &trx_sys_mysql_relay_log_pos);
+
+ mtr_commit(&mtr);
+
+ fprintf(stderr,
+ "InnoDB: In a MySQL replication slave the last"
+ " master binlog file\n"
+ "InnoDB: position %llu, file name %s\n",
+ trx_sys_mysql_master_log_pos,
+ trx_sys_mysql_master_log_name);
+
+ fprintf(stderr,
+ "InnoDB: and relay log file\n"
+ "InnoDB: position %llu, file name %s\n",
+ trx_sys_mysql_relay_log_pos,
+ trx_sys_mysql_relay_log_name);
+}
+
+/*****************************************************************//**
+Prints to stderr the MySQL master log offset info in the trx system header
+COMMIT set of fields if the magic number shows it valid and stores it
+in global variables. */
+UNIV_INTERN
+void
+trx_sys_print_committed_mysql_master_log_pos(void)
+/*==============================================*/
+{
+ trx_sysf_t* sys_header;
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ sys_header = trx_sysf_get(&mtr);
+
+ if (mach_read_from_4(sys_header + TRX_SYS_COMMIT_MASTER_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+ != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ /* Copy the master log position info to global variables we can
+ use in ha_innobase.cc to initialize glob_mi to right values */
+ trx_sys_read_log_pos(sys_header, TRX_SYS_COMMIT_MASTER_LOG_INFO,
+ trx_sys_mysql_master_log_name,
+ &trx_sys_mysql_master_log_pos);
+
+ trx_sys_read_log_pos(sys_header, TRX_SYS_COMMIT_RELAY_LOG_INFO,
+ trx_sys_mysql_relay_log_name,
+ &trx_sys_mysql_relay_log_pos);
+
+ mtr_commit(&mtr);
+
+ fprintf(stderr,
+ "InnoDB: In a MySQL replication slave the last"
+ " master binlog file\n"
+ "InnoDB: position %llu, file name %s\n",
+ trx_sys_mysql_master_log_pos, trx_sys_mysql_master_log_name);
+
+ fprintf(stderr,
+ "InnoDB: and relay log file\n"
+ "InnoDB: position %llu, file name %s\n",
+ trx_sys_mysql_relay_log_pos, trx_sys_mysql_relay_log_name);
+}
+
+/****************************************************************//**
+Looks for a free slot for a rollback segment in the trx system file copy.
+@return slot index or ULINT_UNDEFINED if not found */
+UNIV_INTERN
+ulint
+trx_sysf_rseg_find_free(
+/*====================*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_sysf_t* sys_header;
+ ulint page_no;
+ ulint i;
+
+ ut_ad(mutex_own(&(kernel_mutex)));
+
+ sys_header = trx_sysf_get(mtr);
+
+ for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+
+ page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
+
+ if (page_no == FIL_NULL) {
+
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/*****************************************************************//**
+Creates the file page for the transaction system. This function is called only
+at the database creation, before trx_sys_init. */
+static
+void
+trx_sysf_create(
+/*============*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_sysf_t* sys_header;
+ ulint slot_no;
+ buf_block_t* block;
+ page_t* page;
+ ulint page_no;
+ byte* ptr;
+ ulint len;
+
+ ut_ad(mtr);
+
+ /* Note that below we first reserve the file space x-latch, and
+ then enter the kernel: we must do it in this order to conform
+ to the latching order rules. */
+
+ mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), mtr);
+ mutex_enter(&kernel_mutex);
+
+ /* Create the trx sys file block in a new allocated file segment */
+ block = fseg_create(TRX_SYS_SPACE, 0, TRX_SYS + TRX_SYS_FSEG_HEADER,
+ mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
+
+ ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO);
+
+ page = buf_block_get_frame(block);
+
+ mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS,
+ MLOG_2BYTES, mtr);
+
+ /* Reset the doublewrite buffer magic number to zero so that we
+ know that the doublewrite buffer has not yet been created (this
+ suppresses a Valgrind warning) */
+
+ mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE
+ + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr);
+
+ sys_header = trx_sysf_get(mtr);
+
+ /* Start counting transaction ids from number 1 up */
+ mach_write_to_8(sys_header + TRX_SYS_TRX_ID_STORE, 1);
+
+ /* Reset the rollback segment slots. Old versions of InnoDB
+ define TRX_SYS_N_RSEGS as 256 (TRX_SYS_OLD_N_RSEGS) and expect
+ that the whole array is initialized. */
+ ptr = TRX_SYS_RSEGS + sys_header;
+ len = ut_max(TRX_SYS_OLD_N_RSEGS, TRX_SYS_N_RSEGS)
+ * TRX_SYS_RSEG_SLOT_SIZE;
+ memset(ptr, 0xff, len);
+ ptr += len;
+ ut_a(ptr <= page + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END));
+
+ /* Initialize all of the page. This part used to be uninitialized. */
+ memset(ptr, 0, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page - ptr);
+
+ mlog_log_string(sys_header, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END
+ + page - sys_header, mtr);
+
+ /* Create the first rollback segment in the SYSTEM tablespace */
+ slot_no = trx_sysf_rseg_find_free(mtr);
+ page_no = trx_rseg_header_create(TRX_SYS_SPACE, 0, ULINT_MAX, slot_no,
+ mtr);
+ ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
+ ut_a(page_no == FSP_FIRST_RSEG_PAGE_NO);
+
+ mutex_exit(&kernel_mutex);
+}
+
+/*****************************************************************//**
+Compare two trx_rseg_t instances on last_trx_no. */
+static
+int
+trx_rseg_compare_last_trx_no(
+/*=========================*/
+ const void* p1, /*!< in: elem to compare */
+ const void* p2) /*!< in: elem to compare */
+{
+ ib_int64_t cmp;
+
+ const rseg_queue_t* rseg_q1 = (const rseg_queue_t*) p1;
+ const rseg_queue_t* rseg_q2 = (const rseg_queue_t*) p2;
+
+ cmp = rseg_q1->trx_no - rseg_q2->trx_no;
+
+ if (cmp < 0) {
+ return(-1);
+ } else if (cmp > 0) {
+ return(1);
+ }
+
+ return(0);
+}
+
+/*****************************************************************//**
+Creates dummy of the file page for the transaction system. */
+static
+void
+trx_sysf_dummy_create(
+/*==================*/
+ ulint space,
+ mtr_t* mtr)
+{
+ buf_block_t* block;
+ page_t* page;
+
+ ut_ad(mtr);
+
+ /* Note that below we first reserve the file space x-latch, and
+ then enter the kernel: we must do it in this order to conform
+ to the latching order rules. */
+
+ mtr_x_lock(fil_space_get_latch(space, NULL), mtr);
+ mutex_enter(&kernel_mutex);
+
+ /* Create the trx sys file block in a new allocated file segment */
+ block = fseg_create(space, 0, TRX_SYS + TRX_SYS_FSEG_HEADER,
+ mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
+
+ fprintf(stderr, "%lu\n", buf_block_get_page_no(block));
+ ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO);
+
+ page = buf_block_get_frame(block);
+
+ mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS,
+ MLOG_2BYTES, mtr);
+
+ /* Reset the doublewrite buffer magic number to zero so that we
+ know that the doublewrite buffer has not yet been created (this
+ suppresses a Valgrind warning) */
+
+ mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE
+ + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr);
+
+#ifdef UNDEFINED
+ /* TODO: REMOVE IT: The bellow is not needed, I think */
+ sys_header = trx_sysf_get(mtr);
+
+ /* Start counting transaction ids from number 1 up */
+ mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE,
+ ut_dulint_create(0, 1), mtr);
+
+ /* Reset the rollback segment slots */
+ for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+
+ trx_sysf_rseg_set_space(sys_header, i, ULINT_UNDEFINED, mtr);
+ trx_sysf_rseg_set_page_no(sys_header, i, FIL_NULL, mtr);
+ }
+
+ /* The remaining area (up to the page trailer) is uninitialized.
+ Silence Valgrind warnings about it. */
+ UNIV_MEM_VALID(sys_header + (TRX_SYS_RSEGS
+ + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_SPACE),
+ (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END
+ - (TRX_SYS_RSEGS
+ + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_SPACE))
+ + page - sys_header);
+
+ /* Create the first rollback segment in the SYSTEM tablespace */
+ page_no = trx_rseg_header_create(space, 0, ULINT_MAX, &slot_no,
+ mtr);
+ ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
+ ut_a(page_no != FIL_NULL);
+#endif
+
+ mutex_exit(&kernel_mutex);
+}
+
+/*****************************************************************//**
+Creates and initializes the central memory structures for the transaction
+system. This is called when the database is started. */
+UNIV_INTERN
+void
+trx_sys_init_at_db_start(void)
+/*==========================*/
+{
+ trx_sysf_t* sys_header;
+ ib_uint64_t rows_to_undo = 0;
+ const char* unit = "";
+ trx_t* trx;
+ mtr_t mtr;
+ ib_bh_t* ib_bh;
+
+ mtr_start(&mtr);
+
+ ut_ad(trx_sys == NULL);
+
+ mutex_enter(&kernel_mutex);
+
+ /* We create the min binary heap here and pass ownership to
+ purge when we init the purge sub-system. Purge is responsible
+ for freeing the binary heap. */
+
+ ib_bh = ib_bh_create(
+ trx_rseg_compare_last_trx_no,
+ sizeof(rseg_queue_t), TRX_SYS_N_RSEGS);
+
+ trx_sys = mem_zalloc(sizeof(*trx_sys));
+
+ /* Allocate the trx descriptors array */
+ trx_sys->descriptors = ut_malloc(sizeof(trx_id_t) *
+ TRX_DESCR_ARRAY_INITIAL_SIZE);
+ trx_sys->descr_n_max = TRX_DESCR_ARRAY_INITIAL_SIZE;
+ trx_sys->descr_n_used = 0;
+ srv_descriptors_memory = TRX_DESCR_ARRAY_INITIAL_SIZE *
+ sizeof(trx_id_t);
+
+ sys_header = trx_sysf_get(&mtr);
+
+ trx_rseg_list_and_array_init(sys_header, ib_bh, &mtr);
+
+ trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+ /* VERY important: after the database is started, max_trx_id value is
+ divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in
+ trx_sys_get_new_trx_id will evaluate to TRUE when the function
+ is first time called, and the value for trx id will be written
+ to the disk-based header! Thus trx id values will not overlap when
+ the database is repeatedly started! */
+
+ trx_sys->max_trx_id = 2 * TRX_SYS_TRX_ID_WRITE_MARGIN
+ + ut_uint64_align_up(mach_read_from_8(sys_header
+ + TRX_SYS_TRX_ID_STORE),
+ TRX_SYS_TRX_ID_WRITE_MARGIN);
+
+ UT_LIST_INIT(trx_sys->mysql_trx_list);
+ trx_dummy_sess = sess_open();
+ trx_lists_init_at_db_start();
+
+ if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ for (;;) {
+
+ if (trx->state != TRX_PREPARED) {
+ rows_to_undo += trx->undo_no;
+ }
+
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+
+ if (!trx) {
+ break;
+ }
+ }
+
+ if (rows_to_undo > 1000000000) {
+ unit = "M";
+ rows_to_undo = rows_to_undo / 1000000;
+ }
+
+ fprintf(stderr,
+ "InnoDB: %lu transaction(s) which must be"
+ " rolled back or cleaned up\n"
+ "InnoDB: in total %lu%s row operations to undo\n",
+ (ulong) UT_LIST_GET_LEN(trx_sys->trx_list),
+ (ulong) rows_to_undo, unit);
+
+ fprintf(stderr, "InnoDB: Trx id counter is " TRX_ID_FMT "\n",
+ (ullint) trx_sys->max_trx_id);
+ }
+
+ UT_LIST_INIT(trx_sys->view_list);
+
+ /* Transfer ownership to purge. */
+ trx_purge_sys_create(ib_bh);
+
+ mutex_exit(&kernel_mutex);
+
+ mtr_commit(&mtr);
+}
+
+/*****************************************************************//**
+Creates and initializes the transaction system at the database creation. */
+UNIV_INTERN
+void
+trx_sys_create(void)
+/*================*/
+{
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ trx_sysf_create(&mtr);
+
+ mtr_commit(&mtr);
+
+ trx_sys_init_at_db_start();
+}
+
+/*****************************************************************//**
+Update the file format tag.
+@return always TRUE */
+static
+ibool
+trx_sys_file_format_max_write(
+/*==========================*/
+ ulint format_id, /*!< in: file format id */
+ const char** name) /*!< out: max file format name, can
+ be NULL */
+{
+ mtr_t mtr;
+ byte* ptr;
+ buf_block_t* block;
+ ib_uint64_t tag_value;
+
+ mtr_start(&mtr);
+
+ block = buf_page_get(
+ TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
+
+ file_format_max.id = format_id;
+ file_format_max.name = trx_sys_file_format_id_to_name(format_id);
+
+ ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG;
+ tag_value = format_id + TRX_SYS_FILE_FORMAT_TAG_MAGIC_N;
+
+ if (name) {
+ *name = file_format_max.name;
+ }
+
+ mlog_write_ull(ptr, tag_value, &mtr);
+
+ mtr_commit(&mtr);
+
+ return(TRUE);
+}
+
+/*****************************************************************//**
+Read the file format tag.
+@return the file format or ULINT_UNDEFINED if not set. */
+static
+ulint
+trx_sys_file_format_max_read(void)
+/*==============================*/
+{
+ mtr_t mtr;
+ const byte* ptr;
+ const buf_block_t* block;
+ ib_id_t file_format_id;
+
+ /* Since this is called during the startup phase it's safe to
+ read the value without a covering mutex. */
+ mtr_start(&mtr);
+
+ block = buf_page_get(
+ TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
+
+ ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG;
+ file_format_id = mach_read_from_8(ptr);
+
+ mtr_commit(&mtr);
+
+ file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N;
+
+ if (file_format_id >= FILE_FORMAT_NAME_N) {
+
+ /* Either it has never been tagged, or garbage in it. */
+ return(ULINT_UNDEFINED);
+ }
+
+ return((ulint) file_format_id);
+}
+
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return pointer to the name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_id_to_name(
+/*===========================*/
+ const ulint id) /*!< in: id of the file format */
+{
+ ut_a(id < FILE_FORMAT_NAME_N);
+
+ return(file_format_name_map[id]);
+}
+
+/*****************************************************************//**
+Check for the max file format tag stored on disk. Note: If max_format_id
+is == DICT_TF_FORMAT_MAX + 1 then we only print a warning.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+trx_sys_file_format_max_check(
+/*==========================*/
+ ulint max_format_id) /*!< in: max format id to check */
+{
+ ulint format_id;
+
+ /* Check the file format in the tablespace. Do not try to
+ recover if the file format is not supported by the engine
+ unless forced by the user. */
+ format_id = trx_sys_file_format_max_read();
+ if (format_id == ULINT_UNDEFINED) {
+ /* Format ID was not set. Set it to minimum possible
+ value. */
+ format_id = DICT_TF_FORMAT_MIN;
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: highest supported file format is %s.\n",
+ trx_sys_file_format_id_to_name(DICT_TF_FORMAT_MAX));
+
+ if (format_id > DICT_TF_FORMAT_MAX) {
+
+ ut_a(format_id < FILE_FORMAT_NAME_N);
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: %s: the system tablespace is in a file "
+ "format that this version doesn't support - %s\n",
+ ((max_format_id <= DICT_TF_FORMAT_MAX)
+ ? "Error" : "Warning"),
+ trx_sys_file_format_id_to_name(format_id));
+
+ if (max_format_id <= DICT_TF_FORMAT_MAX) {
+ return(DB_ERROR);
+ }
+ }
+
+ format_id = (format_id > max_format_id) ? format_id : max_format_id;
+
+ /* We don't need a mutex here, as this function should only
+ be called once at start up. */
+ file_format_max.id = format_id;
+ file_format_max.name = trx_sys_file_format_id_to_name(format_id);
+
+ return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Set the file format id unconditionally except if it's already the
+same value.
+@return TRUE if value updated */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_set(
+/*========================*/
+ ulint format_id, /*!< in: file format id */
+ const char** name) /*!< out: max file format name or
+ NULL if not needed. */
+{
+ ibool ret = FALSE;
+
+ ut_a(format_id <= DICT_TF_FORMAT_MAX);
+
+ mutex_enter(&file_format_max.mutex);
+
+ /* Only update if not already same value. */
+ if (format_id != file_format_max.id) {
+
+ ret = trx_sys_file_format_max_write(format_id, name);
+ }
+
+ mutex_exit(&file_format_max.mutex);
+
+ return(ret);
+}
+
+/********************************************************************//**
+Tags the system table space with minimum format id if it has not been
+tagged yet.
+WARNING: This function is only called during the startup and AFTER the
+redo log application during recovery has finished. */
+UNIV_INTERN
+void
+trx_sys_file_format_tag_init(void)
+/*==============================*/
+{
+ ulint format_id;
+
+ format_id = trx_sys_file_format_max_read();
+
+ /* If format_id is not set then set it to the minimum. */
+ if (format_id == ULINT_UNDEFINED) {
+ trx_sys_file_format_max_set(DICT_TF_FORMAT_MIN, NULL);
+ }
+}
+
+/********************************************************************//**
+Update the file format tag in the system tablespace only if the given
+format id is greater than the known max id.
+@return TRUE if format_id was bigger than the known max id */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_upgrade(
+/*============================*/
+ const char** name, /*!< out: max file format name */
+ ulint format_id) /*!< in: file format identifier */
+{
+ ibool ret = FALSE;
+
+ ut_a(name);
+ ut_a(file_format_max.name != NULL);
+ ut_a(format_id <= DICT_TF_FORMAT_MAX);
+
+ mutex_enter(&file_format_max.mutex);
+
+ if (format_id > file_format_max.id) {
+
+ ret = trx_sys_file_format_max_write(format_id, name);
+ }
+
+ mutex_exit(&file_format_max.mutex);
+
+ return(ret);
+}
+
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return pointer to the max format name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_max_get(void)
+/*=============================*/
+{
+ return(file_format_max.name);
+}
+
+/*****************************************************************//**
+Initializes the tablespace tag system. */
+UNIV_INTERN
+void
+trx_sys_file_format_init(void)
+/*==========================*/
+{
+ mutex_create(file_format_max_mutex_key,
+ &file_format_max.mutex, SYNC_FILE_FORMAT_TAG);
+
+ /* We don't need a mutex here, as this function should only
+ be called once at start up. */
+ file_format_max.id = DICT_TF_FORMAT_MIN;
+
+ file_format_max.name = trx_sys_file_format_id_to_name(
+ file_format_max.id);
+}
+
+/*****************************************************************//**
+Closes the tablespace tag system. */
+UNIV_INTERN
+void
+trx_sys_file_format_close(void)
+/*===========================*/
+{
+ /* Does nothing at the moment */
+}
+
+/*****************************************************************//**
+Creates and initializes the dummy transaction system page for tablespace. */
+UNIV_INTERN
+void
+trx_sys_dummy_create(
+/*=================*/
+ ulint space)
+{
+ mtr_t mtr;
+
+ /* This function is only for doublewrite file for now */
+ ut_a(space == TRX_DOUBLEWRITE_SPACE);
+
+ mtr_start(&mtr);
+
+ trx_sysf_dummy_create(space, &mtr);
+
+ mtr_commit(&mtr);
+}
+
+/*********************************************************************
+Creates the rollback segments */
+UNIV_INTERN
+void
+trx_sys_create_rsegs(
+/*=================*/
+ ulint n_rsegs) /*!< number of rollback segments to create */
+{
+ ulint new_rsegs = 0;
+
+ /* Do not create additional rollback segments if
+ innodb_force_recovery has been set and the database
+ was not shutdown cleanly. */
+ if (!srv_force_recovery && !recv_needed_recovery) {
+ ulint i;
+
+ for (i = 0; i < n_rsegs; ++i) {
+
+ if (trx_rseg_create() != NULL) {
+ ++new_rsegs;
+ } else {
+ break;
+ }
+ }
+ }
+
+ if (new_rsegs > 0) {
+ fprintf(stderr,
+ "InnoDB: %lu rollback segment(s) active.\n",
+ new_rsegs);
+ }
+}
+
+#else /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Prints to stderr the MySQL binlog info in the system header if the
+magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_binlog_offset_from_page(
+/*========================================*/
+ const byte* page) /*!< in: buffer containing the trx
+ system header page, i.e., page number
+ TRX_SYS_PAGE_NO in the tablespace */
+{
+ const trx_sysf_t* sys_header;
+
+ sys_header = page + TRX_SYS;
+
+ if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+ == TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+ fprintf(stderr,
+ "ibbackup: Last MySQL binlog file position %lu %lu,"
+ " file name %s\n",
+ (ulong) mach_read_from_4(
+ sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
+ (ulong) mach_read_from_4(
+ sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_LOW),
+ sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_NAME);
+ }
+}
+
+
+/* THESE ARE COPIED FROM NON-HOTBACKUP PART OF THE INNODB SOURCE TREE
+ (This code duplicaton should be fixed at some point!)
+*/
+
+#define TRX_SYS_SPACE 0 /* the SYSTEM tablespace */
+/* The offset of the file format tag on the trx system header page */
+#define TRX_SYS_FILE_FORMAT_TAG (UNIV_PAGE_SIZE - 16)
+/* We use these random constants to reduce the probability of reading
+garbage (from previous versions) that maps to an actual format id. We
+use these as bit masks at the time of reading and writing from/to disk. */
+#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW 3645922177UL
+#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH 2745987765UL
+
+/* END OF COPIED DEFINITIONS */
+
+
+/*****************************************************************//**
+Reads the file format id from the first system table space file.
+Even if the call succeeds and returns TRUE, the returned format id
+may be ULINT_UNDEFINED signalling that the format id was not present
+in the data file.
+@return TRUE if call succeeds */
+UNIV_INTERN
+ibool
+trx_sys_read_file_format_id(
+/*========================*/
+ const char *pathname, /*!< in: pathname of the first system
+ table space file */
+ ulint *format_id) /*!< out: file format of the system table
+ space */
+{
+ os_file_t file;
+ ibool success;
+ byte buf[UNIV_PAGE_SIZE * 2];
+ page_t* page = ut_align(buf, UNIV_PAGE_SIZE);
+ const byte* ptr;
+ ib_id_t file_format_id;
+
+ *format_id = ULINT_UNDEFINED;
+
+ file = os_file_create_simple_no_error_handling(
+ innodb_file_data_key,
+ pathname,
+ OS_FILE_OPEN,
+ OS_FILE_READ_ONLY,
+ &success
+ );
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+" ibbackup: Error: trying to read system tablespace file format,\n"
+" ibbackup: but could not open the tablespace file %s!\n",
+ pathname
+ );
+ return(FALSE);
+ }
+
+ /* Read the page on which file format is stored */
+
+ success = os_file_read_no_error_handling(
+ file, page, TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE, 0, UNIV_PAGE_SIZE
+ );
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+" ibbackup: Error: trying to read system table space file format,\n"
+" ibbackup: but failed to read the tablespace file %s!\n",
+ pathname
+ );
+ os_file_close(file);
+ return(FALSE);
+ }
+ os_file_close(file);
+
+ /* get the file format from the page */
+ ptr = page + TRX_SYS_FILE_FORMAT_TAG;
+ file_format_id = mach_read_from_8(ptr);
+ file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N;
+
+ if (file_format_id >= FILE_FORMAT_NAME_N) {
+
+ /* Either it has never been tagged, or garbage in it. */
+ return(TRUE);
+ }
+
+ *format_id = (ulint) file_format_id;
+
+ return(TRUE);
+}
+
+
+/*****************************************************************//**
+Reads the file format id from the given per-table data file.
+@return TRUE if call succeeds */
+UNIV_INTERN
+ibool
+trx_sys_read_pertable_file_format_id(
+/*=================================*/
+ const char *pathname, /*!< in: pathname of a per-table
+ datafile */
+ ulint *format_id) /*!< out: file format of the per-table
+ data file */
+{
+ os_file_t file;
+ ibool success;
+ byte buf[UNIV_PAGE_SIZE * 2];
+ page_t* page = ut_align(buf, UNIV_PAGE_SIZE);
+ const byte* ptr;
+ ib_uint32_t flags;
+
+ *format_id = ULINT_UNDEFINED;
+
+ file = os_file_create_simple_no_error_handling(
+ innodb_file_data_key,
+ pathname,
+ OS_FILE_OPEN,
+ OS_FILE_READ_ONLY,
+ &success
+ );
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+" ibbackup: Error: trying to read per-table tablespace format,\n"
+" ibbackup: but could not open the tablespace file %s!\n",
+ pathname
+ );
+ return(FALSE);
+ }
+
+ /* Read the first page of the per-table datafile */
+
+ success = os_file_read_no_error_handling(
+ file, page, 0, 0, UNIV_PAGE_SIZE
+ );
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+" ibbackup: Error: trying to per-table data file format,\n"
+" ibbackup: but failed to read the tablespace file %s!\n",
+ pathname
+ );
+ os_file_close(file);
+ return(FALSE);
+ }
+ os_file_close(file);
+
+ /* get the file format from the page */
+ ptr = page + 54;
+ flags = mach_read_from_4(ptr);
+ if (flags == 0) {
+ /* file format is Antelope */
+ *format_id = 0;
+ return (TRUE);
+ } else if (flags & 1) {
+ /* tablespace flags are ok */
+ *format_id = (flags / 32) % 128;
+ return (TRUE);
+ } else {
+ /* bad tablespace flags */
+ return(FALSE);
+ }
+}
+
+
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return pointer to the name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_id_to_name(
+/*===========================*/
+ const ulint id) /*!< in: id of the file format */
+{
+ if (!(id < FILE_FORMAT_NAME_N)) {
+ /* unknown id */
+ return ("Unknown");
+ }
+
+ return(file_format_name_map[id]);
+}
+
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************
+Shutdown/Close the transaction system. */
+UNIV_INTERN
+void
+trx_sys_close(void)
+/*===============*/
+{
+ trx_t* trx;
+ trx_rseg_t* rseg;
+ read_view_t* view;
+
+ ut_ad(trx_sys != NULL);
+ ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
+
+ /* Check that all read views are closed except read view owned
+ by a purge. */
+
+ if (UT_LIST_GET_LEN(trx_sys->view_list) > 1) {
+ fprintf(stderr,
+ "InnoDB: Error: all read views were not closed"
+ " before shutdown:\n"
+ "InnoDB: %lu read views open \n",
+ UT_LIST_GET_LEN(trx_sys->view_list) - 1);
+ }
+
+ sess_close(trx_dummy_sess);
+ trx_dummy_sess = NULL;
+
+ trx_purge_sys_close();
+
+ mutex_enter(&kernel_mutex);
+
+ /* Free the double write data structures. */
+ ut_a(trx_doublewrite != NULL);
+ ut_free(trx_doublewrite->write_buf_unaligned);
+ trx_doublewrite->write_buf_unaligned = NULL;
+
+ mem_free(trx_doublewrite->buf_block_arr);
+ trx_doublewrite->buf_block_arr = NULL;
+
+ mutex_free(&trx_doublewrite->mutex);
+ mem_free(trx_doublewrite);
+ trx_doublewrite = NULL;
+
+ /* Only prepared transactions may be left in the system. Free them. */
+ ut_a(UT_LIST_GET_LEN(trx_sys->trx_list) == trx_n_prepared);
+
+ while ((trx = UT_LIST_GET_FIRST(trx_sys->trx_list)) != NULL) {
+ trx_free_prepared(trx);
+ }
+
+ /* There can't be any active transactions. */
+ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+ while (rseg != NULL) {
+ trx_rseg_t* prev_rseg = rseg;
+
+ rseg = UT_LIST_GET_NEXT(rseg_list, prev_rseg);
+ UT_LIST_REMOVE(rseg_list, trx_sys->rseg_list, prev_rseg);
+
+ trx_rseg_mem_free(prev_rseg);
+ }
+
+ view = UT_LIST_GET_FIRST(trx_sys->view_list);
+
+ while (view != NULL) {
+ read_view_t* prev_view = view;
+
+ view = UT_LIST_GET_NEXT(view_list, prev_view);
+
+ /* Views are allocated from the trx_sys->global_read_view_heap.
+ So, we simply remove the element here. */
+ UT_LIST_REMOVE(view_list, trx_sys->view_list, prev_view);
+ }
+
+ ut_a(UT_LIST_GET_LEN(trx_sys->trx_list) == 0);
+ ut_a(UT_LIST_GET_LEN(trx_sys->rseg_list) == 0);
+ ut_a(UT_LIST_GET_LEN(trx_sys->view_list) == 0);
+ ut_a(UT_LIST_GET_LEN(trx_sys->mysql_trx_list) == 0);
+
+ ut_ad(trx_sys->descr_n_used == 0);
+ ut_free(trx_sys->descriptors);
+
+ mem_free(trx_sys);
+
+ trx_sys = NULL;
+ mutex_exit(&kernel_mutex);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/trx/trx0trx.c b/storage/xtradb/trx/trx0trx.c
new file mode 100644
index 00000000000..09f425cfa55
--- /dev/null
+++ b/storage/xtradb/trx/trx0trx.c
@@ -0,0 +1,2449 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0trx.c
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0trx.h"
+
+#ifdef UNIV_NONINL
+#include "trx0trx.ic"
+#endif
+
+#include "trx0undo.h"
+#include "trx0rseg.h"
+#include "log0log.h"
+#include "que0que.h"
+#include "lock0lock.h"
+#include "trx0roll.h"
+#include "usr0sess.h"
+#include "read0read.h"
+#include "srv0srv.h"
+#include "btr0sea.h"
+#include "os0proc.h"
+#include "trx0xa.h"
+#include "trx0purge.h"
+#include "ha_prototypes.h"
+
+/** Dummy session used currently in MySQL interface */
+UNIV_INTERN sess_t* trx_dummy_sess = NULL;
+
+/** Number of transactions currently allocated for MySQL: protected by
+the kernel mutex */
+UNIV_INTERN ulint trx_n_mysql_transactions = 0;
+/** Number of transactions currently in the XA PREPARED state: protected by
+the kernel mutex */
+UNIV_INTERN ulint trx_n_prepared = 0;
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t trx_undo_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/*************************************************************//**
+Set detailed error message for the transaction. */
+UNIV_INTERN
+void
+trx_set_detailed_error(
+/*===================*/
+ trx_t* trx, /*!< in: transaction struct */
+ const char* msg) /*!< in: detailed error message */
+{
+ ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error));
+}
+
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+UNIV_INTERN
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+ trx_t* trx, /*!< in: transaction struct */
+ FILE* file) /*!< in: file to read message from */
+{
+ os_file_read_string(file, trx->detailed_error,
+ sizeof(trx->detailed_error));
+}
+
+/*************************************************************//**
+Callback function for trx_find_descriptor() to compare trx IDs. */
+UNIV_INTERN
+int
+trx_descr_cmp(
+/*==========*/
+ const void *a, /*!< in: pointer to first comparison argument */
+ const void *b) /*!< in: pointer to second comparison argument */
+{
+ const trx_id_t* da = (const trx_id_t*) a;
+ const trx_id_t* db = (const trx_id_t*) b;
+
+ if (*da < *db) {
+ return -1;
+ } else if (*da > *db) {
+ return 1;
+ }
+
+ return 0;
+}
+
+/*************************************************************//**
+Reserve a slot for a given trx in the global descriptors array. */
+UNIV_INLINE
+void
+trx_reserve_descriptor(
+/*===================*/
+ const trx_t* trx) /*!< in: trx pointer */
+{
+ ulint n_used;
+ ulint n_max;
+ trx_id_t* descr;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(!trx_find_descriptor(trx_sys->descriptors,
+ trx_sys->descr_n_used,
+ trx->id));
+
+ n_used = trx_sys->descr_n_used + 1;
+ n_max = trx_sys->descr_n_max;
+
+ if (UNIV_UNLIKELY(n_used > n_max)) {
+
+ n_max = n_max * 2;
+
+ trx_sys->descriptors =
+ ut_realloc(trx_sys->descriptors,
+ n_max * sizeof(trx_id_t));
+
+ trx_sys->descr_n_max = n_max;
+ srv_descriptors_memory = n_max * sizeof(trx_id_t);
+ }
+
+ descr = trx_sys->descriptors + n_used - 1;
+
+ if (UNIV_UNLIKELY(n_used > 1 && trx->id < descr[-1])) {
+
+ /* Find the slot where it should be inserted. We could use a
+ binary search, but in reality linear search should be faster,
+ because the slot we are looking for is near the array end. */
+
+ trx_id_t* tdescr;
+
+ for (tdescr = descr - 1;
+ tdescr >= trx_sys->descriptors && *tdescr > trx->id;
+ tdescr--) {
+ }
+
+ tdescr++;
+
+ ut_memmove(tdescr + 1, tdescr, (descr - tdescr) *
+ sizeof(trx_id_t));
+
+ descr = tdescr;
+ }
+
+ *descr = trx->id;
+
+ trx_sys->descr_n_used = n_used;
+}
+
+/*************************************************************//**
+Release a slot for a given trx in the global descriptors array. */
+UNIV_INTERN
+void
+trx_release_descriptor(
+/*===================*/
+ trx_t* trx) /*!< in: trx pointer */
+{
+ ulint size;
+ trx_id_t* descr;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ if (UNIV_LIKELY(trx->is_in_trx_serial_list)) {
+
+ UT_LIST_REMOVE(trx_serial_list, trx_sys->trx_serial_list,
+ trx);
+ trx->is_in_trx_serial_list = 0;
+ }
+
+ descr = trx_find_descriptor(trx_sys->descriptors,
+ trx_sys->descr_n_used,
+ trx->id);
+
+ if (UNIV_UNLIKELY(descr == NULL)) {
+
+ return;
+ }
+
+ size = (trx_sys->descriptors + trx_sys->descr_n_used - 1 - descr) *
+ sizeof(trx_id_t);
+
+ if (UNIV_LIKELY(size > 0)) {
+
+ ut_memmove(descr, descr + 1, size);
+ }
+
+ trx_sys->descr_n_used--;
+}
+
+/****************************************************************//**
+Creates and initializes a transaction object.
+@return own: the transaction */
+UNIV_INTERN
+trx_t*
+trx_create(
+/*=======*/
+ sess_t* sess) /*!< in: session */
+{
+ trx_t* trx;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(sess);
+
+ trx = mem_alloc(sizeof(trx_t));
+
+ trx->magic_n = TRX_MAGIC_N;
+
+ trx->op_info = "";
+
+ trx->is_purge = 0;
+ trx->is_recovered = 0;
+ trx->state = TRX_NOT_STARTED;
+
+ trx->is_registered = 0;
+ trx->owns_prepare_mutex = 0;
+ trx->called_commit_ordered = 0;
+
+ trx->start_time = ut_time();
+
+ trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+ trx->id = 0;
+ trx->no = IB_ULONGLONG_MAX;
+ trx->is_in_trx_serial_list = 0;
+
+ trx->support_xa = TRUE;
+
+ trx->fake_changes = FALSE;
+
+ trx->check_foreigns = TRUE;
+ trx->check_unique_secondary = TRUE;
+
+ trx->flush_log_later = FALSE;
+ trx->must_flush_log_later = FALSE;
+
+ trx->dict_operation = TRX_DICT_OP_NONE;
+ trx->table_id = 0;
+
+ trx->mysql_thd = NULL;
+ trx->duplicates = 0;
+
+ trx->n_mysql_tables_in_use = 0;
+ trx->mysql_n_tables_locked = 0;
+
+ trx->mysql_log_file_name = NULL;
+ trx->mysql_log_offset = 0;
+ trx->mysql_master_log_file_name = "";
+ trx->mysql_master_log_pos = 0;
+ trx->mysql_relay_log_file_name = "";
+ trx->mysql_relay_log_pos = 0;
+
+ trx->idle_start = 0;
+ trx->last_stmt_start = 0;
+
+ mutex_create(trx_undo_mutex_key, &trx->undo_mutex, SYNC_TRX_UNDO);
+
+ trx->rseg = NULL;
+
+ trx->undo_no = 0;
+ trx->last_sql_stat_start.least_undo_no = 0;
+ trx->insert_undo = NULL;
+ trx->update_undo = NULL;
+ trx->undo_no_arr = NULL;
+
+ trx->error_state = DB_SUCCESS;
+ trx->error_key_num = 0;
+ trx->detailed_error[0] = '\0';
+
+ trx->sess = sess;
+ trx->que_state = TRX_QUE_RUNNING;
+ trx->n_active_thrs = 0;
+
+ trx->handling_signals = FALSE;
+
+ UT_LIST_INIT(trx->signals);
+ UT_LIST_INIT(trx->reply_signals);
+
+ trx->graph = NULL;
+
+ trx->wait_lock = NULL;
+ trx->was_chosen_as_deadlock_victim = FALSE;
+ UT_LIST_INIT(trx->wait_thrs);
+
+ trx->lock_heap = mem_heap_create_in_buffer(256);
+ UT_LIST_INIT(trx->trx_locks);
+
+ UT_LIST_INIT(trx->trx_savepoints);
+
+ trx->dict_operation_lock_mode = 0;
+ trx->has_search_latch = FALSE;
+ trx->search_latch_timeout = BTR_SEA_TIMEOUT;
+
+ trx->declared_to_be_inside_innodb = FALSE;
+ trx->n_tickets_to_enter_innodb = 0;
+
+ trx->global_read_view = NULL;
+ trx->read_view = NULL;
+ trx->prebuilt_view = NULL;
+
+ trx->io_reads = 0;
+ trx->io_read = 0;
+ trx->io_reads_wait_timer = 0;
+ trx->lock_que_wait_timer = 0;
+ trx->innodb_que_wait_timer = 0;
+ trx->distinct_page_access = 0;
+ trx->distinct_page_access_hash = NULL;
+ trx->take_stats = FALSE;
+
+ /* Set X/Open XA transaction identification to NULL */
+ memset(&trx->xid, 0, sizeof(trx->xid));
+ trx->xid.formatID = -1;
+
+ trx->n_autoinc_rows = 0;
+
+ /* Remember to free the vector explicitly. */
+ trx->autoinc_locks = ib_vector_create(
+ mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 4), 4);
+
+ return(trx);
+}
+
+/********************************************************************//**
+Creates a transaction object for MySQL.
+@return own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_mysql(void)
+/*========================*/
+{
+ trx_t* trx;
+
+ mutex_enter(&kernel_mutex);
+
+ trx = trx_create(trx_dummy_sess);
+
+ trx_n_mysql_transactions++;
+
+ UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx);
+
+ mutex_exit(&kernel_mutex);
+
+ if (UNIV_UNLIKELY(trx->take_stats)) {
+ trx->distinct_page_access_hash = mem_alloc(DPAH_SIZE);
+ memset(trx->distinct_page_access_hash, 0, DPAH_SIZE);
+ }
+
+ return(trx);
+}
+
+/********************************************************************//**
+Creates a transaction object for background operations by the master thread.
+@return own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_background(void)
+/*=============================*/
+{
+ trx_t* trx;
+
+ mutex_enter(&kernel_mutex);
+
+ trx = trx_create(trx_dummy_sess);
+
+ mutex_exit(&kernel_mutex);
+
+ return(trx);
+}
+
+/********************************************************************//**
+Frees a transaction object. */
+UNIV_INTERN
+void
+trx_free(
+/*=====*/
+ trx_t* trx) /*!< in, own: trx object */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+
+ if (trx->declared_to_be_inside_innodb) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: Freeing a trx which is declared"
+ " to be processing\n"
+ "InnoDB: inside InnoDB.\n", stderr);
+ trx_print(stderr, trx, 600);
+ putc('\n', stderr);
+
+ /* This is an error but not a fatal error. We must keep
+ the counters like srv_conc_n_threads accurate. */
+ srv_conc_force_exit_innodb(trx);
+ }
+
+ if (trx->n_mysql_tables_in_use != 0
+ || trx->mysql_n_tables_locked != 0) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: MySQL is freeing a thd\n"
+ "InnoDB: though trx->n_mysql_tables_in_use is %lu\n"
+ "InnoDB: and trx->mysql_n_tables_locked is %lu.\n",
+ (ulong)trx->n_mysql_tables_in_use,
+ (ulong)trx->mysql_n_tables_locked);
+
+ trx_print(stderr, trx, 600);
+
+ ut_print_buf(stderr, trx, sizeof(trx_t));
+ putc('\n', stderr);
+ }
+
+ ut_a(trx->magic_n == TRX_MAGIC_N);
+
+ trx->magic_n = 11112222;
+
+ ut_a(trx->state == TRX_NOT_STARTED);
+
+ mutex_free(&(trx->undo_mutex));
+
+ ut_a(trx->insert_undo == NULL);
+ ut_a(trx->update_undo == NULL);
+
+ if (trx->undo_no_arr) {
+ trx_undo_arr_free(trx->undo_no_arr);
+ }
+
+ ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
+ ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0);
+
+ ut_a(trx->wait_lock == NULL);
+ ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
+
+ ut_a(!trx->has_search_latch);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!btr_search_own_any());
+#endif
+
+ ut_a(trx->dict_operation_lock_mode == 0);
+
+ if (trx->lock_heap) {
+ mem_heap_free(trx->lock_heap);
+ }
+
+ ut_a(UT_LIST_GET_LEN(trx->trx_locks) == 0);
+
+ if (trx->prebuilt_view != NULL) {
+ read_view_free(trx->prebuilt_view);
+ }
+
+ ut_a(trx->read_view == NULL);
+
+ ut_a(ib_vector_is_empty(trx->autoinc_locks));
+ /* We allocated a dedicated heap for the vector. */
+ ib_vector_free(trx->autoinc_locks);
+
+ trx_release_descriptor(trx);
+
+ mem_free(trx);
+}
+
+/********************************************************************//**
+At shutdown, frees a transaction object that is in the PREPARED state. */
+UNIV_INTERN
+void
+trx_free_prepared(
+/*==============*/
+ trx_t* trx) /*!< in, own: trx object */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_a(trx->state == TRX_PREPARED);
+ ut_a(trx->magic_n == TRX_MAGIC_N);
+
+ /* Prepared transactions are sort of active; they allow
+ ROLLBACK and COMMIT operations. Because the system does not
+ contain any other transactions than prepared transactions at
+ the shutdown stage and because a transaction cannot become
+ PREPARED while holding locks, it is safe to release the locks
+ held by PREPARED transactions here at shutdown.*/
+ lock_release_off_kernel(trx);
+
+ trx_undo_free_prepared(trx);
+
+ mutex_free(&trx->undo_mutex);
+
+ if (trx->undo_no_arr) {
+ trx_undo_arr_free(trx->undo_no_arr);
+ }
+
+ ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
+ ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0);
+
+ ut_a(trx->wait_lock == NULL);
+ ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
+
+ ut_a(!trx->has_search_latch);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!btr_search_own_any());
+#endif
+
+ ut_a(trx->dict_operation_lock_mode == 0);
+
+ if (trx->lock_heap) {
+ mem_heap_free(trx->lock_heap);
+ }
+
+ ut_a(ib_vector_is_empty(trx->autoinc_locks));
+ ib_vector_free(trx->autoinc_locks);
+
+ trx_release_descriptor(trx);
+
+ if (trx->prebuilt_view != NULL) {
+ read_view_free(trx->prebuilt_view);
+ }
+
+ UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
+
+ ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->trx_list));
+
+ mem_free(trx);
+}
+
+/********************************************************************//**
+Frees a transaction object for MySQL. */
+UNIV_INTERN
+void
+trx_free_for_mysql(
+/*===============*/
+ trx_t* trx) /*!< in, own: trx object */
+{
+ if (trx->distinct_page_access_hash)
+ {
+ mem_free(trx->distinct_page_access_hash);
+ trx->distinct_page_access_hash= NULL;
+ }
+
+ mutex_enter(&kernel_mutex);
+
+ UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx);
+
+ trx_free(trx);
+
+ ut_a(trx_n_mysql_transactions > 0);
+
+ trx_n_mysql_transactions--;
+
+ mutex_exit(&kernel_mutex);
+}
+
+/********************************************************************//**
+Frees a transaction object of a background operation of the master thread. */
+UNIV_INTERN
+void
+trx_free_for_background(
+/*====================*/
+ trx_t* trx) /*!< in, own: trx object */
+{
+ if (trx->distinct_page_access_hash)
+ {
+ mem_free(trx->distinct_page_access_hash);
+ trx->distinct_page_access_hash= NULL;
+ }
+
+ mutex_enter(&kernel_mutex);
+
+ trx_free(trx);
+
+ mutex_exit(&kernel_mutex);
+}
+
+/****************************************************************//**
+Inserts the trx handle in the trx system trx list in the right position.
+The list is sorted on the trx id so that the biggest id is at the list
+start. This function is used at the database startup to insert incomplete
+transactions to the list. */
+static
+void
+trx_list_insert_ordered(
+/*====================*/
+ trx_t* trx) /*!< in: trx handle */
+{
+ trx_t* trx2;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ trx2 = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ while (trx2 != NULL) {
+ if (trx->id >= trx2->id) {
+
+ ut_ad(trx->id > trx2->id);
+ break;
+ }
+ trx2 = UT_LIST_GET_NEXT(trx_list, trx2);
+ }
+
+ if (trx2 != NULL) {
+ trx2 = UT_LIST_GET_PREV(trx_list, trx2);
+
+ if (trx2 == NULL) {
+ UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
+ } else {
+ UT_LIST_INSERT_AFTER(trx_list, trx_sys->trx_list,
+ trx2, trx);
+ }
+ } else {
+ UT_LIST_ADD_LAST(trx_list, trx_sys->trx_list, trx);
+ }
+}
+
+/****************************************************************//**
+Creates trx objects for transactions and initializes the trx list of
+trx_sys at database start. Rollback segment and undo log lists must
+already exist when this function is called, because the lists of
+transactions to be rolled back or cleaned up are built based on the
+undo log lists. */
+UNIV_INTERN
+void
+trx_lists_init_at_db_start(void)
+/*============================*/
+{
+ trx_rseg_t* rseg;
+ trx_undo_t* undo;
+ trx_t* trx;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ UT_LIST_INIT(trx_sys->trx_list);
+ UT_LIST_INIT(trx_sys->trx_serial_list);
+
+ /* Look from the rollback segments if there exist undo logs for
+ transactions */
+
+ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+ while (rseg != NULL) {
+ undo = UT_LIST_GET_FIRST(rseg->insert_undo_list);
+
+ while (undo != NULL) {
+
+ trx = trx_create(trx_dummy_sess);
+
+ trx->is_recovered = TRUE;
+ trx->id = undo->trx_id;
+ trx->xid = undo->xid;
+ trx->insert_undo = undo;
+ trx->rseg = rseg;
+
+ if (undo->state != TRX_UNDO_ACTIVE) {
+
+ /* Prepared transactions are left in
+ the prepared state waiting for a
+ commit or abort decision from MySQL */
+
+ if (undo->state == TRX_UNDO_PREPARED) {
+
+ fprintf(stderr,
+ "InnoDB: Transaction "
+ TRX_ID_FMT
+ " was in the"
+ " XA prepared state.\n",
+ (ullint) trx->id);
+
+ if (srv_force_recovery == 0) {
+
+ trx->state = TRX_PREPARED;
+ trx_n_prepared++;
+ } else {
+ fprintf(stderr,
+ "InnoDB: Since"
+ " innodb_force_recovery"
+ " > 0, we will"
+ " rollback it"
+ " anyway.\n");
+
+ trx->state = TRX_ACTIVE;
+ }
+
+ trx_reserve_descriptor(trx);
+ } else {
+ trx->state = TRX_COMMITTED_IN_MEMORY;
+ }
+
+ /* We give a dummy value for the trx no;
+ this should have no relevance since purge
+ is not interested in committed transaction
+ numbers, unless they are in the history
+ list, in which case it looks the number
+ from the disk based undo log structure */
+
+ trx->no = trx->id;
+ } else {
+ trx->state = TRX_ACTIVE;
+
+ /* A running transaction always has the number
+ field inited to IB_ULONGLONG_MAX */
+
+ trx->no = IB_ULONGLONG_MAX;
+
+ trx_reserve_descriptor(trx);
+
+ }
+
+ if (undo->dict_operation) {
+ trx_set_dict_operation(
+ trx, TRX_DICT_OP_TABLE);
+ trx->table_id = undo->table_id;
+ }
+
+ if (!undo->empty) {
+ trx->undo_no = undo->top_undo_no + 1;
+ }
+
+ trx_list_insert_ordered(trx);
+
+ undo = UT_LIST_GET_NEXT(undo_list, undo);
+ }
+
+ undo = UT_LIST_GET_FIRST(rseg->update_undo_list);
+
+ while (undo != NULL) {
+ trx = trx_get_on_id(undo->trx_id);
+
+ if (NULL == trx) {
+ trx = trx_create(trx_dummy_sess);
+
+ trx->is_recovered = TRUE;
+ trx->id = undo->trx_id;
+ trx->xid = undo->xid;
+
+ if (undo->state != TRX_UNDO_ACTIVE) {
+
+ /* Prepared transactions are left in
+ the prepared state waiting for a
+ commit or abort decision from MySQL */
+
+ if (undo->state == TRX_UNDO_PREPARED) {
+ fprintf(stderr,
+ "InnoDB: Transaction "
+ TRX_ID_FMT " was in the"
+ " XA prepared state.\n",
+ (ullint) trx->id);
+
+ if (srv_force_recovery == 0) {
+
+ trx->state
+ = TRX_PREPARED;
+ trx_n_prepared++;
+ } else {
+ fprintf(stderr,
+ "InnoDB: Since"
+ " innodb_force_recovery"
+ " > 0, we will"
+ " rollback it"
+ " anyway.\n");
+
+ trx->state = TRX_ACTIVE;
+ trx_reserve_descriptor(
+ trx);
+ }
+ } else {
+ trx->state
+ = TRX_COMMITTED_IN_MEMORY;
+ }
+
+ /* We give a dummy value for the trx
+ number */
+
+ trx->no = trx->id;
+ } else {
+ trx->state = TRX_ACTIVE;
+ /* A running transaction always has
+ the number field inited to
+ IB_ULONGLONG_MAX */
+
+ trx->no = IB_ULONGLONG_MAX;
+
+ trx_reserve_descriptor(trx);
+ }
+
+ trx->rseg = rseg;
+ trx_list_insert_ordered(trx);
+
+ if (undo->dict_operation) {
+ trx_set_dict_operation(
+ trx, TRX_DICT_OP_TABLE);
+ trx->table_id = undo->table_id;
+ }
+ }
+
+ trx->update_undo = undo;
+
+ if ((!undo->empty)
+ && undo->top_undo_no >= trx->undo_no) {
+
+ trx->undo_no = undo->top_undo_no + 1;
+ }
+
+ undo = UT_LIST_GET_NEXT(undo_list, undo);
+ }
+
+ rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+ }
+}
+
+/******************************************************************//**
+Assigns a rollback segment to a transaction in a round-robin fashion.
+@return assigned rollback segment instance */
+UNIV_INLINE
+trx_rseg_t*
+trx_assign_rseg(
+/*============*/
+ ulint max_undo_logs) /*!< in: maximum number of UNDO logs to use */
+{
+ trx_rseg_t* rseg = trx_sys->latest_rseg;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+
+ if (rseg == NULL || rseg->id == max_undo_logs - 1) {
+ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+ }
+
+ trx_sys->latest_rseg = rseg;
+
+ return(rseg);
+}
+
+/****************************************************************//**
+Starts a new transaction.
+@return TRUE */
+UNIV_INTERN
+ibool
+trx_start_low(
+/*==========*/
+ trx_t* trx, /*!< in: transaction */
+ ulint rseg_id)/*!< in: rollback segment id; if ULINT_UNDEFINED
+ is passed, the system chooses the rollback segment
+ automatically in a round-robin fashion */
+{
+ trx_rseg_t* rseg;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(trx->rseg == NULL);
+
+ if (trx->is_purge) {
+ trx->id = 0;
+ /* Don't reserve a descriptor, since this trx is not added to
+ trx_list. */
+ trx->state = TRX_ACTIVE;
+ trx->start_time = time(NULL);
+
+ return(TRUE);
+ }
+
+ ut_ad(trx->state != TRX_ACTIVE);
+
+ ut_a(rseg_id == ULINT_UNDEFINED);
+
+ rseg = trx_assign_rseg(srv_rollback_segments);
+
+ trx->id = trx_sys_get_new_trx_id();
+
+ /* The initial value for trx->no: IB_ULONGLONG_MAX is used in
+ read_view_open_now: */
+
+ trx->no = IB_ULONGLONG_MAX;
+
+ trx->rseg = rseg;
+
+ trx->state = TRX_ACTIVE;
+
+ trx_reserve_descriptor(trx);
+
+ trx->start_time = time(NULL);
+
+ UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
+
+ return(TRUE);
+}
+
+/****************************************************************//**
+Starts a new transaction.
+@return TRUE */
+UNIV_INTERN
+ibool
+trx_start(
+/*======*/
+ trx_t* trx, /*!< in: transaction */
+ ulint rseg_id)/*!< in: rollback segment id; if ULINT_UNDEFINED
+ is passed, the system chooses the rollback segment
+ automatically in a round-robin fashion */
+{
+ ibool ret;
+
+ /* Update the info whether we should skip XA steps that eat CPU time
+ For the duration of the transaction trx->support_xa is not reread
+ from thd so any changes in the value take effect in the next
+ transaction. This is to avoid a scenario where some undo
+ generated by a transaction, has XA stuff, and other undo,
+ generated by the same transaction, doesn't. */
+ trx->support_xa = thd_supports_xa(trx->mysql_thd);
+
+ mutex_enter(&kernel_mutex);
+
+ ret = trx_start_low(trx, rseg_id);
+
+ mutex_exit(&kernel_mutex);
+
+ return(ret);
+}
+
+/****************************************************************//**
+Set the transaction serialisation number. */
+static
+void
+trx_serialisation_number_get(
+/*=========================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ trx_rseg_t* rseg;
+
+ rseg = trx->rseg;
+
+ ut_ad(mutex_own(&rseg->mutex));
+
+ mutex_enter(&kernel_mutex);
+
+ trx->no = trx_sys_get_new_trx_id();
+
+ if (UNIV_LIKELY(trx->is_in_trx_serial_list == 0)) {
+
+ UT_LIST_ADD_LAST(trx_serial_list, trx_sys->trx_serial_list,
+ trx);
+
+ trx->is_in_trx_serial_list = 1;
+ }
+
+ /* If the rollack segment is not empty then the
+ new trx_t::no can't be less than any trx_t::no
+ already in the rollback segment. User threads only
+ produce events when a rollback segment is empty. */
+
+ if (rseg->last_page_no == FIL_NULL) {
+ void* ptr;
+ rseg_queue_t rseg_queue;
+
+ rseg_queue.rseg = rseg;
+ rseg_queue.trx_no = trx->no;
+
+ mutex_enter(&purge_sys->bh_mutex);
+
+ /* This is to reduce the pressure on the kernel mutex,
+ though in reality it should make very little (read no)
+ difference because this code path is only taken when the
+ rbs is empty. */
+
+ mutex_exit(&kernel_mutex);
+
+ ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue);
+ ut_a(ptr);
+
+ mutex_exit(&purge_sys->bh_mutex);
+ } else {
+ mutex_exit(&kernel_mutex);
+ }
+}
+
+/****************************************************************//**
+Assign the transaction its history serialisation number and write the
+update UNDO log record to the assigned rollback segment.
+@return the LSN of the UNDO log write. */
+static
+ib_uint64_t
+trx_write_serialisation_history(
+/*============================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ mtr_t mtr;
+ trx_rseg_t* rseg;
+ trx_sysf_t* sys_header = NULL;
+
+ ut_ad(!mutex_own(&kernel_mutex));
+
+ rseg = trx->rseg;
+
+ mtr_start(&mtr);
+
+ /* Change the undo log segment states from TRX_UNDO_ACTIVE
+ to some other state: these modifications to the file data
+ structure define the transaction as committed in the file
+ based domain, at the serialization point of the log sequence
+ number lsn obtained below. */
+
+ if (trx->update_undo != NULL) {
+ page_t* undo_hdr_page;
+ trx_undo_t* undo = trx->update_undo;
+
+ /* We have to hold the rseg mutex because update
+ log headers have to be put to the history list in the
+ (serialisation) order of the UNDO trx number. This is
+ required for the purge in-memory data structures too. */
+
+ mutex_enter(&rseg->mutex);
+
+ /* Assign the transaction serialisation number and also
+ update the purge min binary heap if this is the first
+ UNDO log being written to the assigned rollback segment. */
+
+ trx_serialisation_number_get(trx);
+
+ /* It is not necessary to obtain trx->undo_mutex here
+ because only a single OS thread is allowed to do the
+ transaction commit for this transaction. */
+
+ undo_hdr_page = trx_undo_set_state_at_finish(undo, &mtr);
+
+ trx_undo_update_cleanup(trx, undo_hdr_page, &mtr);
+ } else {
+ mutex_enter(&rseg->mutex);
+ }
+
+ if (trx->insert_undo != NULL) {
+ trx_undo_set_state_at_finish(trx->insert_undo, &mtr);
+ }
+
+ mutex_exit(&rseg->mutex);
+
+ /* Update the latest MySQL binlog name and offset info
+ in trx sys header if MySQL binlogging is on or the database
+ server is a MySQL replication slave */
+
+ if (trx->mysql_log_file_name
+ && trx->mysql_log_file_name[0] != '\0') {
+ if (!sys_header) {
+ sys_header = trx_sysf_get(&mtr);
+ }
+
+ trx_sys_update_mysql_binlog_offset(
+ sys_header,
+ trx->mysql_log_file_name,
+ trx->mysql_log_offset,
+ TRX_SYS_MYSQL_LOG_INFO, &mtr);
+
+ trx->mysql_log_file_name = NULL;
+ }
+
+ if (trx->mysql_master_log_file_name[0] != '\0') {
+ /* This database server is a MySQL replication slave */
+ if (!sys_header) {
+ sys_header = trx_sysf_get(&mtr);
+ }
+
+ trx_sys_update_mysql_binlog_offset(
+ sys_header,
+ trx->mysql_relay_log_file_name,
+ trx->mysql_relay_log_pos,
+ TRX_SYS_COMMIT_RELAY_LOG_INFO, &mtr);
+
+ trx_sys_update_mysql_binlog_offset(
+ sys_header,
+ trx->mysql_master_log_file_name,
+ trx->mysql_master_log_pos,
+ TRX_SYS_COMMIT_MASTER_LOG_INFO, &mtr);
+
+ trx_sys_update_mysql_binlog_offset(
+ sys_header,
+ trx->mysql_relay_log_file_name,
+ trx->mysql_relay_log_pos,
+ TRX_SYS_MYSQL_RELAY_LOG_INFO, &mtr);
+
+ trx_sys_update_mysql_binlog_offset(
+ sys_header,
+ trx->mysql_master_log_file_name,
+ trx->mysql_master_log_pos,
+ TRX_SYS_MYSQL_MASTER_LOG_INFO, &mtr);
+
+ trx->mysql_master_log_file_name = "";
+ }
+
+ /* The following call commits the mini-transaction, making the
+ whole transaction committed in the file-based world, at this
+ log sequence number. The transaction becomes 'durable' when
+ we write the log to disk, but in the logical sense the commit
+ in the file-based data structures (undo logs etc.) happens
+ here.
+
+ NOTE that transaction numbers, which are assigned only to
+ transactions with an update undo log, do not necessarily come
+ in exactly the same order as commit lsn's, if the transactions
+ have different rollback segments. To get exactly the same
+ order we should hold the kernel mutex up to this point,
+ adding to the contention of the kernel mutex. However, if
+ a transaction T2 is able to see modifications made by
+ a transaction T1, T2 will always get a bigger transaction
+ number and a bigger commit lsn than T1. */
+
+ /*--------------*/
+ mtr_commit(&mtr);
+ /*--------------*/
+
+ return(mtr.end_lsn);
+}
+
+/****************************************************************//**
+Commits a transaction. */
+UNIV_INTERN
+void
+trx_commit_off_kernel(
+/*==================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ ib_uint64_t lsn;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ trx->must_flush_log_later = FALSE;
+
+ /* If the transaction made any updates then we need to write the
+ UNDO logs for the updates to the assigned rollback segment. */
+
+ if (trx->insert_undo != NULL || trx->update_undo != NULL) {
+ mutex_exit(&kernel_mutex);
+
+ lsn = trx_write_serialisation_history(trx);
+
+ mutex_enter(&kernel_mutex);
+ } else {
+ lsn = 0;
+ }
+
+ ut_ad(trx->state == TRX_ACTIVE || trx->state == TRX_PREPARED);
+ ut_ad(mutex_own(&kernel_mutex));
+
+ if (UNIV_UNLIKELY(trx->state == TRX_PREPARED)) {
+ ut_a(trx_n_prepared > 0);
+ trx_n_prepared--;
+ }
+
+ /* The following assignment makes the transaction committed in memory
+ and makes its changes to data visible to other transactions.
+ NOTE that there is a small discrepancy from the strict formal
+ visibility rules here: a human user of the database can see
+ modifications made by another transaction T even before the necessary
+ log segment has been flushed to the disk. If the database happens to
+ crash before the flush, the user has seen modifications from T which
+ will never be a committed transaction. However, any transaction T2
+ which sees the modifications of the committing transaction T, and
+ which also itself makes modifications to the database, will get an lsn
+ larger than the committing transaction T. In the case where the log
+ flush fails, and T never gets committed, also T2 will never get
+ committed. */
+
+ /*--------------------------------------*/
+ trx->state = TRX_COMMITTED_IN_MEMORY;
+ /* The following also removes trx from trx_serial_list */
+ trx_release_descriptor(trx);
+ /*--------------------------------------*/
+
+ /* If we release kernel_mutex below and we are still doing
+ recovery i.e.: back ground rollback thread is still active
+ then there is a chance that the rollback thread may see
+ this trx as COMMITTED_IN_MEMORY and goes adhead to clean it
+ up calling trx_cleanup_at_db_startup(). This can happen
+ in the case we are committing a trx here that is left in
+ PREPARED state during the crash. Note that commit of the
+ rollback of a PREPARED trx happens in the recovery thread
+ while the rollback of other transactions happen in the
+ background thread. To avoid this race we unconditionally
+ unset the is_recovered flag from the trx. */
+
+ trx->is_recovered = FALSE;
+
+ lock_release_off_kernel(trx);
+
+ if (trx->global_read_view) {
+ read_view_close(trx->global_read_view);
+ trx->global_read_view = NULL;
+ }
+
+ trx->read_view = NULL;
+
+ if (lsn) {
+ ulint flush_log_at_trx_commit;
+
+ mutex_exit(&kernel_mutex);
+
+ if (trx->insert_undo != NULL) {
+
+ trx_undo_insert_cleanup(trx);
+ }
+
+ if (srv_use_global_flush_log_at_trx_commit) {
+ flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
+ } else {
+ flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
+ }
+
+ /* NOTE that we could possibly make a group commit more
+ efficient here: call os_thread_yield here to allow also other
+ trxs to come to commit! */
+
+ /*-------------------------------------*/
+
+ /* Depending on the my.cnf options, we may now write the log
+ buffer to the log files, making the transaction durable if
+ the OS does not crash. We may also flush the log files to
+ disk, making the transaction durable also at an OS crash or a
+ power outage.
+
+ The idea in InnoDB's group commit is that a group of
+ transactions gather behind a trx doing a physical disk write
+ to log files, and when that physical write has been completed,
+ one of those transactions does a write which commits the whole
+ group. Note that this group commit will only bring benefit if
+ there are > 2 users in the database. Then at least 2 users can
+ gather behind one doing the physical log write to disk.
+
+ If we are calling trx_commit() under prepare_commit_mutex, we
+ will delay possible log write and flush to a separate function
+ trx_commit_complete_for_mysql(), which is only called when the
+ thread has released the mutex. This is to make the
+ group commit algorithm to work. Otherwise, the prepare_commit
+ mutex would serialize all commits and prevent a group of
+ transactions from gathering. */
+
+ if (trx->flush_log_later) {
+ /* Do nothing yet */
+ trx->must_flush_log_later = TRUE;
+ } else if (flush_log_at_trx_commit == 0) {
+ /* Do nothing */
+ } else if (flush_log_at_trx_commit == 1) {
+ if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
+ FALSE);
+ } else {
+ /* Write the log to the log files AND flush
+ them to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
+ }
+ } else if (flush_log_at_trx_commit == 2) {
+
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+ } else {
+ ut_error;
+ }
+
+ trx->commit_lsn = lsn;
+
+ /*-------------------------------------*/
+
+ mutex_enter(&kernel_mutex);
+ }
+
+ /* Free all savepoints */
+ trx_roll_free_all_savepoints(trx);
+
+ trx->state = TRX_NOT_STARTED;
+ trx->rseg = NULL;
+ trx->undo_no = 0;
+ trx->last_sql_stat_start.least_undo_no = 0;
+
+ ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
+ ut_ad(UT_LIST_GET_LEN(trx->trx_locks) == 0);
+
+ UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
+
+ ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->trx_list));
+
+ trx->error_state = DB_SUCCESS;
+}
+
+/****************************************************************//**
+Cleans up a transaction at database startup. The cleanup is needed if
+the transaction already got to the middle of a commit when the database
+crashed, and we cannot roll it back. */
+UNIV_INTERN
+void
+trx_cleanup_at_db_startup(
+/*======================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ if (trx->insert_undo != NULL) {
+
+ trx_undo_insert_cleanup(trx);
+ }
+
+ trx->state = TRX_NOT_STARTED;
+
+ /* This code is executed in a single threaded context, but we acquire
+ kernel_mutex to satisfy a debug assertion in
+ trx_release_descriptor(). */
+
+ mutex_enter(&kernel_mutex);
+ trx_release_descriptor(trx);
+ mutex_exit(&kernel_mutex);
+
+ trx->rseg = NULL;
+ trx->undo_no = 0;
+ trx->last_sql_stat_start.least_undo_no = 0;
+
+ UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
+
+ ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->trx_list));
+}
+
+/********************************************************************//**
+Assigns a read view for a consistent read query. All the consistent reads
+within the same transaction will get the same read view, which is created
+when this function is first called for a new started transaction.
+@return consistent read view */
+UNIV_INTERN
+read_view_t*
+trx_assign_read_view(
+/*=================*/
+ trx_t* trx) /*!< in: active transaction */
+{
+ ut_ad(trx->state == TRX_ACTIVE);
+
+ if (trx->read_view) {
+ return(trx->read_view);
+ }
+
+ mutex_enter(&kernel_mutex);
+
+ trx->read_view = read_view_open_now(trx->id, trx->prebuilt_view, TRUE);
+ trx->prebuilt_view = trx->read_view;
+ trx->global_read_view = trx->read_view;
+
+ mutex_exit(&kernel_mutex);
+
+ return(trx->read_view);
+}
+
+/****************************************************************//**
+Commits a transaction. NOTE that the kernel mutex is temporarily released. */
+static
+void
+trx_handle_commit_sig_off_kernel(
+/*=============================*/
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t** next_thr) /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+{
+ trx_sig_t* sig;
+ trx_sig_t* next_sig;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ trx->que_state = TRX_QUE_COMMITTING;
+
+ trx_commit_off_kernel(trx);
+
+ ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
+
+ /* Remove all TRX_SIG_COMMIT signals from the signal queue and send
+ reply messages to them */
+
+ sig = UT_LIST_GET_FIRST(trx->signals);
+
+ while (sig != NULL) {
+ next_sig = UT_LIST_GET_NEXT(signals, sig);
+
+ if (sig->type == TRX_SIG_COMMIT) {
+
+ trx_sig_reply(sig, next_thr);
+ trx_sig_remove(trx, sig);
+ }
+
+ sig = next_sig;
+ }
+
+ trx->que_state = TRX_QUE_RUNNING;
+}
+
+/***********************************************************//**
+The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to
+the TRX_QUE_RUNNING state and releases query threads which were
+waiting for a lock in the wait_thrs list. */
+UNIV_INTERN
+void
+trx_end_lock_wait(
+/*==============*/
+ trx_t* trx) /*!< in: transaction */
+{
+ que_thr_t* thr;
+ ulint sec;
+ ulint ms;
+ ib_uint64_t now;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
+
+ thr = UT_LIST_GET_FIRST(trx->wait_thrs);
+
+ while (thr != NULL) {
+ que_thr_end_wait_no_next_thr(thr);
+
+ UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
+
+ thr = UT_LIST_GET_FIRST(trx->wait_thrs);
+ }
+
+ if (UNIV_UNLIKELY(trx->take_stats)) {
+ ut_usectime(&sec, &ms);
+ now = (ib_uint64_t)sec * 1000000 + ms;
+ trx->lock_que_wait_timer += (ulint)(now - trx->lock_que_wait_ustarted);
+ }
+ trx->que_state = TRX_QUE_RUNNING;
+}
+
+/***********************************************************//**
+Moves the query threads in the lock wait list to the SUSPENDED state and puts
+the transaction to the TRX_QUE_RUNNING state. */
+static
+void
+trx_lock_wait_to_suspended(
+/*=======================*/
+ trx_t* trx) /*!< in: transaction in the TRX_QUE_LOCK_WAIT state */
+{
+ que_thr_t* thr;
+ ulint sec;
+ ulint ms;
+ ib_uint64_t now;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
+
+ thr = UT_LIST_GET_FIRST(trx->wait_thrs);
+
+ while (thr != NULL) {
+ thr->state = QUE_THR_SUSPENDED;
+
+ UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
+
+ thr = UT_LIST_GET_FIRST(trx->wait_thrs);
+ }
+
+ if (UNIV_UNLIKELY(trx->take_stats)) {
+ ut_usectime(&sec, &ms);
+ now = (ib_uint64_t)sec * 1000000 + ms;
+ trx->lock_que_wait_timer += (ulint)(now - trx->lock_que_wait_ustarted);
+ }
+ trx->que_state = TRX_QUE_RUNNING;
+}
+
+/***********************************************************//**
+Moves the query threads in the sig reply wait list of trx to the SUSPENDED
+state. */
+static
+void
+trx_sig_reply_wait_to_suspended(
+/*============================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ trx_sig_t* sig;
+ que_thr_t* thr;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ sig = UT_LIST_GET_FIRST(trx->reply_signals);
+
+ while (sig != NULL) {
+ thr = sig->receiver;
+
+ ut_ad(thr->state == QUE_THR_SIG_REPLY_WAIT);
+
+ thr->state = QUE_THR_SUSPENDED;
+
+ sig->receiver = NULL;
+
+ UT_LIST_REMOVE(reply_signals, trx->reply_signals, sig);
+
+ sig = UT_LIST_GET_FIRST(trx->reply_signals);
+ }
+}
+
+/*****************************************************************//**
+Checks the compatibility of a new signal with the other signals in the
+queue.
+@return TRUE if the signal can be queued */
+static
+ibool
+trx_sig_is_compatible(
+/*==================*/
+ trx_t* trx, /*!< in: trx handle */
+ ulint type, /*!< in: signal type */
+ ulint sender) /*!< in: TRX_SIG_SELF or TRX_SIG_OTHER_SESS */
+{
+ trx_sig_t* sig;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ if (UT_LIST_GET_LEN(trx->signals) == 0) {
+
+ return(TRUE);
+ }
+
+ if (sender == TRX_SIG_SELF) {
+ if (type == TRX_SIG_ERROR_OCCURRED) {
+
+ return(TRUE);
+
+ } else if (type == TRX_SIG_BREAK_EXECUTION) {
+
+ return(TRUE);
+ } else {
+ return(FALSE);
+ }
+ }
+
+ ut_ad(sender == TRX_SIG_OTHER_SESS);
+
+ sig = UT_LIST_GET_FIRST(trx->signals);
+
+ if (type == TRX_SIG_COMMIT) {
+ while (sig != NULL) {
+
+ if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
+
+ return(FALSE);
+ }
+
+ sig = UT_LIST_GET_NEXT(signals, sig);
+ }
+
+ return(TRUE);
+
+ } else if (type == TRX_SIG_TOTAL_ROLLBACK) {
+ while (sig != NULL) {
+
+ if (sig->type == TRX_SIG_COMMIT) {
+
+ return(FALSE);
+ }
+
+ sig = UT_LIST_GET_NEXT(signals, sig);
+ }
+
+ return(TRUE);
+
+ } else if (type == TRX_SIG_BREAK_EXECUTION) {
+
+ return(TRUE);
+ } else {
+ ut_error;
+
+ return(FALSE);
+ }
+}
+
+/****************************************************************//**
+Sends a signal to a trx object. */
+UNIV_INTERN
+void
+trx_sig_send(
+/*=========*/
+ trx_t* trx, /*!< in: trx handle */
+ ulint type, /*!< in: signal type */
+ ulint sender, /*!< in: TRX_SIG_SELF or
+ TRX_SIG_OTHER_SESS */
+ que_thr_t* receiver_thr, /*!< in: query thread which wants the
+ reply, or NULL; if type is
+ TRX_SIG_END_WAIT, this must be NULL */
+ trx_savept_t* savept, /*!< in: possible rollback savepoint, or
+ NULL */
+ que_thr_t** next_thr) /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if the parameter
+ is NULL, it is ignored */
+{
+ trx_sig_t* sig;
+ trx_t* receiver_trx;
+
+ ut_ad(trx);
+ ut_ad(mutex_own(&kernel_mutex));
+
+ if (!trx_sig_is_compatible(trx, type, sender)) {
+ /* The signal is not compatible with the other signals in
+ the queue: die */
+
+ ut_error;
+ }
+
+ /* Queue the signal object */
+
+ if (UT_LIST_GET_LEN(trx->signals) == 0) {
+
+ /* The signal list is empty: the 'sig' slot must be unused
+ (we improve performance a bit by avoiding mem_alloc) */
+ sig = &(trx->sig);
+ } else {
+ /* It might be that the 'sig' slot is unused also in this
+ case, but we choose the easy way of using mem_alloc */
+
+ sig = mem_alloc(sizeof(trx_sig_t));
+ }
+
+ UT_LIST_ADD_LAST(signals, trx->signals, sig);
+
+ sig->type = type;
+ sig->sender = sender;
+ sig->receiver = receiver_thr;
+
+ if (savept) {
+ sig->savept = *savept;
+ }
+
+ if (receiver_thr) {
+ receiver_trx = thr_get_trx(receiver_thr);
+
+ UT_LIST_ADD_LAST(reply_signals, receiver_trx->reply_signals,
+ sig);
+ }
+
+ if (trx->sess->state == SESS_ERROR) {
+
+ trx_sig_reply_wait_to_suspended(trx);
+ }
+
+ if ((sender != TRX_SIG_SELF) || (type == TRX_SIG_BREAK_EXECUTION)) {
+ ut_error;
+ }
+
+ /* If there were no other signals ahead in the queue, try to start
+ handling of the signal */
+
+ if (UT_LIST_GET_FIRST(trx->signals) == sig) {
+
+ trx_sig_start_handle(trx, next_thr);
+ }
+}
+
+/****************************************************************//**
+Ends signal handling. If the session is in the error state, and
+trx->graph_before_signal_handling != NULL, then returns control to the error
+handling routine of the graph (currently just returns the control to the
+graph root which then will send an error message to the client). */
+UNIV_INTERN
+void
+trx_end_signal_handling(
+/*====================*/
+ trx_t* trx) /*!< in: trx */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(trx->handling_signals == TRUE);
+
+ trx->handling_signals = FALSE;
+
+ trx->graph = trx->graph_before_signal_handling;
+
+ if (trx->graph && (trx->sess->state == SESS_ERROR)) {
+
+ que_fork_error_handle(trx, trx->graph);
+ }
+}
+
+/****************************************************************//**
+Starts handling of a trx signal. */
+UNIV_INTERN
+void
+trx_sig_start_handle(
+/*=================*/
+ trx_t* trx, /*!< in: trx handle */
+ que_thr_t** next_thr) /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if the parameter
+ is NULL, it is ignored */
+{
+ trx_sig_t* sig;
+ ulint type;
+loop:
+ /* We loop in this function body as long as there are queued signals
+ we can process immediately */
+
+ ut_ad(trx);
+ ut_ad(mutex_own(&kernel_mutex));
+
+ if (trx->handling_signals && (UT_LIST_GET_LEN(trx->signals) == 0)) {
+
+ trx_end_signal_handling(trx);
+
+ return;
+ }
+
+ if (trx->state == TRX_NOT_STARTED) {
+
+ trx_start_low(trx, ULINT_UNDEFINED);
+ }
+
+ /* If the trx is in a lock wait state, moves the waiting query threads
+ to the suspended state */
+
+ if (trx->que_state == TRX_QUE_LOCK_WAIT) {
+
+ trx_lock_wait_to_suspended(trx);
+ }
+
+ /* If the session is in the error state and this trx has threads
+ waiting for reply from signals, moves these threads to the suspended
+ state, canceling wait reservations; note that if the transaction has
+ sent a commit or rollback signal to itself, and its session is not in
+ the error state, then nothing is done here. */
+
+ if (trx->sess->state == SESS_ERROR) {
+ trx_sig_reply_wait_to_suspended(trx);
+ }
+
+ /* If there are no running query threads, we can start processing of a
+ signal, otherwise we have to wait until all query threads of this
+ transaction are aware of the arrival of the signal. */
+
+ if (trx->n_active_thrs > 0) {
+
+ return;
+ }
+
+ if (trx->handling_signals == FALSE) {
+ trx->graph_before_signal_handling = trx->graph;
+
+ trx->handling_signals = TRUE;
+ }
+
+ sig = UT_LIST_GET_FIRST(trx->signals);
+ type = sig->type;
+
+ if (type == TRX_SIG_COMMIT) {
+
+ trx_handle_commit_sig_off_kernel(trx, next_thr);
+
+ } else if ((type == TRX_SIG_TOTAL_ROLLBACK)
+ || (type == TRX_SIG_ROLLBACK_TO_SAVEPT)) {
+
+ trx_rollback(trx, sig, next_thr);
+
+ /* No further signals can be handled until the rollback
+ completes, therefore we return */
+
+ return;
+
+ } else if (type == TRX_SIG_ERROR_OCCURRED) {
+
+ trx_rollback(trx, sig, next_thr);
+
+ /* No further signals can be handled until the rollback
+ completes, therefore we return */
+
+ return;
+
+ } else if (type == TRX_SIG_BREAK_EXECUTION) {
+
+ trx_sig_reply(sig, next_thr);
+ trx_sig_remove(trx, sig);
+ } else {
+ ut_error;
+ }
+
+ goto loop;
+}
+
+/****************************************************************//**
+Send the reply message when a signal in the queue of the trx has been
+handled. */
+UNIV_INTERN
+void
+trx_sig_reply(
+/*==========*/
+ trx_sig_t* sig, /*!< in: signal */
+ que_thr_t** next_thr) /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+{
+ trx_t* receiver_trx;
+
+ ut_ad(sig);
+ ut_ad(mutex_own(&kernel_mutex));
+
+ if (sig->receiver != NULL) {
+ ut_ad((sig->receiver)->state == QUE_THR_SIG_REPLY_WAIT);
+
+ receiver_trx = thr_get_trx(sig->receiver);
+
+ UT_LIST_REMOVE(reply_signals, receiver_trx->reply_signals,
+ sig);
+ ut_ad(receiver_trx->sess->state != SESS_ERROR);
+
+ que_thr_end_wait(sig->receiver, next_thr);
+
+ sig->receiver = NULL;
+
+ }
+}
+
+/****************************************************************//**
+Removes a signal object from the trx signal queue. */
+UNIV_INTERN
+void
+trx_sig_remove(
+/*===========*/
+ trx_t* trx, /*!< in: trx handle */
+ trx_sig_t* sig) /*!< in, own: signal */
+{
+ ut_ad(trx && sig);
+ ut_ad(mutex_own(&kernel_mutex));
+
+ ut_ad(sig->receiver == NULL);
+
+ UT_LIST_REMOVE(signals, trx->signals, sig);
+ sig->type = 0; /* reset the field to catch possible bugs */
+
+ if (sig != &(trx->sig)) {
+ mem_free(sig);
+ }
+}
+
+/*********************************************************************//**
+Creates a commit command node struct.
+@return own: commit node struct */
+UNIV_INTERN
+commit_node_t*
+commit_node_create(
+/*===============*/
+ mem_heap_t* heap) /*!< in: mem heap where created */
+{
+ commit_node_t* node;
+
+ node = mem_heap_alloc(heap, sizeof(commit_node_t));
+ node->common.type = QUE_NODE_COMMIT;
+ node->state = COMMIT_NODE_SEND;
+
+ return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return query thread to run next, or NULL */
+UNIV_INTERN
+que_thr_t*
+trx_commit_step(
+/*============*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ commit_node_t* node;
+ que_thr_t* next_thr;
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ node->state = COMMIT_NODE_SEND;
+ }
+
+ if (node->state == COMMIT_NODE_SEND) {
+ mutex_enter(&kernel_mutex);
+
+ node->state = COMMIT_NODE_WAIT;
+
+ next_thr = NULL;
+
+ thr->state = QUE_THR_SIG_REPLY_WAIT;
+
+ /* Send the commit signal to the transaction */
+
+ trx_sig_send(thr_get_trx(thr), TRX_SIG_COMMIT, TRX_SIG_SELF,
+ thr, NULL, &next_thr);
+
+ mutex_exit(&kernel_mutex);
+
+ return(next_thr);
+ }
+
+ ut_ad(node->state == COMMIT_NODE_WAIT);
+
+ node->state = COMMIT_NODE_SEND;
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+trx_commit_for_mysql(
+/*=================*/
+ trx_t* trx) /*!< in: trx handle */
+{
+ /* Because we do not do the commit by sending an Innobase
+ sig to the transaction, we must here make sure that trx has been
+ started. */
+
+ ut_a(trx);
+
+ trx_start_if_not_started(trx);
+
+ trx->op_info = "committing";
+
+ mutex_enter(&kernel_mutex);
+
+ trx_commit_off_kernel(trx);
+
+ mutex_exit(&kernel_mutex);
+
+ trx->op_info = "";
+
+ return(DB_SUCCESS);
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk if we called trx_commit_for_mysql()
+with trx->flush_log_later == TRUE.
+@return 0 or error number */
+UNIV_INTERN
+ulint
+trx_commit_complete_for_mysql(
+/*==========================*/
+ trx_t* trx) /*!< in: trx handle */
+{
+ ib_uint64_t lsn = trx->commit_lsn;
+ ulint flush_log_at_trx_commit;
+
+ ut_a(trx);
+
+ trx->op_info = "flushing log";
+
+ if (srv_use_global_flush_log_at_trx_commit) {
+ flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
+ } else {
+ flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
+ }
+
+ if (!trx->must_flush_log_later) {
+ /* Do nothing */
+ } else if (flush_log_at_trx_commit == 0) {
+ /* Do nothing */
+ } else if (flush_log_at_trx_commit == 1) {
+ if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+ } else {
+ /* Write the log to the log files AND flush them to
+ disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
+ }
+ } else if (flush_log_at_trx_commit == 2) {
+
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+ } else {
+ ut_error;
+ }
+
+ trx->must_flush_log_later = FALSE;
+
+ trx->op_info = "";
+
+ return(0);
+}
+
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+UNIV_INTERN
+void
+trx_mark_sql_stat_end(
+/*==================*/
+ trx_t* trx) /*!< in: trx handle */
+{
+ ut_a(trx);
+
+ if (trx->state == TRX_NOT_STARTED) {
+ trx->undo_no = 0;
+ }
+
+ trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+}
+
+/**********************************************************************//**
+Prints info about a transaction to the given file. The caller must own the
+kernel mutex. */
+UNIV_INTERN
+void
+trx_print(
+/*======*/
+ FILE* f, /*!< in: output stream */
+ trx_t* trx, /*!< in: transaction */
+ ulint max_query_len) /*!< in: max query length to print, or 0 to
+ use the default max length */
+{
+ ibool newline;
+
+ fprintf(f, "TRANSACTION " TRX_ID_FMT, (ullint) trx->id);
+
+ switch (trx->state) {
+ case TRX_NOT_STARTED:
+ fputs(", not started", f);
+ break;
+ case TRX_ACTIVE:
+ fprintf(f, ", ACTIVE %lu sec",
+ (ulong)difftime(time(NULL), trx->start_time));
+ break;
+ case TRX_PREPARED:
+ fprintf(f, ", ACTIVE (PREPARED) %lu sec",
+ (ulong)difftime(time(NULL), trx->start_time));
+ break;
+ case TRX_COMMITTED_IN_MEMORY:
+ fputs(", COMMITTED IN MEMORY", f);
+ break;
+ default:
+ fprintf(f, " state %lu", (ulong) trx->state);
+ }
+
+ if (*trx->op_info) {
+ putc(' ', f);
+ fputs(trx->op_info, f);
+ }
+
+ if (trx->is_recovered) {
+ fputs(" recovered trx", f);
+ }
+
+ if (trx->is_purge) {
+ fputs(" purge trx", f);
+ }
+
+ if (trx->declared_to_be_inside_innodb) {
+ fprintf(f, ", thread declared inside InnoDB %lu",
+ (ulong) trx->n_tickets_to_enter_innodb);
+ }
+
+ putc('\n', f);
+
+ if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
+ fprintf(f, "mysql tables in use %lu, locked %lu\n",
+ (ulong) trx->n_mysql_tables_in_use,
+ (ulong) trx->mysql_n_tables_locked);
+ }
+
+ newline = TRUE;
+
+ switch (trx->que_state) {
+ case TRX_QUE_RUNNING:
+ newline = FALSE; break;
+ case TRX_QUE_LOCK_WAIT:
+ fputs("LOCK WAIT ", f); break;
+ case TRX_QUE_ROLLING_BACK:
+ fputs("ROLLING BACK ", f); break;
+ case TRX_QUE_COMMITTING:
+ fputs("COMMITTING ", f); break;
+ default:
+ fprintf(f, "que state %lu ", (ulong) trx->que_state);
+ }
+
+ if (0 < UT_LIST_GET_LEN(trx->trx_locks)
+ || mem_heap_get_size(trx->lock_heap) > 400) {
+ newline = TRUE;
+
+ fprintf(f, "%lu lock struct(s), heap size %lu,"
+ " %lu row lock(s)",
+ (ulong) UT_LIST_GET_LEN(trx->trx_locks),
+ (ulong) mem_heap_get_size(trx->lock_heap),
+ (ulong) lock_number_of_rows_locked(trx));
+ }
+
+ if (trx->has_search_latch) {
+ newline = TRUE;
+ fputs(", holds adaptive hash latch", f);
+ }
+
+ if (trx->undo_no != 0) {
+ newline = TRUE;
+ fprintf(f, ", undo log entries %llu",
+ (ullint) trx->undo_no);
+ }
+
+ if (newline) {
+ putc('\n', f);
+ }
+
+ if (trx->mysql_thd != NULL) {
+ innobase_mysql_print_thd(f, trx->mysql_thd, max_query_len);
+ }
+}
+
+/*******************************************************************//**
+Compares the "weight" (or size) of two transactions. Transactions that
+have edited non-transactional tables are considered heavier than ones
+that have not.
+@return TRUE if weight(a) >= weight(b) */
+UNIV_INTERN
+ibool
+trx_weight_ge(
+/*==========*/
+ const trx_t* a, /*!< in: the first transaction to be compared */
+ const trx_t* b) /*!< in: the second transaction to be compared */
+{
+ ibool a_notrans_edit;
+ ibool b_notrans_edit;
+
+ /* If mysql_thd is NULL for a transaction we assume that it has
+ not edited non-transactional tables. */
+
+ a_notrans_edit = a->mysql_thd != NULL
+ && thd_has_edited_nontrans_tables(a->mysql_thd);
+
+ b_notrans_edit = b->mysql_thd != NULL
+ && thd_has_edited_nontrans_tables(b->mysql_thd);
+
+ if (a_notrans_edit != b_notrans_edit) {
+
+ return(a_notrans_edit);
+ }
+
+ /* Either both had edited non-transactional tables or both had
+ not, we fall back to comparing the number of altered/locked
+ rows. */
+
+#if 0
+ fprintf(stderr,
+ "%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n",
+ __func__,
+ a->undo_no, UT_LIST_GET_LEN(a->trx_locks),
+ b->undo_no, UT_LIST_GET_LEN(b->trx_locks));
+#endif
+
+ return(TRX_WEIGHT(a) >= TRX_WEIGHT(b));
+}
+
+/****************************************************************//**
+Prepares a transaction. */
+UNIV_INTERN
+void
+trx_prepare_off_kernel(
+/*===================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ trx_rseg_t* rseg;
+ ib_uint64_t lsn = 0;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ rseg = trx->rseg;
+
+ if (trx->insert_undo != NULL || trx->update_undo != NULL) {
+
+ mutex_exit(&kernel_mutex);
+
+ mtr_start(&mtr);
+
+ /* Change the undo log segment states from TRX_UNDO_ACTIVE
+ to TRX_UNDO_PREPARED: these modifications to the file data
+ structure define the transaction as prepared in the
+ file-based world, at the serialization point of lsn. */
+
+ mutex_enter(&(rseg->mutex));
+
+ if (trx->insert_undo != NULL) {
+
+ /* It is not necessary to obtain trx->undo_mutex here
+ because only a single OS thread is allowed to do the
+ transaction prepare for this transaction. */
+
+ trx_undo_set_state_at_prepare(trx, trx->insert_undo,
+ &mtr);
+ }
+
+ if (trx->update_undo) {
+ trx_undo_set_state_at_prepare(
+ trx, trx->update_undo, &mtr);
+ }
+
+ mutex_exit(&(rseg->mutex));
+
+ if (trx->mysql_master_log_file_name[0] != '\0') {
+ /* This database server is a MySQL replication slave */
+ trx_sysf_t* sys_header = trx_sysf_get(&mtr);
+
+ trx_sys_update_mysql_binlog_offset(
+ sys_header,
+ trx->mysql_relay_log_file_name,
+ trx->mysql_relay_log_pos,
+ TRX_SYS_MYSQL_RELAY_LOG_INFO, &mtr);
+ trx_sys_update_mysql_binlog_offset(
+ sys_header,
+ trx->mysql_master_log_file_name,
+ trx->mysql_master_log_pos,
+ TRX_SYS_MYSQL_MASTER_LOG_INFO, &mtr);
+ trx->mysql_master_log_file_name = "";
+ }
+
+ /*--------------*/
+ mtr_commit(&mtr); /* This mtr commit makes the
+ transaction prepared in the file-based
+ world */
+ /*--------------*/
+ lsn = mtr.end_lsn;
+
+ mutex_enter(&kernel_mutex);
+ }
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ /*--------------------------------------*/
+ if (UNIV_UNLIKELY(trx->state != TRX_ACTIVE)) {
+
+ trx_reserve_descriptor(trx);
+ }
+ trx->state = TRX_PREPARED;
+ trx_n_prepared++;
+ /*--------------------------------------*/
+
+ if (lsn) {
+ ulint flush_log_at_trx_commit;
+
+ /* Depending on the my.cnf options, we may now write the log
+ buffer to the log files, making the prepared state of the
+ transaction durable if the OS does not crash. We may also
+ flush the log files to disk, making the prepared state of the
+ transaction durable also at an OS crash or a power outage.
+
+ The idea in InnoDB's group prepare is that a group of
+ transactions gather behind a trx doing a physical disk write
+ to log files, and when that physical write has been completed,
+ one of those transactions does a write which prepares the whole
+ group. Note that this group prepare will only bring benefit if
+ there are > 2 users in the database. Then at least 2 users can
+ gather behind one doing the physical log write to disk.
+
+ TODO: find out if MySQL holds some mutex when calling this.
+ That would spoil our group prepare algorithm. */
+
+ mutex_exit(&kernel_mutex);
+
+ if (srv_use_global_flush_log_at_trx_commit) {
+ flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
+ } else {
+ flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
+ }
+
+ if (flush_log_at_trx_commit == 0) {
+ /* Do nothing */
+ } else if (flush_log_at_trx_commit == 1) {
+ if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
+ FALSE);
+ } else {
+ /* Write the log to the log files AND flush
+ them to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
+ }
+ } else if (flush_log_at_trx_commit == 2) {
+
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+ } else {
+ ut_error;
+ }
+
+ mutex_enter(&kernel_mutex);
+ }
+}
+
+/**********************************************************************//**
+Does the transaction prepare for MySQL.
+@return 0 or error number */
+UNIV_INTERN
+ulint
+trx_prepare_for_mysql(
+/*==================*/
+ trx_t* trx) /*!< in: trx handle */
+{
+ /* Because we do not do the prepare by sending an Innobase
+ sig to the transaction, we must here make sure that trx has been
+ started. */
+
+ ut_a(trx);
+
+ trx->op_info = "preparing";
+
+ trx_start_if_not_started(trx);
+
+ mutex_enter(&kernel_mutex);
+
+ trx_prepare_off_kernel(trx);
+
+ mutex_exit(&kernel_mutex);
+
+ trx->op_info = "";
+
+ return(0);
+}
+
+/**********************************************************************//**
+This function is used to find number of prepared transactions and
+their transaction objects for a recovery.
+@return number of prepared transactions stored in xid_list */
+UNIV_INTERN
+int
+trx_recover_for_mysql(
+/*==================*/
+ XID* xid_list, /*!< in/out: prepared transactions */
+ ulint len) /*!< in: number of slots in xid_list */
+{
+ trx_t* trx;
+ ulint count = 0;
+
+ ut_ad(xid_list);
+ ut_ad(len);
+
+ /* We should set those transactions which are in the prepared state
+ to the xid_list */
+
+ mutex_enter(&kernel_mutex);
+
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ while (trx) {
+ if (trx->state == TRX_PREPARED) {
+ xid_list[count] = trx->xid;
+
+ if (count == 0) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Starting recovery for"
+ " XA transactions...\n");
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Transaction " TRX_ID_FMT " in"
+ " prepared state after recovery\n",
+ (ullint) trx->id);
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Transaction contains changes"
+ " to %llu rows\n",
+ (ullint) trx->undo_no);
+
+ count++;
+
+ if (count == len) {
+ break;
+ }
+ }
+
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ if (count > 0){
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: %lu transactions in prepared state"
+ " after recovery\n",
+ (ulong) count);
+ }
+
+ return ((int) count);
+}
+
+/*******************************************************************//**
+This function is used to find one X/Open XA distributed transaction
+which is in the prepared state
+@return trx or NULL; on match, the trx->xid will be invalidated */
+UNIV_INTERN
+trx_t*
+trx_get_trx_by_xid(
+/*===============*/
+ const XID* xid) /*!< in: X/Open XA transaction identifier */
+{
+ trx_t* trx;
+
+ if (xid == NULL) {
+
+ return(NULL);
+ }
+
+ mutex_enter(&kernel_mutex);
+
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ while (trx) {
+ /* Compare two X/Open XA transaction id's: their
+ length should be the same and binary comparison
+ of gtrid_length+bqual_length bytes should be
+ the same */
+
+ if (trx->is_recovered
+ && trx->state == TRX_PREPARED
+ && xid->gtrid_length == trx->xid.gtrid_length
+ && xid->bqual_length == trx->xid.bqual_length
+ && memcmp(xid->data, trx->xid.data,
+ xid->gtrid_length + xid->bqual_length) == 0) {
+
+ /* Invalidate the XID, so that subsequent calls
+ will not find it. */
+ memset(&trx->xid, 0, sizeof(trx->xid));
+ trx->xid.formatID = -1;
+ break;
+ }
+
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ return(trx);
+}
diff --git a/storage/xtradb/trx/trx0undo.c b/storage/xtradb/trx/trx0undo.c
new file mode 100644
index 00000000000..3d794c69c8b
--- /dev/null
+++ b/storage/xtradb/trx/trx0undo.c
@@ -0,0 +1,2000 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0undo.c
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0undo.h"
+
+#ifdef UNIV_NONINL
+#include "trx0undo.ic"
+#endif
+
+#include "fsp0fsp.h"
+#ifndef UNIV_HOTBACKUP
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0rec.h"
+#include "trx0purge.h"
+
+/* How should the old versions in the history list be managed?
+ ----------------------------------------------------------
+If each transaction is given a whole page for its update undo log, file
+space consumption can be 10 times higher than necessary. Therefore,
+partly filled update undo log pages should be reusable. But then there
+is no way individual pages can be ordered so that the ordering agrees
+with the serialization numbers of the transactions on the pages. Thus,
+the history list must be formed of undo logs, not their header pages as
+it was in the old implementation.
+ However, on a single header page the transactions are placed in
+the order of their serialization numbers. As old versions are purged, we
+may free the page when the last transaction on the page has been purged.
+ A problem is that the purge has to go through the transactions
+in the serialization order. This means that we have to look through all
+rollback segments for the one that has the smallest transaction number
+in its history list.
+ When should we do a purge? A purge is necessary when space is
+running out in any of the rollback segments. Then we may have to purge
+also old version which might be needed by some consistent read. How do
+we trigger the start of a purge? When a transaction writes to an undo log,
+it may notice that the space is running out. When a read view is closed,
+it may make some history superfluous. The server can have an utility which
+periodically checks if it can purge some history.
+ In a parallellized purge we have the problem that a query thread
+can remove a delete marked clustered index record before another query
+thread has processed an earlier version of the record, which cannot then
+be done because the row cannot be constructed from the clustered index
+record. To avoid this problem, we will store in the update and delete mark
+undo record also the columns necessary to construct the secondary index
+entries which are modified.
+ We can latch the stack of versions of a single clustered index record
+by taking a latch on the clustered index page. As long as the latch is held,
+no new versions can be added and no versions removed by undo. But, a purge
+can still remove old versions from the bottom of the stack. */
+
+/* How to protect rollback segments, undo logs, and history lists with
+ -------------------------------------------------------------------
+latches?
+-------
+The contention of the kernel mutex should be minimized. When a transaction
+does its first insert or modify in an index, an undo log is assigned for it.
+Then we must have an x-latch to the rollback segment header.
+ When the transaction does more modifys or rolls back, the undo log is
+protected with undo_mutex in the transaction.
+ When the transaction commits, its insert undo log is either reset and
+cached for a fast reuse, or freed. In these cases we must have an x-latch on
+the rollback segment page. The update undo log is put to the history list. If
+it is not suitable for reuse, its slot in the rollback segment is reset. In
+both cases, an x-latch must be acquired on the rollback segment.
+ The purge operation steps through the history list without modifying
+it until a truncate operation occurs, which can remove undo logs from the end
+of the list and release undo log segments. In stepping through the list,
+s-latches on the undo log pages are enough, but in a truncate, x-latches must
+be obtained on the rollback segment and individual pages. */
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Initializes the fields in an undo log segment page. */
+static
+void
+trx_undo_page_init(
+/*===============*/
+ page_t* undo_page, /*!< in: undo log segment page */
+ ulint type, /*!< in: undo log segment type */
+ mtr_t* mtr); /*!< in: mtr */
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment memory object */
+ ulint id, /*!< in: slot index within rseg */
+ ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is created */
+ const XID* xid, /*!< in: X/Open XA transaction identification*/
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset);/*!< in: undo log header byte offset on page */
+#endif /* !UNIV_HOTBACKUP */
+/***************************************************************//**
+Initializes a cached insert undo log header page for new use. NOTE that this
+function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change
+the operation of this function!
+@return undo log header byte offset on page */
+static
+ulint
+trx_undo_insert_header_reuse(
+/*=========================*/
+ page_t* undo_page, /*!< in/out: insert undo log segment
+ header page, x-latched */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+If an update undo log can be discarded immediately, this function frees the
+space, resetting the page to the proper state for caching. */
+static
+void
+trx_undo_discard_latest_update_undo(
+/*================================*/
+ page_t* undo_page, /*!< in: header page of an undo log of size 1 */
+ mtr_t* mtr); /*!< in: mtr */
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Gets the previous record in an undo log from the previous page.
+@return undo log record, the page s-latched, NULL if none */
+static
+trx_undo_rec_t*
+trx_undo_get_prev_rec_from_prev_page(
+/*=================================*/
+ trx_undo_rec_t* rec, /*!< in: undo record */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset, /*!< in: undo log header offset on page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint space;
+ ulint zip_size;
+ ulint prev_page_no;
+ page_t* prev_page;
+ page_t* undo_page;
+
+ undo_page = page_align(rec);
+
+ prev_page_no = flst_get_prev_addr(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_NODE, mtr)
+ .page;
+
+ if (prev_page_no == FIL_NULL) {
+
+ return(NULL);
+ }
+
+ space = page_get_space_id(undo_page);
+ zip_size = fil_space_get_zip_size(space);
+
+ prev_page = trx_undo_page_get_s_latched(space, zip_size,
+ prev_page_no, mtr);
+
+ return(trx_undo_page_get_last_rec(prev_page, page_no, offset));
+}
+
+/***********************************************************************//**
+Gets the previous record in an undo log.
+@return undo log record, the page s-latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_prev_rec(
+/*==================*/
+ trx_undo_rec_t* rec, /*!< in: undo record */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset, /*!< in: undo log header offset on page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_undo_rec_t* prev_rec;
+
+ prev_rec = trx_undo_page_get_prev_rec(rec, page_no, offset);
+
+ if (prev_rec) {
+
+ return(prev_rec);
+ }
+
+ /* We have to go to the previous undo log page to look for the
+ previous record */
+
+ return(trx_undo_get_prev_rec_from_prev_page(rec, page_no, offset,
+ mtr));
+}
+
+/***********************************************************************//**
+Gets the next record in an undo log from the next page.
+@return undo log record, the page latched, NULL if none */
+static
+trx_undo_rec_t*
+trx_undo_get_next_rec_from_next_page(
+/*=================================*/
+ ulint space, /*!< in: undo log header space */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ page_t* undo_page, /*!< in: undo log page */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset, /*!< in: undo log header offset on page */
+ ulint mode, /*!< in: latch mode: RW_S_LATCH or RW_X_LATCH */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_ulogf_t* log_hdr;
+ ulint next_page_no;
+ page_t* next_page;
+ ulint next;
+
+ if (page_no == page_get_page_no(undo_page)) {
+
+ log_hdr = undo_page + offset;
+ next = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG);
+
+ if (next != 0) {
+
+ return(NULL);
+ }
+ }
+
+ next_page_no = flst_get_next_addr(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_NODE, mtr)
+ .page;
+ if (next_page_no == FIL_NULL) {
+
+ return(NULL);
+ }
+
+ if (mode == RW_S_LATCH) {
+ next_page = trx_undo_page_get_s_latched(space, zip_size,
+ next_page_no, mtr);
+ } else {
+ ut_ad(mode == RW_X_LATCH);
+ next_page = trx_undo_page_get(space, zip_size,
+ next_page_no, mtr);
+ }
+
+ return(trx_undo_page_get_first_rec(next_page, page_no, offset));
+}
+
+/***********************************************************************//**
+Gets the next record in an undo log.
+@return undo log record, the page s-latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_next_rec(
+/*==================*/
+ trx_undo_rec_t* rec, /*!< in: undo record */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset, /*!< in: undo log header offset on page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint space;
+ ulint zip_size;
+ trx_undo_rec_t* next_rec;
+
+ next_rec = trx_undo_page_get_next_rec(rec, page_no, offset);
+
+ if (next_rec) {
+ return(next_rec);
+ }
+
+ space = page_get_space_id(page_align(rec));
+ zip_size = fil_space_get_zip_size(space);
+
+ return(trx_undo_get_next_rec_from_next_page(space, zip_size,
+ page_align(rec),
+ page_no, offset,
+ RW_S_LATCH, mtr));
+}
+
+/***********************************************************************//**
+Gets the first record in an undo log.
+@return undo log record, the page latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_first_rec(
+/*===================*/
+ ulint space, /*!< in: undo log header space */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset, /*!< in: undo log header offset on page */
+ ulint mode, /*!< in: latching mode: RW_S_LATCH or RW_X_LATCH */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* undo_page;
+ trx_undo_rec_t* rec;
+
+ if (mode == RW_S_LATCH) {
+ undo_page = trx_undo_page_get_s_latched(space, zip_size,
+ page_no, mtr);
+ } else {
+ undo_page = trx_undo_page_get(space, zip_size, page_no, mtr);
+ }
+
+ rec = trx_undo_page_get_first_rec(undo_page, page_no, offset);
+
+ if (rec) {
+ return(rec);
+ }
+
+ return(trx_undo_get_next_rec_from_next_page(space, zip_size,
+ undo_page, page_no, offset,
+ mode, mtr));
+}
+
+/*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/
+
+/**********************************************************************//**
+Writes the mtr log entry of an undo log page initialization. */
+UNIV_INLINE
+void
+trx_undo_page_init_log(
+/*===================*/
+ page_t* undo_page, /*!< in: undo log page */
+ ulint type, /*!< in: undo log type */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mlog_write_initial_log_record(undo_page, MLOG_UNDO_INIT, mtr);
+
+ mlog_catenate_ulint_compressed(mtr, type);
+}
+#else /* !UNIV_HOTBACKUP */
+# define trx_undo_page_init_log(undo_page,type,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses the redo log entry of an undo log page initialization.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_page_init(
+/*=====================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr) /*!< in: mtr or NULL */
+{
+ ulint type;
+
+ ptr = mach_parse_compressed(ptr, end_ptr, &type);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ if (page) {
+ trx_undo_page_init(page, type, mtr);
+ }
+
+ return(ptr);
+}
+
+/********************************************************************//**
+Initializes the fields in an undo log segment page. */
+static
+void
+trx_undo_page_init(
+/*===============*/
+ page_t* undo_page, /*!< in: undo log segment page */
+ ulint type, /*!< in: undo log segment type */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_upagef_t* page_hdr;
+
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_TYPE, type);
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START,
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE,
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+
+ fil_page_set_type(undo_page, FIL_PAGE_UNDO_LOG);
+
+ trx_undo_page_init_log(undo_page, type, mtr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Creates a new undo log segment in file.
+@return DB_SUCCESS if page creation OK possible error codes are:
+DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE */
+static
+ulint
+trx_undo_seg_create(
+/*================*/
+ trx_rseg_t* rseg __attribute__((unused)),/*!< in: rollback segment */
+ trx_rsegf_t* rseg_hdr,/*!< in: rollback segment header, page
+ x-latched */
+ ulint type, /*!< in: type of the segment: TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+ ulint* id, /*!< out: slot index within rseg header */
+ page_t** undo_page,
+ /*!< out: segment header page x-latched, NULL
+ if there was an error */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint slot_no;
+ ulint space;
+ buf_block_t* block;
+ trx_upagef_t* page_hdr;
+ trx_usegf_t* seg_hdr;
+ ulint n_reserved;
+ ibool success;
+ ulint err = DB_SUCCESS;
+
+ ut_ad(mtr && id && rseg_hdr);
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ /* fputs(type == TRX_UNDO_INSERT
+ ? "Creating insert undo log segment\n"
+ : "Creating update undo log segment\n", stderr); */
+ slot_no = trx_rsegf_undo_find_free(rseg_hdr, mtr);
+
+ if (slot_no == ULINT_UNDEFINED) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: cannot find a free slot for"
+ " an undo log. Do you have too\n"
+ "InnoDB: many active transactions"
+ " running concurrently?\n");
+
+ return(DB_TOO_MANY_CONCURRENT_TRXS);
+ }
+
+ space = page_get_space_id(page_align(rseg_hdr));
+
+ success = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO,
+ mtr);
+ if (!success) {
+
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+
+ /* Allocate a new file segment for the undo log */
+ block = fseg_create_general(space, 0,
+ TRX_UNDO_SEG_HDR
+ + TRX_UNDO_FSEG_HEADER, TRUE, mtr);
+
+ fil_space_release_free_extents(space, n_reserved);
+
+ if (block == NULL) {
+ /* No space left */
+
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ *undo_page = buf_block_get_frame(block);
+
+ page_hdr = *undo_page + TRX_UNDO_PAGE_HDR;
+ seg_hdr = *undo_page + TRX_UNDO_SEG_HDR;
+
+ trx_undo_page_init(*undo_page, type, mtr);
+
+ mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE,
+ TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE,
+ MLOG_2BYTES, mtr);
+
+ mlog_write_ulint(seg_hdr + TRX_UNDO_LAST_LOG, 0, MLOG_2BYTES, mtr);
+
+ flst_init(seg_hdr + TRX_UNDO_PAGE_LIST, mtr);
+
+ flst_add_last(seg_hdr + TRX_UNDO_PAGE_LIST,
+ page_hdr + TRX_UNDO_PAGE_NODE, mtr);
+
+ trx_rsegf_set_nth_undo(rseg_hdr, slot_no,
+ page_get_page_no(*undo_page), mtr);
+ *id = slot_no;
+
+ return(err);
+}
+
+/**********************************************************************//**
+Writes the mtr log entry of an undo log header initialization. */
+UNIV_INLINE
+void
+trx_undo_header_create_log(
+/*=======================*/
+ const page_t* undo_page, /*!< in: undo log header page */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_CREATE, mtr);
+
+ mlog_catenate_ull_compressed(mtr, trx_id);
+}
+#else /* !UNIV_HOTBACKUP */
+# define trx_undo_header_create_log(undo_page,trx_id,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+Creates a new undo log header in file. NOTE that this function has its own
+log record type MLOG_UNDO_HDR_CREATE. You must NOT change the operation of
+this function!
+@return header byte offset on page */
+static
+ulint
+trx_undo_header_create(
+/*===================*/
+ page_t* undo_page, /*!< in/out: undo log segment
+ header page, x-latched; it is
+ assumed that there is
+ TRX_UNDO_LOG_XA_HDR_SIZE bytes
+ free space on it */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_upagef_t* page_hdr;
+ trx_usegf_t* seg_hdr;
+ trx_ulogf_t* log_hdr;
+ trx_ulogf_t* prev_log_hdr;
+ ulint prev_log;
+ ulint free;
+ ulint new_free;
+
+ ut_ad(mtr && undo_page);
+
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+ free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE);
+
+ log_hdr = undo_page + free;
+
+ new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE;
+
+ ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100);
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free);
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free);
+
+ mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE);
+
+ prev_log = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG);
+
+ if (prev_log != 0) {
+ prev_log_hdr = undo_page + prev_log;
+
+ mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, free);
+ }
+
+ mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, free);
+
+ log_hdr = undo_page + free;
+
+ mach_write_to_2(log_hdr + TRX_UNDO_DEL_MARKS, TRUE);
+
+ mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id);
+ mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free);
+
+ mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE);
+ mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE);
+
+ mach_write_to_2(log_hdr + TRX_UNDO_NEXT_LOG, 0);
+ mach_write_to_2(log_hdr + TRX_UNDO_PREV_LOG, prev_log);
+
+ /* Write the log record about the header creation */
+ trx_undo_header_create_log(undo_page, trx_id, mtr);
+
+ return(free);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Write X/Open XA Transaction Identification (XID) to undo log header */
+static
+void
+trx_undo_write_xid(
+/*===============*/
+ trx_ulogf_t* log_hdr,/*!< in: undo log header */
+ const XID* xid, /*!< in: X/Open XA Transaction Identification */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mlog_write_ulint(log_hdr + TRX_UNDO_XA_FORMAT,
+ (ulint)xid->formatID, MLOG_4BYTES, mtr);
+
+ mlog_write_ulint(log_hdr + TRX_UNDO_XA_TRID_LEN,
+ (ulint)xid->gtrid_length, MLOG_4BYTES, mtr);
+
+ mlog_write_ulint(log_hdr + TRX_UNDO_XA_BQUAL_LEN,
+ (ulint)xid->bqual_length, MLOG_4BYTES, mtr);
+
+ mlog_write_string(log_hdr + TRX_UNDO_XA_XID, (const byte*) xid->data,
+ XIDDATASIZE, mtr);
+}
+
+/********************************************************************//**
+Read X/Open XA Transaction Identification (XID) from undo log header */
+static
+void
+trx_undo_read_xid(
+/*==============*/
+ trx_ulogf_t* log_hdr,/*!< in: undo log header */
+ XID* xid) /*!< out: X/Open XA Transaction Identification */
+{
+ xid->formatID = (long)mach_read_from_4(log_hdr + TRX_UNDO_XA_FORMAT);
+
+ xid->gtrid_length
+ = (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_TRID_LEN);
+ xid->bqual_length
+ = (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_BQUAL_LEN);
+
+ memcpy(xid->data, log_hdr + TRX_UNDO_XA_XID, XIDDATASIZE);
+}
+
+/***************************************************************//**
+Adds space for the XA XID after an undo log old-style header. */
+static
+void
+trx_undo_header_add_space_for_xid(
+/*==============================*/
+ page_t* undo_page,/*!< in: undo log segment header page */
+ trx_ulogf_t* log_hdr,/*!< in: undo log header */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_upagef_t* page_hdr;
+ ulint free;
+ ulint new_free;
+
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+ free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE);
+
+ /* free is now the end offset of the old style undo log header */
+
+ ut_a(free == (ulint)(log_hdr - undo_page) + TRX_UNDO_LOG_OLD_HDR_SIZE);
+
+ new_free = free + (TRX_UNDO_LOG_XA_HDR_SIZE
+ - TRX_UNDO_LOG_OLD_HDR_SIZE);
+
+ /* Add space for a XID after the header, update the free offset
+ fields on the undo log page and in the undo log header */
+
+ mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_START, new_free,
+ MLOG_2BYTES, mtr);
+
+ mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE, new_free,
+ MLOG_2BYTES, mtr);
+
+ mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, new_free,
+ MLOG_2BYTES, mtr);
+}
+
+/**********************************************************************//**
+Writes the mtr log entry of an undo log header reuse. */
+UNIV_INLINE
+void
+trx_undo_insert_header_reuse_log(
+/*=============================*/
+ const page_t* undo_page, /*!< in: undo log header page */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_REUSE, mtr);
+
+ mlog_catenate_ull_compressed(mtr, trx_id);
+}
+#else /* !UNIV_HOTBACKUP */
+# define trx_undo_insert_header_reuse_log(undo_page,trx_id,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses the redo log entry of an undo log page header create or reuse.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_page_header(
+/*=======================*/
+ ulint type, /*!< in: MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE */
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr) /*!< in: mtr or NULL */
+{
+ trx_id_t trx_id;
+ /* Silence a GCC warning about possibly uninitialized variable
+ when mach_ull_parse_compressed() is not inlined. */
+ ut_d(trx_id = 0);
+ /* Declare the variable uninitialized in Valgrind, so that the
+ above initialization will not mask any bugs. */
+ UNIV_MEM_INVALID(&trx_id, sizeof trx_id);
+
+ ptr = mach_ull_parse_compressed(ptr, end_ptr, &trx_id);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ if (page) {
+ if (type == MLOG_UNDO_HDR_CREATE) {
+ trx_undo_header_create(page, trx_id, mtr);
+ } else {
+ ut_ad(type == MLOG_UNDO_HDR_REUSE);
+ trx_undo_insert_header_reuse(page, trx_id, mtr);
+ }
+ }
+
+ return(ptr);
+}
+
+/***************************************************************//**
+Initializes a cached insert undo log header page for new use. NOTE that this
+function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change
+the operation of this function!
+@return undo log header byte offset on page */
+static
+ulint
+trx_undo_insert_header_reuse(
+/*=========================*/
+ page_t* undo_page, /*!< in/out: insert undo log segment
+ header page, x-latched */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_upagef_t* page_hdr;
+ trx_usegf_t* seg_hdr;
+ trx_ulogf_t* log_hdr;
+ ulint free;
+ ulint new_free;
+
+ ut_ad(mtr && undo_page);
+
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+ free = TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE;
+
+ ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100);
+
+ log_hdr = undo_page + free;
+
+ new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE;
+
+ /* Insert undo data is not needed after commit: we may free all
+ the space on the page */
+
+ ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_TYPE)
+ == TRX_UNDO_INSERT);
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free);
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free);
+
+ mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE);
+
+ log_hdr = undo_page + free;
+
+ mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id);
+ mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free);
+
+ mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE);
+ mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE);
+
+ /* Write the log record MLOG_UNDO_HDR_REUSE */
+ trx_undo_insert_header_reuse_log(undo_page, trx_id, mtr);
+
+ return(free);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Writes the redo log entry of an update undo log header discard. */
+UNIV_INLINE
+void
+trx_undo_discard_latest_log(
+/*========================*/
+ page_t* undo_page, /*!< in: undo log header page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_DISCARD, mtr);
+}
+#else /* !UNIV_HOTBACKUP */
+# define trx_undo_discard_latest_log(undo_page, mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses the redo log entry of an undo log page header discard.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_discard_latest(
+/*==========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr __attribute__((unused)), /*!< in: buffer end */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr) /*!< in: mtr or NULL */
+{
+ ut_ad(end_ptr);
+
+ if (page) {
+ trx_undo_discard_latest_update_undo(page, mtr);
+ }
+
+ return(ptr);
+}
+
+/**********************************************************************//**
+If an update undo log can be discarded immediately, this function frees the
+space, resetting the page to the proper state for caching. */
+static
+void
+trx_undo_discard_latest_update_undo(
+/*================================*/
+ page_t* undo_page, /*!< in: header page of an undo log of size 1 */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_usegf_t* seg_hdr;
+ trx_upagef_t* page_hdr;
+ trx_ulogf_t* log_hdr;
+ trx_ulogf_t* prev_log_hdr;
+ ulint free;
+ ulint prev_hdr_offset;
+
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+ free = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG);
+ log_hdr = undo_page + free;
+
+ prev_hdr_offset = mach_read_from_2(log_hdr + TRX_UNDO_PREV_LOG);
+
+ if (prev_hdr_offset != 0) {
+ prev_log_hdr = undo_page + prev_hdr_offset;
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START,
+ mach_read_from_2(prev_log_hdr
+ + TRX_UNDO_LOG_START));
+ mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, 0);
+ }
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, free);
+
+ mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_CACHED);
+ mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, prev_hdr_offset);
+
+ trx_undo_discard_latest_log(undo_page, mtr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Tries to add a page to the undo log segment where the undo log is placed.
+@return X-latched block if success, else NULL */
+UNIV_INTERN
+buf_block_t*
+trx_undo_add_page(
+/*==============*/
+ trx_t* trx, /*!< in: transaction */
+ trx_undo_t* undo, /*!< in: undo log memory object */
+ mtr_t* mtr) /*!< in: mtr which does not have a latch to any
+ undo log page; the caller must have reserved
+ the rollback segment mutex */
+{
+ page_t* header_page;
+ buf_block_t* new_block;
+ page_t* new_page;
+ trx_rseg_t* rseg;
+ ulint n_reserved;
+
+ ut_ad(mutex_own(&(trx->undo_mutex)));
+ ut_ad(!mutex_own(&kernel_mutex));
+ ut_ad(mutex_own(&(trx->rseg->mutex)));
+
+ rseg = trx->rseg;
+
+ if (rseg->curr_size == rseg->max_size) {
+
+ return(NULL);
+ }
+
+ header_page = trx_undo_page_get(undo->space, undo->zip_size,
+ undo->hdr_page_no, mtr);
+
+ if (!fsp_reserve_free_extents(&n_reserved, undo->space, 1,
+ FSP_UNDO, mtr)) {
+
+ return(NULL);
+ }
+
+ new_block = fseg_alloc_free_page_general(
+ TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+ + header_page,
+ undo->top_page_no + 1, FSP_UP, TRUE, mtr, mtr);
+
+ fil_space_release_free_extents(undo->space, n_reserved);
+
+ if (new_block == NULL) {
+
+ /* No space left */
+
+ return(NULL);
+ }
+
+ ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
+ buf_block_dbg_add_level(new_block, SYNC_TRX_UNDO_PAGE);
+ undo->last_page_no = buf_block_get_page_no(new_block);
+
+ new_page = buf_block_get_frame(new_block);
+
+ trx_undo_page_init(new_page, undo->type, mtr);
+
+ flst_add_last(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+ new_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+ undo->size++;
+ rseg->curr_size++;
+
+ return(new_block);
+}
+
+/********************************************************************//**
+Frees an undo log page that is not the header page.
+@return last page number in remaining log */
+static
+ulint
+trx_undo_free_page(
+/*===============*/
+ trx_rseg_t* rseg, /*!< in: rollback segment */
+ ibool in_history, /*!< in: TRUE if the undo log is in the history
+ list */
+ ulint space, /*!< in: space */
+ ulint hdr_page_no, /*!< in: header page number */
+ ulint page_no, /*!< in: page number to free: must not be the
+ header page */
+ mtr_t* mtr) /*!< in: mtr which does not have a latch to any
+ undo log page; the caller must have reserved
+ the rollback segment mutex */
+{
+ page_t* header_page;
+ page_t* undo_page;
+ fil_addr_t last_addr;
+ trx_rsegf_t* rseg_header;
+ ulint hist_size;
+ ulint zip_size;
+
+ ut_a(hdr_page_no != page_no);
+ ut_ad(!mutex_own(&kernel_mutex));
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ zip_size = rseg->zip_size;
+
+ undo_page = trx_undo_page_get(space, zip_size, page_no, mtr);
+
+ header_page = trx_undo_page_get(space, zip_size, hdr_page_no, mtr);
+
+ flst_remove(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+ undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+
+ fseg_free_page(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER,
+ space, page_no, mtr);
+
+ last_addr = flst_get_last(header_page + TRX_UNDO_SEG_HDR
+ + TRX_UNDO_PAGE_LIST, mtr);
+ rseg->curr_size--;
+
+ if (in_history) {
+ rseg_header = trx_rsegf_get(space, zip_size,
+ rseg->page_no, mtr);
+
+ hist_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+ MLOG_4BYTES, mtr);
+ ut_ad(hist_size > 0);
+ mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+ hist_size - 1, MLOG_4BYTES, mtr);
+ }
+
+ return(last_addr.page);
+}
+
+/********************************************************************//**
+Frees the last undo log page.
+The caller must hold the rollback segment mutex. */
+UNIV_INTERN
+void
+trx_undo_free_last_page_func(
+/*==========================*/
+#ifdef UNIV_DEBUG
+ const trx_t* trx, /*!< in: transaction */
+#endif /* UNIV_DEBUG */
+ trx_undo_t* undo, /*!< in/out: undo log memory copy */
+ mtr_t* mtr) /*!< in/out: mini-transaction which does not
+ have a latch to any undo log page or which
+ has allocated the undo log page */
+{
+ ut_ad(mutex_own(&trx->undo_mutex));
+ ut_ad(undo->hdr_page_no != undo->last_page_no);
+ ut_ad(undo->size > 0);
+
+ undo->last_page_no = trx_undo_free_page(
+ undo->rseg, FALSE, undo->space,
+ undo->hdr_page_no, undo->last_page_no, mtr);
+
+ undo->size--;
+}
+
+/********************************************************************//**
+Empties an undo log header page of undo records for that undo log. Other
+undo logs may still have records on that page, if it is an update undo log. */
+static
+void
+trx_undo_empty_header_page(
+/*=======================*/
+ ulint space, /*!< in: space */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint hdr_page_no, /*!< in: header page number */
+ ulint hdr_offset, /*!< in: header offset */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* header_page;
+ trx_ulogf_t* log_hdr;
+ ulint end;
+
+ header_page = trx_undo_page_get(space, zip_size, hdr_page_no, mtr);
+
+ log_hdr = header_page + hdr_offset;
+
+ end = trx_undo_page_get_end(header_page, hdr_page_no, hdr_offset);
+
+ mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, end, MLOG_2BYTES, mtr);
+}
+
+/***********************************************************************//**
+Truncates an undo log from the end. This function is used during a rollback
+to free space from an undo log. */
+UNIV_INTERN
+void
+trx_undo_truncate_end_func(
+/*=======================*/
+#ifdef UNIV_DEBUG
+ const trx_t* trx, /*!< in: transaction whose undo log it is */
+#endif /* UNIV_DEBUG */
+ trx_undo_t* undo, /*!< in: undo log */
+ undo_no_t limit) /*!< in: all undo records with undo number
+ >= this value should be truncated */
+{
+ page_t* undo_page;
+ ulint last_page_no;
+ trx_undo_rec_t* rec;
+ trx_undo_rec_t* trunc_here;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(trx->undo_mutex)));
+ ut_ad(mutex_own(&(trx->rseg->mutex)));
+
+ for (;;) {
+ mtr_start(&mtr);
+
+ trunc_here = NULL;
+
+ last_page_no = undo->last_page_no;
+
+ undo_page = trx_undo_page_get(undo->space, undo->zip_size,
+ last_page_no, &mtr);
+
+ rec = trx_undo_page_get_last_rec(undo_page, undo->hdr_page_no,
+ undo->hdr_offset);
+ while (rec) {
+ if (trx_undo_rec_get_undo_no(rec) >= limit) {
+ /* Truncate at least this record off, maybe
+ more */
+ trunc_here = rec;
+ } else {
+ goto function_exit;
+ }
+
+ rec = trx_undo_page_get_prev_rec(rec,
+ undo->hdr_page_no,
+ undo->hdr_offset);
+ }
+
+ if (last_page_no == undo->hdr_page_no) {
+
+ goto function_exit;
+ }
+
+ ut_ad(last_page_no == undo->last_page_no);
+ trx_undo_free_last_page(trx, undo, &mtr);
+
+ mtr_commit(&mtr);
+ }
+
+function_exit:
+ if (trunc_here) {
+ mlog_write_ulint(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE,
+ trunc_here - undo_page, MLOG_2BYTES, &mtr);
+ }
+
+ mtr_commit(&mtr);
+}
+
+/***********************************************************************//**
+Truncates an undo log from the start. This function is used during a purge
+operation. */
+UNIV_INTERN
+void
+trx_undo_truncate_start(
+/*====================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment */
+ ulint space, /*!< in: space id of the log */
+ ulint hdr_page_no, /*!< in: header page number */
+ ulint hdr_offset, /*!< in: header offset on the page */
+ undo_no_t limit) /*!< in: all undo pages with
+ undo numbers < this value
+ should be truncated; NOTE that
+ the function only frees whole
+ pages; the header page is not
+ freed, but emptied, if all the
+ records there are < limit */
+{
+ page_t* undo_page;
+ trx_undo_rec_t* rec;
+ trx_undo_rec_t* last_rec;
+ ulint page_no;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ if (!limit) {
+
+ return;
+ }
+loop:
+ mtr_start(&mtr);
+
+ rec = trx_undo_get_first_rec(space, rseg->zip_size,
+ hdr_page_no, hdr_offset,
+ RW_X_LATCH, &mtr);
+ if (rec == NULL) {
+ /* Already empty */
+
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ undo_page = page_align(rec);
+
+ last_rec = trx_undo_page_get_last_rec(undo_page, hdr_page_no,
+ hdr_offset);
+ if (trx_undo_rec_get_undo_no(last_rec) >= limit) {
+
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ page_no = page_get_page_no(undo_page);
+
+ if (page_no == hdr_page_no) {
+ trx_undo_empty_header_page(space, rseg->zip_size,
+ hdr_page_no, hdr_offset,
+ &mtr);
+ } else {
+ trx_undo_free_page(rseg, TRUE, space, hdr_page_no,
+ page_no, &mtr);
+ }
+
+ mtr_commit(&mtr);
+
+ goto loop;
+}
+
+/**********************************************************************//**
+Frees an undo log segment which is not in the history list. */
+static
+void
+trx_undo_seg_free(
+/*==============*/
+ trx_undo_t* undo) /*!< in: undo log */
+{
+ trx_rseg_t* rseg;
+ fseg_header_t* file_seg;
+ trx_rsegf_t* rseg_header;
+ trx_usegf_t* seg_header;
+ ibool finished;
+ mtr_t mtr;
+
+ rseg = undo->rseg;
+
+ do {
+
+ mtr_start(&mtr);
+
+ ut_ad(!mutex_own(&kernel_mutex));
+
+ mutex_enter(&(rseg->mutex));
+
+ seg_header = trx_undo_page_get(undo->space, undo->zip_size,
+ undo->hdr_page_no,
+ &mtr) + TRX_UNDO_SEG_HDR;
+
+ file_seg = seg_header + TRX_UNDO_FSEG_HEADER;
+
+ finished = fseg_free_step(file_seg, &mtr);
+
+ if (finished) {
+ /* Update the rseg header */
+ rseg_header = trx_rsegf_get(
+ rseg->space, rseg->zip_size, rseg->page_no,
+ &mtr);
+ trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL,
+ &mtr);
+ }
+
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+ } while (!finished);
+}
+
+/*========== UNDO LOG MEMORY COPY INITIALIZATION =====================*/
+
+/********************************************************************//**
+Creates and initializes an undo log memory object according to the values
+in the header in file, when the database is started. The memory object is
+inserted in the appropriate list of rseg.
+@return own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create_at_db_start(
+/*============================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment memory object */
+ ulint id, /*!< in: slot index within rseg */
+ ulint page_no,/*!< in: undo log segment page number */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* undo_page;
+ trx_upagef_t* page_header;
+ trx_usegf_t* seg_header;
+ trx_ulogf_t* undo_header;
+ trx_undo_t* undo;
+ ulint type;
+ ulint state;
+ trx_id_t trx_id;
+ ulint offset;
+ fil_addr_t last_addr;
+ page_t* last_page;
+ trx_undo_rec_t* rec;
+ XID xid;
+ ibool xid_exists = FALSE;
+
+ if (id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr,
+ "InnoDB: Error: undo->id is %lu\n", (ulong) id);
+ ut_error;
+ }
+
+ undo_page = trx_undo_page_get(rseg->space, rseg->zip_size,
+ page_no, mtr);
+
+ page_header = undo_page + TRX_UNDO_PAGE_HDR;
+
+ type = mtr_read_ulint(page_header + TRX_UNDO_PAGE_TYPE, MLOG_2BYTES,
+ mtr);
+ seg_header = undo_page + TRX_UNDO_SEG_HDR;
+
+ state = mach_read_from_2(seg_header + TRX_UNDO_STATE);
+
+ offset = mach_read_from_2(seg_header + TRX_UNDO_LAST_LOG);
+
+ undo_header = undo_page + offset;
+
+ trx_id = mach_read_from_8(undo_header + TRX_UNDO_TRX_ID);
+
+ xid_exists = mtr_read_ulint(undo_header + TRX_UNDO_XID_EXISTS,
+ MLOG_1BYTE, mtr);
+
+ /* Read X/Open XA transaction identification if it exists, or
+ set it to NULL. */
+
+ memset(&xid, 0, sizeof(xid));
+ xid.formatID = -1;
+
+ if (xid_exists == TRUE) {
+ trx_undo_read_xid(undo_header, &xid);
+ }
+
+ mutex_enter(&(rseg->mutex));
+
+ undo = trx_undo_mem_create(rseg, id, type, trx_id, &xid,
+ page_no, offset);
+ mutex_exit(&(rseg->mutex));
+
+ undo->dict_operation = mtr_read_ulint(
+ undo_header + TRX_UNDO_DICT_TRANS, MLOG_1BYTE, mtr);
+
+ undo->table_id = mach_read_from_8(undo_header + TRX_UNDO_TABLE_ID);
+ undo->state = state;
+ undo->size = flst_get_len(seg_header + TRX_UNDO_PAGE_LIST, mtr);
+
+ /* If the log segment is being freed, the page list is inconsistent! */
+ if (state == TRX_UNDO_TO_FREE) {
+
+ goto add_to_list;
+ }
+
+ last_addr = flst_get_last(seg_header + TRX_UNDO_PAGE_LIST, mtr);
+
+ undo->last_page_no = last_addr.page;
+ undo->top_page_no = last_addr.page;
+
+ last_page = trx_undo_page_get(rseg->space, rseg->zip_size,
+ undo->last_page_no, mtr);
+
+ rec = trx_undo_page_get_last_rec(last_page, page_no, offset);
+
+ if (rec == NULL) {
+ undo->empty = TRUE;
+ } else {
+ undo->empty = FALSE;
+ undo->top_offset = rec - last_page;
+ undo->top_undo_no = trx_undo_rec_get_undo_no(rec);
+ }
+add_to_list:
+ if (type == TRX_UNDO_INSERT) {
+ if (state != TRX_UNDO_CACHED) {
+ UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_list,
+ undo);
+ } else {
+ UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_cached,
+ undo);
+ }
+ } else {
+ ut_ad(type == TRX_UNDO_UPDATE);
+ if (state != TRX_UNDO_CACHED) {
+ UT_LIST_ADD_LAST(undo_list, rseg->update_undo_list,
+ undo);
+ } else {
+ UT_LIST_ADD_LAST(undo_list, rseg->update_undo_cached,
+ undo);
+ }
+ }
+
+ return(undo);
+}
+
+/********************************************************************//**
+Initializes the undo log lists for a rollback segment memory copy. This
+function is only called when the database is started or a new rollback
+segment is created.
+@return the combined size of undo log segments in pages */
+UNIV_INTERN
+ulint
+trx_undo_lists_init(
+/*================*/
+ trx_rseg_t* rseg) /*!< in: rollback segment memory object */
+{
+ ulint page_no;
+ trx_undo_t* undo;
+ ulint size = 0;
+ trx_rsegf_t* rseg_header;
+ ulint i;
+ mtr_t mtr;
+
+ UT_LIST_INIT(rseg->update_undo_list);
+ UT_LIST_INIT(rseg->update_undo_cached);
+ UT_LIST_INIT(rseg->insert_undo_list);
+ UT_LIST_INIT(rseg->insert_undo_cached);
+
+ mtr_start(&mtr);
+
+ rseg_header = trx_rsegf_get_new(rseg->space, rseg->zip_size,
+ rseg->page_no, &mtr);
+
+ for (i = 0; i < TRX_RSEG_N_SLOTS; i++) {
+ page_no = trx_rsegf_get_nth_undo(rseg_header, i, &mtr);
+
+ /* In forced recovery: try to avoid operations which look
+ at database pages; undo logs are rapidly changing data, and
+ the probability that they are in an inconsistent state is
+ high */
+
+ if (page_no != FIL_NULL
+ && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
+
+ undo = trx_undo_mem_create_at_db_start(rseg, i,
+ page_no, &mtr);
+ size += undo->size;
+
+ mtr_commit(&mtr);
+
+ mtr_start(&mtr);
+
+ rseg_header = trx_rsegf_get(
+ rseg->space, rseg->zip_size, rseg->page_no,
+ &mtr);
+ }
+ }
+
+ mtr_commit(&mtr);
+
+ return(size);
+}
+
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment memory object */
+ ulint id, /*!< in: slot index within rseg */
+ ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is created */
+ const XID* xid, /*!< in: X/Open transaction identification */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset) /*!< in: undo log header byte offset on page */
+{
+ trx_undo_t* undo;
+
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ if (id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr,
+ "InnoDB: Error: undo->id is %lu\n", (ulong) id);
+ ut_error;
+ }
+
+ undo = mem_alloc(sizeof(trx_undo_t));
+
+ if (undo == NULL) {
+
+ return NULL;
+ }
+
+ undo->id = id;
+ undo->type = type;
+ undo->state = TRX_UNDO_ACTIVE;
+ undo->del_marks = FALSE;
+ undo->trx_id = trx_id;
+ undo->xid = *xid;
+
+ undo->dict_operation = FALSE;
+
+ undo->rseg = rseg;
+
+ undo->space = rseg->space;
+ undo->zip_size = rseg->zip_size;
+ undo->hdr_page_no = page_no;
+ undo->hdr_offset = offset;
+ undo->last_page_no = page_no;
+ undo->size = 1;
+
+ undo->empty = TRUE;
+ undo->top_page_no = page_no;
+ undo->guess_block = NULL;
+
+ return(undo);
+}
+
+/********************************************************************//**
+Initializes a cached undo log object for new use. */
+static
+void
+trx_undo_mem_init_for_reuse(
+/*========================*/
+ trx_undo_t* undo, /*!< in: undo log to init */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is created */
+ const XID* xid, /*!< in: X/Open XA transaction identification*/
+ ulint offset) /*!< in: undo log header byte offset on page */
+{
+ ut_ad(mutex_own(&((undo->rseg)->mutex)));
+
+ if (UNIV_UNLIKELY(undo->id >= TRX_RSEG_N_SLOTS)) {
+ fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+ (ulong) undo->id);
+
+ mem_analyze_corruption(undo);
+ ut_error;
+ }
+
+ undo->state = TRX_UNDO_ACTIVE;
+ undo->del_marks = FALSE;
+ undo->trx_id = trx_id;
+ undo->xid = *xid;
+
+ undo->dict_operation = FALSE;
+
+ undo->hdr_offset = offset;
+ undo->empty = TRUE;
+}
+
+/********************************************************************//**
+Frees an undo log memory copy. */
+UNIV_INTERN
+void
+trx_undo_mem_free(
+/*==============*/
+ trx_undo_t* undo) /*!< in: the undo object to be freed */
+{
+ if (undo->id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr,
+ "InnoDB: Error: undo->id is %lu\n", (ulong) undo->id);
+ ut_error;
+ }
+
+ mem_free(undo);
+}
+
+/**********************************************************************//**
+Creates a new undo log.
+@return DB_SUCCESS if successful in creating the new undo lob object,
+possible error codes are: DB_TOO_MANY_CONCURRENT_TRXS
+DB_OUT_OF_FILE_SPACE DB_OUT_OF_MEMORY */
+static
+ulint
+trx_undo_create(
+/*============*/
+ trx_t* trx, /*!< in: transaction */
+ trx_rseg_t* rseg, /*!< in: rollback segment memory copy */
+ ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is created */
+ const XID* xid, /*!< in: X/Open transaction identification*/
+ trx_undo_t** undo, /*!< out: the new undo log object, undefined
+ * if did not succeed */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_rsegf_t* rseg_header;
+ ulint page_no;
+ ulint offset;
+ ulint id;
+ page_t* undo_page;
+ ulint err;
+
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ if (rseg->curr_size == rseg->max_size) {
+
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+
+ rseg->curr_size++;
+
+ rseg_header = trx_rsegf_get(rseg->space, rseg->zip_size, rseg->page_no,
+ mtr);
+
+ err = trx_undo_seg_create(rseg, rseg_header, type, &id,
+ &undo_page, mtr);
+
+ if (err != DB_SUCCESS) {
+ /* Did not succeed */
+
+ rseg->curr_size--;
+
+ return(err);
+ }
+
+ page_no = page_get_page_no(undo_page);
+
+ offset = trx_undo_header_create(undo_page, trx_id, mtr);
+
+ if (trx->support_xa) {
+ trx_undo_header_add_space_for_xid(undo_page,
+ undo_page + offset, mtr);
+ }
+
+ *undo = trx_undo_mem_create(rseg, id, type, trx_id, xid,
+ page_no, offset);
+ if (*undo == NULL) {
+
+ err = DB_OUT_OF_MEMORY;
+ }
+
+ return(err);
+}
+
+/*================ UNDO LOG ASSIGNMENT AND CLEANUP =====================*/
+
+/********************************************************************//**
+Reuses a cached undo log.
+@return the undo log memory object, NULL if none cached */
+static
+trx_undo_t*
+trx_undo_reuse_cached(
+/*==================*/
+ trx_t* trx, /*!< in: transaction */
+ trx_rseg_t* rseg, /*!< in: rollback segment memory object */
+ ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is used */
+ const XID* xid, /*!< in: X/Open XA transaction identification */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_undo_t* undo;
+ page_t* undo_page;
+ ulint offset;
+
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ if (type == TRX_UNDO_INSERT) {
+
+ undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached);
+ if (undo == NULL) {
+
+ return(NULL);
+ }
+
+ UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, undo);
+ } else {
+ ut_ad(type == TRX_UNDO_UPDATE);
+
+ undo = UT_LIST_GET_FIRST(rseg->update_undo_cached);
+ if (undo == NULL) {
+
+ return(NULL);
+ }
+
+ UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, undo);
+ }
+
+ ut_ad(undo->size == 1);
+
+ if (undo->id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+ (ulong) undo->id);
+ mem_analyze_corruption(undo);
+ ut_error;
+ }
+
+ undo_page = trx_undo_page_get(undo->space, undo->zip_size,
+ undo->hdr_page_no, mtr);
+
+ if (type == TRX_UNDO_INSERT) {
+ offset = trx_undo_insert_header_reuse(undo_page, trx_id, mtr);
+
+ if (trx->support_xa) {
+ trx_undo_header_add_space_for_xid(
+ undo_page, undo_page + offset, mtr);
+ }
+ } else {
+ ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_TYPE)
+ == TRX_UNDO_UPDATE);
+
+ offset = trx_undo_header_create(undo_page, trx_id, mtr);
+
+ if (trx->support_xa) {
+ trx_undo_header_add_space_for_xid(
+ undo_page, undo_page + offset, mtr);
+ }
+ }
+
+ trx_undo_mem_init_for_reuse(undo, trx_id, xid, offset);
+
+ return(undo);
+}
+
+/**********************************************************************//**
+Marks an undo log header as a header of a data dictionary operation
+transaction. */
+static
+void
+trx_undo_mark_as_dict_operation(
+/*============================*/
+ trx_t* trx, /*!< in: dict op transaction */
+ trx_undo_t* undo, /*!< in: assigned undo log */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* hdr_page;
+
+ hdr_page = trx_undo_page_get(undo->space, undo->zip_size,
+ undo->hdr_page_no, mtr);
+
+ switch (trx_get_dict_operation(trx)) {
+ case TRX_DICT_OP_NONE:
+ ut_error;
+ case TRX_DICT_OP_INDEX:
+ /* Do not discard the table on recovery. */
+ undo->table_id = 0;
+ break;
+ case TRX_DICT_OP_TABLE:
+ undo->table_id = trx->table_id;
+ break;
+ }
+
+ mlog_write_ulint(hdr_page + undo->hdr_offset
+ + TRX_UNDO_DICT_TRANS,
+ TRUE, MLOG_1BYTE, mtr);
+
+ mlog_write_ull(hdr_page + undo->hdr_offset + TRX_UNDO_TABLE_ID,
+ undo->table_id, mtr);
+
+ undo->dict_operation = TRUE;
+}
+
+/**********************************************************************//**
+Assigns an undo log for a transaction. A new undo log is created or a cached
+undo log reused.
+@return DB_SUCCESS if undo log assign successful, possible error codes
+are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE
+DB_OUT_OF_MEMORY */
+UNIV_INTERN
+ulint
+trx_undo_assign_undo(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ ulint type) /*!< in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */
+{
+ trx_rseg_t* rseg;
+ trx_undo_t* undo;
+ mtr_t mtr;
+ ulint err = DB_SUCCESS;
+
+ ut_ad(trx);
+ ut_ad(trx->rseg);
+
+ rseg = trx->rseg;
+
+ ut_ad(mutex_own(&(trx->undo_mutex)));
+
+ mtr_start(&mtr);
+
+ ut_ad(!mutex_own(&kernel_mutex));
+
+ mutex_enter(&(rseg->mutex));
+
+ undo = trx_undo_reuse_cached(trx, rseg, type, trx->id, &trx->xid,
+ &mtr);
+ if (undo == NULL) {
+ err = trx_undo_create(trx, rseg, type, trx->id, &trx->xid,
+ &undo, &mtr);
+ if (err != DB_SUCCESS) {
+
+ goto func_exit;
+ }
+ }
+
+ if (type == TRX_UNDO_INSERT) {
+ UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_list, undo);
+ ut_ad(trx->insert_undo == NULL);
+ trx->insert_undo = undo;
+ } else {
+ UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_list, undo);
+ ut_ad(trx->update_undo == NULL);
+ trx->update_undo = undo;
+ }
+
+ if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) {
+ trx_undo_mark_as_dict_operation(trx, undo, &mtr);
+ }
+
+func_exit:
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ return err;
+}
+
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction finish.
+@return undo log segment header page, x-latched */
+UNIV_INTERN
+page_t*
+trx_undo_set_state_at_finish(
+/*=========================*/
+ trx_undo_t* undo, /*!< in: undo log memory copy */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_usegf_t* seg_hdr;
+ trx_upagef_t* page_hdr;
+ page_t* undo_page;
+ ulint state;
+
+ ut_ad(undo);
+ ut_ad(mtr);
+
+ if (undo->id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+ (ulong) undo->id);
+ mem_analyze_corruption(undo);
+ ut_error;
+ }
+
+ undo_page = trx_undo_page_get(undo->space, undo->zip_size,
+ undo->hdr_page_no, mtr);
+
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+ if (undo->size == 1
+ && mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE)
+ < TRX_UNDO_PAGE_REUSE_LIMIT) {
+
+ state = TRX_UNDO_CACHED;
+
+ } else if (undo->type == TRX_UNDO_INSERT) {
+
+ state = TRX_UNDO_TO_FREE;
+ } else {
+ state = TRX_UNDO_TO_PURGE;
+ }
+
+ undo->state = state;
+
+ mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, state, MLOG_2BYTES, mtr);
+
+ return(undo_page);
+}
+
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction prepare.
+@return undo log segment header page, x-latched */
+UNIV_INTERN
+page_t*
+trx_undo_set_state_at_prepare(
+/*==========================*/
+ trx_t* trx, /*!< in: transaction */
+ trx_undo_t* undo, /*!< in: undo log memory copy */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_usegf_t* seg_hdr;
+ trx_ulogf_t* undo_header;
+ page_t* undo_page;
+ ulint offset;
+
+ ut_ad(trx && undo && mtr);
+
+ if (undo->id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+ (ulong) undo->id);
+ mem_analyze_corruption(undo);
+ ut_error;
+ }
+
+ undo_page = trx_undo_page_get(undo->space, undo->zip_size,
+ undo->hdr_page_no, mtr);
+
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+ /*------------------------------*/
+ undo->state = TRX_UNDO_PREPARED;
+ undo->xid = trx->xid;
+ /*------------------------------*/
+
+ mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, undo->state,
+ MLOG_2BYTES, mtr);
+
+ offset = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG);
+ undo_header = undo_page + offset;
+
+ mlog_write_ulint(undo_header + TRX_UNDO_XID_EXISTS,
+ TRUE, MLOG_1BYTE, mtr);
+
+ trx_undo_write_xid(undo_header, &undo->xid, mtr);
+
+ return(undo_page);
+}
+
+/**********************************************************************//**
+Adds the update undo log header as the first in the history list, and
+frees the memory object, or puts it to the list of cached update undo log
+segments. */
+UNIV_INTERN
+void
+trx_undo_update_cleanup(
+/*====================*/
+ trx_t* trx, /*!< in: trx owning the update undo log */
+ page_t* undo_page, /*!< in: update undo log header page,
+ x-latched */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_rseg_t* rseg;
+ trx_undo_t* undo;
+
+ undo = trx->update_undo;
+ rseg = trx->rseg;
+
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ trx_purge_add_update_undo_to_history(trx, undo_page, mtr);
+
+ UT_LIST_REMOVE(undo_list, rseg->update_undo_list, undo);
+
+ trx->update_undo = NULL;
+
+ if (undo->state == TRX_UNDO_CACHED) {
+
+ UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_cached, undo);
+ } else {
+ ut_ad(undo->state == TRX_UNDO_TO_PURGE
+ || undo->state == TRX_UNDO_TO_FREE);
+
+ trx_undo_mem_free(undo);
+ }
+}
+
+/******************************************************************//**
+Frees or caches an insert undo log after a transaction commit or rollback.
+Knowledge of inserts is not needed after a commit or rollback, therefore
+the data can be discarded. */
+UNIV_INTERN
+void
+trx_undo_insert_cleanup(
+/*====================*/
+ trx_t* trx) /*!< in: transaction handle */
+{
+ trx_undo_t* undo;
+ trx_rseg_t* rseg;
+
+ undo = trx->insert_undo;
+ ut_ad(undo);
+
+ rseg = trx->rseg;
+
+ mutex_enter(&(rseg->mutex));
+
+ UT_LIST_REMOVE(undo_list, rseg->insert_undo_list, undo);
+ trx->insert_undo = NULL;
+
+ if (undo->state == TRX_UNDO_CACHED) {
+
+ UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_cached, undo);
+ } else {
+ ut_ad(undo->state == TRX_UNDO_TO_FREE);
+
+ /* Delete first the undo log segment in the file */
+
+ mutex_exit(&(rseg->mutex));
+
+ trx_undo_seg_free(undo);
+
+ mutex_enter(&(rseg->mutex));
+
+ ut_ad(rseg->curr_size > undo->size);
+
+ rseg->curr_size -= undo->size;
+
+ trx_undo_mem_free(undo);
+ }
+
+ mutex_exit(&(rseg->mutex));
+}
+
+/********************************************************************//**
+At shutdown, frees the undo logs of a PREPARED transaction. */
+UNIV_INTERN
+void
+trx_undo_free_prepared(
+/*===================*/
+ trx_t* trx) /*!< in/out: PREPARED transaction */
+{
+ ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
+
+ if (trx->update_undo) {
+ ut_a(trx->update_undo->state == TRX_UNDO_PREPARED);
+ UT_LIST_REMOVE(undo_list, trx->rseg->update_undo_list,
+ trx->update_undo);
+ trx_undo_mem_free(trx->update_undo);
+ }
+ if (trx->insert_undo) {
+ ut_a(trx->insert_undo->state == TRX_UNDO_PREPARED);
+ UT_LIST_REMOVE(undo_list, trx->rseg->insert_undo_list,
+ trx->insert_undo);
+ trx_undo_mem_free(trx->insert_undo);
+ }
+}
+#endif /* !UNIV_HOTBACKUP */