diff options
author | Sergei Golubchik <serg@mariadb.org> | 2015-04-27 23:37:51 +0200 |
---|---|---|
committer | Sergei Golubchik <serg@mariadb.org> | 2015-04-27 23:37:51 +0200 |
commit | fd39c56effd5b56aae2ebe7709a1fbf73503edcd (patch) | |
tree | 17d03ddb4e6a2acbb6d90ba76753c7690653f7b4 /storage/xtradb/trx | |
parent | 13927f878e02c33d118cac43b14bd06d2382eb26 (diff) | |
download | mariadb-git-fd39c56effd5b56aae2ebe7709a1fbf73503edcd.tar.gz |
move to storage/xtradb/
Diffstat (limited to 'storage/xtradb/trx')
-rw-r--r-- | storage/xtradb/trx/trx0i_s.c | 1607 | ||||
-rw-r--r-- | storage/xtradb/trx/trx0purge.c | 1254 | ||||
-rw-r--r-- | storage/xtradb/trx/trx0rec.c | 1698 | ||||
-rw-r--r-- | storage/xtradb/trx/trx0roll.c | 1357 | ||||
-rw-r--r-- | storage/xtradb/trx/trx0rseg.c | 374 | ||||
-rw-r--r-- | storage/xtradb/trx/trx0sys.c | 2049 | ||||
-rw-r--r-- | storage/xtradb/trx/trx0trx.c | 2449 | ||||
-rw-r--r-- | storage/xtradb/trx/trx0undo.c | 2000 |
8 files changed, 12788 insertions, 0 deletions
diff --git a/storage/xtradb/trx/trx0i_s.c b/storage/xtradb/trx/trx0i_s.c new file mode 100644 index 00000000000..8b3a83585cc --- /dev/null +++ b/storage/xtradb/trx/trx0i_s.c @@ -0,0 +1,1607 @@ +/***************************************************************************** + +Copyright (c) 2007, 2010, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0i_s.c +INFORMATION SCHEMA innodb_trx, innodb_locks and +innodb_lock_waits tables fetch code. + +The code below fetches information needed to fill those +3 dynamic tables and uploads it into a "transactions +table cache" for later retrieval. + +Created July 17, 2007 Vasil Dimov +*******************************************************/ + +/* Found during the build of 5.5.3 on Linux 2.4 and early 2.6 kernels: + The includes "univ.i" -> "my_global.h" cause a different path + to be taken further down with pthread functions and types, + so they must come first. + From the symptoms, this is related to bug#46587 in the MySQL bug DB. +*/ +#include "univ.i" + +#include <mysql/plugin.h> + +#include "buf0buf.h" +#include "dict0dict.h" +#include "ha0storage.h" +#include "ha_prototypes.h" +#include "hash0hash.h" +#include "lock0iter.h" +#include "lock0lock.h" +#include "mem0mem.h" +#include "page0page.h" +#include "rem0rec.h" +#include "row0row.h" +#include "srv0srv.h" +#include "sync0rw.h" +#include "sync0sync.h" +#include "sync0types.h" +#include "trx0i_s.h" +#include "trx0sys.h" +#include "trx0trx.h" +#include "ut0mem.h" +#include "ut0ut.h" + +/** Initial number of rows in the table cache */ +#define TABLE_CACHE_INITIAL_ROWSNUM 1024 + +/** @brief The maximum number of chunks to allocate for a table cache. + +The rows of a table cache are stored in a set of chunks. When a new +row is added a new chunk is allocated if necessary. Assuming that the +first one is 1024 rows (TABLE_CACHE_INITIAL_ROWSNUM) and each +subsequent is N/2 where N is the number of rows we have allocated till +now, then 39th chunk would accommodate 1677416425 rows and all chunks +would accommodate 3354832851 rows. */ +#define MEM_CHUNKS_IN_TABLE_CACHE 39 + +/** The following are some testing auxiliary macros. Do not enable them +in a production environment. */ +/* @{ */ + +#if 0 +/** If this is enabled then lock folds will always be different +resulting in equal rows being put in a different cells of the hash +table. Checking for duplicates will be flawed because different +fold will be calculated when a row is searched in the hash table. */ +#define TEST_LOCK_FOLD_ALWAYS_DIFFERENT +#endif + +#if 0 +/** This effectively kills the search-for-duplicate-before-adding-a-row +function, but searching in the hash is still performed. It will always +be assumed that lock is not present and insertion will be performed in +the hash table. */ +#define TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T +#endif + +#if 0 +/** This aggressively repeats adding each row many times. Depending on +the above settings this may be noop or may result in lots of rows being +added. */ +#define TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES +#endif + +#if 0 +/** Very similar to TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T but hash +table search is not performed at all. */ +#define TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS +#endif + +#if 0 +/** Do not insert each row into the hash table, duplicates may appear +if this is enabled, also if this is enabled searching into the hash is +noop because it will be empty. */ +#define TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE +#endif +/* @} */ + +/** Memory limit passed to ha_storage_put_memlim(). +@param cache hash storage +@return maximum allowed allocation size */ +#define MAX_ALLOWED_FOR_STORAGE(cache) \ + (TRX_I_S_MEM_LIMIT \ + - (cache)->mem_allocd) + +/** Memory limit in table_cache_create_empty_row(). +@param cache hash storage +@return maximum allowed allocation size */ +#define MAX_ALLOWED_FOR_ALLOC(cache) \ + (TRX_I_S_MEM_LIMIT \ + - (cache)->mem_allocd \ + - ha_storage_get_size((cache)->storage)) + +/** Memory for each table in the intermediate buffer is allocated in +separate chunks. These chunks are considered to be concatenated to +represent one flat array of rows. */ +typedef struct i_s_mem_chunk_struct { + ulint offset; /*!< offset, in number of rows */ + ulint rows_allocd; /*!< the size of this chunk, in number + of rows */ + void* base; /*!< start of the chunk */ +} i_s_mem_chunk_t; + +/** This represents one table's cache. */ +typedef struct i_s_table_cache_struct { + ulint rows_used; /*!< number of used rows */ + ulint rows_allocd; /*!< number of allocated rows */ + ulint row_size; /*!< size of a single row */ + i_s_mem_chunk_t chunks[MEM_CHUNKS_IN_TABLE_CACHE]; /*!< array of + memory chunks that stores the + rows */ +} i_s_table_cache_t; + +/** This structure describes the intermediate buffer */ +struct trx_i_s_cache_struct { + rw_lock_t rw_lock; /*!< read-write lock protecting + the rest of this structure */ + ullint last_read; /*!< last time the cache was read; + measured in microseconds since + epoch */ + mutex_t last_read_mutex;/*!< mutex protecting the + last_read member - it is updated + inside a shared lock of the + rw_lock member */ + i_s_table_cache_t innodb_trx; /*!< innodb_trx table */ + i_s_table_cache_t innodb_locks; /*!< innodb_locks table */ + i_s_table_cache_t innodb_lock_waits;/*!< innodb_lock_waits table */ +/** the hash table size is LOCKS_HASH_CELLS_NUM * sizeof(void*) bytes */ +#define LOCKS_HASH_CELLS_NUM 10000 + hash_table_t* locks_hash; /*!< hash table used to eliminate + duplicate entries in the + innodb_locks table */ +/** Initial size of the cache storage */ +#define CACHE_STORAGE_INITIAL_SIZE 1024 +/** Number of hash cells in the cache storage */ +#define CACHE_STORAGE_HASH_CELLS 2048 + ha_storage_t* storage; /*!< storage for external volatile + data that can possibly not be + available later, when we release + the kernel mutex */ + ulint mem_allocd; /*!< the amount of memory + allocated with mem_alloc*() */ + ibool is_truncated; /*!< this is TRUE if the memory + limit was hit and thus the data + in the cache is truncated */ +}; + +/** This is the intermediate buffer where data needed to fill the +INFORMATION SCHEMA tables is fetched and later retrieved by the C++ +code in handler/i_s.cc. */ +static trx_i_s_cache_t trx_i_s_cache_static; +/** This is the intermediate buffer where data needed to fill the +INFORMATION SCHEMA tables is fetched and later retrieved by the C++ +code in handler/i_s.cc. */ +UNIV_INTERN trx_i_s_cache_t* trx_i_s_cache = &trx_i_s_cache_static; + +/* Key to register the lock/mutex with performance schema */ +#ifdef UNIV_PFS_RWLOCK +UNIV_INTERN mysql_pfs_key_t trx_i_s_cache_lock_key; +#endif /* UNIV_PFS_RWLOCK */ + +#ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t cache_last_read_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/*******************************************************************//** +For a record lock that is in waiting state retrieves the only bit that +is set, for a table lock returns ULINT_UNDEFINED. +@return record number within the heap */ +static +ulint +wait_lock_get_heap_no( +/*==================*/ + const lock_t* lock) /*!< in: lock */ +{ + ulint ret; + + switch (lock_get_type(lock)) { + case LOCK_REC: + ret = lock_rec_find_set_bit(lock); + ut_a(ret != ULINT_UNDEFINED); + break; + case LOCK_TABLE: + ret = ULINT_UNDEFINED; + break; + default: + ut_error; + } + + return(ret); +} + +/*******************************************************************//** +Initializes the members of a table cache. */ +static +void +table_cache_init( +/*=============*/ + i_s_table_cache_t* table_cache, /*!< out: table cache */ + size_t row_size) /*!< in: the size of a + row */ +{ + ulint i; + + table_cache->rows_used = 0; + table_cache->rows_allocd = 0; + table_cache->row_size = row_size; + + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + /* the memory is actually allocated in + table_cache_create_empty_row() */ + table_cache->chunks[i].base = NULL; + } +} + +/*******************************************************************//** +Frees a table cache. */ +static +void +table_cache_free( +/*=============*/ + i_s_table_cache_t* table_cache) /*!< in/out: table cache */ +{ + ulint i; + + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + /* the memory is actually allocated in + table_cache_create_empty_row() */ + if (table_cache->chunks[i].base) { + mem_free(table_cache->chunks[i].base); + table_cache->chunks[i].base = NULL; + } + } +} + +/*******************************************************************//** +Returns an empty row from a table cache. The row is allocated if no more +empty rows are available. The number of used rows is incremented. +If the memory limit is hit then NULL is returned and nothing is +allocated. +@return empty row, or NULL if out of memory */ +static +void* +table_cache_create_empty_row( +/*=========================*/ + i_s_table_cache_t* table_cache, /*!< in/out: table cache */ + trx_i_s_cache_t* cache) /*!< in/out: cache to record + how many bytes are + allocated */ +{ + ulint i; + void* row; + + ut_a(table_cache->rows_used <= table_cache->rows_allocd); + + if (table_cache->rows_used == table_cache->rows_allocd) { + + /* rows_used == rows_allocd means that new chunk needs + to be allocated: either no more empty rows in the + last allocated chunk or nothing has been allocated yet + (rows_num == rows_allocd == 0); */ + + i_s_mem_chunk_t* chunk; + ulint req_bytes; + ulint got_bytes; + ulint req_rows; + ulint got_rows; + + /* find the first not allocated chunk */ + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + if (table_cache->chunks[i].base == NULL) { + + break; + } + } + + /* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks + have been allocated :-X */ + ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE); + + /* allocate the chunk we just found */ + + if (i == 0) { + + /* first chunk, nothing is allocated yet */ + req_rows = TABLE_CACHE_INITIAL_ROWSNUM; + } else { + + /* Memory is increased by the formula + new = old + old / 2; We are trying not to be + aggressive here (= using the common new = old * 2) + because the allocated memory will not be freed + until InnoDB exit (it is reused). So it is better + to once allocate the memory in more steps, but + have less unused/wasted memory than to use less + steps in allocation (which is done once in a + lifetime) but end up with lots of unused/wasted + memory. */ + req_rows = table_cache->rows_allocd / 2; + } + req_bytes = req_rows * table_cache->row_size; + + if (req_bytes > MAX_ALLOWED_FOR_ALLOC(cache)) { + + return(NULL); + } + + chunk = &table_cache->chunks[i]; + + chunk->base = mem_alloc2(req_bytes, &got_bytes); + + got_rows = got_bytes / table_cache->row_size; + + cache->mem_allocd += got_bytes; + +#if 0 + printf("allocating chunk %d req bytes=%lu, got bytes=%lu, " + "row size=%lu, " + "req rows=%lu, got rows=%lu\n", + i, req_bytes, got_bytes, + table_cache->row_size, + req_rows, got_rows); +#endif + + chunk->rows_allocd = got_rows; + + table_cache->rows_allocd += got_rows; + + /* adjust the offset of the next chunk */ + if (i < MEM_CHUNKS_IN_TABLE_CACHE - 1) { + + table_cache->chunks[i + 1].offset + = chunk->offset + chunk->rows_allocd; + } + + /* return the first empty row in the newly allocated + chunk */ + row = chunk->base; + } else { + + char* chunk_start; + ulint offset; + + /* there is an empty row, no need to allocate new + chunks */ + + /* find the first chunk that contains allocated but + empty/unused rows */ + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + if (table_cache->chunks[i].offset + + table_cache->chunks[i].rows_allocd + > table_cache->rows_used) { + + break; + } + } + + /* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks + are full, but + table_cache->rows_used != table_cache->rows_allocd means + exactly the opposite - there are allocated but + empty/unused rows :-X */ + ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE); + + chunk_start = (char*) table_cache->chunks[i].base; + offset = table_cache->rows_used + - table_cache->chunks[i].offset; + + row = chunk_start + offset * table_cache->row_size; + } + + table_cache->rows_used++; + + return(row); +} + +#ifdef UNIV_DEBUG +/*******************************************************************//** +Validates a row in the locks cache. +@return TRUE if valid */ +static +ibool +i_s_locks_row_validate( +/*===================*/ + const i_s_locks_row_t* row) /*!< in: row to validate */ +{ + ut_ad(row->lock_trx_id != 0); + ut_ad(row->lock_mode != NULL); + ut_ad(row->lock_type != NULL); + ut_ad(row->lock_table != NULL); + ut_ad(row->lock_table_id != 0); + + if (row->lock_space == ULINT_UNDEFINED) { + /* table lock */ + ut_ad(!strcmp("TABLE", row->lock_type)); + ut_ad(row->lock_index == NULL); + ut_ad(row->lock_data == NULL); + ut_ad(row->lock_page == ULINT_UNDEFINED); + ut_ad(row->lock_rec == ULINT_UNDEFINED); + } else { + /* record lock */ + ut_ad(!strcmp("RECORD", row->lock_type)); + ut_ad(row->lock_index != NULL); + /* row->lock_data == NULL if buf_page_try_get() == NULL */ + ut_ad(row->lock_page != ULINT_UNDEFINED); + ut_ad(row->lock_rec != ULINT_UNDEFINED); + } + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/*******************************************************************//** +Fills i_s_trx_row_t object. +If memory can not be allocated then FALSE is returned. +@return FALSE if allocation fails */ +static +ibool +fill_trx_row( +/*=========*/ + i_s_trx_row_t* row, /*!< out: result object + that's filled */ + const trx_t* trx, /*!< in: transaction to + get data from */ + const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the + corresponding row in + innodb_locks if trx is + waiting or NULL if trx + is not waiting */ + trx_i_s_cache_t* cache) /*!< in/out: cache into + which to copy volatile + strings */ +{ + const char* stmt; + size_t stmt_len; + const char* s; + + ut_ad(mutex_own(&kernel_mutex)); + + row->trx_id = trx->id; + row->trx_started = (ib_time_t) trx->start_time; + row->trx_state = trx_get_que_state_str(trx); + row->requested_lock_row = requested_lock_row; + ut_ad(requested_lock_row == NULL + || i_s_locks_row_validate(requested_lock_row)); + + if (trx->wait_lock != NULL) { + ut_a(requested_lock_row != NULL); + row->trx_wait_started = (ib_time_t) trx->wait_started; + } else { + ut_a(requested_lock_row == NULL); + row->trx_wait_started = 0; + } + + row->trx_weight = (ullint) TRX_WEIGHT(trx); + + if (trx->mysql_thd == NULL) { + /* For internal transactions e.g., purge and transactions + being recovered at startup there is no associated MySQL + thread data structure. */ + row->trx_mysql_thread_id = 0; + row->trx_query = NULL; + goto thd_done; + } + + row->trx_mysql_thread_id = thd_get_thread_id(trx->mysql_thd); + stmt = innobase_get_stmt(trx->mysql_thd, &stmt_len); + + if (stmt != NULL) { + char query[TRX_I_S_TRX_QUERY_MAX_LEN + 1]; + + if (stmt_len > TRX_I_S_TRX_QUERY_MAX_LEN) { + stmt_len = TRX_I_S_TRX_QUERY_MAX_LEN; + } + + memcpy(query, stmt, stmt_len); + query[stmt_len] = '\0'; + + row->trx_query = ha_storage_put_memlim( + cache->storage, query, stmt_len + 1, + MAX_ALLOWED_FOR_STORAGE(cache)); + + row->trx_query_cs = innobase_get_charset(trx->mysql_thd); + + if (row->trx_query == NULL) { + + return(FALSE); + } + } else { + + row->trx_query = NULL; + } + +thd_done: + s = trx->op_info; + + if (s != NULL && s[0] != '\0') { + + TRX_I_S_STRING_COPY(s, row->trx_operation_state, + TRX_I_S_TRX_OP_STATE_MAX_LEN, cache); + + if (row->trx_operation_state == NULL) { + + return(FALSE); + } + } else { + + row->trx_operation_state = NULL; + } + + row->trx_tables_in_use = trx->n_mysql_tables_in_use; + + row->trx_tables_locked = trx->mysql_n_tables_locked; + + row->trx_lock_structs = UT_LIST_GET_LEN(trx->trx_locks); + + row->trx_lock_memory_bytes = mem_heap_get_size(trx->lock_heap); + + row->trx_rows_locked = lock_number_of_rows_locked(trx); + + row->trx_rows_modified = trx->undo_no; + + row->trx_concurrency_tickets = trx->n_tickets_to_enter_innodb; + + switch (trx->isolation_level) { + case TRX_ISO_READ_UNCOMMITTED: + row->trx_isolation_level = "READ UNCOMMITTED"; + break; + case TRX_ISO_READ_COMMITTED: + row->trx_isolation_level = "READ COMMITTED"; + break; + case TRX_ISO_REPEATABLE_READ: + row->trx_isolation_level = "REPEATABLE READ"; + break; + case TRX_ISO_SERIALIZABLE: + row->trx_isolation_level = "SERIALIZABLE"; + break; + /* Should not happen as TRX_ISO_READ_COMMITTED is default */ + default: + row->trx_isolation_level = "UNKNOWN"; + } + + row->trx_unique_checks = (ibool) trx->check_unique_secondary; + + row->trx_foreign_key_checks = (ibool) trx->check_foreigns; + + s = trx->detailed_error; + + if (s != NULL && s[0] != '\0') { + + TRX_I_S_STRING_COPY(s, + row->trx_foreign_key_error, + TRX_I_S_TRX_FK_ERROR_MAX_LEN, cache); + + if (row->trx_foreign_key_error == NULL) { + + return(FALSE); + } + } else { + row->trx_foreign_key_error = NULL; + } + + row->trx_has_search_latch = (ibool) trx->has_search_latch; + + row->trx_search_latch_timeout = trx->search_latch_timeout; + + return(TRUE); +} + +/*******************************************************************//** +Format the nth field of "rec" and put it in "buf". The result is always +NUL-terminated. Returns the number of bytes that were written to "buf" +(including the terminating NUL). +@return end of the result */ +static +ulint +put_nth_field( +/*==========*/ + char* buf, /*!< out: buffer */ + ulint buf_size,/*!< in: buffer size in bytes */ + ulint n, /*!< in: number of field */ + const dict_index_t* index, /*!< in: index */ + const rec_t* rec, /*!< in: record */ + const ulint* offsets)/*!< in: record offsets, returned + by rec_get_offsets() */ +{ + const byte* data; + ulint data_len; + dict_field_t* dict_field; + ulint ret; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (buf_size == 0) { + + return(0); + } + + ret = 0; + + if (n > 0) { + /* we must append ", " before the actual data */ + + if (buf_size < 3) { + + buf[0] = '\0'; + return(1); + } + + memcpy(buf, ", ", 3); + + buf += 2; + buf_size -= 2; + ret += 2; + } + + /* now buf_size >= 1 */ + + data = rec_get_nth_field(rec, offsets, n, &data_len); + + dict_field = dict_index_get_nth_field(index, n); + + ret += row_raw_format((const char*) data, data_len, + dict_field, buf, buf_size); + + return(ret); +} + +/*******************************************************************//** +Fills the "lock_data" member of i_s_locks_row_t object. +If memory can not be allocated then FALSE is returned. +@return FALSE if allocation fails */ +static +ibool +fill_lock_data( +/*===========*/ + const char** lock_data,/*!< out: "lock_data" to fill */ + const lock_t* lock, /*!< in: lock used to find the data */ + ulint heap_no,/*!< in: rec num used to find the data */ + trx_i_s_cache_t* cache) /*!< in/out: cache where to store + volatile data */ +{ + mtr_t mtr; + + const buf_block_t* block; + const page_t* page; + const rec_t* rec; + + ut_a(lock_get_type(lock) == LOCK_REC); + + mtr_start(&mtr); + + block = buf_page_try_get(lock_rec_get_space_id(lock), + lock_rec_get_page_no(lock), + &mtr); + + if (block == NULL) { + + *lock_data = NULL; + + mtr_commit(&mtr); + + return(TRUE); + } + + page = (const page_t*) buf_block_get_frame(block); + + rec = page_find_rec_with_heap_no(page, heap_no); + + if (page_rec_is_infimum(rec)) { + + *lock_data = ha_storage_put_str_memlim( + cache->storage, "infimum pseudo-record", + MAX_ALLOWED_FOR_STORAGE(cache)); + } else if (page_rec_is_supremum(rec)) { + + *lock_data = ha_storage_put_str_memlim( + cache->storage, "supremum pseudo-record", + MAX_ALLOWED_FOR_STORAGE(cache)); + } else { + + const dict_index_t* index; + ulint n_fields; + mem_heap_t* heap; + ulint offsets_onstack[REC_OFFS_NORMAL_SIZE]; + ulint* offsets; + char buf[TRX_I_S_LOCK_DATA_MAX_LEN]; + ulint buf_used; + ulint i; + + rec_offs_init(offsets_onstack); + offsets = offsets_onstack; + + index = lock_rec_get_index(lock); + + n_fields = dict_index_get_n_unique(index); + + ut_a(n_fields > 0); + + heap = NULL; + offsets = rec_get_offsets(rec, index, offsets, n_fields, + &heap); + + /* format and store the data */ + + buf_used = 0; + for (i = 0; i < n_fields; i++) { + + buf_used += put_nth_field( + buf + buf_used, sizeof(buf) - buf_used, + i, index, rec, offsets) - 1; + } + + *lock_data = (const char*) ha_storage_put_memlim( + cache->storage, buf, buf_used + 1, + MAX_ALLOWED_FOR_STORAGE(cache)); + + if (UNIV_UNLIKELY(heap != NULL)) { + + /* this means that rec_get_offsets() has created a new + heap and has stored offsets in it; check that this is + really the case and free the heap */ + ut_a(offsets != offsets_onstack); + mem_heap_free(heap); + } + } + + mtr_commit(&mtr); + + if (*lock_data == NULL) { + + return(FALSE); + } + + return(TRUE); +} + +/*******************************************************************//** +Fills i_s_locks_row_t object. Returns its first argument. +If memory can not be allocated then FALSE is returned. +@return FALSE if allocation fails */ +static +ibool +fill_locks_row( +/*===========*/ + i_s_locks_row_t* row, /*!< out: result object that's filled */ + const lock_t* lock, /*!< in: lock to get data from */ + ulint heap_no,/*!< in: lock's record number + or ULINT_UNDEFINED if the lock + is a table lock */ + trx_i_s_cache_t* cache) /*!< in/out: cache into which to copy + volatile strings */ +{ + row->lock_trx_id = lock_get_trx_id(lock); + row->lock_mode = lock_get_mode_str(lock); + row->lock_type = lock_get_type_str(lock); + + row->lock_table = ha_storage_put_str_memlim( + cache->storage, lock_get_table_name(lock), + MAX_ALLOWED_FOR_STORAGE(cache)); + + /* memory could not be allocated */ + if (row->lock_table == NULL) { + + return(FALSE); + } + + switch (lock_get_type(lock)) { + case LOCK_REC: + row->lock_index = ha_storage_put_str_memlim( + cache->storage, lock_rec_get_index_name(lock), + MAX_ALLOWED_FOR_STORAGE(cache)); + + /* memory could not be allocated */ + if (row->lock_index == NULL) { + + return(FALSE); + } + + row->lock_space = lock_rec_get_space_id(lock); + row->lock_page = lock_rec_get_page_no(lock); + row->lock_rec = heap_no; + + if (!fill_lock_data(&row->lock_data, lock, heap_no, cache)) { + + /* memory could not be allocated */ + return(FALSE); + } + + break; + case LOCK_TABLE: + row->lock_index = NULL; + + row->lock_space = ULINT_UNDEFINED; + row->lock_page = ULINT_UNDEFINED; + row->lock_rec = ULINT_UNDEFINED; + + row->lock_data = NULL; + + break; + default: + ut_error; + } + + row->lock_table_id = lock_get_table_id(lock); + + row->hash_chain.value = row; + ut_ad(i_s_locks_row_validate(row)); + + return(TRUE); +} + +/*******************************************************************//** +Fills i_s_lock_waits_row_t object. Returns its first argument. +@return result object that's filled */ +static +i_s_lock_waits_row_t* +fill_lock_waits_row( +/*================*/ + i_s_lock_waits_row_t* row, /*!< out: result object + that's filled */ + const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the + relevant requested lock + row in innodb_locks */ + const i_s_locks_row_t* blocking_lock_row)/*!< in: pointer to the + relevant blocking lock + row in innodb_locks */ +{ + ut_ad(i_s_locks_row_validate(requested_lock_row)); + ut_ad(i_s_locks_row_validate(blocking_lock_row)); + + row->requested_lock_row = requested_lock_row; + row->blocking_lock_row = blocking_lock_row; + + return(row); +} + +/*******************************************************************//** +Calculates a hash fold for a lock. For a record lock the fold is +calculated from 4 elements, which uniquely identify a lock at a given +point in time: transaction id, space id, page number, record number. +For a table lock the fold is table's id. +@return fold */ +static +ulint +fold_lock( +/*======*/ + const lock_t* lock, /*!< in: lock object to fold */ + ulint heap_no)/*!< in: lock's record number + or ULINT_UNDEFINED if the lock + is a table lock */ +{ +#ifdef TEST_LOCK_FOLD_ALWAYS_DIFFERENT + static ulint fold = 0; + + return(fold++); +#else + ulint ret; + + switch (lock_get_type(lock)) { + case LOCK_REC: + ut_a(heap_no != ULINT_UNDEFINED); + + ret = ut_fold_ulint_pair((ulint) lock_get_trx_id(lock), + lock_rec_get_space_id(lock)); + + ret = ut_fold_ulint_pair(ret, + lock_rec_get_page_no(lock)); + + ret = ut_fold_ulint_pair(ret, heap_no); + + break; + case LOCK_TABLE: + /* this check is actually not necessary for continuing + correct operation, but something must have gone wrong if + it fails. */ + ut_a(heap_no == ULINT_UNDEFINED); + + ret = (ulint) lock_get_table_id(lock); + + break; + default: + ut_error; + } + + return(ret); +#endif +} + +/*******************************************************************//** +Checks whether i_s_locks_row_t object represents a lock_t object. +@return TRUE if they match */ +static +ibool +locks_row_eq_lock( +/*==============*/ + const i_s_locks_row_t* row, /*!< in: innodb_locks row */ + const lock_t* lock, /*!< in: lock object */ + ulint heap_no)/*!< in: lock's record number + or ULINT_UNDEFINED if the lock + is a table lock */ +{ + ut_ad(i_s_locks_row_validate(row)); +#ifdef TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T + return(0); +#else + switch (lock_get_type(lock)) { + case LOCK_REC: + ut_a(heap_no != ULINT_UNDEFINED); + + return(row->lock_trx_id == lock_get_trx_id(lock) + && row->lock_space == lock_rec_get_space_id(lock) + && row->lock_page == lock_rec_get_page_no(lock) + && row->lock_rec == heap_no); + + case LOCK_TABLE: + /* this check is actually not necessary for continuing + correct operation, but something must have gone wrong if + it fails. */ + ut_a(heap_no == ULINT_UNDEFINED); + + return(row->lock_trx_id == lock_get_trx_id(lock) + && row->lock_table_id == lock_get_table_id(lock)); + + default: + ut_error; + return(FALSE); + } +#endif +} + +/*******************************************************************//** +Searches for a row in the innodb_locks cache that has a specified id. +This happens in O(1) time since a hash table is used. Returns pointer to +the row or NULL if none is found. +@return row or NULL */ +static +i_s_locks_row_t* +search_innodb_locks( +/*================*/ + trx_i_s_cache_t* cache, /*!< in: cache */ + const lock_t* lock, /*!< in: lock to search for */ + ulint heap_no)/*!< in: lock's record number + or ULINT_UNDEFINED if the lock + is a table lock */ +{ + i_s_hash_chain_t* hash_chain; + + HASH_SEARCH( + /* hash_chain->"next" */ + next, + /* the hash table */ + cache->locks_hash, + /* fold */ + fold_lock(lock, heap_no), + /* the type of the next variable */ + i_s_hash_chain_t*, + /* auxiliary variable */ + hash_chain, + /* assertion on every traversed item */ + ut_ad(i_s_locks_row_validate(hash_chain->value)), + /* this determines if we have found the lock */ + locks_row_eq_lock(hash_chain->value, lock, heap_no)); + + if (hash_chain == NULL) { + + return(NULL); + } + /* else */ + + return(hash_chain->value); +} + +/*******************************************************************//** +Adds new element to the locks cache, enlarging it if necessary. +Returns a pointer to the added row. If the row is already present then +no row is added and a pointer to the existing row is returned. +If row can not be allocated then NULL is returned. +@return row */ +static +i_s_locks_row_t* +add_lock_to_cache( +/*==============*/ + trx_i_s_cache_t* cache, /*!< in/out: cache */ + const lock_t* lock, /*!< in: the element to add */ + ulint heap_no)/*!< in: lock's record number + or ULINT_UNDEFINED if the lock + is a table lock */ +{ + i_s_locks_row_t* dst_row; + +#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES + ulint i; + for (i = 0; i < 10000; i++) { +#endif +#ifndef TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS + /* quit if this lock is already present */ + dst_row = search_innodb_locks(cache, lock, heap_no); + if (dst_row != NULL) { + + ut_ad(i_s_locks_row_validate(dst_row)); + return(dst_row); + } +#endif + + dst_row = (i_s_locks_row_t*) + table_cache_create_empty_row(&cache->innodb_locks, cache); + + /* memory could not be allocated */ + if (dst_row == NULL) { + + return(NULL); + } + + if (!fill_locks_row(dst_row, lock, heap_no, cache)) { + + /* memory could not be allocated */ + cache->innodb_locks.rows_used--; + return(NULL); + } + +#ifndef TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE + HASH_INSERT( + /* the type used in the hash chain */ + i_s_hash_chain_t, + /* hash_chain->"next" */ + next, + /* the hash table */ + cache->locks_hash, + /* fold */ + fold_lock(lock, heap_no), + /* add this data to the hash */ + &dst_row->hash_chain); +#endif +#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES + } /* for()-loop */ +#endif + + ut_ad(i_s_locks_row_validate(dst_row)); + return(dst_row); +} + +/*******************************************************************//** +Adds new pair of locks to the lock waits cache. +If memory can not be allocated then FALSE is returned. +@return FALSE if allocation fails */ +static +ibool +add_lock_wait_to_cache( +/*===================*/ + trx_i_s_cache_t* cache, /*!< in/out: cache */ + const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the + relevant requested lock + row in innodb_locks */ + const i_s_locks_row_t* blocking_lock_row)/*!< in: pointer to the + relevant blocking lock + row in innodb_locks */ +{ + i_s_lock_waits_row_t* dst_row; + + dst_row = (i_s_lock_waits_row_t*) + table_cache_create_empty_row(&cache->innodb_lock_waits, + cache); + + /* memory could not be allocated */ + if (dst_row == NULL) { + + return(FALSE); + } + + fill_lock_waits_row(dst_row, requested_lock_row, blocking_lock_row); + + return(TRUE); +} + +/*******************************************************************//** +Adds transaction's relevant (important) locks to cache. +If the transaction is waiting, then the wait lock is added to +innodb_locks and a pointer to the added row is returned in +requested_lock_row, otherwise requested_lock_row is set to NULL. +If rows can not be allocated then FALSE is returned and the value of +requested_lock_row is undefined. +@return FALSE if allocation fails */ +static +ibool +add_trx_relevant_locks_to_cache( +/*============================*/ + trx_i_s_cache_t* cache, /*!< in/out: cache */ + const trx_t* trx, /*!< in: transaction */ + i_s_locks_row_t** requested_lock_row)/*!< out: pointer to the + requested lock row, or NULL or + undefined */ +{ + ut_ad(mutex_own(&kernel_mutex)); + + /* If transaction is waiting we add the wait lock and all locks + from another transactions that are blocking the wait lock. */ + if (trx->que_state == TRX_QUE_LOCK_WAIT) { + + const lock_t* curr_lock; + ulint wait_lock_heap_no; + i_s_locks_row_t* blocking_lock_row; + lock_queue_iterator_t iter; + + ut_a(trx->wait_lock != NULL); + + wait_lock_heap_no + = wait_lock_get_heap_no(trx->wait_lock); + + /* add the requested lock */ + *requested_lock_row + = add_lock_to_cache(cache, trx->wait_lock, + wait_lock_heap_no); + + /* memory could not be allocated */ + if (*requested_lock_row == NULL) { + + return(FALSE); + } + + /* then iterate over the locks before the wait lock and + add the ones that are blocking it */ + + lock_queue_iterator_reset(&iter, trx->wait_lock, + ULINT_UNDEFINED); + + curr_lock = lock_queue_iterator_get_prev(&iter); + while (curr_lock != NULL) { + + if (lock_has_to_wait(trx->wait_lock, + curr_lock)) { + + /* add the lock that is + blocking trx->wait_lock */ + blocking_lock_row + = add_lock_to_cache( + cache, curr_lock, + /* heap_no is the same + for the wait and waited + locks */ + wait_lock_heap_no); + + /* memory could not be allocated */ + if (blocking_lock_row == NULL) { + + return(FALSE); + } + + /* add the relation between both locks + to innodb_lock_waits */ + if (!add_lock_wait_to_cache( + cache, *requested_lock_row, + blocking_lock_row)) { + + /* memory could not be allocated */ + return(FALSE); + } + } + + curr_lock = lock_queue_iterator_get_prev(&iter); + } + } else { + + *requested_lock_row = NULL; + } + + return(TRUE); +} + +/** The minimum time that a cache must not be updated after it has been +read for the last time; measured in microseconds. We use this technique +to ensure that SELECTs which join several INFORMATION SCHEMA tables read +the same version of the cache. */ +#define CACHE_MIN_IDLE_TIME_US 100000 /* 0.1 sec */ + +/*******************************************************************//** +Checks if the cache can safely be updated. +@return TRUE if can be updated */ +static +ibool +can_cache_be_updated( +/*=================*/ + trx_i_s_cache_t* cache) /*!< in: cache */ +{ + ullint now; + + /* Here we read cache->last_read without acquiring its mutex + because last_read is only updated when a shared rw lock on the + whole cache is being held (see trx_i_s_cache_end_read()) and + we are currently holding an exclusive rw lock on the cache. + So it is not possible for last_read to be updated while we are + reading it. */ + +#ifdef UNIV_SYNC_DEBUG + ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_EX)); +#endif + + now = ut_time_us(NULL); + if (now - cache->last_read > CACHE_MIN_IDLE_TIME_US) { + + return(TRUE); + } + + return(FALSE); +} + +/*******************************************************************//** +Declare a cache empty, preparing it to be filled up. Not all resources +are freed because they can be reused. */ +static +void +trx_i_s_cache_clear( +/*================*/ + trx_i_s_cache_t* cache) /*!< out: cache to clear */ +{ + cache->innodb_trx.rows_used = 0; + cache->innodb_locks.rows_used = 0; + cache->innodb_lock_waits.rows_used = 0; + + hash_table_clear(cache->locks_hash); + + ha_storage_empty(&cache->storage); +} + +/*******************************************************************//** +Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the +table cache buffer. Cache must be locked for write. */ +static +void +fetch_data_into_cache( +/*==================*/ + trx_i_s_cache_t* cache) /*!< in/out: cache */ +{ + trx_t* trx; + i_s_trx_row_t* trx_row; + i_s_locks_row_t* requested_lock_row; + + ut_ad(mutex_own(&kernel_mutex)); + + trx_i_s_cache_clear(cache); + + /* We iterate over the list of all transactions and add each one + to innodb_trx's cache. We also add all locks that are relevant + to each transaction into innodb_locks' and innodb_lock_waits' + caches. */ + + for (trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + trx != NULL; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + + if (!add_trx_relevant_locks_to_cache(cache, trx, + &requested_lock_row)) { + + cache->is_truncated = TRUE; + return; + } + + trx_row = (i_s_trx_row_t*) + table_cache_create_empty_row(&cache->innodb_trx, + cache); + + /* memory could not be allocated */ + if (trx_row == NULL) { + + cache->is_truncated = TRUE; + return; + } + + if (!fill_trx_row(trx_row, trx, requested_lock_row, cache)) { + + /* memory could not be allocated */ + cache->innodb_trx.rows_used--; + cache->is_truncated = TRUE; + return; + } + } + + cache->is_truncated = FALSE; +} + +/*******************************************************************//** +Update the transactions cache if it has not been read for some time. +Called from handler/i_s.cc. +@return 0 - fetched, 1 - not */ +UNIV_INTERN +int +trx_i_s_possibly_fetch_data_into_cache( +/*===================================*/ + trx_i_s_cache_t* cache) /*!< in/out: cache */ +{ + if (!can_cache_be_updated(cache)) { + + return(1); + } + + /* We need to read trx_sys and record/table lock queues */ + mutex_enter(&kernel_mutex); + + fetch_data_into_cache(cache); + + mutex_exit(&kernel_mutex); + + return(0); +} + +/*******************************************************************//** +Returns TRUE if the data in the cache is truncated due to the memory +limit posed by TRX_I_S_MEM_LIMIT. +@return TRUE if truncated */ +UNIV_INTERN +ibool +trx_i_s_cache_is_truncated( +/*=======================*/ + trx_i_s_cache_t* cache) /*!< in: cache */ +{ + return(cache->is_truncated); +} + +/*******************************************************************//** +Initialize INFORMATION SCHEMA trx related cache. */ +UNIV_INTERN +void +trx_i_s_cache_init( +/*===============*/ + trx_i_s_cache_t* cache) /*!< out: cache to init */ +{ + /* The latching is done in the following order: + acquire trx_i_s_cache_t::rw_lock, X + acquire kernel_mutex + release kernel_mutex + release trx_i_s_cache_t::rw_lock + acquire trx_i_s_cache_t::rw_lock, S + acquire trx_i_s_cache_t::last_read_mutex + release trx_i_s_cache_t::last_read_mutex + release trx_i_s_cache_t::rw_lock */ + + rw_lock_create(trx_i_s_cache_lock_key, &cache->rw_lock, + SYNC_TRX_I_S_RWLOCK); + + cache->last_read = 0; + + mutex_create(cache_last_read_mutex_key, + &cache->last_read_mutex, SYNC_TRX_I_S_LAST_READ); + + table_cache_init(&cache->innodb_trx, sizeof(i_s_trx_row_t)); + table_cache_init(&cache->innodb_locks, sizeof(i_s_locks_row_t)); + table_cache_init(&cache->innodb_lock_waits, + sizeof(i_s_lock_waits_row_t)); + + cache->locks_hash = hash_create(LOCKS_HASH_CELLS_NUM); + + cache->storage = ha_storage_create(CACHE_STORAGE_INITIAL_SIZE, + CACHE_STORAGE_HASH_CELLS); + + cache->mem_allocd = 0; + + cache->is_truncated = FALSE; +} + +/*******************************************************************//** +Free the INFORMATION SCHEMA trx related cache. */ +UNIV_INTERN +void +trx_i_s_cache_free( +/*===============*/ + trx_i_s_cache_t* cache) /*!< in, own: cache to free */ +{ + hash_table_free(cache->locks_hash); + ha_storage_free(cache->storage); + table_cache_free(&cache->innodb_trx); + table_cache_free(&cache->innodb_locks); + table_cache_free(&cache->innodb_lock_waits); + memset(cache, 0, sizeof *cache); +} + +/*******************************************************************//** +Issue a shared/read lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_start_read( +/*=====================*/ + trx_i_s_cache_t* cache) /*!< in: cache */ +{ + rw_lock_s_lock(&cache->rw_lock); +} + +/*******************************************************************//** +Release a shared/read lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_end_read( +/*===================*/ + trx_i_s_cache_t* cache) /*!< in: cache */ +{ + ullint now; + +#ifdef UNIV_SYNC_DEBUG + ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_SHARED)); +#endif + + /* update cache last read time */ + now = ut_time_us(NULL); + mutex_enter(&cache->last_read_mutex); + cache->last_read = now; + mutex_exit(&cache->last_read_mutex); + + rw_lock_s_unlock(&cache->rw_lock); +} + +/*******************************************************************//** +Issue an exclusive/write lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_start_write( +/*======================*/ + trx_i_s_cache_t* cache) /*!< in: cache */ +{ + rw_lock_x_lock(&cache->rw_lock); +} + +/*******************************************************************//** +Release an exclusive/write lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_end_write( +/*====================*/ + trx_i_s_cache_t* cache) /*!< in: cache */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_EX)); +#endif + + rw_lock_x_unlock(&cache->rw_lock); +} + +/*******************************************************************//** +Selects a INFORMATION SCHEMA table cache from the whole cache. +@return table cache */ +static +i_s_table_cache_t* +cache_select_table( +/*===============*/ + trx_i_s_cache_t* cache, /*!< in: whole cache */ + enum i_s_table table) /*!< in: which table */ +{ + i_s_table_cache_t* table_cache; + +#ifdef UNIV_SYNC_DEBUG + ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_SHARED) + || rw_lock_own(&cache->rw_lock, RW_LOCK_EX)); +#endif + + switch (table) { + case I_S_INNODB_TRX: + table_cache = &cache->innodb_trx; + break; + case I_S_INNODB_LOCKS: + table_cache = &cache->innodb_locks; + break; + case I_S_INNODB_LOCK_WAITS: + table_cache = &cache->innodb_lock_waits; + break; + default: + ut_error; + } + + return(table_cache); +} + +/*******************************************************************//** +Retrieves the number of used rows in the cache for a given +INFORMATION SCHEMA table. +@return number of rows */ +UNIV_INTERN +ulint +trx_i_s_cache_get_rows_used( +/*========================*/ + trx_i_s_cache_t* cache, /*!< in: cache */ + enum i_s_table table) /*!< in: which table */ +{ + i_s_table_cache_t* table_cache; + + table_cache = cache_select_table(cache, table); + + return(table_cache->rows_used); +} + +/*******************************************************************//** +Retrieves the nth row (zero-based) in the cache for a given +INFORMATION SCHEMA table. +@return row */ +UNIV_INTERN +void* +trx_i_s_cache_get_nth_row( +/*======================*/ + trx_i_s_cache_t* cache, /*!< in: cache */ + enum i_s_table table, /*!< in: which table */ + ulint n) /*!< in: row number */ +{ + i_s_table_cache_t* table_cache; + ulint i; + void* row; + + table_cache = cache_select_table(cache, table); + + ut_a(n < table_cache->rows_used); + + row = NULL; + + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + if (table_cache->chunks[i].offset + + table_cache->chunks[i].rows_allocd > n) { + + row = (char*) table_cache->chunks[i].base + + (n - table_cache->chunks[i].offset) + * table_cache->row_size; + break; + } + } + + ut_a(row != NULL); + + return(row); +} + +/*******************************************************************//** +Crafts a lock id string from a i_s_locks_row_t object. Returns its +second argument. This function aborts if there is not enough space in +lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you +want to be 100% sure that it will not abort. +@return resulting lock id */ +UNIV_INTERN +char* +trx_i_s_create_lock_id( +/*===================*/ + const i_s_locks_row_t* row, /*!< in: innodb_locks row */ + char* lock_id,/*!< out: resulting lock_id */ + ulint lock_id_size)/*!< in: size of the lock id + buffer */ +{ + int res_len; + + /* please adjust TRX_I_S_LOCK_ID_MAX_LEN if you change this */ + + if (row->lock_space != ULINT_UNDEFINED) { + /* record lock */ + res_len = ut_snprintf(lock_id, lock_id_size, + TRX_ID_FMT ":%lu:%lu:%lu", + row->lock_trx_id, row->lock_space, + row->lock_page, row->lock_rec); + } else { + /* table lock */ + res_len = ut_snprintf(lock_id, lock_id_size, + TRX_ID_FMT ":%llu", + row->lock_trx_id, + row->lock_table_id); + } + + /* the typecast is safe because snprintf(3) never returns + negative result */ + ut_a(res_len >= 0); + ut_a((ulint) res_len < lock_id_size); + + return(lock_id); +} diff --git a/storage/xtradb/trx/trx0purge.c b/storage/xtradb/trx/trx0purge.c new file mode 100644 index 00000000000..d343a73c9d8 --- /dev/null +++ b/storage/xtradb/trx/trx0purge.c @@ -0,0 +1,1254 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0purge.c +Purge old versions + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0purge.h" + +#ifdef UNIV_NONINL +#include "trx0purge.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "mtr0log.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "read0read.h" +#include "fut0fut.h" +#include "que0que.h" +#include "row0purge.h" +#include "row0upd.h" +#include "trx0rec.h" +#include "srv0srv.h" +#include "os0thread.h" + +/** The global data structure coordinating a purge */ +UNIV_INTERN trx_purge_t* purge_sys = NULL; + +/** A dummy undo record used as a return value when we have a whole undo log +which needs no purge */ +UNIV_INTERN trx_undo_rec_t trx_purge_dummy_rec; + +#ifdef UNIV_PFS_RWLOCK +/* Key to register trx_purge_latch with performance schema */ +UNIV_INTERN mysql_pfs_key_t trx_purge_latch_key; +#endif /* UNIV_PFS_RWLOCK */ + +#ifdef UNIV_PFS_MUTEX +/* Key to register purge_sys_bh_mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t purge_sys_bh_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +#ifdef UNIV_DEBUG +UNIV_INTERN my_bool srv_purge_view_update_only_debug; +#endif /* UNIV_DEBUG */ + +/*****************************************************************//** +Checks if trx_id is >= purge_view: then it is guaranteed that its update +undo log still exists in the system. +@return TRUE if is sure that it is preserved, also if the function +returns FALSE, it is possible that the undo log still exists in the +system */ +UNIV_INTERN +ibool +trx_purge_update_undo_must_exist( +/*=============================*/ + trx_id_t trx_id) /*!< in: transaction id */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + if (!read_view_sees_trx_id(purge_sys->view, trx_id)) { + + return(TRUE); + } + + return(FALSE); +} + +/*=================== PURGE RECORD ARRAY =============================*/ + +/*******************************************************************//** +Stores info of an undo log record during a purge. +@return pointer to the storage cell */ +static +trx_undo_inf_t* +trx_purge_arr_store_info( +/*=====================*/ + trx_id_t trx_no, /*!< in: transaction number */ + undo_no_t undo_no)/*!< in: undo number */ +{ + trx_undo_inf_t* cell; + trx_undo_arr_t* arr; + ulint i; + + arr = purge_sys->arr; + + for (i = 0;; i++) { + cell = trx_undo_arr_get_nth_info(arr, i); + + if (!(cell->in_use)) { + /* Not in use, we may store here */ + cell->undo_no = undo_no; + cell->trx_no = trx_no; + cell->in_use = TRUE; + + arr->n_used++; + + return(cell); + } + } +} + +/*******************************************************************//** +Removes info of an undo log record during a purge. */ +UNIV_INLINE +void +trx_purge_arr_remove_info( +/*======================*/ + trx_undo_inf_t* cell) /*!< in: pointer to the storage cell */ +{ + trx_undo_arr_t* arr; + + arr = purge_sys->arr; + + cell->in_use = FALSE; + + ut_ad(arr->n_used > 0); + + arr->n_used--; +} + +/*******************************************************************//** +Gets the biggest pair of a trx number and an undo number in a purge array. */ +static +void +trx_purge_arr_get_biggest( +/*======================*/ + trx_undo_arr_t* arr, /*!< in: purge array */ + trx_id_t* trx_no, /*!< out: transaction number: 0 + if array is empty */ + undo_no_t* undo_no)/*!< out: undo number */ +{ + trx_undo_inf_t* cell; + trx_id_t pair_trx_no; + undo_no_t pair_undo_no; + ulint i; + ulint n; + + n = arr->n_used; + pair_trx_no = 0; + pair_undo_no = 0; + + if (n) { + for (i = 0;; i++) { + cell = trx_undo_arr_get_nth_info(arr, i); + + if (!cell->in_use) { + continue; + } + + if ((cell->trx_no > pair_trx_no) + || ((cell->trx_no == pair_trx_no) + && cell->undo_no >= pair_undo_no)) { + + pair_trx_no = cell->trx_no; + pair_undo_no = cell->undo_no; + } + + if (!--n) { + break; + } + } + } + + *trx_no = pair_trx_no; + *undo_no = pair_undo_no; +} + +/****************************************************************//** +Builds a purge 'query' graph. The actual purge is performed by executing +this query graph. +@return own: the query graph */ +static +que_t* +trx_purge_graph_build(void) +/*=======================*/ +{ + mem_heap_t* heap; + que_fork_t* fork; + que_thr_t* thr; + /* que_thr_t* thr2; */ + + heap = mem_heap_create(512); + fork = que_fork_create(NULL, NULL, QUE_FORK_PURGE, heap); + fork->trx = purge_sys->trx; + + thr = que_thr_create(fork, heap); + + thr->child = row_purge_node_create(thr, heap); + + /* thr2 = que_thr_create(fork, fork, heap); + + thr2->child = row_purge_node_create(fork, thr2, heap); */ + + return(fork); +} + +/********************************************************************//** +Creates the global purge system control structure and inits the history +mutex. */ +UNIV_INTERN +void +trx_purge_sys_create( +/*=================*/ + ib_bh_t* ib_bh) /*!< in, own: UNDO log min binary heap */ +{ + ut_ad(mutex_own(&kernel_mutex)); + + purge_sys = mem_zalloc(sizeof(trx_purge_t)); + + /* Take ownership of ib_bh, we are responsible for freeing it. */ + purge_sys->ib_bh = ib_bh; + purge_sys->state = TRX_STOP_PURGE; + + purge_sys->n_pages_handled = 0; + + purge_sys->purge_trx_no = 0; + purge_sys->purge_undo_no = 0; + purge_sys->next_stored = FALSE; + ut_d(purge_sys->done_trx_no = 0); + + rw_lock_create(trx_purge_latch_key, + &purge_sys->latch, SYNC_PURGE_LATCH); + + mutex_create( + purge_sys_bh_mutex_key, &purge_sys->bh_mutex, + SYNC_PURGE_QUEUE); + + purge_sys->heap = mem_heap_create(256); + + purge_sys->arr = trx_undo_arr_create(); + + purge_sys->sess = sess_open(); + + purge_sys->trx = purge_sys->sess->trx; + + purge_sys->trx->is_purge = 1; + + ut_a(trx_start_low(purge_sys->trx, ULINT_UNDEFINED)); + + purge_sys->query = trx_purge_graph_build(); + + purge_sys->prebuilt_view = + read_view_oldest_copy_or_open_new(0, NULL); + purge_sys->view = purge_sys->prebuilt_view; +} + +/************************************************************************ +Frees the global purge system control structure. */ +UNIV_INTERN +void +trx_purge_sys_close(void) +/*======================*/ +{ + ut_ad(!mutex_own(&kernel_mutex)); + + que_graph_free(purge_sys->query); + + ut_a(purge_sys->sess->trx->is_purge); + purge_sys->sess->trx->state = TRX_NOT_STARTED; + + mutex_enter(&kernel_mutex); + trx_release_descriptor(purge_sys->sess->trx); + mutex_exit(&kernel_mutex); + + sess_close(purge_sys->sess); + purge_sys->sess = NULL; + + if (purge_sys->view != NULL) { + /* Because acquiring the kernel mutex is a pre-condition + of read_view_close(). We don't really need it here. */ + mutex_enter(&kernel_mutex); + + read_view_close(purge_sys->view); + read_view_free(purge_sys->prebuilt_view); + purge_sys->prebuilt_view = NULL; + purge_sys->view = NULL; + + mutex_exit(&kernel_mutex); + } + + trx_undo_arr_free(purge_sys->arr); + + rw_lock_free(&purge_sys->latch); + mutex_free(&purge_sys->bh_mutex); + + mem_heap_free(purge_sys->heap); + + ib_bh_free(purge_sys->ib_bh); + + mem_free(purge_sys); + + purge_sys = NULL; +} + +/*================ UNDO LOG HISTORY LIST =============================*/ + +/********************************************************************//** +Adds the update undo log as the first log in the history list. Removes the +update undo log segment from the rseg slot if it is too big for reuse. */ +UNIV_INTERN +void +trx_purge_add_update_undo_to_history( +/*=================================*/ + trx_t* trx, /*!< in: transaction */ + page_t* undo_page, /*!< in: update undo log header page, + x-latched */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_undo_t* undo; + trx_rsegf_t* rseg_header; + trx_ulogf_t* undo_header; + + undo = trx->update_undo; + + ut_ad(undo); + + ut_ad(mutex_own(&undo->rseg->mutex)); + + rseg_header = trx_rsegf_get( + undo->rseg->space, undo->rseg->zip_size, undo->rseg->page_no, + mtr); + + undo_header = undo_page + undo->hdr_offset; + /* Add the log as the first in the history list */ + + if (undo->state != TRX_UNDO_CACHED) { + ulint hist_size; +#ifdef UNIV_DEBUG + trx_usegf_t* seg_header = undo_page + TRX_UNDO_SEG_HDR; +#endif /* UNIV_DEBUG */ + + /* The undo log segment will not be reused */ + + if (UNIV_UNLIKELY(undo->id >= TRX_RSEG_N_SLOTS)) { + fprintf(stderr, + "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + ut_error; + } + + trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, mtr); + + hist_size = mtr_read_ulint( + rseg_header + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, mtr); + + ut_ad(undo->size == flst_get_len( + seg_header + TRX_UNDO_PAGE_LIST, mtr)); + + mlog_write_ulint( + rseg_header + TRX_RSEG_HISTORY_SIZE, + hist_size + undo->size, MLOG_4BYTES, mtr); + } + + flst_add_first( + rseg_header + TRX_RSEG_HISTORY, + undo_header + TRX_UNDO_HISTORY_NODE, mtr); + + /* Write the trx number to the undo log header */ + + mlog_write_ull(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr); + + /* Write information about delete markings to the undo log header */ + + if (!undo->del_marks) { + mlog_write_ulint( + undo_header + TRX_UNDO_DEL_MARKS, FALSE, + MLOG_2BYTES, mtr); + } + + if (undo->rseg->last_page_no == FIL_NULL) { + undo->rseg->last_trx_no = trx->no; + undo->rseg->last_offset = undo->hdr_offset; + undo->rseg->last_page_no = undo->hdr_page_no; + undo->rseg->last_del_marks = undo->del_marks; + + /* FIXME: Add a bin heap validate function to check that + the rseg exists. */ + } + + mutex_enter(&kernel_mutex); + trx_sys->rseg_history_len++; + mutex_exit(&kernel_mutex); + +// if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) { /*should wake up always*/ + /* Inform the purge thread that there is work to do. */ + srv_wake_purge_thread_if_not_active(); +// } +} + +/**********************************************************************//** +Frees an undo log segment which is in the history list. Cuts the end of the +history list at the youngest undo log in this segment. */ +static +void +trx_purge_free_segment( +/*===================*/ + trx_rseg_t* rseg, /*!< in: rollback segment */ + fil_addr_t hdr_addr, /*!< in: the file address of log_hdr */ + ulint n_removed_logs) /*!< in: count of how many undo logs we + will cut off from the end of the + history list */ +{ + page_t* undo_page; + trx_rsegf_t* rseg_hdr; + trx_ulogf_t* log_hdr; + trx_usegf_t* seg_hdr; + ibool freed; + ulint seg_size; + ulint hist_size; + ibool marked = FALSE; + mtr_t mtr; + + /* fputs("Freeing an update undo log segment\n", stderr); */ + +loop: + mtr_start(&mtr); + mutex_enter(&(rseg->mutex)); + + rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size, + rseg->page_no, &mtr); + + undo_page = trx_undo_page_get(rseg->space, rseg->zip_size, + hdr_addr.page, &mtr); + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + log_hdr = undo_page + hdr_addr.boffset; + + /* Mark the last undo log totally purged, so that if the system + crashes, the tail of the undo log will not get accessed again. The + list of pages in the undo log tail gets inconsistent during the + freeing of the segment, and therefore purge should not try to access + them again. */ + + if (!marked) { + mlog_write_ulint(log_hdr + TRX_UNDO_DEL_MARKS, FALSE, + MLOG_2BYTES, &mtr); + marked = TRUE; + } + + freed = fseg_free_step_not_header(seg_hdr + TRX_UNDO_FSEG_HEADER, + &mtr); + if (!freed) { + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + goto loop; + } + + /* The page list may now be inconsistent, but the length field + stored in the list base node tells us how big it was before we + started the freeing. */ + + seg_size = flst_get_len(seg_hdr + TRX_UNDO_PAGE_LIST, &mtr); + + /* We may free the undo log segment header page; it must be freed + within the same mtr as the undo log header is removed from the + history list: otherwise, in case of a database crash, the segment + could become inaccessible garbage in the file space. */ + + flst_cut_end(rseg_hdr + TRX_RSEG_HISTORY, + log_hdr + TRX_UNDO_HISTORY_NODE, n_removed_logs, &mtr); + + mutex_enter(&kernel_mutex); + ut_ad(trx_sys->rseg_history_len >= n_removed_logs); + trx_sys->rseg_history_len -= n_removed_logs; + mutex_exit(&kernel_mutex); + + freed = FALSE; + + while (!freed) { + /* Here we assume that a file segment with just the header + page can be freed in a few steps, so that the buffer pool + is not flooded with bufferfixed pages: see the note in + fsp0fsp.c. */ + + freed = fseg_free_step(seg_hdr + TRX_UNDO_FSEG_HEADER, + &mtr); + } + + hist_size = mtr_read_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE, + MLOG_4BYTES, &mtr); + ut_ad(hist_size >= seg_size); + + mlog_write_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE, + hist_size - seg_size, MLOG_4BYTES, &mtr); + + ut_ad(rseg->curr_size >= seg_size); + + rseg->curr_size -= seg_size; + + mutex_exit(&(rseg->mutex)); + + mtr_commit(&mtr); +} + +/********************************************************************//** +Removes unnecessary history data from a rollback segment. */ +static +void +trx_purge_truncate_rseg_history( +/*============================*/ + trx_rseg_t* rseg, /*!< in: rollback segment */ + trx_id_t limit_trx_no, /*!< in: remove update undo logs whose + trx number is < limit_trx_no */ + undo_no_t limit_undo_no) /*!< in: if transaction number is equal + to limit_trx_no, truncate undo records + with undo number < limit_undo_no */ +{ + fil_addr_t hdr_addr; + fil_addr_t prev_hdr_addr; + trx_rsegf_t* rseg_hdr; + page_t* undo_page; + trx_ulogf_t* log_hdr; + trx_usegf_t* seg_hdr; + ulint n_removed_logs = 0; + mtr_t mtr; + trx_id_t undo_trx_no; + + mtr_start(&mtr); + mutex_enter(&(rseg->mutex)); + + rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size, + rseg->page_no, &mtr); + + hdr_addr = trx_purge_get_log_from_hist( + flst_get_last(rseg_hdr + TRX_RSEG_HISTORY, &mtr)); +loop: + if (hdr_addr.page == FIL_NULL) { + + mutex_exit(&(rseg->mutex)); + + mtr_commit(&mtr); + + return; + } + + undo_page = trx_undo_page_get(rseg->space, rseg->zip_size, + hdr_addr.page, &mtr); + + log_hdr = undo_page + hdr_addr.boffset; + undo_trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO); + + if (undo_trx_no >= limit_trx_no) { + if (undo_trx_no == limit_trx_no) { + trx_undo_truncate_start(rseg, rseg->space, + hdr_addr.page, + hdr_addr.boffset, + limit_undo_no); + } + + mutex_enter(&kernel_mutex); + ut_a(trx_sys->rseg_history_len >= n_removed_logs); + trx_sys->rseg_history_len -= n_removed_logs; + mutex_exit(&kernel_mutex); + + flst_truncate_end(rseg_hdr + TRX_RSEG_HISTORY, + log_hdr + TRX_UNDO_HISTORY_NODE, + n_removed_logs, &mtr); + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + return; + } + + prev_hdr_addr = trx_purge_get_log_from_hist( + flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr)); + n_removed_logs++; + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + + if ((mach_read_from_2(seg_hdr + TRX_UNDO_STATE) == TRX_UNDO_TO_PURGE) + && (mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG) == 0)) { + + /* We can free the whole log segment */ + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + trx_purge_free_segment(rseg, hdr_addr, n_removed_logs); + + n_removed_logs = 0; + } else { + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + } + + mtr_start(&mtr); + mutex_enter(&(rseg->mutex)); + + rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size, + rseg->page_no, &mtr); + + hdr_addr = prev_hdr_addr; + + goto loop; +} + +/********************************************************************//** +Removes unnecessary history data from rollback segments. NOTE that when this +function is called, the caller must not have any latches on undo log pages! */ +static +void +trx_purge_truncate_history(void) +/*============================*/ +{ + trx_rseg_t* rseg; + trx_id_t limit_trx_no; + undo_no_t limit_undo_no; + + trx_purge_arr_get_biggest( + purge_sys->arr, &limit_trx_no, &limit_undo_no); + + if (limit_trx_no == 0) { + + limit_trx_no = purge_sys->purge_trx_no; + limit_undo_no = purge_sys->purge_undo_no; + } + + /* We play safe and set the truncate limit at most to the purge view + low_limit number, though this is not necessary */ + + if (limit_trx_no >= purge_sys->view->low_limit_no) { + limit_trx_no = purge_sys->view->low_limit_no; + limit_undo_no = 0; + } + + ut_ad(limit_trx_no <= purge_sys->view->low_limit_no); + + for (rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + rseg != NULL; + rseg = UT_LIST_GET_NEXT(rseg_list, rseg)) { + + trx_purge_truncate_rseg_history( + rseg, limit_trx_no, limit_undo_no); + } +} + +/********************************************************************//** +Does a truncate if the purge array is empty. NOTE that when this function is +called, the caller must not have any latches on undo log pages! */ +UNIV_INLINE +void +trx_purge_truncate_if_arr_empty(void) +/*=================================*/ +{ + static ulint count; + +#ifdef UNIV_DEBUG + if (purge_sys->arr->n_used == 0) { + purge_sys->done_trx_no = purge_sys->purge_trx_no; + } +#endif /* UNIV_DEBUG */ + + if (!(++count % TRX_SYS_N_RSEGS) && purge_sys->arr->n_used == 0) { + + trx_purge_truncate_history(); + } +} + +/***********************************************************************//** +Updates the last not yet purged history log info in rseg when we have purged +a whole undo log. Advances also purge_sys->purge_trx_no past the purged log. */ +static +void +trx_purge_rseg_get_next_history_log( +/*================================*/ + trx_rseg_t* rseg) /*!< in: rollback segment */ +{ + page_t* undo_page; + trx_ulogf_t* log_hdr; + fil_addr_t prev_log_addr; + trx_id_t trx_no; + ibool del_marks; + mtr_t mtr; + rseg_queue_t rseg_queue; + const void* ptr; + + mutex_enter(&(rseg->mutex)); + + ut_a(rseg->last_page_no != FIL_NULL); + + purge_sys->purge_trx_no = rseg->last_trx_no + 1; + purge_sys->purge_undo_no = 0; + purge_sys->next_stored = FALSE; + + mtr_start(&mtr); + + undo_page = trx_undo_page_get_s_latched( + rseg->space, rseg->zip_size, rseg->last_page_no, &mtr); + + log_hdr = undo_page + rseg->last_offset; + + /* Increase the purge page count by one for every handled log */ + + purge_sys->n_pages_handled++; + + prev_log_addr = trx_purge_get_log_from_hist( + flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr)); + + if (prev_log_addr.page == FIL_NULL) { + /* No logs left in the history list */ + + rseg->last_page_no = FIL_NULL; + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + mutex_enter(&kernel_mutex); + + /* Add debug code to track history list corruption reported + on the MySQL mailing list on Nov 9, 2004. The fut0lst.c + file-based list was corrupt. The prev node pointer was + FIL_NULL, even though the list length was over 8 million nodes! + We assume that purge truncates the history list in large + size pieces, and if we here reach the head of the list, the + list cannot be longer than 2000 000 undo logs now. */ + + if (trx_sys->rseg_history_len > 2000000) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: purge reached the" + " head of the history list,\n" + "InnoDB: but its length is still" + " reported as %lu! Make a detailed bug\n" + "InnoDB: report, and submit it" + " to http://bugs.mysql.com\n", + (ulong) trx_sys->rseg_history_len); + ut_ad(0); + } + + mutex_exit(&kernel_mutex); + + return; + } + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + /* Read the trx number and del marks from the previous log header */ + mtr_start(&mtr); + + log_hdr = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size, + prev_log_addr.page, &mtr) + + prev_log_addr.boffset; + + trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO); + + del_marks = mach_read_from_2(log_hdr + TRX_UNDO_DEL_MARKS); + + mtr_commit(&mtr); + + mutex_enter(&(rseg->mutex)); + + rseg->last_page_no = prev_log_addr.page; + rseg->last_offset = prev_log_addr.boffset; + rseg->last_trx_no = trx_no; + rseg->last_del_marks = del_marks; + + rseg_queue.rseg = rseg; + rseg_queue.trx_no = rseg->last_trx_no; + + /* Purge can also produce events, however these are already ordered + in the rollback segment and any user generated event will be greater + than the events that Purge produces. ie. Purge can never produce + events from an empty rollback segment. */ + + mutex_enter(&purge_sys->bh_mutex); + + ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue); + ut_a(ptr != NULL); + + mutex_exit(&purge_sys->bh_mutex); + + mutex_exit(&(rseg->mutex)); +} + +/***********************************************************************//** +Chooses the rollback segment with the smallest trx_id. +@return zip_size if log is for a compressed table, ULINT_UNDEFINED if + no rollback segments to purge, 0 for non compressed tables. */ +static +ulint +trx_purge_get_rseg_with_min_trx_id( +/*===============================*/ + trx_purge_t* purge_sys) /*!< in/out: purge instance */ + +{ + ulint zip_size = 0; + + mutex_enter(&purge_sys->bh_mutex); + + /* Only purge consumes events from the binary heap, user + threads only produce the events. */ + + if (!ib_bh_is_empty(purge_sys->ib_bh)) { + trx_rseg_t* rseg; + + rseg = ((rseg_queue_t*) ib_bh_first(purge_sys->ib_bh))->rseg; + ib_bh_pop(purge_sys->ib_bh); + + mutex_exit(&purge_sys->bh_mutex); + + purge_sys->rseg = rseg; + } else { + mutex_exit(&purge_sys->bh_mutex); + + purge_sys->rseg = NULL; + + return(ULINT_UNDEFINED); + } + + ut_a(purge_sys->rseg != NULL); + + mutex_enter(&purge_sys->rseg->mutex); + + ut_a(purge_sys->rseg->last_page_no != FIL_NULL); + + /* We assume in purge of externally stored fields + that space id == 0 */ + ut_a(purge_sys->rseg->space == 0); + + zip_size = purge_sys->rseg->zip_size; + + ut_a(purge_sys->purge_trx_no <= purge_sys->rseg->last_trx_no); + + purge_sys->purge_trx_no = purge_sys->rseg->last_trx_no; + + purge_sys->hdr_offset = purge_sys->rseg->last_offset; + + purge_sys->hdr_page_no = purge_sys->rseg->last_page_no; + + mutex_exit(&purge_sys->rseg->mutex); + + return(zip_size); +} + +/***********************************************************************//** +Position the purge sys "iterator" on the undo record to use for purging. */ +static +void +trx_purge_read_undo_rec( +/*====================*/ + trx_purge_t* purge_sys, /*!< in/out: purge instance */ + ulint zip_size) /*!< in: block size or 0 */ +{ + ulint page_no; + ulint offset = 0; + ib_uint64_t undo_no = 0; + + purge_sys->hdr_offset = purge_sys->rseg->last_offset; + page_no = purge_sys->hdr_page_no = purge_sys->rseg->last_page_no; + + if (purge_sys->rseg->last_del_marks) { + mtr_t mtr; + trx_undo_rec_t* undo_rec; + + mtr_start(&mtr); + + undo_rec = trx_undo_get_first_rec( + 0 /* System space id */, zip_size, + purge_sys->hdr_page_no, + purge_sys->hdr_offset, RW_S_LATCH, &mtr); + + if (undo_rec != NULL) { + offset = page_offset(undo_rec); + undo_no = trx_undo_rec_get_undo_no(undo_rec); + page_no = page_get_page_no(page_align(undo_rec)); + } + + mtr_commit(&mtr); + } + + purge_sys->offset = offset; + purge_sys->page_no = page_no; + purge_sys->purge_undo_no = undo_no; + + purge_sys->next_stored = TRUE; +} + +/***********************************************************************//** +Chooses the next undo log to purge and updates the info in purge_sys. This +function is used to initialize purge_sys when the next record to purge is +not known, and also to update the purge system info on the next record when +purge has handled the whole undo log for a transaction. */ +static +void +trx_purge_choose_next_log(void) +/*===========================*/ +{ + ulint zip_size; + + ut_ad(purge_sys->next_stored == FALSE); + + zip_size = trx_purge_get_rseg_with_min_trx_id(purge_sys); + + if (purge_sys->rseg != NULL) { + + trx_purge_read_undo_rec(purge_sys, zip_size); + } else { + /* There is nothing to do yet. */ + os_thread_yield(); + } +} + +/***********************************************************************//** +Gets the next record to purge and updates the info in the purge system. +@return copy of an undo log record or pointer to the dummy undo log record */ +static +trx_undo_rec_t* +trx_purge_get_next_rec( +/*===================*/ + mem_heap_t* heap) /*!< in: memory heap where copied */ +{ + trx_undo_rec_t* rec; + trx_undo_rec_t* rec_copy; + trx_undo_rec_t* rec2; + trx_undo_rec_t* next_rec; + page_t* undo_page; + page_t* page; + ulint offset; + ulint page_no; + ulint space; + ulint zip_size; + ulint type; + ulint cmpl_info; + mtr_t mtr; + + ut_ad(purge_sys->next_stored); + + space = purge_sys->rseg->space; + zip_size = purge_sys->rseg->zip_size; + page_no = purge_sys->page_no; + offset = purge_sys->offset; + + if (offset == 0) { + /* It is the dummy undo log record, which means that there is + no need to purge this undo log */ + + trx_purge_rseg_get_next_history_log(purge_sys->rseg); + + /* Look for the next undo log and record to purge */ + + trx_purge_choose_next_log(); + + return(&trx_purge_dummy_rec); + } + + mtr_start(&mtr); + + undo_page = trx_undo_page_get_s_latched(space, zip_size, page_no, &mtr); + + rec = undo_page + offset; + + rec2 = rec; + + for (;;) { + /* Try first to find the next record which requires a purge + operation from the same page of the same undo log */ + + next_rec = trx_undo_page_get_next_rec( + rec2, purge_sys->hdr_page_no, purge_sys->hdr_offset); + + if (next_rec == NULL) { + rec2 = trx_undo_get_next_rec( + rec2, purge_sys->hdr_page_no, + purge_sys->hdr_offset, &mtr); + break; + } + + rec2 = next_rec; + + type = trx_undo_rec_get_type(rec2); + + if (type == TRX_UNDO_DEL_MARK_REC) { + + break; + } + + cmpl_info = trx_undo_rec_get_cmpl_info(rec2); + + if (trx_undo_rec_get_extern_storage(rec2)) { + break; + } + + if ((type == TRX_UNDO_UPD_EXIST_REC) + && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + break; + } + } + + if (rec2 == NULL) { + mtr_commit(&mtr); + + trx_purge_rseg_get_next_history_log(purge_sys->rseg); + + /* Look for the next undo log and record to purge */ + + trx_purge_choose_next_log(); + + mtr_start(&mtr); + + undo_page = trx_undo_page_get_s_latched(space, zip_size, + page_no, &mtr); + + rec = undo_page + offset; + } else { + page = page_align(rec2); + + purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec2); + purge_sys->page_no = page_get_page_no(page); + purge_sys->offset = rec2 - page; + + if (undo_page != page) { + /* We advance to a new page of the undo log: */ + purge_sys->n_pages_handled++; + } + } + + rec_copy = trx_undo_rec_copy(rec, heap); + + mtr_commit(&mtr); + + return(rec_copy); +} + +/********************************************************************//** +Fetches the next undo log record from the history list to purge. It must be +released with the corresponding release function. +@return copy of an undo log record or pointer to trx_purge_dummy_rec, +if the whole undo log can skipped in purge; NULL if none left */ +UNIV_INTERN +trx_undo_rec_t* +trx_purge_fetch_next_rec( +/*=====================*/ + roll_ptr_t* roll_ptr,/*!< out: roll pointer to undo record */ + trx_undo_inf_t** cell, /*!< out: storage cell for the record in the + purge array */ + mem_heap_t* heap) /*!< in: memory heap where copied */ +{ + trx_undo_rec_t* undo_rec; + + + if (purge_sys->state == TRX_STOP_PURGE) { + trx_purge_truncate_if_arr_empty(); + + return(NULL); + } else if (!purge_sys->next_stored) { + trx_purge_choose_next_log(); + + if (!purge_sys->next_stored) { + purge_sys->state = TRX_STOP_PURGE; + + trx_purge_truncate_if_arr_empty(); + + if (srv_print_thread_releases) { + fprintf(stderr, + "Purge: No logs left in the" + " history list; pages handled %lu\n", + (ulong) purge_sys->n_pages_handled); + } + + return(NULL); + } + } + + if (purge_sys->n_pages_handled >= purge_sys->handle_limit) { + + purge_sys->state = TRX_STOP_PURGE; + + trx_purge_truncate_if_arr_empty(); + + return(NULL); + } else if (purge_sys->purge_trx_no >= purge_sys->view->low_limit_no) { + purge_sys->state = TRX_STOP_PURGE; + + trx_purge_truncate_if_arr_empty(); + + return(NULL); + } + + /* fprintf(stderr, "Thread %lu purging trx %llu undo record %llu\n", + os_thread_get_curr_id(), + (ullint) purge_sys->purge_trx_no, + (ullint) purge_sys->purge_undo_no); */ + + + *roll_ptr = trx_undo_build_roll_ptr( + FALSE, (purge_sys->rseg)->id, purge_sys->page_no, + purge_sys->offset); + + *cell = trx_purge_arr_store_info( + purge_sys->purge_trx_no, purge_sys->purge_undo_no); + + ut_ad(purge_sys->purge_trx_no < purge_sys->view->low_limit_no); + + /* The following call will advance the stored values of purge_trx_no + and purge_undo_no, therefore we had to store them first */ + + undo_rec = trx_purge_get_next_rec(heap); + + return(undo_rec); +} + +/*******************************************************************//** +Releases a reserved purge undo record. */ +UNIV_INTERN +void +trx_purge_rec_release( +/*==================*/ + trx_undo_inf_t* cell) /*!< in: storage cell */ +{ + trx_purge_arr_remove_info(cell); +} + +/*******************************************************************//** +This function runs a purge batch. +@return number of undo log pages handled in the batch */ +UNIV_INTERN +ulint +trx_purge( +/*======*/ + ulint limit) /*!< in: the maximum number of records to + purge in one batch */ +{ + que_thr_t* thr; + ulint old_pages_handled; + + ut_a(purge_sys->trx->n_active_thrs == 0); + + rw_lock_x_lock(&purge_sys->latch); + + mutex_enter(&kernel_mutex); + + /* Close and free the old purge view */ + + read_view_close(purge_sys->view); + purge_sys->view = NULL; + mem_heap_empty(purge_sys->heap); + + /* Determine how much data manipulation language (DML) statements + need to be delayed in order to reduce the lagging of the purge + thread. */ + srv_dml_needed_delay = 0; /* in microseconds; default: no delay */ + + /* If we cannot advance the 'purge view' because of an old + 'consistent read view', then the DML statements cannot be delayed. + Also, srv_max_purge_lag <= 0 means 'infinity'. */ + if (srv_max_purge_lag > 0) { + float ratio = (float) trx_sys->rseg_history_len + / srv_max_purge_lag; + if (ratio > ULINT_MAX / 10000) { + /* Avoid overflow: maximum delay is 4295 seconds */ + srv_dml_needed_delay = ULINT_MAX; + } else if (ratio > 1) { + /* If the history list length exceeds the + innodb_max_purge_lag, the + data manipulation statements are delayed + by at least 5000 microseconds. */ + srv_dml_needed_delay = (ulint) ((ratio - .5) * 10000); + } + } + + purge_sys->view = read_view_oldest_copy_or_open_new( + 0, purge_sys->prebuilt_view); + + mutex_exit(&kernel_mutex); + + rw_lock_x_unlock(&(purge_sys->latch)); + +#ifdef UNIV_DEBUG + if (srv_purge_view_update_only_debug) { + return(0); + } +#endif + + purge_sys->state = TRX_PURGE_ON; + + purge_sys->handle_limit = purge_sys->n_pages_handled + limit; + + old_pages_handled = purge_sys->n_pages_handled; + + + mutex_enter(&kernel_mutex); + + thr = que_fork_start_command(purge_sys->query); + + ut_ad(thr); + + mutex_exit(&kernel_mutex); + + if (srv_print_thread_releases) { + + fputs("Starting purge\n", stderr); + } + + que_run_threads(thr); + + if (srv_print_thread_releases) { + + fprintf(stderr, + "Purge ends; pages handled %lu\n", + (ulong) purge_sys->n_pages_handled); + } + + return((ulint) (purge_sys->n_pages_handled - old_pages_handled)); +} + +/******************************************************************//** +Prints information of the purge system to stderr. */ +UNIV_INTERN +void +trx_purge_sys_print(void) +/*=====================*/ +{ + fprintf(stderr, "InnoDB: Purge system view:\n"); + read_view_print(stderr, purge_sys->view); + + fprintf(stderr, "InnoDB: Purge trx n:o " TRX_ID_FMT + ", undo n:o " TRX_ID_FMT "\n", + (ullint) purge_sys->purge_trx_no, + (ullint) purge_sys->purge_undo_no); + fprintf(stderr, + "InnoDB: Purge next stored %lu, page_no %lu, offset %lu,\n" + "InnoDB: Purge hdr_page_no %lu, hdr_offset %lu\n", + (ulong) purge_sys->next_stored, + (ulong) purge_sys->page_no, + (ulong) purge_sys->offset, + (ulong) purge_sys->hdr_page_no, + (ulong) purge_sys->hdr_offset); +} diff --git a/storage/xtradb/trx/trx0rec.c b/storage/xtradb/trx/trx0rec.c new file mode 100644 index 00000000000..ef42152aeb7 --- /dev/null +++ b/storage/xtradb/trx/trx0rec.c @@ -0,0 +1,1698 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0rec.c +Transaction undo log record + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0rec.h" + +#ifdef UNIV_NONINL +#include "trx0rec.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0undo.h" +#include "mtr0log.h" +#ifndef UNIV_HOTBACKUP +#include "dict0dict.h" +#include "ut0mem.h" +#include "read0read.h" +#include "row0ext.h" +#include "row0upd.h" +#include "que0que.h" +#include "trx0purge.h" +#include "trx0rseg.h" +#include "row0row.h" + +/*=========== UNDO LOG RECORD CREATION AND DECODING ====================*/ + +/**********************************************************************//** +Writes the mtr log entry of the inserted undo log record on the undo log +page. */ +UNIV_INLINE +void +trx_undof_page_add_undo_rec_log( +/*============================*/ + page_t* undo_page, /*!< in: undo log page */ + ulint old_free, /*!< in: start offset of the inserted entry */ + ulint new_free, /*!< in: end offset of the entry */ + mtr_t* mtr) /*!< in: mtr */ +{ + byte* log_ptr; + const byte* log_end; + ulint len; + + log_ptr = mlog_open(mtr, 11 + 13 + MLOG_BUF_MARGIN); + + if (log_ptr == NULL) { + + return; + } + + log_end = &log_ptr[11 + 13 + MLOG_BUF_MARGIN]; + log_ptr = mlog_write_initial_log_record_fast( + undo_page, MLOG_UNDO_INSERT, log_ptr, mtr); + len = new_free - old_free - 4; + + mach_write_to_2(log_ptr, len); + log_ptr += 2; + + if (log_ptr + len <= log_end) { + memcpy(log_ptr, undo_page + old_free + 2, len); + mlog_close(mtr, log_ptr + len); + } else { + mlog_close(mtr, log_ptr); + mlog_catenate_string(mtr, undo_page + old_free + 2, len); + } +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses a redo log record of adding an undo log record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_add_undo_rec( +/*========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page) /*!< in: page or NULL */ +{ + ulint len; + byte* rec; + ulint first_free; + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + len = mach_read_from_2(ptr); + ptr += 2; + + if (end_ptr < ptr + len) { + + return(NULL); + } + + if (page == NULL) { + + return(ptr + len); + } + + first_free = mach_read_from_2(page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + rec = page + first_free; + + mach_write_to_2(rec, first_free + 4 + len); + mach_write_to_2(rec + 2 + len, first_free); + + mach_write_to_2(page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, + first_free + 4 + len); + ut_memcpy(rec + 2, ptr, len); + + return(ptr + len); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Calculates the free space left for extending an undo log record. +@return bytes left */ +UNIV_INLINE +ulint +trx_undo_left( +/*==========*/ + const page_t* page, /*!< in: undo log page */ + const byte* ptr) /*!< in: pointer to page */ +{ + /* The '- 10' is a safety margin, in case we have some small + calculation error below */ + + return(UNIV_PAGE_SIZE - (ptr - page) - 10 - FIL_PAGE_DATA_END); +} + +/**********************************************************************//** +Set the next and previous pointers in the undo page for the undo record +that was written to ptr. Update the first free value by the number of bytes +written for this undo record. +@return offset of the inserted entry on the page if succeeded, 0 if fail */ +static +ulint +trx_undo_page_set_next_prev_and_add( +/*================================*/ + page_t* undo_page, /*!< in/out: undo log page */ + byte* ptr, /*!< in: ptr up to where data has been + written on this undo page. */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint first_free; /*!< offset within undo_page */ + ulint end_of_rec; /*!< offset within undo_page */ + byte* ptr_to_first_free; + /* pointer within undo_page + that points to the next free + offset value within undo_page.*/ + + ut_ad(ptr > undo_page); + ut_ad(ptr < undo_page + UNIV_PAGE_SIZE); + + if (UNIV_UNLIKELY(trx_undo_left(undo_page, ptr) < 2)) { + + return(0); + } + + ptr_to_first_free = undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE; + + first_free = mach_read_from_2(ptr_to_first_free); + + /* Write offset of the previous undo log record */ + mach_write_to_2(ptr, first_free); + ptr += 2; + + end_of_rec = ptr - undo_page; + + /* Write offset of the next undo log record */ + mach_write_to_2(undo_page + first_free, end_of_rec); + + /* Update the offset to first free undo record */ + mach_write_to_2(ptr_to_first_free, end_of_rec); + + /* Write this log entry to the UNDO log */ + trx_undof_page_add_undo_rec_log(undo_page, first_free, + end_of_rec, mtr); + + return(first_free); +} + +/**********************************************************************//** +Reports in the undo log of an insert of a clustered index record. +@return offset of the inserted entry on the page if succeed, 0 if fail */ +static +ulint +trx_undo_page_report_insert( +/*========================*/ + page_t* undo_page, /*!< in: undo log page */ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: clustered index */ + const dtuple_t* clust_entry, /*!< in: index entry which will be + inserted to the clustered index */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint first_free; + byte* ptr; + ulint i; + + ut_ad(dict_index_is_clust(index)); + ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_INSERT); + + first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + ptr = undo_page + first_free; + + ut_ad(first_free <= UNIV_PAGE_SIZE); + + if (trx_undo_left(undo_page, ptr) < 2 + 1 + 11 + 11) { + + /* Not enough space for writing the general parameters */ + + return(0); + } + + /* Reserve 2 bytes for the pointer to the next undo log record */ + ptr += 2; + + /* Store first some general parameters to the undo log */ + *ptr++ = TRX_UNDO_INSERT_REC; + ptr += mach_ull_write_much_compressed(ptr, trx->undo_no); + ptr += mach_ull_write_much_compressed(ptr, index->table->id); + /*----------------------------------------*/ + /* Store then the fields required to uniquely determine the record + to be inserted in the clustered index */ + + for (i = 0; i < dict_index_get_n_unique(index); i++) { + + const dfield_t* field = dtuple_get_nth_field(clust_entry, i); + ulint flen = dfield_get_len(field); + + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + ptr += mach_write_compressed(ptr, flen); + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_page, ptr) < flen) { + + return(0); + } + + ut_memcpy(ptr, dfield_get_data(field), flen); + ptr += flen; + } + } + + return(trx_undo_page_set_next_prev_and_add(undo_page, ptr, mtr)); +} + +/**********************************************************************//** +Reads from an undo log record the general parameters. +@return remaining part of undo log record after reading these values */ +UNIV_INTERN +byte* +trx_undo_rec_get_pars( +/*==================*/ + trx_undo_rec_t* undo_rec, /*!< in: undo log record */ + ulint* type, /*!< out: undo record type: + TRX_UNDO_INSERT_REC, ... */ + ulint* cmpl_info, /*!< out: compiler info, relevant only + for update type records */ + ibool* updated_extern, /*!< out: TRUE if we updated an + externally stored fild */ + undo_no_t* undo_no, /*!< out: undo log record number */ + table_id_t* table_id) /*!< out: table id */ +{ + byte* ptr; + ulint type_cmpl; + + ptr = undo_rec + 2; + + type_cmpl = mach_read_from_1(ptr); + ptr++; + + if (type_cmpl & TRX_UNDO_UPD_EXTERN) { + *updated_extern = TRUE; + type_cmpl -= TRX_UNDO_UPD_EXTERN; + } else { + *updated_extern = FALSE; + } + + *type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1); + *cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT; + + *undo_no = mach_ull_read_much_compressed(ptr); + ptr += mach_ull_get_much_compressed_size(*undo_no); + + *table_id = mach_ull_read_much_compressed(ptr); + ptr += mach_ull_get_much_compressed_size(*table_id); + + return(ptr); +} + +/**********************************************************************//** +Reads from an undo log record a stored column value. +@return remaining part of undo log record after reading these values */ +static +byte* +trx_undo_rec_get_col_val( +/*=====================*/ + byte* ptr, /*!< in: pointer to remaining part of undo log record */ + byte** field, /*!< out: pointer to stored field */ + ulint* len, /*!< out: length of the field, or UNIV_SQL_NULL */ + ulint* orig_len)/*!< out: original length of the locally + stored part of an externally stored column, or 0 */ +{ + *len = mach_read_compressed(ptr); + ptr += mach_get_compressed_size(*len); + + *orig_len = 0; + + switch (*len) { + case UNIV_SQL_NULL: + *field = NULL; + break; + case UNIV_EXTERN_STORAGE_FIELD: + *orig_len = mach_read_compressed(ptr); + ptr += mach_get_compressed_size(*orig_len); + *len = mach_read_compressed(ptr); + ptr += mach_get_compressed_size(*len); + *field = ptr; + ptr += *len; + + ut_ad(*orig_len >= BTR_EXTERN_FIELD_REF_SIZE); + ut_ad(*len > *orig_len); + /* @see dtuple_convert_big_rec() */ + ut_ad(*len >= BTR_EXTERN_FIELD_REF_SIZE); + /* we do not have access to index->table here + ut_ad(dict_table_get_format(index->table) >= DICT_TF_FORMAT_ZIP + || *len >= col->max_prefix + + BTR_EXTERN_FIELD_REF_SIZE); + */ + + *len += UNIV_EXTERN_STORAGE_FIELD; + break; + default: + *field = ptr; + if (*len >= UNIV_EXTERN_STORAGE_FIELD) { + ptr += *len - UNIV_EXTERN_STORAGE_FIELD; + } else { + ptr += *len; + } + } + + return(ptr); +} + +/*******************************************************************//** +Builds a row reference from an undo log record. +@return pointer to remaining part of undo record */ +UNIV_INTERN +byte* +trx_undo_rec_get_row_ref( +/*=====================*/ + byte* ptr, /*!< in: remaining part of a copy of an undo log + record, at the start of the row reference; + NOTE that this copy of the undo log record must + be preserved as long as the row reference is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + dtuple_t** ref, /*!< out, own: row reference */ + mem_heap_t* heap) /*!< in: memory heap from which the memory + needed is allocated */ +{ + ulint ref_len; + ulint i; + + ut_ad(index && ptr && ref && heap); + ut_a(dict_index_is_clust(index)); + + ref_len = dict_index_get_n_unique(index); + + *ref = dtuple_create(heap, ref_len); + + dict_index_copy_types(*ref, index, ref_len); + + for (i = 0; i < ref_len; i++) { + dfield_t* dfield; + byte* field; + ulint len; + ulint orig_len; + + dfield = dtuple_get_nth_field(*ref, i); + + ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); + + dfield_set_data(dfield, field, len); + } + + return(ptr); +} + +/*******************************************************************//** +Skips a row reference from an undo log record. +@return pointer to remaining part of undo record */ +UNIV_INTERN +byte* +trx_undo_rec_skip_row_ref( +/*======================*/ + byte* ptr, /*!< in: remaining part in update undo log + record, at the start of the row reference */ + dict_index_t* index) /*!< in: clustered index */ +{ + ulint ref_len; + ulint i; + + ut_ad(index && ptr); + ut_a(dict_index_is_clust(index)); + + ref_len = dict_index_get_n_unique(index); + + for (i = 0; i < ref_len; i++) { + byte* field; + ulint len; + ulint orig_len; + + ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); + } + + return(ptr); +} + +/**********************************************************************//** +Fetch a prefix of an externally stored column, for writing to the undo log +of an update or delete marking of a clustered index record. +@return ext_buf */ +static +byte* +trx_undo_page_fetch_ext( +/*====================*/ + byte* ext_buf, /*!< in: buffer to hold the prefix + data and BLOB pointer */ + ulint prefix_len, /*!< in: prefix size to store + in the undo log */ + ulint zip_size, /*!< compressed page size in bytes, + or 0 for uncompressed BLOB */ + const byte* field, /*!< in: an externally stored column */ + ulint* len) /*!< in: length of field; + out: used length of ext_buf */ +{ + /* Fetch the BLOB. */ + ulint ext_len = btr_copy_externally_stored_field_prefix( + ext_buf, prefix_len, zip_size, field, *len); + /* BLOBs should always be nonempty. */ + ut_a(ext_len); + /* Append the BLOB pointer to the prefix. */ + memcpy(ext_buf + ext_len, + field + *len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + *len = ext_len + BTR_EXTERN_FIELD_REF_SIZE; + return(ext_buf); +} + +/**********************************************************************//** +Writes to the undo log a prefix of an externally stored column. +@return undo log position */ +static +byte* +trx_undo_page_report_modify_ext( +/*============================*/ + byte* ptr, /*!< in: undo log position, + at least 15 bytes must be available */ + byte* ext_buf, /*!< in: a buffer of + DICT_MAX_FIELD_LEN_BY_FORMAT() size, + or NULL when should not fetch + a longer prefix */ + ulint prefix_len, /*!< prefix size to store in the + undo log */ + ulint zip_size, /*!< compressed page size in bytes, + or 0 for uncompressed BLOB */ + const byte** field, /*!< in/out: the locally stored part of + the externally stored column */ + ulint* len) /*!< in/out: length of field, in bytes */ +{ + if (ext_buf) { + ut_a(prefix_len > 0); + + /* If an ordering column is externally stored, we will + have to store a longer prefix of the field. In this + case, write to the log a marker followed by the + original length and the real length of the field. */ + ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD); + + ptr += mach_write_compressed(ptr, *len); + + *field = trx_undo_page_fetch_ext(ext_buf, prefix_len, zip_size, + *field, len); + + ptr += mach_write_compressed(ptr, *len); + } else { + ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD + + *len); + } + + return(ptr); +} + +/**********************************************************************//** +Reports in the undo log of an update or delete marking of a clustered index +record. +@return byte offset of the inserted undo log entry on the page if +succeed, 0 if fail */ +static +ulint +trx_undo_page_report_modify( +/*========================*/ + page_t* undo_page, /*!< in: undo log page */ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: clustered index where update or + delete marking is done */ + const rec_t* rec, /*!< in: clustered index record which + has NOT yet been modified */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index) */ + const upd_t* update, /*!< in: update vector which tells the + columns to be updated; in the case of + a delete, this should be set to NULL */ + ulint cmpl_info, /*!< in: compiler info on secondary + index updates */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_table_t* table; + ulint first_free; + byte* ptr; + const byte* field; + ulint flen; + ulint col_no; + ulint type_cmpl; + byte* type_cmpl_ptr; + ulint i; + trx_id_t trx_id; + ibool ignore_prefix = FALSE; + byte ext_buf[REC_VERSION_56_MAX_INDEX_COL_LEN + + BTR_EXTERN_FIELD_REF_SIZE]; + + ut_a(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE); + table = index->table; + + first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + ptr = undo_page + first_free; + + ut_ad(first_free <= UNIV_PAGE_SIZE); + + if (trx_undo_left(undo_page, ptr) < 50) { + + /* NOTE: the value 50 must be big enough so that the general + fields written below fit on the undo log page */ + + return(0); + } + + /* Reserve 2 bytes for the pointer to the next undo log record */ + ptr += 2; + + /* Store first some general parameters to the undo log */ + + if (!update) { + type_cmpl = TRX_UNDO_DEL_MARK_REC; + } else if (rec_get_deleted_flag(rec, dict_table_is_comp(table))) { + type_cmpl = TRX_UNDO_UPD_DEL_REC; + /* We are about to update a delete marked record. + We don't typically need the prefix in this case unless + the delete marking is done by the same transaction + (which we check below). */ + ignore_prefix = TRUE; + } else { + type_cmpl = TRX_UNDO_UPD_EXIST_REC; + } + + type_cmpl |= cmpl_info * TRX_UNDO_CMPL_INFO_MULT; + type_cmpl_ptr = ptr; + + *ptr++ = (byte) type_cmpl; + ptr += mach_ull_write_much_compressed(ptr, trx->undo_no); + + ptr += mach_ull_write_much_compressed(ptr, table->id); + + /*----------------------------------------*/ + /* Store the state of the info bits */ + + *ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table)); + + /* Store the values of the system columns */ + field = rec_get_nth_field(rec, offsets, + dict_index_get_sys_col_pos( + index, DATA_TRX_ID), &flen); + ut_ad(flen == DATA_TRX_ID_LEN); + + trx_id = trx_read_trx_id(field); + + /* If it is an update of a delete marked record, then we are + allowed to ignore blob prefixes if the delete marking was done + by some other trx as it must have committed by now for us to + allow an over-write. */ + if (ignore_prefix) { + ignore_prefix = (trx_id != trx->id); + } + ptr += mach_ull_write_compressed(ptr, trx_id); + + field = rec_get_nth_field(rec, offsets, + dict_index_get_sys_col_pos( + index, DATA_ROLL_PTR), &flen); + ut_ad(flen == DATA_ROLL_PTR_LEN); + + ptr += mach_ull_write_compressed(ptr, trx_read_roll_ptr(field)); + + /*----------------------------------------*/ + /* Store then the fields required to uniquely determine the + record which will be modified in the clustered index */ + + for (i = 0; i < dict_index_get_n_unique(index); i++) { + + field = rec_get_nth_field(rec, offsets, i, &flen); + + /* The ordering columns must not be stored externally. */ + ut_ad(!rec_offs_nth_extern(offsets, i)); + ut_ad(dict_index_get_nth_col(index, i)->ord_part); + + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + ptr += mach_write_compressed(ptr, flen); + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_page, ptr) < flen) { + + return(0); + } + + ut_memcpy(ptr, field, flen); + ptr += flen; + } + } + + /*----------------------------------------*/ + /* Save to the undo log the old values of the columns to be updated. */ + + if (update) { + ulint extended = 0; + + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + if (srv_use_sys_stats_table + && index == UT_LIST_GET_FIRST(dict_sys->sys_stats->indexes)) { + for (i = 0; i < upd_get_n_fields(update); i++) { + ulint pos = upd_get_nth_field(update, i)->field_no; + + if (pos >= rec_offs_n_fields(offsets)) { + extended++; + } + } + } + + ptr += mach_write_compressed(ptr, upd_get_n_fields(update) - extended); + + for (i = 0; i < upd_get_n_fields(update) - extended; i++) { + + ulint pos = upd_get_nth_field(update, i)->field_no; + + /* Write field number to undo log */ + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + ptr += mach_write_compressed(ptr, pos); + + /* Save the old value of field */ + field = rec_get_nth_field(rec, offsets, pos, &flen); + + if (trx_undo_left(undo_page, ptr) < 15) { + + return(0); + } + + if (rec_offs_nth_extern(offsets, pos)) { + const dict_col_t* col + = dict_index_get_nth_col(index, pos); + ulint prefix_len + = dict_max_field_len_store_undo( + table, col); + + ut_ad(prefix_len + BTR_EXTERN_FIELD_REF_SIZE + <= sizeof ext_buf); + + ptr = trx_undo_page_report_modify_ext( + ptr, + col->ord_part + && !ignore_prefix + && flen < REC_ANTELOPE_MAX_INDEX_COL_LEN + ? ext_buf : NULL, prefix_len, + dict_table_zip_size(table), + &field, &flen); + + /* Notify purge that it eventually has to + free the old externally stored field */ + + trx->update_undo->del_marks = TRUE; + + *type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN; + } else { + ptr += mach_write_compressed(ptr, flen); + } + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_page, ptr) < flen) { + + return(0); + } + + ut_memcpy(ptr, field, flen); + ptr += flen; + } + } + } + + /*----------------------------------------*/ + /* In the case of a delete marking, and also in the case of an update + where any ordering field of any index changes, store the values of all + columns which occur as ordering fields in any index. This info is used + in the purge of old versions where we use it to build and search the + delete marked index records, to look if we can remove them from the + index tree. Note that starting from 4.0.14 also externally stored + fields can be ordering in some index. Starting from 5.2, we no longer + store REC_MAX_INDEX_COL_LEN first bytes to the undo log record, + but we can construct the column prefix fields in the index by + fetching the first page of the BLOB that is pointed to by the + clustered index. This works also in crash recovery, because all pages + (including BLOBs) are recovered before anything is rolled back. */ + + if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + byte* old_ptr = ptr; + + trx->update_undo->del_marks = TRUE; + + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + /* Reserve 2 bytes to write the number of bytes the stored + fields take in this undo record */ + + ptr += 2; + + for (col_no = 0; col_no < dict_table_get_n_cols(table); + col_no++) { + + const dict_col_t* col + = dict_table_get_nth_col(table, col_no); + + if (col->ord_part) { + ulint pos; + + /* Write field number to undo log */ + if (trx_undo_left(undo_page, ptr) < 5 + 15) { + + return(0); + } + + pos = dict_index_get_nth_col_pos(index, + col_no); + ptr += mach_write_compressed(ptr, pos); + + /* Save the old value of field */ + field = rec_get_nth_field(rec, offsets, pos, + &flen); + + if (rec_offs_nth_extern(offsets, pos)) { + const dict_col_t* col = + dict_index_get_nth_col( + index, pos); + ulint prefix_len = + dict_max_field_len_store_undo( + table, col); + + ut_a(prefix_len < sizeof ext_buf); + + ptr = trx_undo_page_report_modify_ext( + ptr, + flen < REC_ANTELOPE_MAX_INDEX_COL_LEN + && !ignore_prefix + ? ext_buf : NULL, prefix_len, + dict_table_zip_size(table), + &field, &flen); + } else { + ptr += mach_write_compressed( + ptr, flen); + } + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_page, ptr) + < flen) { + + return(0); + } + + ut_memcpy(ptr, field, flen); + ptr += flen; + } + } + } + + mach_write_to_2(old_ptr, ptr - old_ptr); + } + + /*----------------------------------------*/ + /* Write pointers to the previous and the next undo log records */ + if (trx_undo_left(undo_page, ptr) < 2) { + + return(0); + } + + mach_write_to_2(ptr, first_free); + ptr += 2; + mach_write_to_2(undo_page + first_free, ptr - undo_page); + + mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, + ptr - undo_page); + + /* Write to the REDO log about this change in the UNDO log */ + + trx_undof_page_add_undo_rec_log(undo_page, first_free, + ptr - undo_page, mtr); + return(first_free); +} + +/**********************************************************************//** +Reads from an undo log update record the system field values of the old +version. +@return remaining part of undo log record after reading these values */ +UNIV_INTERN +byte* +trx_undo_update_rec_get_sys_cols( +/*=============================*/ + byte* ptr, /*!< in: remaining part of undo + log record after reading + general parameters */ + trx_id_t* trx_id, /*!< out: trx id */ + roll_ptr_t* roll_ptr, /*!< out: roll ptr */ + ulint* info_bits) /*!< out: info bits state */ +{ + /* Read the state of the info bits */ + *info_bits = mach_read_from_1(ptr); + ptr += 1; + + /* Read the values of the system columns */ + + *trx_id = mach_ull_read_compressed(ptr); + ptr += mach_ull_get_compressed_size(*trx_id); + + *roll_ptr = mach_ull_read_compressed(ptr); + ptr += mach_ull_get_compressed_size(*roll_ptr); + + return(ptr); +} + +/**********************************************************************//** +Reads from an update undo log record the number of updated fields. +@return remaining part of undo log record after reading this value */ +UNIV_INLINE +byte* +trx_undo_update_rec_get_n_upd_fields( +/*=================================*/ + byte* ptr, /*!< in: pointer to remaining part of undo log record */ + ulint* n) /*!< out: number of fields */ +{ + *n = mach_read_compressed(ptr); + ptr += mach_get_compressed_size(*n); + + return(ptr); +} + +/**********************************************************************//** +Reads from an update undo log record a stored field number. +@return remaining part of undo log record after reading this value */ +UNIV_INLINE +byte* +trx_undo_update_rec_get_field_no( +/*=============================*/ + byte* ptr, /*!< in: pointer to remaining part of undo log record */ + ulint* field_no)/*!< out: field number */ +{ + *field_no = mach_read_compressed(ptr); + ptr += mach_get_compressed_size(*field_no); + + return(ptr); +} + +/*******************************************************************//** +Builds an update vector based on a remaining part of an undo log record. +@return remaining part of the record, NULL if an error detected, which +means that the record is corrupted */ +UNIV_INTERN +byte* +trx_undo_update_rec_get_update( +/*===========================*/ + byte* ptr, /*!< in: remaining part in update undo log + record, after reading the row reference + NOTE that this copy of the undo log record must + be preserved as long as the update vector is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + ulint type, /*!< in: TRX_UNDO_UPD_EXIST_REC, + TRX_UNDO_UPD_DEL_REC, or + TRX_UNDO_DEL_MARK_REC; in the last case, + only trx id and roll ptr fields are added to + the update vector */ + trx_id_t trx_id, /*!< in: transaction id from this undo record */ + roll_ptr_t roll_ptr,/*!< in: roll pointer from this undo record */ + ulint info_bits,/*!< in: info bits from this undo record */ + trx_t* trx, /*!< in: transaction */ + mem_heap_t* heap, /*!< in: memory heap from which the memory + needed is allocated */ + upd_t** upd) /*!< out, own: update vector */ +{ + upd_field_t* upd_field; + upd_t* update; + ulint n_fields; + byte* buf; + ulint i; + + ut_a(dict_index_is_clust(index)); + + if (type != TRX_UNDO_DEL_MARK_REC) { + ptr = trx_undo_update_rec_get_n_upd_fields(ptr, &n_fields); + } else { + n_fields = 0; + } + + update = upd_create(n_fields + 2, heap); + + update->info_bits = info_bits; + + /* Store first trx id and roll ptr to update vector */ + + upd_field = upd_get_nth_field(update, n_fields); + buf = mem_heap_alloc(heap, DATA_TRX_ID_LEN); + trx_write_trx_id(buf, trx_id); + + upd_field_set_field_no(upd_field, + dict_index_get_sys_col_pos(index, DATA_TRX_ID), + index, trx); + dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN); + + upd_field = upd_get_nth_field(update, n_fields + 1); + buf = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN); + trx_write_roll_ptr(buf, roll_ptr); + + upd_field_set_field_no( + upd_field, dict_index_get_sys_col_pos(index, DATA_ROLL_PTR), + index, trx); + dfield_set_data(&(upd_field->new_val), buf, DATA_ROLL_PTR_LEN); + + /* Store then the updated ordinary columns to the update vector */ + + for (i = 0; i < n_fields; i++) { + + byte* field; + ulint len; + ulint field_no; + ulint orig_len; + + ptr = trx_undo_update_rec_get_field_no(ptr, &field_no); + + if (field_no >= dict_index_get_n_fields(index)) { + fprintf(stderr, + "InnoDB: Error: trying to access" + " update undo rec field %lu in ", + (ulong) field_no); + dict_index_name_print(stderr, trx, index); + fprintf(stderr, "\n" + "InnoDB: but index has only %lu fields\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n" + "InnoDB: Run also CHECK TABLE ", + (ulong) dict_index_get_n_fields(index)); + ut_print_name(stderr, trx, TRUE, index->table_name); + fprintf(stderr, "\n" + "InnoDB: n_fields = %lu, i = %lu, ptr %p\n", + (ulong) n_fields, (ulong) i, ptr); + ut_ad(0); + *upd = NULL; + return(NULL); + } + + upd_field = upd_get_nth_field(update, i); + + upd_field_set_field_no(upd_field, field_no, index, trx); + + ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); + + upd_field->orig_len = orig_len; + + if (len == UNIV_SQL_NULL) { + dfield_set_null(&upd_field->new_val); + } else if (len < UNIV_EXTERN_STORAGE_FIELD) { + dfield_set_data(&upd_field->new_val, field, len); + } else { + len -= UNIV_EXTERN_STORAGE_FIELD; + + dfield_set_data(&upd_field->new_val, field, len); + dfield_set_ext(&upd_field->new_val); + } + } + + *upd = update; + + return(ptr); +} + +/*******************************************************************//** +Builds a partial row from an update undo log record. It contains the +columns which occur as ordering in any index of the table. +@return pointer to remaining part of undo record */ +UNIV_INTERN +byte* +trx_undo_rec_get_partial_row( +/*=========================*/ + byte* ptr, /*!< in: remaining part in update undo log + record of a suitable type, at the start of + the stored index columns; + NOTE that this copy of the undo log record must + be preserved as long as the partial row is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + dtuple_t** row, /*!< out, own: partial row */ + ibool ignore_prefix, /*!< in: flag to indicate if we + expect blob prefixes in undo. Used + only in the assertion. */ + mem_heap_t* heap) /*!< in: memory heap from which the memory + needed is allocated */ +{ + const byte* end_ptr; + ulint row_len; + + ut_ad(index); + ut_ad(ptr); + ut_ad(row); + ut_ad(heap); + ut_ad(dict_index_is_clust(index)); + + row_len = dict_table_get_n_cols(index->table); + + *row = dtuple_create(heap, row_len); + + dict_table_copy_types(*row, index->table); + + end_ptr = ptr + mach_read_from_2(ptr); + ptr += 2; + + while (ptr != end_ptr) { + dfield_t* dfield; + byte* field; + ulint field_no; + const dict_col_t* col; + ulint col_no; + ulint len; + ulint orig_len; + + ptr = trx_undo_update_rec_get_field_no(ptr, &field_no); + + col = dict_index_get_nth_col(index, field_no); + col_no = dict_col_get_no(col); + + ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); + + dfield = dtuple_get_nth_field(*row, col_no); + + dfield_set_data(dfield, field, len); + + if (len != UNIV_SQL_NULL + && len >= UNIV_EXTERN_STORAGE_FIELD) { + dfield_set_len(dfield, + len - UNIV_EXTERN_STORAGE_FIELD); + dfield_set_ext(dfield); + /* If the prefix of this column is indexed, + ensure that enough prefix is stored in the + undo log record. */ + if (!ignore_prefix && col->ord_part) { + ut_a(dfield_get_len(dfield) + >= BTR_EXTERN_FIELD_REF_SIZE); + ut_a(dict_table_get_format(index->table) + >= DICT_TF_FORMAT_ZIP + || dfield_get_len(dfield) + >= REC_ANTELOPE_MAX_INDEX_COL_LEN + + BTR_EXTERN_FIELD_REF_SIZE); + } + } + } + + return(ptr); +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************************//** +Erases the unused undo log page end. +@return TRUE if the page contained something, FALSE if it was empty */ +static __attribute__((nonnull)) +ibool +trx_undo_erase_page_end( +/*====================*/ + page_t* undo_page, /*!< in/out: undo page whose end to erase */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint first_free; + + first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + memset(undo_page + first_free, 0xff, + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) - first_free); + + mlog_write_initial_log_record(undo_page, MLOG_UNDO_ERASE_END, mtr); + return(first_free != TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); +} + +/***********************************************************//** +Parses a redo log record of erasing of an undo page end. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_erase_page_end( +/*==========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr __attribute__((unused)), /*!< in: buffer end */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + ut_ad(ptr && end_ptr); + + if (page == NULL) { + + return(ptr); + } + + trx_undo_erase_page_end(page, mtr); + + return(ptr); +} + +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Writes information to an undo log about an insert, update, or a delete marking +of a clustered index record. This information is used in a rollback of the +transaction and in consistent reads that must look to the history of this +transaction. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ulint +trx_undo_report_row_operation( +/*==========================*/ + ulint flags, /*!< in: if BTR_NO_UNDO_LOG_FLAG bit is + set, does nothing */ + ulint op_type, /*!< in: TRX_UNDO_INSERT_OP or + TRX_UNDO_MODIFY_OP */ + que_thr_t* thr, /*!< in: query thread */ + dict_index_t* index, /*!< in: clustered index */ + const dtuple_t* clust_entry, /*!< in: in the case of an insert, + index entry to insert into the + clustered index, otherwise NULL */ + const upd_t* update, /*!< in: in the case of an update, + the update vector, otherwise NULL */ + ulint cmpl_info, /*!< in: compiler info on secondary + index updates */ + const rec_t* rec, /*!< in: in case of an update or delete + marking, the record in the clustered + index, otherwise NULL */ + roll_ptr_t* roll_ptr) /*!< out: rollback pointer to the + inserted undo log record, + 0 if BTR_NO_UNDO_LOG + flag was specified */ +{ + trx_t* trx; + trx_undo_t* undo; + ulint page_no; + buf_block_t* undo_block; + trx_rseg_t* rseg; + mtr_t mtr; + ulint err = DB_SUCCESS; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; +#ifdef UNIV_DEBUG + int loop_count = 0; +#endif /* UNIV_DEBUG */ + rec_offs_init(offsets_); + + ut_a(dict_index_is_clust(index)); + + if (flags & BTR_NO_UNDO_LOG_FLAG) { + + *roll_ptr = 0; + + return(DB_SUCCESS); + } + + ut_ad(thr); + ut_ad((op_type != TRX_UNDO_INSERT_OP) + || (clust_entry && !update && !rec)); + + trx = thr_get_trx(thr); + rseg = trx->rseg; + + mutex_enter(&(trx->undo_mutex)); + + /* If the undo log is not assigned yet, assign one */ + + if (op_type == TRX_UNDO_INSERT_OP) { + + if (trx->insert_undo == NULL) { + + err = trx_undo_assign_undo(trx, TRX_UNDO_INSERT); + } + + undo = trx->insert_undo; + + if (UNIV_UNLIKELY(!undo)) { + /* Did not succeed */ + ut_ad(err != DB_SUCCESS); + mutex_exit(&(trx->undo_mutex)); + + return(err); + } + + ut_ad(err == DB_SUCCESS); + } else { + ut_ad(op_type == TRX_UNDO_MODIFY_OP); + + if (trx->update_undo == NULL) { + + err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE); + + } + + undo = trx->update_undo; + + if (UNIV_UNLIKELY(!undo)) { + /* Did not succeed */ + ut_ad(err != DB_SUCCESS); + mutex_exit(&(trx->undo_mutex)); + return(err); + } + + ut_ad(err == DB_SUCCESS); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + } + + mtr_start(&mtr); + + page_no = undo->last_page_no; + undo_block = buf_page_get_gen( + undo->space, undo->zip_size, page_no, RW_X_LATCH, + undo->guess_block, BUF_GET, __FILE__, __LINE__, &mtr); + buf_block_dbg_add_level(undo_block, SYNC_TRX_UNDO_PAGE); + + do { + page_t* undo_page; + ulint offset; + + undo_page = buf_block_get_frame(undo_block); + ut_ad(page_no == buf_block_get_page_no(undo_block)); + + if (op_type == TRX_UNDO_INSERT_OP) { + offset = trx_undo_page_report_insert( + undo_page, trx, index, clust_entry, &mtr); + } else { + offset = trx_undo_page_report_modify( + undo_page, trx, index, rec, offsets, update, + cmpl_info, &mtr); + } + + if (UNIV_UNLIKELY(offset == 0)) { + /* The record did not fit on the page. We erase the + end segment of the undo log page and write a log + record of it: this is to ensure that in the debug + version the replicate page constructed using the log + records stays identical to the original page */ + + if (!trx_undo_erase_page_end(undo_page, &mtr)) { + /* The record did not fit on an empty + undo page. Discard the freshly allocated + page and return an error. */ + + /* When we remove a page from an undo + log, this is analogous to a + pessimistic insert in a B-tree, and we + must reserve the counterpart of the + tree latch, which is the rseg + mutex. We must commit the mini-transaction + first, because it may be holding lower-level + latches, such as SYNC_FSP and SYNC_FSP_PAGE. */ + + mtr_commit(&mtr); + mtr_start(&mtr); + + mutex_enter(&rseg->mutex); + trx_undo_free_last_page(trx, undo, &mtr); + mutex_exit(&rseg->mutex); + + err = DB_UNDO_RECORD_TOO_BIG; + goto err_exit; + } + + mtr_commit(&mtr); + } else { + /* Success */ + + mtr_commit(&mtr); + + undo->empty = FALSE; + undo->top_page_no = page_no; + undo->top_offset = offset; + undo->top_undo_no = trx->undo_no; + undo->guess_block = undo_block; + + trx->undo_no++; + + mutex_exit(&trx->undo_mutex); + + *roll_ptr = trx_undo_build_roll_ptr( + op_type == TRX_UNDO_INSERT_OP, + rseg->id, page_no, offset); + err = DB_SUCCESS; + goto func_exit; + } + + ut_ad(page_no == undo->last_page_no); + + /* We have to extend the undo log by one page */ + + ut_ad(++loop_count < 2); + mtr_start(&mtr); + + /* When we add a page to an undo log, this is analogous to + a pessimistic insert in a B-tree, and we must reserve the + counterpart of the tree latch, which is the rseg mutex. */ + + mutex_enter(&rseg->mutex); + undo_block = trx_undo_add_page(trx, undo, &mtr); + mutex_exit(&rseg->mutex); + page_no = undo->last_page_no; + } while (undo_block != NULL); + + /* Did not succeed: out of space */ + err = DB_OUT_OF_FILE_SPACE; + +err_exit: + mutex_exit(&trx->undo_mutex); + mtr_commit(&mtr); +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); +} + +/*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/ + +/******************************************************************//** +Copies an undo record to heap. This function can be called if we know that +the undo log record exists. +@return own: copy of the record */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_undo_rec_low( +/*======================*/ + roll_ptr_t roll_ptr, /*!< in: roll pointer to record */ + mem_heap_t* heap) /*!< in: memory heap where copied */ +{ + trx_undo_rec_t* undo_rec; + ulint rseg_id; + ulint page_no; + ulint offset; + const page_t* undo_page; + trx_rseg_t* rseg; + ibool is_insert; + mtr_t mtr; + + trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no, + &offset); + rseg = trx_rseg_get_on_id(rseg_id); + + mtr_start(&mtr); + + undo_page = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size, + page_no, &mtr); + + undo_rec = trx_undo_rec_copy(undo_page + offset, heap); + + mtr_commit(&mtr); + + return(undo_rec); +} + +/******************************************************************//** +Copies an undo record to heap. + +NOTE: the caller must have latches on the clustered index page and +purge_view. + +@return DB_SUCCESS, or DB_MISSING_HISTORY if the undo log has been +truncated and we cannot fetch the old version */ +UNIV_INTERN +ulint +trx_undo_get_undo_rec( +/*==================*/ + roll_ptr_t roll_ptr, /*!< in: roll pointer to record */ + trx_id_t trx_id, /*!< in: id of the trx that generated + the roll pointer: it points to an + undo log of this transaction */ + trx_undo_rec_t** undo_rec, /*!< out, own: copy of the record */ + mem_heap_t* heap) /*!< in: memory heap where copied */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + if (!trx_purge_update_undo_must_exist(trx_id)) { + + /* It may be that the necessary undo log has already been + deleted */ + + return(DB_MISSING_HISTORY); + } + + *undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap); + + return(DB_SUCCESS); +} + +/*******************************************************************//** +Build a previous version of a clustered index record. This function checks +that the caller has a latch on the index page of the clustered index record +and an s-latch on the purge_view. This guarantees that the stack of versions +is locked all the way down to the purge_view. +@return DB_SUCCESS, or DB_MISSING_HISTORY if the previous version is +earlier than purge_view, which means that it may have been removed, +DB_ERROR if corrupted record */ +UNIV_INTERN +ulint +trx_undo_prev_version_build( +/*========================*/ + const rec_t* index_rec,/*!< in: clustered index record in the + index tree */ + mtr_t* index_mtr __attribute__((unused)), + /*!< in: mtr which contains the latch to + index_rec page and purge_view */ + const rec_t* rec, /*!< in: version of a clustered index record */ + dict_index_t* index, /*!< in: clustered index */ + ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + mem_heap_t* heap, /*!< in: memory heap from which the memory + needed is allocated */ + rec_t** old_vers)/*!< out, own: previous version, or NULL if + rec is the first inserted version, or if + history data has been deleted (an error), + or if the purge COULD have removed the version + though it has not yet done so */ +{ + trx_undo_rec_t* undo_rec = NULL; + dtuple_t* entry; + trx_id_t rec_trx_id; + ulint type; + undo_no_t undo_no; + table_id_t table_id; + trx_id_t trx_id; + roll_ptr_t roll_ptr; + roll_ptr_t old_roll_ptr; + upd_t* update; + byte* ptr; + ulint info_bits; + ulint cmpl_info; + ibool dummy_extern; + byte* buf; + ulint err; +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mtr_memo_contains_page(index_mtr, index_rec, MTR_MEMO_PAGE_S_FIX) + || mtr_memo_contains_page(index_mtr, index_rec, + MTR_MEMO_PAGE_X_FIX)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (!dict_index_is_clust(index)) { + fprintf(stderr, "InnoDB: Error: trying to access" + " update undo rec for non-clustered index %s\n" + "InnoDB: Submit a detailed bug report to" + " http://bugs.mysql.com\n" + "InnoDB: index record ", index->name); + rec_print(stderr, index_rec, index); + fputs("\n" + "InnoDB: record version ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + ut_ad(0); + return(DB_ERROR); + } + + roll_ptr = row_get_rec_roll_ptr(rec, index, offsets); + old_roll_ptr = roll_ptr; + + *old_vers = NULL; + + if (trx_undo_roll_ptr_is_insert(roll_ptr)) { + + /* The record rec is the first inserted version */ + + return(DB_SUCCESS); + } + + rec_trx_id = row_get_rec_trx_id(rec, index, offsets); + + err = trx_undo_get_undo_rec(roll_ptr, rec_trx_id, &undo_rec, heap); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + /* The undo record may already have been purged. + This should never happen in InnoDB. */ + + return(err); + } + + ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info, + &dummy_extern, &undo_no, &table_id); + + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, + &info_bits); + + /* (a) If a clustered index record version is such that the + trx id stamp in it is bigger than purge_sys->view, then the + BLOBs in that version are known to exist (the purge has not + progressed that far); + + (b) if the version is the first version such that trx id in it + is less than purge_sys->view, and it is not delete-marked, + then the BLOBs in that version are known to exist (the purge + cannot have purged the BLOBs referenced by that version + yet). + + This function does not fetch any BLOBs. The callers might, by + possibly invoking row_ext_create() via row_build(). However, + they should have all needed information in the *old_vers + returned by this function. This is because *old_vers is based + on the transaction undo log records. The function + trx_undo_page_fetch_ext() will write BLOB prefixes to the + transaction undo log that are at least as long as the longest + possible column prefix in a secondary index. Thus, secondary + index entries for *old_vers can be constructed without + dereferencing any BLOB pointers. */ + + ptr = trx_undo_rec_skip_row_ref(ptr, index); + + ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id, + roll_ptr, info_bits, + NULL, heap, &update); + + if (UNIV_UNLIKELY(table_id != index->table->id)) { + ptr = NULL; + + fprintf(stderr, + "InnoDB: Error: trying to access update undo rec" + " for table %s\n" + "InnoDB: but the table id in the" + " undo record is wrong\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n" + "InnoDB: Run also CHECK TABLE %s\n", + index->table_name, index->table_name); + } + + if (ptr == NULL) { + /* The record was corrupted, return an error; these printfs + should catch an elusive bug in row_vers_old_has_index_entry */ + + fprintf(stderr, + "InnoDB: table %s, index %s, n_uniq %lu\n" + "InnoDB: undo rec address %p, type %lu cmpl_info %lu\n" + "InnoDB: undo rec table id %llu," + " index table id %llu\n" + "InnoDB: dump of 150 bytes in undo rec: ", + index->table_name, index->name, + (ulong) dict_index_get_n_unique(index), + undo_rec, (ulong) type, (ulong) cmpl_info, + (ullint) table_id, + (ullint) index->table->id); + ut_print_buf(stderr, undo_rec, 150); + fputs("\n" + "InnoDB: index record ", stderr); + rec_print(stderr, index_rec, index); + fputs("\n" + "InnoDB: record version ", stderr); + rec_print_new(stderr, rec, offsets); + fprintf(stderr, "\n" + "InnoDB: Record trx id " TRX_ID_FMT + ", update rec trx id " TRX_ID_FMT "\n" + "InnoDB: Roll ptr in rec " TRX_ID_FMT + ", in update rec" TRX_ID_FMT "\n", + (ullint) rec_trx_id, (ullint) trx_id, + (ullint) old_roll_ptr, (ullint) roll_ptr); + + trx_purge_sys_print(); + ut_ad(0); + return(DB_ERROR); + } + +# ifdef UNIV_BLOB_NULL_DEBUG + ut_a(!rec_offs_any_null_extern(rec, offsets)); +# endif /* UNIV_BLOB_NULL_DEBUG */ + + if (row_upd_changes_field_size_or_external(index, offsets, update)) { + ulint n_ext; + + /* We should confirm the existence of disowned external data, + if the previous version record is delete marked. If the trx_id + of the previous record is seen by purge view, we should treat + it as missing history, because the disowned external data + might be purged already. + + The inherited external data (BLOBs) can be freed (purged) + after trx_id was committed, provided that no view was started + before trx_id. If the purge view can see the committed + delete-marked record by trx_id, no transactions need to access + the BLOB. */ + + if ((update->info_bits & REC_INFO_DELETED_FLAG) + && read_view_sees_trx_id(purge_sys->view, trx_id)) { + /* treat as a fresh insert, not to + cause assertion error at the caller. */ + return(DB_SUCCESS); + } + + /* We have to set the appropriate extern storage bits in the + old version of the record: the extern bits in rec for those + fields that update does NOT update, as well as the bits for + those fields that update updates to become externally stored + fields. Store the info: */ + + entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, + offsets, &n_ext, heap); + n_ext += btr_push_update_extern_fields(entry, update, heap); + /* The page containing the clustered index record + corresponding to entry is latched in mtr. Thus the + following call is safe. */ + row_upd_index_replace_new_col_vals(entry, index, update, heap); + + buf = mem_heap_alloc(heap, rec_get_converted_size(index, entry, + n_ext)); + + *old_vers = rec_convert_dtuple_to_rec(buf, index, + entry, n_ext); + } else { + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + *old_vers = rec_copy(buf, rec, offsets); + rec_offs_make_valid(*old_vers, index, offsets); + row_upd_rec_in_place(*old_vers, index, offsets, update, NULL); + } + + return(DB_SUCCESS); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/trx/trx0roll.c b/storage/xtradb/trx/trx0roll.c new file mode 100644 index 00000000000..25c1d5d4692 --- /dev/null +++ b/storage/xtradb/trx/trx0roll.c @@ -0,0 +1,1357 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0roll.c +Transaction rollback + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0roll.h" + +#ifdef UNIV_NONINL +#include "trx0roll.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0undo.h" +#include "trx0rec.h" +#include "que0que.h" +#include "usr0sess.h" +#include "srv0start.h" +#include "row0undo.h" +#include "row0mysql.h" +#include "lock0lock.h" +#include "pars0pars.h" + +/** This many pages must be undone before a truncate is tried within +rollback */ +#define TRX_ROLL_TRUNC_THRESHOLD 1 + +/** In crash recovery, the current trx to be rolled back; NULL otherwise */ +static const trx_t* trx_roll_crash_recv_trx = NULL; + +/** In crash recovery we set this to the undo n:o of the current trx to be +rolled back. Then we can print how many % the rollback has progressed. */ +static undo_no_t trx_roll_max_undo_no; + +/** Auxiliary variable which tells the previous progress % we printed */ +static ulint trx_roll_progress_printed_pct; + +/*******************************************************************//** +Rollback a transaction used in MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +int +trx_general_rollback_for_mysql( +/*===========================*/ + trx_t* trx, /*!< in: transaction handle */ + trx_savept_t* savept) /*!< in: pointer to savepoint undo number, if + partial rollback requested, or NULL for + complete rollback */ +{ + mem_heap_t* heap; + que_thr_t* thr; + roll_node_t* roll_node; + + /* Tell Innobase server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + + trx_start_if_not_started(trx); + + heap = mem_heap_create(512); + + roll_node = roll_node_create(heap); + + if (savept) { + roll_node->partial = TRUE; + roll_node->savept = *savept; + } + + trx->error_state = DB_SUCCESS; + + thr = pars_complete_graph_for_exec(roll_node, trx, heap); + + ut_a(thr == que_fork_start_command(que_node_get_parent(thr))); + que_run_threads(thr); + + mutex_enter(&kernel_mutex); + + while (trx->que_state != TRX_QUE_RUNNING) { + + mutex_exit(&kernel_mutex); + + os_thread_sleep(100000); + + mutex_enter(&kernel_mutex); + } + + mutex_exit(&kernel_mutex); + + mem_heap_free(heap); + + ut_a(trx->error_state == DB_SUCCESS); + + /* Tell Innobase server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + + return((int) trx->error_state); +} + +/*******************************************************************//** +Rollback a transaction used in MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +int +trx_rollback_for_mysql( +/*===================*/ + trx_t* trx) /*!< in: transaction handle */ +{ + int err; + + if (trx->state == TRX_NOT_STARTED) { + + return(DB_SUCCESS); + } + + trx->op_info = "rollback"; + + /* If we are doing the XA recovery of prepared transactions, then + the transaction object does not have an InnoDB session object, and we + set a dummy session that we use for all MySQL transactions. */ + + err = trx_general_rollback_for_mysql(trx, NULL); + + trx->op_info = ""; + + return(err); +} + +/*******************************************************************//** +Rollback the latest SQL statement for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +int +trx_rollback_last_sql_stat_for_mysql( +/*=================================*/ + trx_t* trx) /*!< in: transaction handle */ +{ + int err; + + if (trx->state == TRX_NOT_STARTED) { + + return(DB_SUCCESS); + } + + trx->op_info = "rollback of SQL statement"; + + err = trx_general_rollback_for_mysql(trx, &trx->last_sql_stat_start); + /* The following call should not be needed, but we play safe: */ + trx_mark_sql_stat_end(trx); + + trx->op_info = ""; + + return(err); +} + +/*******************************************************************//** +Frees a single savepoint struct. */ +UNIV_INTERN +void +trx_roll_savepoint_free( +/*=====================*/ + trx_t* trx, /*!< in: transaction handle */ + trx_named_savept_t* savep) /*!< in: savepoint to free */ +{ + ut_a(savep != NULL); + ut_a(UT_LIST_GET_LEN(trx->trx_savepoints) > 0); + + UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep); + mem_free(savep->name); + mem_free(savep); +} + +/*******************************************************************//** +Frees savepoint structs starting from savep, if savep == NULL then +free all savepoints. */ +UNIV_INTERN +void +trx_roll_savepoints_free( +/*=====================*/ + trx_t* trx, /*!< in: transaction handle */ + trx_named_savept_t* savep) /*!< in: free all savepoints > this one; + if this is NULL, free all savepoints + of trx */ +{ + trx_named_savept_t* next_savep; + + if (savep == NULL) { + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + } else { + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + + while (savep != NULL) { + next_savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + + trx_roll_savepoint_free(trx, savep); + + savep = next_savep; + } +} + +/*******************************************************************//** +Rolls back a transaction back to a named savepoint. Modifications after the +savepoint are undone but InnoDB does NOT release the corresponding locks +which are stored in memory. If a lock is 'implicit', that is, a new inserted +row holds a lock where the lock information is carried by the trx id stored in +the row, these locks are naturally released in the rollback. Savepoints which +were set after this savepoint are deleted. +@return if no savepoint of the name found then DB_NO_SAVEPOINT, +otherwise DB_SUCCESS */ +UNIV_INTERN +ulint +trx_rollback_to_savepoint_for_mysql( +/*================================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name, /*!< in: savepoint name */ + ib_int64_t* mysql_binlog_cache_pos) /*!< out: the MySQL binlog cache + position corresponding to this + savepoint; MySQL needs this + information to remove the + binlog entries of the queries + executed after the savepoint */ +{ + trx_named_savept_t* savep; + ulint err; + + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + + while (savep != NULL) { + if (0 == ut_strcmp(savep->name, savepoint_name)) { + /* Found */ + break; + } + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + + if (savep == NULL) { + + return(DB_NO_SAVEPOINT); + } + + if (trx->state == TRX_NOT_STARTED) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: transaction has a savepoint ", stderr); + ut_print_name(stderr, trx, FALSE, savep->name); + fputs(" though it is not started\n", stderr); + return(DB_ERROR); + } + + /* We can now free all savepoints strictly later than this one */ + + trx_roll_savepoints_free(trx, savep); + + *mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos; + + trx->op_info = "rollback to a savepoint"; + + err = trx_general_rollback_for_mysql(trx, &savep->savept); + + /* Store the current undo_no of the transaction so that we know where + to roll back if we have to roll back the next SQL statement: */ + + trx_mark_sql_stat_end(trx); + + trx->op_info = ""; + + return(err); +} + +/*******************************************************************//** +Creates a named savepoint. If the transaction is not yet started, starts it. +If there is already a savepoint of the same name, this call erases that old +savepoint and replaces it with a new. Savepoints are deleted in a transaction +commit or rollback. +@return always DB_SUCCESS */ +UNIV_INTERN +ulint +trx_savepoint_for_mysql( +/*====================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name, /*!< in: savepoint name */ + ib_int64_t binlog_cache_pos) /*!< in: MySQL binlog cache + position corresponding to this + connection at the time of the + savepoint */ +{ + trx_named_savept_t* savep; + + ut_a(trx); + ut_a(savepoint_name); + + trx_start_if_not_started(trx); + + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + + while (savep != NULL) { + if (0 == ut_strcmp(savep->name, savepoint_name)) { + /* Found */ + break; + } + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + + if (savep) { + /* There is a savepoint with the same name: free that */ + + UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep); + + mem_free(savep->name); + mem_free(savep); + } + + /* Create a new savepoint and add it as the last in the list */ + + savep = mem_alloc(sizeof(trx_named_savept_t)); + + savep->name = mem_strdup(savepoint_name); + + savep->savept = trx_savept_take(trx); + + savep->mysql_binlog_cache_pos = binlog_cache_pos; + + UT_LIST_ADD_LAST(trx_savepoints, trx->trx_savepoints, savep); + + return(DB_SUCCESS); +} + +/*******************************************************************//** +Releases only the named savepoint. Savepoints which were set after this +savepoint are left as is. +@return if no savepoint of the name found then DB_NO_SAVEPOINT, +otherwise DB_SUCCESS */ +UNIV_INTERN +ulint +trx_release_savepoint_for_mysql( +/*============================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name) /*!< in: savepoint name */ +{ + trx_named_savept_t* savep; + + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + + /* Search for the savepoint by name and free if found. */ + while (savep != NULL) { + if (0 == ut_strcmp(savep->name, savepoint_name)) { + trx_roll_savepoint_free(trx, savep); + return(DB_SUCCESS); + } + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + + return(DB_NO_SAVEPOINT); +} + +/*******************************************************************//** +Determines if this transaction is rolling back an incomplete transaction +in crash recovery. +@return TRUE if trx is an incomplete transaction that is being rolled +back in crash recovery */ +UNIV_INTERN +ibool +trx_is_recv( +/*========*/ + const trx_t* trx) /*!< in: transaction */ +{ + return(trx == trx_roll_crash_recv_trx); +} + +/*******************************************************************//** +Returns a transaction savepoint taken at this point in time. +@return savepoint */ +UNIV_INTERN +trx_savept_t +trx_savept_take( +/*============*/ + trx_t* trx) /*!< in: transaction */ +{ + trx_savept_t savept; + + savept.least_undo_no = trx->undo_no; + + return(savept); +} + +/*******************************************************************//** +Roll back an active transaction. */ +static +void +trx_rollback_active( +/*================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + mem_heap_t* heap; + que_fork_t* fork; + que_thr_t* thr; + roll_node_t* roll_node; + dict_table_t* table; + ib_int64_t rows_to_undo; + const char* unit = ""; + ibool dictionary_locked = FALSE; + + heap = mem_heap_create(512); + + fork = que_fork_create(NULL, NULL, QUE_FORK_RECOVERY, heap); + fork->trx = trx; + + thr = que_thr_create(fork, heap); + + roll_node = roll_node_create(heap); + + thr->child = roll_node; + roll_node->common.parent = thr; + + mutex_enter(&kernel_mutex); + + trx->graph = fork; + + ut_a(thr == que_fork_start_command(fork)); + + trx_roll_crash_recv_trx = trx; + trx_roll_max_undo_no = trx->undo_no; + trx_roll_progress_printed_pct = 0; + rows_to_undo = trx_roll_max_undo_no; + + if (rows_to_undo > 1000000000) { + rows_to_undo = rows_to_undo / 1000000; + unit = "M"; + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Rolling back trx with id " TRX_ID_FMT ", %lu%s" + " rows to undo\n", + (ullint) trx->id, + (ulong) rows_to_undo, unit); + mutex_exit(&kernel_mutex); + + if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) { + row_mysql_lock_data_dictionary(trx); + dictionary_locked = TRUE; + } + + que_run_threads(thr); + + mutex_enter(&kernel_mutex); + + while (trx->que_state != TRX_QUE_RUNNING) { + + mutex_exit(&kernel_mutex); + + fprintf(stderr, + "InnoDB: Waiting for rollback of trx id " + TRX_ID_FMT " to end\n", + (ullint) trx->id); + os_thread_sleep(100000); + + mutex_enter(&kernel_mutex); + } + + mutex_exit(&kernel_mutex); + + if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE + && trx->table_id != 0) { + + /* If the transaction was for a dictionary operation, we + drop the relevant table, if it still exists */ + + fprintf(stderr, + "InnoDB: Dropping table with id %llu" + " in recovery if it exists\n", + (ullint) trx->table_id); + + table = dict_table_get_on_id_low(trx->table_id); + + if (table) { + ulint err; + + fputs("InnoDB: Table found: dropping table ", stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs(" in recovery\n", stderr); + + err = row_drop_table_for_mysql(table->name, trx, TRUE); + trx_commit_for_mysql(trx); + + ut_a(err == (int) DB_SUCCESS); + } + } + + if (dictionary_locked) { + row_mysql_unlock_data_dictionary(trx); + } + + fprintf(stderr, "\nInnoDB: Rolling back of trx id " TRX_ID_FMT + " completed\n", + (ullint) trx->id); + mem_heap_free(heap); + + trx_roll_crash_recv_trx = NULL; +} + +/*******************************************************************//** +Rollback or clean up any incomplete transactions which were +encountered in crash recovery. If the transaction already was +committed, then we clean up a possible insert undo log. If the +transaction was not yet committed, then we roll it back. */ +UNIV_INTERN +void +trx_rollback_or_clean_recovered( +/*============================*/ + ibool all) /*!< in: FALSE=roll back dictionary transactions; + TRUE=roll back all non-PREPARED transactions */ +{ + trx_t* trx; + + mutex_enter(&kernel_mutex); + + if (!UT_LIST_GET_FIRST(trx_sys->trx_list)) { + goto leave_function; + } + + if (all) { + fprintf(stderr, + "InnoDB: Starting in background the rollback" + " of uncommitted transactions\n"); + } + + mutex_exit(&kernel_mutex); + +loop: + mutex_enter(&kernel_mutex); + + for (trx = UT_LIST_GET_FIRST(trx_sys->trx_list); trx; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + if (!trx->is_recovered) { + continue; + } + + switch (trx->state) { + case TRX_NOT_STARTED: + case TRX_PREPARED: + continue; + + case TRX_COMMITTED_IN_MEMORY: + mutex_exit(&kernel_mutex); + fprintf(stderr, + "InnoDB: Cleaning up trx with id " + TRX_ID_FMT "\n", + (ullint) trx->id); + trx_cleanup_at_db_startup(trx); + goto loop; + + case TRX_ACTIVE: + if (all || trx_get_dict_operation(trx) + != TRX_DICT_OP_NONE) { + mutex_exit(&kernel_mutex); + trx_rollback_active(trx); + goto loop; + } + } + } + + if (all) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Rollback of non-prepared" + " transactions completed\n"); + } + +leave_function: + mutex_exit(&kernel_mutex); +} + +/*******************************************************************//** +Rollback or clean up any incomplete transactions which were +encountered in crash recovery. If the transaction already was +committed, then we clean up a possible insert undo log. If the +transaction was not yet committed, then we roll it back. +Note: this is done in a background thread. +@return a dummy parameter */ +UNIV_INTERN +os_thread_ret_t +trx_rollback_or_clean_all_recovered( +/*================================*/ + void* arg __attribute__((unused))) + /*!< in: a dummy parameter required by + os_thread_create */ +{ +#ifdef UNIV_PFS_THREAD + pfs_register_thread(trx_rollback_clean_thread_key); +#endif /* UNIV_PFS_THREAD */ + + trx_rollback_or_clean_recovered(TRUE); + + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} + +/*******************************************************************//** +Creates an undo number array. +@return own: undo number array */ +UNIV_INTERN +trx_undo_arr_t* +trx_undo_arr_create(void) +/*=====================*/ +{ + trx_undo_arr_t* arr; + mem_heap_t* heap; + ulint i; + + heap = mem_heap_create(1024); + + arr = mem_heap_alloc(heap, sizeof(trx_undo_arr_t)); + + arr->infos = mem_heap_alloc(heap, sizeof(trx_undo_inf_t) + * UNIV_MAX_PARALLELISM); + arr->n_cells = UNIV_MAX_PARALLELISM; + arr->n_used = 0; + + arr->heap = heap; + + for (i = 0; i < UNIV_MAX_PARALLELISM; i++) { + + (trx_undo_arr_get_nth_info(arr, i))->in_use = FALSE; + } + + return(arr); +} + +/*******************************************************************//** +Frees an undo number array. */ +UNIV_INTERN +void +trx_undo_arr_free( +/*==============*/ + trx_undo_arr_t* arr) /*!< in: undo number array */ +{ + ut_ad(arr->n_used == 0); + + mem_heap_free(arr->heap); +} + +/*******************************************************************//** +Stores info of an undo log record to the array if it is not stored yet. +@return FALSE if the record already existed in the array */ +static +ibool +trx_undo_arr_store_info( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + undo_no_t undo_no)/*!< in: undo number */ +{ + trx_undo_inf_t* cell; + trx_undo_inf_t* stored_here; + trx_undo_arr_t* arr; + ulint n_used; + ulint n; + ulint i; + + n = 0; + arr = trx->undo_no_arr; + n_used = arr->n_used; + stored_here = NULL; + + for (i = 0;; i++) { + cell = trx_undo_arr_get_nth_info(arr, i); + + if (!cell->in_use) { + if (!stored_here) { + /* Not in use, we may store here */ + cell->undo_no = undo_no; + cell->in_use = TRUE; + + arr->n_used++; + + stored_here = cell; + } + } else { + n++; + + if (cell->undo_no == undo_no) { + + if (stored_here) { + stored_here->in_use = FALSE; + ut_ad(arr->n_used > 0); + arr->n_used--; + } + + ut_ad(arr->n_used == n_used); + + return(FALSE); + } + } + + if (n == n_used && stored_here) { + + ut_ad(arr->n_used == 1 + n_used); + + return(TRUE); + } + } +} + +/*******************************************************************//** +Removes an undo number from the array. */ +static +void +trx_undo_arr_remove_info( +/*=====================*/ + trx_undo_arr_t* arr, /*!< in: undo number array */ + undo_no_t undo_no)/*!< in: undo number */ +{ + trx_undo_inf_t* cell; + ulint i; + + for (i = 0;; i++) { + cell = trx_undo_arr_get_nth_info(arr, i); + + if (cell->in_use + && cell->undo_no == undo_no) { + + cell->in_use = FALSE; + + ut_ad(arr->n_used > 0); + + arr->n_used--; + + return; + } + } +} + +/*******************************************************************//** +Gets the biggest undo number in an array. +@return biggest value, 0 if the array is empty */ +static +undo_no_t +trx_undo_arr_get_biggest( +/*=====================*/ + trx_undo_arr_t* arr) /*!< in: undo number array */ +{ + trx_undo_inf_t* cell; + ulint n_used; + undo_no_t biggest; + ulint n; + ulint i; + + n = 0; + n_used = arr->n_used; + biggest = 0; + + for (i = 0;; i++) { + cell = trx_undo_arr_get_nth_info(arr, i); + + if (cell->in_use) { + n++; + if (cell->undo_no > biggest) { + + biggest = cell->undo_no; + } + } + + if (n == n_used) { + return(biggest); + } + } +} + +/***********************************************************************//** +Tries truncate the undo logs. */ +UNIV_INTERN +void +trx_roll_try_truncate( +/*==================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + trx_undo_arr_t* arr; + undo_no_t limit; + undo_no_t biggest; + + ut_ad(mutex_own(&(trx->undo_mutex))); + ut_ad(mutex_own(&((trx->rseg)->mutex))); + + trx->pages_undone = 0; + + arr = trx->undo_no_arr; + + limit = trx->undo_no; + + if (arr->n_used > 0) { + biggest = trx_undo_arr_get_biggest(arr); + + if (biggest >= limit) { + + limit = biggest + 1; + } + } + + if (trx->insert_undo) { + trx_undo_truncate_end(trx, trx->insert_undo, limit); + } + + if (trx->update_undo) { + trx_undo_truncate_end(trx, trx->update_undo, limit); + } +} + +/***********************************************************************//** +Pops the topmost undo log record in a single undo log and updates the info +about the topmost record in the undo log memory struct. +@return undo log record, the page s-latched */ +static +trx_undo_rec_t* +trx_roll_pop_top_rec( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + trx_undo_t* undo, /*!< in: undo log */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* undo_page; + ulint offset; + trx_undo_rec_t* prev_rec; + page_t* prev_rec_page; + + ut_ad(mutex_own(&(trx->undo_mutex))); + + undo_page = trx_undo_page_get_s_latched(undo->space, undo->zip_size, + undo->top_page_no, mtr); + offset = undo->top_offset; + + /* fprintf(stderr, "Thread %lu undoing trx " TRX_ID_FMT + " undo record " TRX_ID_FMT "\n", + os_thread_get_curr_id(), trx->id, undo->top_undo_no); */ + + prev_rec = trx_undo_get_prev_rec(undo_page + offset, + undo->hdr_page_no, undo->hdr_offset, + mtr); + if (prev_rec == NULL) { + + undo->empty = TRUE; + } else { + prev_rec_page = page_align(prev_rec); + + if (prev_rec_page != undo_page) { + + trx->pages_undone++; + } + + undo->top_page_no = page_get_page_no(prev_rec_page); + undo->top_offset = prev_rec - prev_rec_page; + undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec); + } + + return(undo_page + offset); +} + +/********************************************************************//** +Pops the topmost record when the two undo logs of a transaction are seen +as a single stack of records ordered by their undo numbers. Inserts the +undo number of the popped undo record to the array of currently processed +undo numbers in the transaction. When the query thread finishes processing +of this undo record, it must be released with trx_undo_rec_release. +@return undo log record copied to heap, NULL if none left, or if the +undo number of the top record would be less than the limit */ +UNIV_INTERN +trx_undo_rec_t* +trx_roll_pop_top_rec_of_trx( +/*========================*/ + trx_t* trx, /*!< in: transaction */ + undo_no_t limit, /*!< in: least undo number we need */ + roll_ptr_t* roll_ptr,/*!< out: roll pointer to undo record */ + mem_heap_t* heap) /*!< in: memory heap where copied */ +{ + trx_undo_t* undo; + trx_undo_t* ins_undo; + trx_undo_t* upd_undo; + trx_undo_rec_t* undo_rec; + trx_undo_rec_t* undo_rec_copy; + undo_no_t undo_no; + ibool is_insert; + trx_rseg_t* rseg; + ulint progress_pct; + mtr_t mtr; + + rseg = trx->rseg; +try_again: + mutex_enter(&(trx->undo_mutex)); + + if (trx->pages_undone >= TRX_ROLL_TRUNC_THRESHOLD) { + mutex_enter(&(rseg->mutex)); + + trx_roll_try_truncate(trx); + + mutex_exit(&(rseg->mutex)); + } + + ins_undo = trx->insert_undo; + upd_undo = trx->update_undo; + + if (!ins_undo || ins_undo->empty) { + undo = upd_undo; + } else if (!upd_undo || upd_undo->empty) { + undo = ins_undo; + } else if (upd_undo->top_undo_no > ins_undo->top_undo_no) { + undo = upd_undo; + } else { + undo = ins_undo; + } + + if (!undo || undo->empty + || limit > undo->top_undo_no) { + + if ((trx->undo_no_arr)->n_used == 0) { + /* Rollback is ending */ + + mutex_enter(&(rseg->mutex)); + + trx_roll_try_truncate(trx); + + mutex_exit(&(rseg->mutex)); + } + + mutex_exit(&(trx->undo_mutex)); + + return(NULL); + } + + if (undo == ins_undo) { + is_insert = TRUE; + } else { + is_insert = FALSE; + } + + *roll_ptr = trx_undo_build_roll_ptr(is_insert, (undo->rseg)->id, + undo->top_page_no, + undo->top_offset); + mtr_start(&mtr); + + undo_rec = trx_roll_pop_top_rec(trx, undo, &mtr); + + undo_no = trx_undo_rec_get_undo_no(undo_rec); + + ut_ad(undo_no + 1 == trx->undo_no); + + /* We print rollback progress info if we are in a crash recovery + and the transaction has at least 1000 row operations to undo. */ + + if (trx == trx_roll_crash_recv_trx && trx_roll_max_undo_no > 1000) { + + progress_pct = 100 - (ulint) + ((undo_no * 100) / trx_roll_max_undo_no); + if (progress_pct != trx_roll_progress_printed_pct) { + if (trx_roll_progress_printed_pct == 0) { + fprintf(stderr, + "\nInnoDB: Progress in percents:" + " %lu", (ulong) progress_pct); + } else { + fprintf(stderr, + " %lu", (ulong) progress_pct); + } + fflush(stderr); + trx_roll_progress_printed_pct = progress_pct; + } + } + + trx->undo_no = undo_no; + + if (!trx_undo_arr_store_info(trx, undo_no)) { + /* A query thread is already processing this undo log record */ + + mutex_exit(&(trx->undo_mutex)); + + mtr_commit(&mtr); + + goto try_again; + } + + undo_rec_copy = trx_undo_rec_copy(undo_rec, heap); + + mutex_exit(&(trx->undo_mutex)); + + mtr_commit(&mtr); + + return(undo_rec_copy); +} + +/********************************************************************//** +Reserves an undo log record for a query thread to undo. This should be +called if the query thread gets the undo log record not using the pop +function above. +@return TRUE if succeeded */ +UNIV_INTERN +ibool +trx_undo_rec_reserve( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + undo_no_t undo_no)/*!< in: undo number of the record */ +{ + ibool ret; + + mutex_enter(&(trx->undo_mutex)); + + ret = trx_undo_arr_store_info(trx, undo_no); + + mutex_exit(&(trx->undo_mutex)); + + return(ret); +} + +/*******************************************************************//** +Releases a reserved undo record. */ +UNIV_INTERN +void +trx_undo_rec_release( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + undo_no_t undo_no)/*!< in: undo number */ +{ + trx_undo_arr_t* arr; + + mutex_enter(&(trx->undo_mutex)); + + arr = trx->undo_no_arr; + + trx_undo_arr_remove_info(arr, undo_no); + + mutex_exit(&(trx->undo_mutex)); +} + +/*********************************************************************//** +Starts a rollback operation. */ +UNIV_INTERN +void +trx_rollback( +/*=========*/ + trx_t* trx, /*!< in: transaction */ + trx_sig_t* sig, /*!< in: signal starting the rollback */ + que_thr_t** next_thr)/*!< in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread; if the passed value is + NULL, the parameter is ignored */ +{ + que_t* roll_graph; + que_thr_t* thr; + /* que_thr_t* thr2; */ + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad((trx->undo_no_arr == NULL) || ((trx->undo_no_arr)->n_used == 0)); + + /* Initialize the rollback field in the transaction */ + + switch (sig->type) { + case TRX_SIG_TOTAL_ROLLBACK: + trx->roll_limit = 0; + break; + case TRX_SIG_ROLLBACK_TO_SAVEPT: + trx->roll_limit = (sig->savept).least_undo_no; + break; + case TRX_SIG_ERROR_OCCURRED: + trx->roll_limit = trx->last_sql_stat_start.least_undo_no; + break; + default: + ut_error; + } + + ut_a(trx->roll_limit <= trx->undo_no); + + trx->pages_undone = 0; + + if (trx->undo_no_arr == NULL) { + trx->undo_no_arr = trx_undo_arr_create(); + } + + /* Build a 'query' graph which will perform the undo operations */ + + roll_graph = trx_roll_graph_build(trx); + + trx->graph = roll_graph; + trx->que_state = TRX_QUE_ROLLING_BACK; + + thr = que_fork_start_command(roll_graph); + + ut_ad(thr); + + /* thr2 = que_fork_start_command(roll_graph); + + ut_ad(thr2); */ + + if (next_thr && (*next_thr == NULL)) { + *next_thr = thr; + /* srv_que_task_enqueue_low(thr2); */ + } else { + srv_que_task_enqueue_low(thr); + /* srv_que_task_enqueue_low(thr2); */ + } +} + +/****************************************************************//** +Builds an undo 'query' graph for a transaction. The actual rollback is +performed by executing this query graph like a query subprocedure call. +The reply about the completion of the rollback will be sent by this +graph. +@return own: the query graph */ +UNIV_INTERN +que_t* +trx_roll_graph_build( +/*=================*/ + trx_t* trx) /*!< in: trx handle */ +{ + mem_heap_t* heap; + que_fork_t* fork; + que_thr_t* thr; + /* que_thr_t* thr2; */ + + ut_ad(mutex_own(&kernel_mutex)); + + heap = mem_heap_create(512); + fork = que_fork_create(NULL, NULL, QUE_FORK_ROLLBACK, heap); + fork->trx = trx; + + thr = que_thr_create(fork, heap); + /* thr2 = que_thr_create(fork, heap); */ + + thr->child = row_undo_node_create(trx, thr, heap); + /* thr2->child = row_undo_node_create(trx, thr2, heap); */ + + return(fork); +} + +/*********************************************************************//** +Finishes error processing after the necessary partial rollback has been +done. */ +static +void +trx_finish_error_processing( +/*========================*/ + trx_t* trx) /*!< in: transaction */ +{ + trx_sig_t* sig; + trx_sig_t* next_sig; + + ut_ad(mutex_own(&kernel_mutex)); + + sig = UT_LIST_GET_FIRST(trx->signals); + + while (sig != NULL) { + next_sig = UT_LIST_GET_NEXT(signals, sig); + + if (sig->type == TRX_SIG_ERROR_OCCURRED) { + + trx_sig_remove(trx, sig); + } + + sig = next_sig; + } + + trx->que_state = TRX_QUE_RUNNING; +} + +/*********************************************************************//** +Finishes a partial rollback operation. */ +static +void +trx_finish_partial_rollback_off_kernel( +/*===================================*/ + trx_t* trx, /*!< in: transaction */ + que_thr_t** next_thr)/*!< in/out: next query thread to run; + if the value which is passed in is a pointer + to a NULL pointer, then the calling function + can start running a new query thread; if this + parameter is NULL, it is ignored */ +{ + trx_sig_t* sig; + + ut_ad(mutex_own(&kernel_mutex)); + + sig = UT_LIST_GET_FIRST(trx->signals); + + /* Remove the signal from the signal queue and send reply message + to it */ + + trx_sig_reply(sig, next_thr); + trx_sig_remove(trx, sig); + + trx->que_state = TRX_QUE_RUNNING; +} + +/****************************************************************//** +Finishes a transaction rollback. */ +UNIV_INTERN +void +trx_finish_rollback_off_kernel( +/*===========================*/ + que_t* graph, /*!< in: undo graph which can now be freed */ + trx_t* trx, /*!< in: transaction */ + que_thr_t** next_thr)/*!< in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread; if this parameter is + NULL, it is ignored */ +{ + trx_sig_t* sig; + trx_sig_t* next_sig; + + ut_ad(mutex_own(&kernel_mutex)); + + ut_a(trx->undo_no_arr == NULL || trx->undo_no_arr->n_used == 0); + + /* Free the memory reserved by the undo graph */ + que_graph_free(graph); + + sig = UT_LIST_GET_FIRST(trx->signals); + + if (sig->type == TRX_SIG_ROLLBACK_TO_SAVEPT) { + + trx_finish_partial_rollback_off_kernel(trx, next_thr); + + return; + + } else if (sig->type == TRX_SIG_ERROR_OCCURRED) { + + trx_finish_error_processing(trx); + + return; + } + +#ifdef UNIV_DEBUG + if (lock_print_waits) { + fprintf(stderr, "Trx " TRX_ID_FMT " rollback finished\n", + (ullint) trx->id); + } +#endif /* UNIV_DEBUG */ + + trx_commit_off_kernel(trx); + + /* Remove all TRX_SIG_TOTAL_ROLLBACK signals from the signal queue and + send reply messages to them */ + + trx->que_state = TRX_QUE_RUNNING; + + while (sig != NULL) { + next_sig = UT_LIST_GET_NEXT(signals, sig); + + if (sig->type == TRX_SIG_TOTAL_ROLLBACK) { + + trx_sig_reply(sig, next_thr); + + trx_sig_remove(trx, sig); + } + + sig = next_sig; + } +} + +/*********************************************************************//** +Creates a rollback command node struct. +@return own: rollback node struct */ +UNIV_INTERN +roll_node_t* +roll_node_create( +/*=============*/ + mem_heap_t* heap) /*!< in: mem heap where created */ +{ + roll_node_t* node; + + node = mem_heap_alloc(heap, sizeof(roll_node_t)); + node->common.type = QUE_NODE_ROLLBACK; + node->state = ROLL_NODE_SEND; + + node->partial = FALSE; + + return(node); +} + +/***********************************************************//** +Performs an execution step for a rollback command node in a query graph. +@return query thread to run next, or NULL */ +UNIV_INTERN +que_thr_t* +trx_rollback_step( +/*==============*/ + que_thr_t* thr) /*!< in: query thread */ +{ + roll_node_t* node; + ulint sig_no; + trx_savept_t* savept; + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = ROLL_NODE_SEND; + } + + if (node->state == ROLL_NODE_SEND) { + mutex_enter(&kernel_mutex); + + node->state = ROLL_NODE_WAIT; + + if (node->partial) { + sig_no = TRX_SIG_ROLLBACK_TO_SAVEPT; + savept = &(node->savept); + } else { + sig_no = TRX_SIG_TOTAL_ROLLBACK; + savept = NULL; + } + + /* Send a rollback signal to the transaction */ + + trx_sig_send(thr_get_trx(thr), sig_no, TRX_SIG_SELF, thr, + savept, NULL); + + thr->state = QUE_THR_SIG_REPLY_WAIT; + + mutex_exit(&kernel_mutex); + + return(NULL); + } + + ut_ad(node->state == ROLL_NODE_WAIT); + + thr->run_node = que_node_get_parent(node); + + return(thr); +} diff --git a/storage/xtradb/trx/trx0rseg.c b/storage/xtradb/trx/trx0rseg.c new file mode 100644 index 00000000000..ed3c27326d4 --- /dev/null +++ b/storage/xtradb/trx/trx0rseg.c @@ -0,0 +1,374 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle Corpn. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0rseg.c +Rollback segment + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0rseg.h" + +#ifdef UNIV_NONINL +#include "trx0rseg.ic" +#endif + +#include "trx0undo.h" +#include "fut0lst.h" +#include "srv0srv.h" +#include "trx0purge.h" + +#ifdef UNIV_PFS_MUTEX +/* Key to register rseg_mutex_key with performance schema */ +UNIV_INTERN mysql_pfs_key_t rseg_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/******************************************************************//** +Looks for a rollback segment, based on the rollback segment id. +@return rollback segment */ +UNIV_INTERN +trx_rseg_t* +trx_rseg_get_on_id( +/*===============*/ + ulint id) /*!< in: rollback segment id */ +{ + trx_rseg_t* rseg; + + ut_a(id < TRX_SYS_N_RSEGS); + + rseg = trx_sys->rseg_array[id]; + + ut_a(rseg == NULL || id == rseg->id); + + return(rseg); +} + +/****************************************************************//** +Creates a rollback segment header. This function is called only when +a new rollback segment is created in the database. +@return page number of the created segment, FIL_NULL if fail */ +UNIV_INTERN +ulint +trx_rseg_header_create( +/*===================*/ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint max_size, /*!< in: max size in pages */ + ulint rseg_slot_no, /*!< in: rseg id == slot number in trx sys */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint page_no; + trx_rsegf_t* rsegf; + trx_sysf_t* sys_header; + ulint i; + buf_block_t* block; + + ut_ad(mtr); + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL), + MTR_MEMO_X_LOCK)); + + /* Allocate a new file segment for the rollback segment */ + block = fseg_create(space, 0, + TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr); + + if (block == NULL) { + /* No space left */ + + return(FIL_NULL); + } + + buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW); + + page_no = buf_block_get_page_no(block); + + /* Get the rollback segment file page */ + rsegf = trx_rsegf_get_new(space, zip_size, page_no, mtr); + + /* Initialize max size field */ + mlog_write_ulint(rsegf + TRX_RSEG_MAX_SIZE, max_size, + MLOG_4BYTES, mtr); + + /* Initialize the history list */ + + mlog_write_ulint(rsegf + TRX_RSEG_HISTORY_SIZE, 0, MLOG_4BYTES, mtr); + flst_init(rsegf + TRX_RSEG_HISTORY, mtr); + + /* Reset the undo log slots */ + for (i = 0; i < TRX_RSEG_N_SLOTS; i++) { + + trx_rsegf_set_nth_undo(rsegf, i, FIL_NULL, mtr); + } + + /* Add the rollback segment info to the free slot in + the trx system header */ + + sys_header = trx_sysf_get(mtr); + + trx_sysf_rseg_set_space(sys_header, rseg_slot_no, space, mtr); + trx_sysf_rseg_set_page_no(sys_header, rseg_slot_no, page_no, mtr); + + return(page_no); +} + +/***********************************************************************//** +Free's an instance of the rollback segment in memory. */ +UNIV_INTERN +void +trx_rseg_mem_free( +/*==============*/ + trx_rseg_t* rseg) /* in, own: instance to free */ +{ + trx_undo_t* undo; + + mutex_free(&rseg->mutex); + + /* There can't be any active transactions. */ + ut_a(UT_LIST_GET_LEN(rseg->update_undo_list) == 0); + ut_a(UT_LIST_GET_LEN(rseg->insert_undo_list) == 0); + + undo = UT_LIST_GET_FIRST(rseg->update_undo_cached); + + while (undo != NULL) { + trx_undo_t* prev_undo = undo; + + undo = UT_LIST_GET_NEXT(undo_list, undo); + UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, prev_undo); + + trx_undo_mem_free(prev_undo); + } + + undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached); + + while (undo != NULL) { + trx_undo_t* prev_undo = undo; + + undo = UT_LIST_GET_NEXT(undo_list, undo); + UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, prev_undo); + + trx_undo_mem_free(prev_undo); + } + + trx_sys_set_nth_rseg(trx_sys, rseg->id, NULL); + + mem_free(rseg); +} + +/*************************************************************************** +Creates and initializes a rollback segment object. The values for the +fields are read from the header. The object is inserted to the rseg +list of the trx system object and a pointer is inserted in the rseg +array in the trx system object. +@return own: rollback segment object */ +static +trx_rseg_t* +trx_rseg_mem_create( +/*================*/ + ulint id, /*!< in: rollback segment id */ + ulint space, /*!< in: space where the segment + placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number of the segment + header */ + ib_bh_t* ib_bh, /*!< in/out: rseg queue */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint len; + trx_rseg_t* rseg; + fil_addr_t node_addr; + trx_rsegf_t* rseg_header; + trx_ulogf_t* undo_log_hdr; + ulint sum_of_undo_sizes; + + ut_ad(mutex_own(&kernel_mutex)); + + rseg = mem_zalloc(sizeof(trx_rseg_t)); + + rseg->id = id; + rseg->space = space; + rseg->zip_size = zip_size; + rseg->page_no = page_no; + + mutex_create(rseg_mutex_key, &rseg->mutex, SYNC_RSEG); + + UT_LIST_ADD_LAST(rseg_list, trx_sys->rseg_list, rseg); + + trx_sys_set_nth_rseg(trx_sys, id, rseg); + + rseg_header = trx_rsegf_get_new(space, zip_size, page_no, mtr); + + rseg->max_size = mtr_read_ulint(rseg_header + TRX_RSEG_MAX_SIZE, + MLOG_4BYTES, mtr); + + /* Initialize the undo log lists according to the rseg header */ + + sum_of_undo_sizes = trx_undo_lists_init(rseg); + + rseg->curr_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, + MLOG_4BYTES, mtr) + + 1 + sum_of_undo_sizes; + + len = flst_get_len(rseg_header + TRX_RSEG_HISTORY, mtr); + if (len > 0) { + const void* ptr; + rseg_queue_t rseg_queue; + + trx_sys->rseg_history_len += len; + + node_addr = trx_purge_get_log_from_hist( + flst_get_last(rseg_header + TRX_RSEG_HISTORY, mtr)); + rseg->last_page_no = node_addr.page; + rseg->last_offset = node_addr.boffset; + + undo_log_hdr = trx_undo_page_get(rseg->space, rseg->zip_size, + node_addr.page, + mtr) + node_addr.boffset; + + rseg->last_trx_no = mach_read_from_8( + undo_log_hdr + TRX_UNDO_TRX_NO); + rseg->last_del_marks = mtr_read_ulint( + undo_log_hdr + TRX_UNDO_DEL_MARKS, MLOG_2BYTES, mtr); + + rseg_queue.rseg = rseg; + rseg_queue.trx_no = rseg->last_trx_no; + + if (rseg->last_page_no != FIL_NULL) { + /* There is no need to cover this operation by the purge + mutex because we are still bootstrapping. */ + + ptr = ib_bh_push(ib_bh, &rseg_queue); + ut_a(ptr != NULL); + } + } else { + rseg->last_page_no = FIL_NULL; + } + + return(rseg); +} + +/******************************************************************** +Creates the memory copies for the rollback segments and initializes the +rseg list and array in trx_sys at a database startup. */ +static +void +trx_rseg_create_instance( +/*=====================*/ + trx_sysf_t* sys_header, /*!< in: trx system header */ + ib_bh_t* ib_bh, /*!< in/out: rseg queue */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint i; + + for (i = 0; i < TRX_SYS_N_RSEGS; i++) { + ulint page_no; + + page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr); + + if (page_no == FIL_NULL) { + trx_sys_set_nth_rseg(trx_sys, i, NULL); + } else { + ulint space; + ulint zip_size; + trx_rseg_t* rseg = NULL; + + ut_a(!trx_rseg_get_on_id(i)); + + space = trx_sysf_rseg_get_space(sys_header, i, mtr); + + zip_size = space ? fil_space_get_zip_size(space) : 0; + + rseg = trx_rseg_mem_create( + i, space, zip_size, page_no, ib_bh, mtr); + + ut_a(rseg->id == i); + } + } +} + +/********************************************************************* +Creates a rollback segment. +@return pointer to new rollback segment if create successful */ +UNIV_INTERN +trx_rseg_t* +trx_rseg_create(void) +/*=================*/ +{ + mtr_t mtr; + ulint slot_no; + trx_rseg_t* rseg = NULL; + + mtr_start(&mtr); + + /* To obey the latching order, acquire the file space + x-latch before the kernel mutex. */ + mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), &mtr); + + mutex_enter(&kernel_mutex); + + slot_no = trx_sysf_rseg_find_free(&mtr); + + if (slot_no != ULINT_UNDEFINED) { + ulint space; + ulint page_no; + ulint zip_size; + trx_sysf_t* sys_header; + + page_no = trx_rseg_header_create( + TRX_SYS_SPACE, 0, ULINT_MAX, slot_no, &mtr); + + ut_a(page_no != FIL_NULL); + + ut_ad(!trx_rseg_get_on_id(slot_no)); + + sys_header = trx_sysf_get(&mtr); + + space = trx_sysf_rseg_get_space(sys_header, slot_no, &mtr); + + zip_size = space ? fil_space_get_zip_size(space) : 0; + + rseg = trx_rseg_mem_create( + slot_no, space, zip_size, page_no, + purge_sys->ib_bh, &mtr); + } + + mutex_exit(&kernel_mutex); + mtr_commit(&mtr); + + return(rseg); +} + +/******************************************************************** +Initialize the rollback instance list. */ +UNIV_INTERN +void +trx_rseg_list_and_array_init( +/*=========================*/ + trx_sysf_t* sys_header, /*!< in: trx system header */ + ib_bh_t* ib_bh, /*!< in: rseg queue */ + mtr_t* mtr) /*!< in: mtr */ +{ + UT_LIST_INIT(trx_sys->rseg_list); + + trx_sys->rseg_history_len = 0; + + trx_rseg_create_instance(sys_header, ib_bh, mtr); +} + diff --git a/storage/xtradb/trx/trx0sys.c b/storage/xtradb/trx/trx0sys.c new file mode 100644 index 00000000000..6b230a296c0 --- /dev/null +++ b/storage/xtradb/trx/trx0sys.c @@ -0,0 +1,2049 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0sys.c +Transaction system + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0sys.h" + +#ifdef UNIV_NONINL +#include "trx0sys.ic" +#endif + +#ifndef UNIV_HOTBACKUP +#include "fsp0fsp.h" +#include "mtr0log.h" +#include "mtr0log.h" +#include "trx0trx.h" +#include "trx0rseg.h" +#include "trx0undo.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "trx0purge.h" +#include "log0log.h" +#include "log0recv.h" +#include "os0file.h" +#include "read0read.h" + +/** The file format tag structure with id and name. */ +struct file_format_struct { + ulint id; /*!< id of the file format */ + const char* name; /*!< text representation of the + file format */ + mutex_t mutex; /*!< covers changes to the above + fields */ +}; + +/** The file format tag */ +typedef struct file_format_struct file_format_t; + +/** The transaction system */ +UNIV_INTERN trx_sys_t* trx_sys = NULL; +/** The doublewrite buffer */ +UNIV_INTERN trx_doublewrite_t* trx_doublewrite = NULL; + +/** The following is set to TRUE when we are upgrading from pre-4.1 +format data files to the multiple tablespaces format data files */ +UNIV_INTERN ibool trx_doublewrite_must_reset_space_ids = FALSE; +/** Set to TRUE when the doublewrite buffer is being created */ +UNIV_INTERN ibool trx_doublewrite_buf_is_being_created = FALSE; + +/** The following is TRUE when we are using the database in the +post-4.1 format, i.e., we have successfully upgraded, or have created +a new database installation */ +UNIV_INTERN ibool trx_sys_multiple_tablespace_format = FALSE; + +/** In a MySQL replication slave, in crash recovery we store the master log +file name and position here. */ +/* @{ */ +/** Master binlog file name */ +UNIV_INTERN char trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN]; +/** Master binlog file position. We have successfully got the updates +up to this position. -1 means that no crash recovery was needed, or +there was no master log position info inside InnoDB.*/ +UNIV_INTERN ib_int64_t trx_sys_mysql_master_log_pos = -1; +/* @} */ + +UNIV_INTERN char trx_sys_mysql_relay_log_name[TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN]; +UNIV_INTERN ib_int64_t trx_sys_mysql_relay_log_pos = -1; + +/** If this MySQL server uses binary logging, after InnoDB has been inited +and if it has done a crash recovery, we store the binlog file name and position +here. */ +/* @{ */ +/** Binlog file name */ +UNIV_INTERN char trx_sys_mysql_bin_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN]; +/** Binlog file position, or -1 if unknown */ +UNIV_INTERN ib_int64_t trx_sys_mysql_bin_log_pos = -1; +/* @} */ +#endif /* !UNIV_HOTBACKUP */ + +/** List of animal names representing file format. */ +static const char* file_format_name_map[] = { + "Antelope", + "Barracuda", + "Cheetah", + "Dragon", + "Elk", + "Fox", + "Gazelle", + "Hornet", + "Impala", + "Jaguar", + "Kangaroo", + "Leopard", + "Moose", + "Nautilus", + "Ocelot", + "Porpoise", + "Quail", + "Rabbit", + "Shark", + "Tiger", + "Urchin", + "Viper", + "Whale", + "Xenops", + "Yak", + "Zebra" +}; + +/** The number of elements in the file format name array. */ +static const ulint FILE_FORMAT_NAME_N + = sizeof(file_format_name_map) / sizeof(file_format_name_map[0]); + +#ifdef UNIV_PFS_MUTEX +/* Key to register the mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t trx_doublewrite_mutex_key; +UNIV_INTERN mysql_pfs_key_t file_format_max_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +#ifndef UNIV_HOTBACKUP +#ifdef UNIV_DEBUG +/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */ +UNIV_INTERN uint trx_rseg_n_slots_debug = 0; +#endif + +/** This is used to track the maximum file format id known to InnoDB. It's +updated via SET GLOBAL innodb_file_format_max = 'x' or when we open +or create a table. */ +static file_format_t file_format_max; + +/****************************************************************//** +Determines if a page number is located inside the doublewrite buffer. +@return TRUE if the location is inside the two blocks of the +doublewrite buffer */ +UNIV_INTERN +ibool +trx_doublewrite_page_inside( +/*========================*/ + ulint page_no) /*!< in: page number */ +{ + if (trx_doublewrite == NULL) { + + return(FALSE); + } + + if (page_no >= trx_doublewrite->block1 + && page_no < trx_doublewrite->block1 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + return(TRUE); + } + + if (page_no >= trx_doublewrite->block2 + && page_no < trx_doublewrite->block2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + return(TRUE); + } + + return(FALSE); +} + +/****************************************************************//** +Creates or initialializes the doublewrite buffer at a database start. */ +static +void +trx_doublewrite_init( +/*=================*/ + byte* doublewrite) /*!< in: pointer to the doublewrite buf + header on trx sys page */ +{ + trx_doublewrite = mem_alloc(sizeof(trx_doublewrite_t)); + + /* Since we now start to use the doublewrite buffer, no need to call + fsync() after every write to a data file */ +#ifdef UNIV_DO_FLUSH + os_do_not_call_flush_at_each_write = TRUE; +#endif /* UNIV_DO_FLUSH */ + + mutex_create(trx_doublewrite_mutex_key, + &trx_doublewrite->mutex, SYNC_DOUBLEWRITE); + + trx_doublewrite->first_free = 0; + + trx_doublewrite->block1 = mach_read_from_4( + doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1); + trx_doublewrite->block2 = mach_read_from_4( + doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2); + trx_doublewrite->write_buf_unaligned = ut_malloc( + (1 + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE); + + trx_doublewrite->write_buf = ut_align( + trx_doublewrite->write_buf_unaligned, UNIV_PAGE_SIZE); + trx_doublewrite->buf_block_arr = mem_alloc( + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * sizeof(void*)); +} + +/****************************************************************//** +Marks the trx sys header when we have successfully upgraded to the >= 4.1.x +multiple tablespace format. */ +UNIV_INTERN +void +trx_sys_mark_upgraded_to_multiple_tablespaces(void) +/*===============================================*/ +{ + buf_block_t* block; + byte* doublewrite; + mtr_t mtr; + + /* We upgraded to 4.1.x and reset the space id fields in the + doublewrite buffer. Let us mark to the trx_sys header that the upgrade + has been done. */ + + mtr_start(&mtr); + + block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE; + + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED, + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, + MLOG_4BYTES, &mtr); + mtr_commit(&mtr); + + /* Flush the modified pages to disk and make a checkpoint */ + log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); + + trx_sys_multiple_tablespace_format = TRUE; +} + +/****************************************************************//** +Creates the doublewrite buffer to a new InnoDB installation. The header of the +doublewrite buffer is placed on the trx system header page. */ +UNIV_INTERN +void +trx_sys_create_doublewrite_buf(void) +/*================================*/ +{ + buf_block_t* block; + buf_block_t* block2; + buf_block_t* new_block; + byte* doublewrite; + byte* fseg_header; + ulint page_no; + ulint prev_page_no; + ulint i; + mtr_t mtr; + + if (trx_doublewrite) { + /* Already inited */ + + return; + } + +start_again: + mtr_start(&mtr); + trx_doublewrite_buf_is_being_created = TRUE; + + block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE; + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) + == TRX_SYS_DOUBLEWRITE_MAGIC_N) { + /* The doublewrite buffer has already been created: + just read in some numbers */ + + trx_doublewrite_init(doublewrite); + + mtr_commit(&mtr); + trx_doublewrite_buf_is_being_created = FALSE; + } else { + fprintf(stderr, + "InnoDB: Doublewrite buffer not found:" + " creating new\n"); + + if (buf_pool_get_curr_size() + < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + + FSP_EXTENT_SIZE / 2 + 100) + * UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Cannot create doublewrite buffer:" + " you must\n" + "InnoDB: increase your buffer pool size.\n" + "InnoDB: Cannot continue operation.\n"); + + exit(1); + } + + block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, + TRX_SYS_DOUBLEWRITE + + TRX_SYS_DOUBLEWRITE_FSEG, &mtr); + + /* fseg_create acquires a second latch on the page, + therefore we must declare it: */ + + buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK); + + if (block2 == NULL) { + fprintf(stderr, + "InnoDB: Cannot create doublewrite buffer:" + " you must\n" + "InnoDB: increase your tablespace size.\n" + "InnoDB: Cannot continue operation.\n"); + + /* We exit without committing the mtr to prevent + its modifications to the database getting to disk */ + + exit(1); + } + + fseg_header = buf_block_get_frame(block) + + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG; + prev_page_no = 0; + + for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + + FSP_EXTENT_SIZE / 2; i++) { + new_block = fseg_alloc_free_page( + fseg_header, prev_page_no + 1, FSP_UP, &mtr); + if (new_block == NULL) { + fprintf(stderr, + "InnoDB: Cannot create doublewrite" + " buffer: you must\n" + "InnoDB: increase your" + " tablespace size.\n" + "InnoDB: Cannot continue operation.\n" + ); + + exit(1); + } + + /* We read the allocated pages to the buffer pool; + when they are written to disk in a flush, the space + id and page number fields are also written to the + pages. When we at database startup read pages + from the doublewrite buffer, we know that if the + space id and page number in them are the same as + the page position in the tablespace, then the page + has not been written to in doublewrite. */ + + ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1); + page_no = buf_block_get_page_no(new_block); + + if (i == FSP_EXTENT_SIZE / 2) { + ut_a(page_no == FSP_EXTENT_SIZE); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK1, + page_no, MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK1, + page_no, MLOG_4BYTES, &mtr); + } else if (i == FSP_EXTENT_SIZE / 2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + ut_a(page_no == 2 * FSP_EXTENT_SIZE); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK2, + page_no, MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK2, + page_no, MLOG_4BYTES, &mtr); + } else if (i > FSP_EXTENT_SIZE / 2) { + ut_a(page_no == prev_page_no + 1); + } + + prev_page_no = page_no; + } + + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC, + TRX_SYS_DOUBLEWRITE_MAGIC_N, + MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC + + TRX_SYS_DOUBLEWRITE_REPEAT, + TRX_SYS_DOUBLEWRITE_MAGIC_N, + MLOG_4BYTES, &mtr); + + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED, + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, + MLOG_4BYTES, &mtr); + mtr_commit(&mtr); + + /* Flush the modified pages to disk and make a checkpoint */ + log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); + + fprintf(stderr, "InnoDB: Doublewrite buffer created\n"); + + trx_sys_multiple_tablespace_format = TRUE; + + goto start_again; + } + + if (srv_doublewrite_file) { + /* the same doublewrite buffer to TRX_SYS_SPACE should exist. + check and create if not exist.*/ + + mtr_start(&mtr); + trx_doublewrite_buf_is_being_created = TRUE; + + block = buf_page_get(TRX_DOUBLEWRITE_SPACE, 0, TRX_SYS_PAGE_NO, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE; + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) + == TRX_SYS_DOUBLEWRITE_MAGIC_N) { + /* The doublewrite buffer has already been created: + just read in some numbers */ + + trx_doublewrite_init(doublewrite); + + mtr_commit(&mtr); + trx_doublewrite_buf_is_being_created = FALSE; + } else { + fprintf(stderr, + "InnoDB: Doublewrite buffer not found in the doublewrite file:" + " creating new doublewrite buffer.\n"); + + if (buf_pool_get_curr_size() + < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + + FSP_EXTENT_SIZE / 2 + 100) + * UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Cannot create the doublewrite buffer:" + " You must\n" + "InnoDB: increase your buffer pool size.\n" + "InnoDB: Cannot continue processing.\n"); + + exit(1); + } + + block2 = fseg_create(TRX_DOUBLEWRITE_SPACE, TRX_SYS_PAGE_NO, + TRX_SYS_DOUBLEWRITE + + TRX_SYS_DOUBLEWRITE_FSEG, &mtr); + + /* fseg_create acquires a second latch on the page, + therefore we must declare it: */ + + buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK); + + if (block2 == NULL) { + fprintf(stderr, + "InnoDB: Cannot create the doublewrite buffer:" + " You must\n" + "InnoDB: increase your tablespace size.\n" + "InnoDB: Cannot continue processing.\n"); + + /* We exit without committing the mtr to prevent + its modifications to the database getting to disk */ + + exit(1); + } + + fseg_header = buf_block_get_frame(block) + + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG; + prev_page_no = 0; + + for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + + FSP_EXTENT_SIZE / 2; i++) { + new_block = fseg_alloc_free_page( + fseg_header, prev_page_no + 1, FSP_UP, &mtr); + if (new_block == NULL) { + fprintf(stderr, + "InnoDB: Cannot create doublewrite" + " buffer: you must\n" + "InnoDB: increase your" + " tablespace size.\n" + "InnoDB: Cannot continue operation.\n" + ); + + exit(1); + } + + /* We read the allocated pages to the buffer pool; + when they are written to disk in a flush, the space + id and page number fields are also written to the + pages. When we at database startup read pages + from the doublewrite buffer, we know that if the + space id and page number in them are the same as + the page position in the tablespace, then the page + has not been written to in doublewrite. */ + + ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1); + page_no = buf_block_get_page_no(new_block); + + if (i == FSP_EXTENT_SIZE / 2) { + ut_a(page_no == FSP_EXTENT_SIZE); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK1, + page_no, MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK1, + page_no, MLOG_4BYTES, &mtr); + } else if (i == FSP_EXTENT_SIZE / 2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + ut_a(page_no == 2 * FSP_EXTENT_SIZE); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK2, + page_no, MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK2, + page_no, MLOG_4BYTES, &mtr); + } else if (i > FSP_EXTENT_SIZE / 2) { + ut_a(page_no == prev_page_no + 1); + } + + prev_page_no = page_no; + } + + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC, + TRX_SYS_DOUBLEWRITE_MAGIC_N, + MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC + + TRX_SYS_DOUBLEWRITE_REPEAT, + TRX_SYS_DOUBLEWRITE_MAGIC_N, + MLOG_4BYTES, &mtr); + + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED, + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, + MLOG_4BYTES, &mtr); + mtr_commit(&mtr); + + /* Flush the modified pages to disk and make a checkpoint */ + log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); + + fprintf(stderr, "InnoDB: Doublewrite buffer created in the doublewrite file\n"); + trx_sys_multiple_tablespace_format = TRUE; + } + trx_doublewrite_buf_is_being_created = FALSE; + } +} + +/****************************************************************//** +At a database startup initializes the doublewrite buffer memory structure if +we already have a doublewrite buffer created in the data files. If we are +upgrading to an InnoDB version which supports multiple tablespaces, then this +function performs the necessary update operations. If we are in a crash +recovery, this function uses a possible doublewrite buffer to restore +half-written pages in the data files. */ +UNIV_INTERN +void +trx_sys_doublewrite_init_or_restore_pages( +/*======================================*/ + ibool restore_corrupt_pages) /*!< in: TRUE=restore pages */ +{ + byte* buf; + byte* read_buf; + byte* unaligned_read_buf; + ulint block1; + ulint block2; + ulint source_page_no; + byte* page; + byte* doublewrite; + ulint doublewrite_space_id; + ulint space_id; + ulint page_no; + ulint i; + + doublewrite_space_id = (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE); + + if (srv_doublewrite_file) { + fprintf(stderr, + "InnoDB: doublewrite file '%s' is used.\n", + srv_doublewrite_file); + } + + /* We do the file i/o past the buffer pool */ + + unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE); + read_buf = ut_align(unaligned_read_buf, UNIV_PAGE_SIZE); + + /* Read the trx sys header to check if we are using the doublewrite + buffer */ + + fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, TRX_SYS_PAGE_NO, 0, + UNIV_PAGE_SIZE, read_buf, NULL); + doublewrite = read_buf + TRX_SYS_DOUBLEWRITE; + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) + == TRX_SYS_DOUBLEWRITE_MAGIC_N) { + /* The doublewrite buffer has been created */ + + trx_doublewrite_init(doublewrite); + + block1 = trx_doublewrite->block1; + block2 = trx_doublewrite->block2; + + buf = trx_doublewrite->write_buf; + } else { + goto leave_func; + } + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED) + != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) { + + /* We are upgrading from a version < 4.1.x to a version where + multiple tablespaces are supported. We must reset the space id + field in the pages in the doublewrite buffer because starting + from this version the space id is stored to + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */ + + trx_doublewrite_must_reset_space_ids = TRUE; + + fprintf(stderr, + "InnoDB: Resetting space id's in the" + " doublewrite buffer\n"); + } else { + trx_sys_multiple_tablespace_format = TRUE; + } + + /* Read the pages from the doublewrite buffer to memory */ + + fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, block1, 0, + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, + buf, NULL); + fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, block2, 0, + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, + buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, + NULL); + /* Check if any of these pages is half-written in data files, in the + intended position */ + + page = buf; + + for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) { + + page_no = mach_read_from_4(page + FIL_PAGE_OFFSET); + + if (trx_doublewrite_must_reset_space_ids) { + + space_id = 0; + mach_write_to_4(page + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0); + /* We do not need to calculate new checksums for the + pages because the field .._SPACE_ID does not affect + them. Write the page back to where we read it from. */ + + if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + source_page_no = block1 + i; + } else { + source_page_no = block2 + + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; + } + + fil_io(OS_FILE_WRITE, TRUE, 0, 0, source_page_no, 0, + UNIV_PAGE_SIZE, page, NULL); + /* printf("Resetting space id in page %lu\n", + source_page_no); */ + } else { + space_id = mach_read_from_4( + page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + } + + if (!restore_corrupt_pages) { + /* The database was shut down gracefully: no need to + restore pages */ + + } else if (!fil_tablespace_exists_in_mem(space_id)) { + /* Maybe we have dropped the single-table tablespace + and this page once belonged to it: do nothing */ + + } else if (!fil_check_adress_in_tablespace(space_id, + page_no)) { + fprintf(stderr, + "InnoDB: Warning: a page in the" + " doublewrite buffer is not within space\n" + "InnoDB: bounds; space id %lu" + " page number %lu, page %lu in" + " doublewrite buf.\n", + (ulong) space_id, (ulong) page_no, (ulong) i); + + } else if ((space_id == TRX_SYS_SPACE + || (srv_doublewrite_file && space_id == TRX_DOUBLEWRITE_SPACE)) + && ((page_no >= block1 + && page_no + < block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) + || (page_no >= block2 + && page_no + < (block2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)))) { + + /* It is an unwritten doublewrite buffer page: + do nothing */ + } else { + ulint zip_size = fil_space_get_zip_size(space_id); + + /* Read in the actual page from the file */ + fil_io(OS_FILE_READ, TRUE, space_id, zip_size, + page_no, 0, + zip_size ? zip_size : UNIV_PAGE_SIZE, + read_buf, NULL); + + if (srv_recovery_stats && recv_recovery_is_on()) { + mutex_enter(&(recv_sys->mutex)); + recv_sys->stats_doublewrite_check_pages++; + mutex_exit(&(recv_sys->mutex)); + } + + /* Check if the page is corrupt */ + + if (UNIV_UNLIKELY + (buf_page_is_corrupted( + TRUE, read_buf, zip_size))) { + + fprintf(stderr, + "InnoDB: Warning: database page" + " corruption or a failed\n" + "InnoDB: file read of" + " space %lu page %lu.\n" + "InnoDB: Trying to recover it from" + " the doublewrite buffer.\n", + (ulong) space_id, (ulong) page_no); + + if (buf_page_is_corrupted( + TRUE, page, zip_size)) { + fprintf(stderr, + "InnoDB: Dump of the page:\n"); + buf_page_print( + read_buf, zip_size, + BUF_PAGE_PRINT_NO_CRASH); + fprintf(stderr, + "InnoDB: Dump of" + " corresponding page" + " in doublewrite buffer:\n"); + buf_page_print( + page, zip_size, + BUF_PAGE_PRINT_NO_CRASH); + + fprintf(stderr, + "InnoDB: Also the page in the" + " doublewrite buffer" + " is corrupt.\n" + "InnoDB: Cannot continue" + " operation.\n" + "InnoDB: You can try to" + " recover the database" + " with the my.cnf\n" + "InnoDB: option:\n" + "InnoDB:" + " innodb_force_recovery=6\n"); + ut_error; + } + + /* Write the good page from the + doublewrite buffer to the intended + position */ + + fil_io(OS_FILE_WRITE, TRUE, space_id, + zip_size, page_no, 0, + zip_size ? zip_size : UNIV_PAGE_SIZE, + page, NULL); + + if (srv_recovery_stats && recv_recovery_is_on()) { + mutex_enter(&(recv_sys->mutex)); + recv_sys->stats_doublewrite_overwrite_pages++; + mutex_exit(&(recv_sys->mutex)); + } + + fprintf(stderr, + "InnoDB: Recovered the page from" + " the doublewrite buffer.\n"); + } + } + + page += UNIV_PAGE_SIZE; + } + + fil_flush_file_spaces(FIL_TABLESPACE); + +leave_func: + ut_free(unaligned_read_buf); +} + +/****************************************************************//** +Checks that trx is in the trx list. +@return TRUE if is in */ +UNIV_INTERN +ibool +trx_in_trx_list( +/*============*/ + trx_t* in_trx) /*!< in: trx */ +{ + trx_t* trx; + + ut_ad(mutex_own(&(kernel_mutex))); + + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx != NULL) { + + if (trx == in_trx) { + + return(TRUE); + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + return(FALSE); +} + +/*****************************************************************//** +Writes the value of max_trx_id to the file based trx system header. */ +UNIV_INTERN +void +trx_sys_flush_max_trx_id(void) +/*==========================*/ +{ + trx_sysf_t* sys_header; + mtr_t mtr; + + ut_ad(mutex_own(&kernel_mutex)); + + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + mlog_write_ull(sys_header + TRX_SYS_TRX_ID_STORE, + trx_sys->max_trx_id, &mtr); + mtr_commit(&mtr); +} + +/*****************************************************************//** +Updates the offset information about the end of the MySQL binlog entry +which corresponds to the transaction just being committed. In a MySQL +replication slave updates the latest master binlog position up to which +replication has proceeded. */ +UNIV_INTERN +void +trx_sys_update_mysql_binlog_offset( +/*===============================*/ + trx_sysf_t* sys_header, + const char* file_name_in,/*!< in: MySQL log file name */ + ib_int64_t offset, /*!< in: position in that log file */ + ulint field, /*!< in: offset of the MySQL log info field in + the trx sys header */ + mtr_t* mtr) /*!< in: mtr */ +{ + const char* file_name; + + if (ut_strlen(file_name_in) >= TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN) { + + /* We cannot fit the name to the 512 bytes we have reserved */ + /* -> To store relay log file information, file_name must fit to the 480 bytes */ + + file_name = ""; + } else { + file_name = file_name_in; + } + + if (mach_read_from_4(sys_header + field + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + != TRX_SYS_MYSQL_LOG_MAGIC_N) { + + mlog_write_ulint(sys_header + field + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD, + TRX_SYS_MYSQL_LOG_MAGIC_N, + MLOG_4BYTES, mtr); + } + + if (0 != strcmp((char*) (sys_header + field + TRX_SYS_MYSQL_LOG_NAME), + file_name)) { + + mlog_write_string(sys_header + field + + TRX_SYS_MYSQL_LOG_NAME, + (byte*) file_name, 1 + ut_strlen(file_name), + mtr); + } + + if (mach_read_from_4(sys_header + field + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH) > 0 + || (offset >> 32) > 0) { + + mlog_write_ulint(sys_header + field + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH, + (ulint)(offset >> 32), + MLOG_4BYTES, mtr); + } + + mlog_write_ulint(sys_header + field + + TRX_SYS_MYSQL_LOG_OFFSET_LOW, + (ulint)(offset & 0xFFFFFFFFUL), + MLOG_4BYTES, mtr); +} + +/*****************************************************************//** +Stores the MySQL binlog offset info in the trx system header if +the magic number shows it valid, and print the info to stderr */ +UNIV_INTERN +void +trx_sys_print_mysql_binlog_offset(void) +/*===================================*/ +{ + trx_sysf_t* sys_header; + mtr_t mtr; + ulint trx_sys_mysql_bin_log_pos_high; + ulint trx_sys_mysql_bin_log_pos_low; + + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + != TRX_SYS_MYSQL_LOG_MAGIC_N) { + + mtr_commit(&mtr); + + return; + } + + trx_sys_mysql_bin_log_pos_high = mach_read_from_4( + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH); + trx_sys_mysql_bin_log_pos_low = mach_read_from_4( + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW); + + trx_sys_mysql_bin_log_pos + = (((ib_int64_t)trx_sys_mysql_bin_log_pos_high) << 32) + + (ib_int64_t)trx_sys_mysql_bin_log_pos_low; + + ut_memcpy(trx_sys_mysql_bin_log_name, + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME, TRX_SYS_MYSQL_LOG_NAME_LEN); + + fprintf(stderr, + "InnoDB: Last MySQL binlog file position %lu %lu," + " file name %s\n", + trx_sys_mysql_bin_log_pos_high, trx_sys_mysql_bin_log_pos_low, + trx_sys_mysql_bin_log_name); + + mtr_commit(&mtr); +} + +/*****************************************************************//** +Reads the log coordinates at the given offset in the trx sys header. */ +static +void +trx_sys_read_log_pos( +/*=================*/ + const trx_sysf_t* sys_header, /*!< in: the trx sys header */ + uint header_offset, /*!< in: coord offset in the + header */ + char* log_fn, /*!< out: the log file name */ + ib_int64_t* log_pos) /*!< out: the log poistion */ +{ + ut_memcpy(log_fn, sys_header + header_offset + TRX_SYS_MYSQL_LOG_NAME, + TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN); + + *log_pos = + (((ib_int64_t)mach_read_from_4(sys_header + header_offset + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH)) << 32) + + mach_read_from_4(sys_header + header_offset + + TRX_SYS_MYSQL_LOG_OFFSET_LOW); +} + +/*****************************************************************//** +Prints to stderr the MySQL master log offset info in the trx system header +PREPARE set of fields if the magic number shows it valid and stores it +in global variables. */ +UNIV_INTERN +void +trx_sys_print_mysql_master_log_pos(void) +/*====================================*/ +{ + trx_sysf_t* sys_header; + mtr_t mtr; + + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + != TRX_SYS_MYSQL_LOG_MAGIC_N) { + + mtr_commit(&mtr); + + return; + } + + /* Copy the master log position info to global variables we can + use in ha_innobase.cc to initialize glob_mi to right values */ + trx_sys_read_log_pos(sys_header, TRX_SYS_MYSQL_MASTER_LOG_INFO, + trx_sys_mysql_master_log_name, + &trx_sys_mysql_master_log_pos); + + trx_sys_read_log_pos(sys_header, TRX_SYS_MYSQL_RELAY_LOG_INFO, + trx_sys_mysql_relay_log_name, + &trx_sys_mysql_relay_log_pos); + + mtr_commit(&mtr); + + fprintf(stderr, + "InnoDB: In a MySQL replication slave the last" + " master binlog file\n" + "InnoDB: position %llu, file name %s\n", + trx_sys_mysql_master_log_pos, + trx_sys_mysql_master_log_name); + + fprintf(stderr, + "InnoDB: and relay log file\n" + "InnoDB: position %llu, file name %s\n", + trx_sys_mysql_relay_log_pos, + trx_sys_mysql_relay_log_name); +} + +/*****************************************************************//** +Prints to stderr the MySQL master log offset info in the trx system header +COMMIT set of fields if the magic number shows it valid and stores it +in global variables. */ +UNIV_INTERN +void +trx_sys_print_committed_mysql_master_log_pos(void) +/*==============================================*/ +{ + trx_sysf_t* sys_header; + mtr_t mtr; + + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + if (mach_read_from_4(sys_header + TRX_SYS_COMMIT_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + != TRX_SYS_MYSQL_LOG_MAGIC_N) { + + mtr_commit(&mtr); + + return; + } + + /* Copy the master log position info to global variables we can + use in ha_innobase.cc to initialize glob_mi to right values */ + trx_sys_read_log_pos(sys_header, TRX_SYS_COMMIT_MASTER_LOG_INFO, + trx_sys_mysql_master_log_name, + &trx_sys_mysql_master_log_pos); + + trx_sys_read_log_pos(sys_header, TRX_SYS_COMMIT_RELAY_LOG_INFO, + trx_sys_mysql_relay_log_name, + &trx_sys_mysql_relay_log_pos); + + mtr_commit(&mtr); + + fprintf(stderr, + "InnoDB: In a MySQL replication slave the last" + " master binlog file\n" + "InnoDB: position %llu, file name %s\n", + trx_sys_mysql_master_log_pos, trx_sys_mysql_master_log_name); + + fprintf(stderr, + "InnoDB: and relay log file\n" + "InnoDB: position %llu, file name %s\n", + trx_sys_mysql_relay_log_pos, trx_sys_mysql_relay_log_name); +} + +/****************************************************************//** +Looks for a free slot for a rollback segment in the trx system file copy. +@return slot index or ULINT_UNDEFINED if not found */ +UNIV_INTERN +ulint +trx_sysf_rseg_find_free( +/*====================*/ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_sysf_t* sys_header; + ulint page_no; + ulint i; + + ut_ad(mutex_own(&(kernel_mutex))); + + sys_header = trx_sysf_get(mtr); + + for (i = 0; i < TRX_SYS_N_RSEGS; i++) { + + page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr); + + if (page_no == FIL_NULL) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/*****************************************************************//** +Creates the file page for the transaction system. This function is called only +at the database creation, before trx_sys_init. */ +static +void +trx_sysf_create( +/*============*/ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_sysf_t* sys_header; + ulint slot_no; + buf_block_t* block; + page_t* page; + ulint page_no; + byte* ptr; + ulint len; + + ut_ad(mtr); + + /* Note that below we first reserve the file space x-latch, and + then enter the kernel: we must do it in this order to conform + to the latching order rules. */ + + mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), mtr); + mutex_enter(&kernel_mutex); + + /* Create the trx sys file block in a new allocated file segment */ + block = fseg_create(TRX_SYS_SPACE, 0, TRX_SYS + TRX_SYS_FSEG_HEADER, + mtr); + buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); + + ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO); + + page = buf_block_get_frame(block); + + mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS, + MLOG_2BYTES, mtr); + + /* Reset the doublewrite buffer magic number to zero so that we + know that the doublewrite buffer has not yet been created (this + suppresses a Valgrind warning) */ + + mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE + + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr); + + sys_header = trx_sysf_get(mtr); + + /* Start counting transaction ids from number 1 up */ + mach_write_to_8(sys_header + TRX_SYS_TRX_ID_STORE, 1); + + /* Reset the rollback segment slots. Old versions of InnoDB + define TRX_SYS_N_RSEGS as 256 (TRX_SYS_OLD_N_RSEGS) and expect + that the whole array is initialized. */ + ptr = TRX_SYS_RSEGS + sys_header; + len = ut_max(TRX_SYS_OLD_N_RSEGS, TRX_SYS_N_RSEGS) + * TRX_SYS_RSEG_SLOT_SIZE; + memset(ptr, 0xff, len); + ptr += len; + ut_a(ptr <= page + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END)); + + /* Initialize all of the page. This part used to be uninitialized. */ + memset(ptr, 0, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page - ptr); + + mlog_log_string(sys_header, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + + page - sys_header, mtr); + + /* Create the first rollback segment in the SYSTEM tablespace */ + slot_no = trx_sysf_rseg_find_free(mtr); + page_no = trx_rseg_header_create(TRX_SYS_SPACE, 0, ULINT_MAX, slot_no, + mtr); + ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID); + ut_a(page_no == FSP_FIRST_RSEG_PAGE_NO); + + mutex_exit(&kernel_mutex); +} + +/*****************************************************************//** +Compare two trx_rseg_t instances on last_trx_no. */ +static +int +trx_rseg_compare_last_trx_no( +/*=========================*/ + const void* p1, /*!< in: elem to compare */ + const void* p2) /*!< in: elem to compare */ +{ + ib_int64_t cmp; + + const rseg_queue_t* rseg_q1 = (const rseg_queue_t*) p1; + const rseg_queue_t* rseg_q2 = (const rseg_queue_t*) p2; + + cmp = rseg_q1->trx_no - rseg_q2->trx_no; + + if (cmp < 0) { + return(-1); + } else if (cmp > 0) { + return(1); + } + + return(0); +} + +/*****************************************************************//** +Creates dummy of the file page for the transaction system. */ +static +void +trx_sysf_dummy_create( +/*==================*/ + ulint space, + mtr_t* mtr) +{ + buf_block_t* block; + page_t* page; + + ut_ad(mtr); + + /* Note that below we first reserve the file space x-latch, and + then enter the kernel: we must do it in this order to conform + to the latching order rules. */ + + mtr_x_lock(fil_space_get_latch(space, NULL), mtr); + mutex_enter(&kernel_mutex); + + /* Create the trx sys file block in a new allocated file segment */ + block = fseg_create(space, 0, TRX_SYS + TRX_SYS_FSEG_HEADER, + mtr); + buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); + + fprintf(stderr, "%lu\n", buf_block_get_page_no(block)); + ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO); + + page = buf_block_get_frame(block); + + mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS, + MLOG_2BYTES, mtr); + + /* Reset the doublewrite buffer magic number to zero so that we + know that the doublewrite buffer has not yet been created (this + suppresses a Valgrind warning) */ + + mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE + + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr); + +#ifdef UNDEFINED + /* TODO: REMOVE IT: The bellow is not needed, I think */ + sys_header = trx_sysf_get(mtr); + + /* Start counting transaction ids from number 1 up */ + mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE, + ut_dulint_create(0, 1), mtr); + + /* Reset the rollback segment slots */ + for (i = 0; i < TRX_SYS_N_RSEGS; i++) { + + trx_sysf_rseg_set_space(sys_header, i, ULINT_UNDEFINED, mtr); + trx_sysf_rseg_set_page_no(sys_header, i, FIL_NULL, mtr); + } + + /* The remaining area (up to the page trailer) is uninitialized. + Silence Valgrind warnings about it. */ + UNIV_MEM_VALID(sys_header + (TRX_SYS_RSEGS + + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE + + TRX_SYS_RSEG_SPACE), + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + - (TRX_SYS_RSEGS + + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE + + TRX_SYS_RSEG_SPACE)) + + page - sys_header); + + /* Create the first rollback segment in the SYSTEM tablespace */ + page_no = trx_rseg_header_create(space, 0, ULINT_MAX, &slot_no, + mtr); + ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID); + ut_a(page_no != FIL_NULL); +#endif + + mutex_exit(&kernel_mutex); +} + +/*****************************************************************//** +Creates and initializes the central memory structures for the transaction +system. This is called when the database is started. */ +UNIV_INTERN +void +trx_sys_init_at_db_start(void) +/*==========================*/ +{ + trx_sysf_t* sys_header; + ib_uint64_t rows_to_undo = 0; + const char* unit = ""; + trx_t* trx; + mtr_t mtr; + ib_bh_t* ib_bh; + + mtr_start(&mtr); + + ut_ad(trx_sys == NULL); + + mutex_enter(&kernel_mutex); + + /* We create the min binary heap here and pass ownership to + purge when we init the purge sub-system. Purge is responsible + for freeing the binary heap. */ + + ib_bh = ib_bh_create( + trx_rseg_compare_last_trx_no, + sizeof(rseg_queue_t), TRX_SYS_N_RSEGS); + + trx_sys = mem_zalloc(sizeof(*trx_sys)); + + /* Allocate the trx descriptors array */ + trx_sys->descriptors = ut_malloc(sizeof(trx_id_t) * + TRX_DESCR_ARRAY_INITIAL_SIZE); + trx_sys->descr_n_max = TRX_DESCR_ARRAY_INITIAL_SIZE; + trx_sys->descr_n_used = 0; + srv_descriptors_memory = TRX_DESCR_ARRAY_INITIAL_SIZE * + sizeof(trx_id_t); + + sys_header = trx_sysf_get(&mtr); + + trx_rseg_list_and_array_init(sys_header, ib_bh, &mtr); + + trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + + /* VERY important: after the database is started, max_trx_id value is + divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in + trx_sys_get_new_trx_id will evaluate to TRUE when the function + is first time called, and the value for trx id will be written + to the disk-based header! Thus trx id values will not overlap when + the database is repeatedly started! */ + + trx_sys->max_trx_id = 2 * TRX_SYS_TRX_ID_WRITE_MARGIN + + ut_uint64_align_up(mach_read_from_8(sys_header + + TRX_SYS_TRX_ID_STORE), + TRX_SYS_TRX_ID_WRITE_MARGIN); + + UT_LIST_INIT(trx_sys->mysql_trx_list); + trx_dummy_sess = sess_open(); + trx_lists_init_at_db_start(); + + if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) { + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + for (;;) { + + if (trx->state != TRX_PREPARED) { + rows_to_undo += trx->undo_no; + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + + if (!trx) { + break; + } + } + + if (rows_to_undo > 1000000000) { + unit = "M"; + rows_to_undo = rows_to_undo / 1000000; + } + + fprintf(stderr, + "InnoDB: %lu transaction(s) which must be" + " rolled back or cleaned up\n" + "InnoDB: in total %lu%s row operations to undo\n", + (ulong) UT_LIST_GET_LEN(trx_sys->trx_list), + (ulong) rows_to_undo, unit); + + fprintf(stderr, "InnoDB: Trx id counter is " TRX_ID_FMT "\n", + (ullint) trx_sys->max_trx_id); + } + + UT_LIST_INIT(trx_sys->view_list); + + /* Transfer ownership to purge. */ + trx_purge_sys_create(ib_bh); + + mutex_exit(&kernel_mutex); + + mtr_commit(&mtr); +} + +/*****************************************************************//** +Creates and initializes the transaction system at the database creation. */ +UNIV_INTERN +void +trx_sys_create(void) +/*================*/ +{ + mtr_t mtr; + + mtr_start(&mtr); + + trx_sysf_create(&mtr); + + mtr_commit(&mtr); + + trx_sys_init_at_db_start(); +} + +/*****************************************************************//** +Update the file format tag. +@return always TRUE */ +static +ibool +trx_sys_file_format_max_write( +/*==========================*/ + ulint format_id, /*!< in: file format id */ + const char** name) /*!< out: max file format name, can + be NULL */ +{ + mtr_t mtr; + byte* ptr; + buf_block_t* block; + ib_uint64_t tag_value; + + mtr_start(&mtr); + + block = buf_page_get( + TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr); + + file_format_max.id = format_id; + file_format_max.name = trx_sys_file_format_id_to_name(format_id); + + ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG; + tag_value = format_id + TRX_SYS_FILE_FORMAT_TAG_MAGIC_N; + + if (name) { + *name = file_format_max.name; + } + + mlog_write_ull(ptr, tag_value, &mtr); + + mtr_commit(&mtr); + + return(TRUE); +} + +/*****************************************************************//** +Read the file format tag. +@return the file format or ULINT_UNDEFINED if not set. */ +static +ulint +trx_sys_file_format_max_read(void) +/*==============================*/ +{ + mtr_t mtr; + const byte* ptr; + const buf_block_t* block; + ib_id_t file_format_id; + + /* Since this is called during the startup phase it's safe to + read the value without a covering mutex. */ + mtr_start(&mtr); + + block = buf_page_get( + TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr); + + ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG; + file_format_id = mach_read_from_8(ptr); + + mtr_commit(&mtr); + + file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N; + + if (file_format_id >= FILE_FORMAT_NAME_N) { + + /* Either it has never been tagged, or garbage in it. */ + return(ULINT_UNDEFINED); + } + + return((ulint) file_format_id); +} + +/*****************************************************************//** +Get the name representation of the file format from its id. +@return pointer to the name */ +UNIV_INTERN +const char* +trx_sys_file_format_id_to_name( +/*===========================*/ + const ulint id) /*!< in: id of the file format */ +{ + ut_a(id < FILE_FORMAT_NAME_N); + + return(file_format_name_map[id]); +} + +/*****************************************************************//** +Check for the max file format tag stored on disk. Note: If max_format_id +is == DICT_TF_FORMAT_MAX + 1 then we only print a warning. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ulint +trx_sys_file_format_max_check( +/*==========================*/ + ulint max_format_id) /*!< in: max format id to check */ +{ + ulint format_id; + + /* Check the file format in the tablespace. Do not try to + recover if the file format is not supported by the engine + unless forced by the user. */ + format_id = trx_sys_file_format_max_read(); + if (format_id == ULINT_UNDEFINED) { + /* Format ID was not set. Set it to minimum possible + value. */ + format_id = DICT_TF_FORMAT_MIN; + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: highest supported file format is %s.\n", + trx_sys_file_format_id_to_name(DICT_TF_FORMAT_MAX)); + + if (format_id > DICT_TF_FORMAT_MAX) { + + ut_a(format_id < FILE_FORMAT_NAME_N); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: %s: the system tablespace is in a file " + "format that this version doesn't support - %s\n", + ((max_format_id <= DICT_TF_FORMAT_MAX) + ? "Error" : "Warning"), + trx_sys_file_format_id_to_name(format_id)); + + if (max_format_id <= DICT_TF_FORMAT_MAX) { + return(DB_ERROR); + } + } + + format_id = (format_id > max_format_id) ? format_id : max_format_id; + + /* We don't need a mutex here, as this function should only + be called once at start up. */ + file_format_max.id = format_id; + file_format_max.name = trx_sys_file_format_id_to_name(format_id); + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Set the file format id unconditionally except if it's already the +same value. +@return TRUE if value updated */ +UNIV_INTERN +ibool +trx_sys_file_format_max_set( +/*========================*/ + ulint format_id, /*!< in: file format id */ + const char** name) /*!< out: max file format name or + NULL if not needed. */ +{ + ibool ret = FALSE; + + ut_a(format_id <= DICT_TF_FORMAT_MAX); + + mutex_enter(&file_format_max.mutex); + + /* Only update if not already same value. */ + if (format_id != file_format_max.id) { + + ret = trx_sys_file_format_max_write(format_id, name); + } + + mutex_exit(&file_format_max.mutex); + + return(ret); +} + +/********************************************************************//** +Tags the system table space with minimum format id if it has not been +tagged yet. +WARNING: This function is only called during the startup and AFTER the +redo log application during recovery has finished. */ +UNIV_INTERN +void +trx_sys_file_format_tag_init(void) +/*==============================*/ +{ + ulint format_id; + + format_id = trx_sys_file_format_max_read(); + + /* If format_id is not set then set it to the minimum. */ + if (format_id == ULINT_UNDEFINED) { + trx_sys_file_format_max_set(DICT_TF_FORMAT_MIN, NULL); + } +} + +/********************************************************************//** +Update the file format tag in the system tablespace only if the given +format id is greater than the known max id. +@return TRUE if format_id was bigger than the known max id */ +UNIV_INTERN +ibool +trx_sys_file_format_max_upgrade( +/*============================*/ + const char** name, /*!< out: max file format name */ + ulint format_id) /*!< in: file format identifier */ +{ + ibool ret = FALSE; + + ut_a(name); + ut_a(file_format_max.name != NULL); + ut_a(format_id <= DICT_TF_FORMAT_MAX); + + mutex_enter(&file_format_max.mutex); + + if (format_id > file_format_max.id) { + + ret = trx_sys_file_format_max_write(format_id, name); + } + + mutex_exit(&file_format_max.mutex); + + return(ret); +} + +/*****************************************************************//** +Get the name representation of the file format from its id. +@return pointer to the max format name */ +UNIV_INTERN +const char* +trx_sys_file_format_max_get(void) +/*=============================*/ +{ + return(file_format_max.name); +} + +/*****************************************************************//** +Initializes the tablespace tag system. */ +UNIV_INTERN +void +trx_sys_file_format_init(void) +/*==========================*/ +{ + mutex_create(file_format_max_mutex_key, + &file_format_max.mutex, SYNC_FILE_FORMAT_TAG); + + /* We don't need a mutex here, as this function should only + be called once at start up. */ + file_format_max.id = DICT_TF_FORMAT_MIN; + + file_format_max.name = trx_sys_file_format_id_to_name( + file_format_max.id); +} + +/*****************************************************************//** +Closes the tablespace tag system. */ +UNIV_INTERN +void +trx_sys_file_format_close(void) +/*===========================*/ +{ + /* Does nothing at the moment */ +} + +/*****************************************************************//** +Creates and initializes the dummy transaction system page for tablespace. */ +UNIV_INTERN +void +trx_sys_dummy_create( +/*=================*/ + ulint space) +{ + mtr_t mtr; + + /* This function is only for doublewrite file for now */ + ut_a(space == TRX_DOUBLEWRITE_SPACE); + + mtr_start(&mtr); + + trx_sysf_dummy_create(space, &mtr); + + mtr_commit(&mtr); +} + +/********************************************************************* +Creates the rollback segments */ +UNIV_INTERN +void +trx_sys_create_rsegs( +/*=================*/ + ulint n_rsegs) /*!< number of rollback segments to create */ +{ + ulint new_rsegs = 0; + + /* Do not create additional rollback segments if + innodb_force_recovery has been set and the database + was not shutdown cleanly. */ + if (!srv_force_recovery && !recv_needed_recovery) { + ulint i; + + for (i = 0; i < n_rsegs; ++i) { + + if (trx_rseg_create() != NULL) { + ++new_rsegs; + } else { + break; + } + } + } + + if (new_rsegs > 0) { + fprintf(stderr, + "InnoDB: %lu rollback segment(s) active.\n", + new_rsegs); + } +} + +#else /* !UNIV_HOTBACKUP */ +/*****************************************************************//** +Prints to stderr the MySQL binlog info in the system header if the +magic number shows it valid. */ +UNIV_INTERN +void +trx_sys_print_mysql_binlog_offset_from_page( +/*========================================*/ + const byte* page) /*!< in: buffer containing the trx + system header page, i.e., page number + TRX_SYS_PAGE_NO in the tablespace */ +{ + const trx_sysf_t* sys_header; + + sys_header = page + TRX_SYS; + + if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + == TRX_SYS_MYSQL_LOG_MAGIC_N) { + + fprintf(stderr, + "ibbackup: Last MySQL binlog file position %lu %lu," + " file name %s\n", + (ulong) mach_read_from_4( + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH), + (ulong) mach_read_from_4( + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW), + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME); + } +} + + +/* THESE ARE COPIED FROM NON-HOTBACKUP PART OF THE INNODB SOURCE TREE + (This code duplicaton should be fixed at some point!) +*/ + +#define TRX_SYS_SPACE 0 /* the SYSTEM tablespace */ +/* The offset of the file format tag on the trx system header page */ +#define TRX_SYS_FILE_FORMAT_TAG (UNIV_PAGE_SIZE - 16) +/* We use these random constants to reduce the probability of reading +garbage (from previous versions) that maps to an actual format id. We +use these as bit masks at the time of reading and writing from/to disk. */ +#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW 3645922177UL +#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH 2745987765UL + +/* END OF COPIED DEFINITIONS */ + + +/*****************************************************************//** +Reads the file format id from the first system table space file. +Even if the call succeeds and returns TRUE, the returned format id +may be ULINT_UNDEFINED signalling that the format id was not present +in the data file. +@return TRUE if call succeeds */ +UNIV_INTERN +ibool +trx_sys_read_file_format_id( +/*========================*/ + const char *pathname, /*!< in: pathname of the first system + table space file */ + ulint *format_id) /*!< out: file format of the system table + space */ +{ + os_file_t file; + ibool success; + byte buf[UNIV_PAGE_SIZE * 2]; + page_t* page = ut_align(buf, UNIV_PAGE_SIZE); + const byte* ptr; + ib_id_t file_format_id; + + *format_id = ULINT_UNDEFINED; + + file = os_file_create_simple_no_error_handling( + innodb_file_data_key, + pathname, + OS_FILE_OPEN, + OS_FILE_READ_ONLY, + &success + ); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + + ut_print_timestamp(stderr); + + fprintf(stderr, +" ibbackup: Error: trying to read system tablespace file format,\n" +" ibbackup: but could not open the tablespace file %s!\n", + pathname + ); + return(FALSE); + } + + /* Read the page on which file format is stored */ + + success = os_file_read_no_error_handling( + file, page, TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE, 0, UNIV_PAGE_SIZE + ); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + + ut_print_timestamp(stderr); + + fprintf(stderr, +" ibbackup: Error: trying to read system table space file format,\n" +" ibbackup: but failed to read the tablespace file %s!\n", + pathname + ); + os_file_close(file); + return(FALSE); + } + os_file_close(file); + + /* get the file format from the page */ + ptr = page + TRX_SYS_FILE_FORMAT_TAG; + file_format_id = mach_read_from_8(ptr); + file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N; + + if (file_format_id >= FILE_FORMAT_NAME_N) { + + /* Either it has never been tagged, or garbage in it. */ + return(TRUE); + } + + *format_id = (ulint) file_format_id; + + return(TRUE); +} + + +/*****************************************************************//** +Reads the file format id from the given per-table data file. +@return TRUE if call succeeds */ +UNIV_INTERN +ibool +trx_sys_read_pertable_file_format_id( +/*=================================*/ + const char *pathname, /*!< in: pathname of a per-table + datafile */ + ulint *format_id) /*!< out: file format of the per-table + data file */ +{ + os_file_t file; + ibool success; + byte buf[UNIV_PAGE_SIZE * 2]; + page_t* page = ut_align(buf, UNIV_PAGE_SIZE); + const byte* ptr; + ib_uint32_t flags; + + *format_id = ULINT_UNDEFINED; + + file = os_file_create_simple_no_error_handling( + innodb_file_data_key, + pathname, + OS_FILE_OPEN, + OS_FILE_READ_ONLY, + &success + ); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + + ut_print_timestamp(stderr); + + fprintf(stderr, +" ibbackup: Error: trying to read per-table tablespace format,\n" +" ibbackup: but could not open the tablespace file %s!\n", + pathname + ); + return(FALSE); + } + + /* Read the first page of the per-table datafile */ + + success = os_file_read_no_error_handling( + file, page, 0, 0, UNIV_PAGE_SIZE + ); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + + ut_print_timestamp(stderr); + + fprintf(stderr, +" ibbackup: Error: trying to per-table data file format,\n" +" ibbackup: but failed to read the tablespace file %s!\n", + pathname + ); + os_file_close(file); + return(FALSE); + } + os_file_close(file); + + /* get the file format from the page */ + ptr = page + 54; + flags = mach_read_from_4(ptr); + if (flags == 0) { + /* file format is Antelope */ + *format_id = 0; + return (TRUE); + } else if (flags & 1) { + /* tablespace flags are ok */ + *format_id = (flags / 32) % 128; + return (TRUE); + } else { + /* bad tablespace flags */ + return(FALSE); + } +} + + +/*****************************************************************//** +Get the name representation of the file format from its id. +@return pointer to the name */ +UNIV_INTERN +const char* +trx_sys_file_format_id_to_name( +/*===========================*/ + const ulint id) /*!< in: id of the file format */ +{ + if (!(id < FILE_FORMAT_NAME_N)) { + /* unknown id */ + return ("Unknown"); + } + + return(file_format_name_map[id]); +} + +#endif /* !UNIV_HOTBACKUP */ + +#ifndef UNIV_HOTBACKUP +/********************************************************************* +Shutdown/Close the transaction system. */ +UNIV_INTERN +void +trx_sys_close(void) +/*===============*/ +{ + trx_t* trx; + trx_rseg_t* rseg; + read_view_t* view; + + ut_ad(trx_sys != NULL); + ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); + + /* Check that all read views are closed except read view owned + by a purge. */ + + if (UT_LIST_GET_LEN(trx_sys->view_list) > 1) { + fprintf(stderr, + "InnoDB: Error: all read views were not closed" + " before shutdown:\n" + "InnoDB: %lu read views open \n", + UT_LIST_GET_LEN(trx_sys->view_list) - 1); + } + + sess_close(trx_dummy_sess); + trx_dummy_sess = NULL; + + trx_purge_sys_close(); + + mutex_enter(&kernel_mutex); + + /* Free the double write data structures. */ + ut_a(trx_doublewrite != NULL); + ut_free(trx_doublewrite->write_buf_unaligned); + trx_doublewrite->write_buf_unaligned = NULL; + + mem_free(trx_doublewrite->buf_block_arr); + trx_doublewrite->buf_block_arr = NULL; + + mutex_free(&trx_doublewrite->mutex); + mem_free(trx_doublewrite); + trx_doublewrite = NULL; + + /* Only prepared transactions may be left in the system. Free them. */ + ut_a(UT_LIST_GET_LEN(trx_sys->trx_list) == trx_n_prepared); + + while ((trx = UT_LIST_GET_FIRST(trx_sys->trx_list)) != NULL) { + trx_free_prepared(trx); + } + + /* There can't be any active transactions. */ + rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + + while (rseg != NULL) { + trx_rseg_t* prev_rseg = rseg; + + rseg = UT_LIST_GET_NEXT(rseg_list, prev_rseg); + UT_LIST_REMOVE(rseg_list, trx_sys->rseg_list, prev_rseg); + + trx_rseg_mem_free(prev_rseg); + } + + view = UT_LIST_GET_FIRST(trx_sys->view_list); + + while (view != NULL) { + read_view_t* prev_view = view; + + view = UT_LIST_GET_NEXT(view_list, prev_view); + + /* Views are allocated from the trx_sys->global_read_view_heap. + So, we simply remove the element here. */ + UT_LIST_REMOVE(view_list, trx_sys->view_list, prev_view); + } + + ut_a(UT_LIST_GET_LEN(trx_sys->trx_list) == 0); + ut_a(UT_LIST_GET_LEN(trx_sys->rseg_list) == 0); + ut_a(UT_LIST_GET_LEN(trx_sys->view_list) == 0); + ut_a(UT_LIST_GET_LEN(trx_sys->mysql_trx_list) == 0); + + ut_ad(trx_sys->descr_n_used == 0); + ut_free(trx_sys->descriptors); + + mem_free(trx_sys); + + trx_sys = NULL; + mutex_exit(&kernel_mutex); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/trx/trx0trx.c b/storage/xtradb/trx/trx0trx.c new file mode 100644 index 00000000000..09f425cfa55 --- /dev/null +++ b/storage/xtradb/trx/trx0trx.c @@ -0,0 +1,2449 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0trx.c +The transaction + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0trx.h" + +#ifdef UNIV_NONINL +#include "trx0trx.ic" +#endif + +#include "trx0undo.h" +#include "trx0rseg.h" +#include "log0log.h" +#include "que0que.h" +#include "lock0lock.h" +#include "trx0roll.h" +#include "usr0sess.h" +#include "read0read.h" +#include "srv0srv.h" +#include "btr0sea.h" +#include "os0proc.h" +#include "trx0xa.h" +#include "trx0purge.h" +#include "ha_prototypes.h" + +/** Dummy session used currently in MySQL interface */ +UNIV_INTERN sess_t* trx_dummy_sess = NULL; + +/** Number of transactions currently allocated for MySQL: protected by +the kernel mutex */ +UNIV_INTERN ulint trx_n_mysql_transactions = 0; +/** Number of transactions currently in the XA PREPARED state: protected by +the kernel mutex */ +UNIV_INTERN ulint trx_n_prepared = 0; + +#ifdef UNIV_PFS_MUTEX +/* Key to register the mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t trx_undo_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/*************************************************************//** +Set detailed error message for the transaction. */ +UNIV_INTERN +void +trx_set_detailed_error( +/*===================*/ + trx_t* trx, /*!< in: transaction struct */ + const char* msg) /*!< in: detailed error message */ +{ + ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error)); +} + +/*************************************************************//** +Set detailed error message for the transaction from a file. Note that the +file is rewinded before reading from it. */ +UNIV_INTERN +void +trx_set_detailed_error_from_file( +/*=============================*/ + trx_t* trx, /*!< in: transaction struct */ + FILE* file) /*!< in: file to read message from */ +{ + os_file_read_string(file, trx->detailed_error, + sizeof(trx->detailed_error)); +} + +/*************************************************************//** +Callback function for trx_find_descriptor() to compare trx IDs. */ +UNIV_INTERN +int +trx_descr_cmp( +/*==========*/ + const void *a, /*!< in: pointer to first comparison argument */ + const void *b) /*!< in: pointer to second comparison argument */ +{ + const trx_id_t* da = (const trx_id_t*) a; + const trx_id_t* db = (const trx_id_t*) b; + + if (*da < *db) { + return -1; + } else if (*da > *db) { + return 1; + } + + return 0; +} + +/*************************************************************//** +Reserve a slot for a given trx in the global descriptors array. */ +UNIV_INLINE +void +trx_reserve_descriptor( +/*===================*/ + const trx_t* trx) /*!< in: trx pointer */ +{ + ulint n_used; + ulint n_max; + trx_id_t* descr; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(!trx_find_descriptor(trx_sys->descriptors, + trx_sys->descr_n_used, + trx->id)); + + n_used = trx_sys->descr_n_used + 1; + n_max = trx_sys->descr_n_max; + + if (UNIV_UNLIKELY(n_used > n_max)) { + + n_max = n_max * 2; + + trx_sys->descriptors = + ut_realloc(trx_sys->descriptors, + n_max * sizeof(trx_id_t)); + + trx_sys->descr_n_max = n_max; + srv_descriptors_memory = n_max * sizeof(trx_id_t); + } + + descr = trx_sys->descriptors + n_used - 1; + + if (UNIV_UNLIKELY(n_used > 1 && trx->id < descr[-1])) { + + /* Find the slot where it should be inserted. We could use a + binary search, but in reality linear search should be faster, + because the slot we are looking for is near the array end. */ + + trx_id_t* tdescr; + + for (tdescr = descr - 1; + tdescr >= trx_sys->descriptors && *tdescr > trx->id; + tdescr--) { + } + + tdescr++; + + ut_memmove(tdescr + 1, tdescr, (descr - tdescr) * + sizeof(trx_id_t)); + + descr = tdescr; + } + + *descr = trx->id; + + trx_sys->descr_n_used = n_used; +} + +/*************************************************************//** +Release a slot for a given trx in the global descriptors array. */ +UNIV_INTERN +void +trx_release_descriptor( +/*===================*/ + trx_t* trx) /*!< in: trx pointer */ +{ + ulint size; + trx_id_t* descr; + + ut_ad(mutex_own(&kernel_mutex)); + + if (UNIV_LIKELY(trx->is_in_trx_serial_list)) { + + UT_LIST_REMOVE(trx_serial_list, trx_sys->trx_serial_list, + trx); + trx->is_in_trx_serial_list = 0; + } + + descr = trx_find_descriptor(trx_sys->descriptors, + trx_sys->descr_n_used, + trx->id); + + if (UNIV_UNLIKELY(descr == NULL)) { + + return; + } + + size = (trx_sys->descriptors + trx_sys->descr_n_used - 1 - descr) * + sizeof(trx_id_t); + + if (UNIV_LIKELY(size > 0)) { + + ut_memmove(descr, descr + 1, size); + } + + trx_sys->descr_n_used--; +} + +/****************************************************************//** +Creates and initializes a transaction object. +@return own: the transaction */ +UNIV_INTERN +trx_t* +trx_create( +/*=======*/ + sess_t* sess) /*!< in: session */ +{ + trx_t* trx; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(sess); + + trx = mem_alloc(sizeof(trx_t)); + + trx->magic_n = TRX_MAGIC_N; + + trx->op_info = ""; + + trx->is_purge = 0; + trx->is_recovered = 0; + trx->state = TRX_NOT_STARTED; + + trx->is_registered = 0; + trx->owns_prepare_mutex = 0; + trx->called_commit_ordered = 0; + + trx->start_time = ut_time(); + + trx->isolation_level = TRX_ISO_REPEATABLE_READ; + + trx->id = 0; + trx->no = IB_ULONGLONG_MAX; + trx->is_in_trx_serial_list = 0; + + trx->support_xa = TRUE; + + trx->fake_changes = FALSE; + + trx->check_foreigns = TRUE; + trx->check_unique_secondary = TRUE; + + trx->flush_log_later = FALSE; + trx->must_flush_log_later = FALSE; + + trx->dict_operation = TRX_DICT_OP_NONE; + trx->table_id = 0; + + trx->mysql_thd = NULL; + trx->duplicates = 0; + + trx->n_mysql_tables_in_use = 0; + trx->mysql_n_tables_locked = 0; + + trx->mysql_log_file_name = NULL; + trx->mysql_log_offset = 0; + trx->mysql_master_log_file_name = ""; + trx->mysql_master_log_pos = 0; + trx->mysql_relay_log_file_name = ""; + trx->mysql_relay_log_pos = 0; + + trx->idle_start = 0; + trx->last_stmt_start = 0; + + mutex_create(trx_undo_mutex_key, &trx->undo_mutex, SYNC_TRX_UNDO); + + trx->rseg = NULL; + + trx->undo_no = 0; + trx->last_sql_stat_start.least_undo_no = 0; + trx->insert_undo = NULL; + trx->update_undo = NULL; + trx->undo_no_arr = NULL; + + trx->error_state = DB_SUCCESS; + trx->error_key_num = 0; + trx->detailed_error[0] = '\0'; + + trx->sess = sess; + trx->que_state = TRX_QUE_RUNNING; + trx->n_active_thrs = 0; + + trx->handling_signals = FALSE; + + UT_LIST_INIT(trx->signals); + UT_LIST_INIT(trx->reply_signals); + + trx->graph = NULL; + + trx->wait_lock = NULL; + trx->was_chosen_as_deadlock_victim = FALSE; + UT_LIST_INIT(trx->wait_thrs); + + trx->lock_heap = mem_heap_create_in_buffer(256); + UT_LIST_INIT(trx->trx_locks); + + UT_LIST_INIT(trx->trx_savepoints); + + trx->dict_operation_lock_mode = 0; + trx->has_search_latch = FALSE; + trx->search_latch_timeout = BTR_SEA_TIMEOUT; + + trx->declared_to_be_inside_innodb = FALSE; + trx->n_tickets_to_enter_innodb = 0; + + trx->global_read_view = NULL; + trx->read_view = NULL; + trx->prebuilt_view = NULL; + + trx->io_reads = 0; + trx->io_read = 0; + trx->io_reads_wait_timer = 0; + trx->lock_que_wait_timer = 0; + trx->innodb_que_wait_timer = 0; + trx->distinct_page_access = 0; + trx->distinct_page_access_hash = NULL; + trx->take_stats = FALSE; + + /* Set X/Open XA transaction identification to NULL */ + memset(&trx->xid, 0, sizeof(trx->xid)); + trx->xid.formatID = -1; + + trx->n_autoinc_rows = 0; + + /* Remember to free the vector explicitly. */ + trx->autoinc_locks = ib_vector_create( + mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 4), 4); + + return(trx); +} + +/********************************************************************//** +Creates a transaction object for MySQL. +@return own: transaction object */ +UNIV_INTERN +trx_t* +trx_allocate_for_mysql(void) +/*========================*/ +{ + trx_t* trx; + + mutex_enter(&kernel_mutex); + + trx = trx_create(trx_dummy_sess); + + trx_n_mysql_transactions++; + + UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx); + + mutex_exit(&kernel_mutex); + + if (UNIV_UNLIKELY(trx->take_stats)) { + trx->distinct_page_access_hash = mem_alloc(DPAH_SIZE); + memset(trx->distinct_page_access_hash, 0, DPAH_SIZE); + } + + return(trx); +} + +/********************************************************************//** +Creates a transaction object for background operations by the master thread. +@return own: transaction object */ +UNIV_INTERN +trx_t* +trx_allocate_for_background(void) +/*=============================*/ +{ + trx_t* trx; + + mutex_enter(&kernel_mutex); + + trx = trx_create(trx_dummy_sess); + + mutex_exit(&kernel_mutex); + + return(trx); +} + +/********************************************************************//** +Frees a transaction object. */ +UNIV_INTERN +void +trx_free( +/*=====*/ + trx_t* trx) /*!< in, own: trx object */ +{ + ut_ad(mutex_own(&kernel_mutex)); + + if (trx->declared_to_be_inside_innodb) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: Freeing a trx which is declared" + " to be processing\n" + "InnoDB: inside InnoDB.\n", stderr); + trx_print(stderr, trx, 600); + putc('\n', stderr); + + /* This is an error but not a fatal error. We must keep + the counters like srv_conc_n_threads accurate. */ + srv_conc_force_exit_innodb(trx); + } + + if (trx->n_mysql_tables_in_use != 0 + || trx->mysql_n_tables_locked != 0) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: MySQL is freeing a thd\n" + "InnoDB: though trx->n_mysql_tables_in_use is %lu\n" + "InnoDB: and trx->mysql_n_tables_locked is %lu.\n", + (ulong)trx->n_mysql_tables_in_use, + (ulong)trx->mysql_n_tables_locked); + + trx_print(stderr, trx, 600); + + ut_print_buf(stderr, trx, sizeof(trx_t)); + putc('\n', stderr); + } + + ut_a(trx->magic_n == TRX_MAGIC_N); + + trx->magic_n = 11112222; + + ut_a(trx->state == TRX_NOT_STARTED); + + mutex_free(&(trx->undo_mutex)); + + ut_a(trx->insert_undo == NULL); + ut_a(trx->update_undo == NULL); + + if (trx->undo_no_arr) { + trx_undo_arr_free(trx->undo_no_arr); + } + + ut_a(UT_LIST_GET_LEN(trx->signals) == 0); + ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0); + + ut_a(trx->wait_lock == NULL); + ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0); + + ut_a(!trx->has_search_latch); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!btr_search_own_any()); +#endif + + ut_a(trx->dict_operation_lock_mode == 0); + + if (trx->lock_heap) { + mem_heap_free(trx->lock_heap); + } + + ut_a(UT_LIST_GET_LEN(trx->trx_locks) == 0); + + if (trx->prebuilt_view != NULL) { + read_view_free(trx->prebuilt_view); + } + + ut_a(trx->read_view == NULL); + + ut_a(ib_vector_is_empty(trx->autoinc_locks)); + /* We allocated a dedicated heap for the vector. */ + ib_vector_free(trx->autoinc_locks); + + trx_release_descriptor(trx); + + mem_free(trx); +} + +/********************************************************************//** +At shutdown, frees a transaction object that is in the PREPARED state. */ +UNIV_INTERN +void +trx_free_prepared( +/*==============*/ + trx_t* trx) /*!< in, own: trx object */ +{ + ut_ad(mutex_own(&kernel_mutex)); + ut_a(trx->state == TRX_PREPARED); + ut_a(trx->magic_n == TRX_MAGIC_N); + + /* Prepared transactions are sort of active; they allow + ROLLBACK and COMMIT operations. Because the system does not + contain any other transactions than prepared transactions at + the shutdown stage and because a transaction cannot become + PREPARED while holding locks, it is safe to release the locks + held by PREPARED transactions here at shutdown.*/ + lock_release_off_kernel(trx); + + trx_undo_free_prepared(trx); + + mutex_free(&trx->undo_mutex); + + if (trx->undo_no_arr) { + trx_undo_arr_free(trx->undo_no_arr); + } + + ut_a(UT_LIST_GET_LEN(trx->signals) == 0); + ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0); + + ut_a(trx->wait_lock == NULL); + ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0); + + ut_a(!trx->has_search_latch); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!btr_search_own_any()); +#endif + + ut_a(trx->dict_operation_lock_mode == 0); + + if (trx->lock_heap) { + mem_heap_free(trx->lock_heap); + } + + ut_a(ib_vector_is_empty(trx->autoinc_locks)); + ib_vector_free(trx->autoinc_locks); + + trx_release_descriptor(trx); + + if (trx->prebuilt_view != NULL) { + read_view_free(trx->prebuilt_view); + } + + UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx); + + ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->trx_list)); + + mem_free(trx); +} + +/********************************************************************//** +Frees a transaction object for MySQL. */ +UNIV_INTERN +void +trx_free_for_mysql( +/*===============*/ + trx_t* trx) /*!< in, own: trx object */ +{ + if (trx->distinct_page_access_hash) + { + mem_free(trx->distinct_page_access_hash); + trx->distinct_page_access_hash= NULL; + } + + mutex_enter(&kernel_mutex); + + UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx); + + trx_free(trx); + + ut_a(trx_n_mysql_transactions > 0); + + trx_n_mysql_transactions--; + + mutex_exit(&kernel_mutex); +} + +/********************************************************************//** +Frees a transaction object of a background operation of the master thread. */ +UNIV_INTERN +void +trx_free_for_background( +/*====================*/ + trx_t* trx) /*!< in, own: trx object */ +{ + if (trx->distinct_page_access_hash) + { + mem_free(trx->distinct_page_access_hash); + trx->distinct_page_access_hash= NULL; + } + + mutex_enter(&kernel_mutex); + + trx_free(trx); + + mutex_exit(&kernel_mutex); +} + +/****************************************************************//** +Inserts the trx handle in the trx system trx list in the right position. +The list is sorted on the trx id so that the biggest id is at the list +start. This function is used at the database startup to insert incomplete +transactions to the list. */ +static +void +trx_list_insert_ordered( +/*====================*/ + trx_t* trx) /*!< in: trx handle */ +{ + trx_t* trx2; + + ut_ad(mutex_own(&kernel_mutex)); + + trx2 = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx2 != NULL) { + if (trx->id >= trx2->id) { + + ut_ad(trx->id > trx2->id); + break; + } + trx2 = UT_LIST_GET_NEXT(trx_list, trx2); + } + + if (trx2 != NULL) { + trx2 = UT_LIST_GET_PREV(trx_list, trx2); + + if (trx2 == NULL) { + UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx); + } else { + UT_LIST_INSERT_AFTER(trx_list, trx_sys->trx_list, + trx2, trx); + } + } else { + UT_LIST_ADD_LAST(trx_list, trx_sys->trx_list, trx); + } +} + +/****************************************************************//** +Creates trx objects for transactions and initializes the trx list of +trx_sys at database start. Rollback segment and undo log lists must +already exist when this function is called, because the lists of +transactions to be rolled back or cleaned up are built based on the +undo log lists. */ +UNIV_INTERN +void +trx_lists_init_at_db_start(void) +/*============================*/ +{ + trx_rseg_t* rseg; + trx_undo_t* undo; + trx_t* trx; + + ut_ad(mutex_own(&kernel_mutex)); + UT_LIST_INIT(trx_sys->trx_list); + UT_LIST_INIT(trx_sys->trx_serial_list); + + /* Look from the rollback segments if there exist undo logs for + transactions */ + + rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + + while (rseg != NULL) { + undo = UT_LIST_GET_FIRST(rseg->insert_undo_list); + + while (undo != NULL) { + + trx = trx_create(trx_dummy_sess); + + trx->is_recovered = TRUE; + trx->id = undo->trx_id; + trx->xid = undo->xid; + trx->insert_undo = undo; + trx->rseg = rseg; + + if (undo->state != TRX_UNDO_ACTIVE) { + + /* Prepared transactions are left in + the prepared state waiting for a + commit or abort decision from MySQL */ + + if (undo->state == TRX_UNDO_PREPARED) { + + fprintf(stderr, + "InnoDB: Transaction " + TRX_ID_FMT + " was in the" + " XA prepared state.\n", + (ullint) trx->id); + + if (srv_force_recovery == 0) { + + trx->state = TRX_PREPARED; + trx_n_prepared++; + } else { + fprintf(stderr, + "InnoDB: Since" + " innodb_force_recovery" + " > 0, we will" + " rollback it" + " anyway.\n"); + + trx->state = TRX_ACTIVE; + } + + trx_reserve_descriptor(trx); + } else { + trx->state = TRX_COMMITTED_IN_MEMORY; + } + + /* We give a dummy value for the trx no; + this should have no relevance since purge + is not interested in committed transaction + numbers, unless they are in the history + list, in which case it looks the number + from the disk based undo log structure */ + + trx->no = trx->id; + } else { + trx->state = TRX_ACTIVE; + + /* A running transaction always has the number + field inited to IB_ULONGLONG_MAX */ + + trx->no = IB_ULONGLONG_MAX; + + trx_reserve_descriptor(trx); + + } + + if (undo->dict_operation) { + trx_set_dict_operation( + trx, TRX_DICT_OP_TABLE); + trx->table_id = undo->table_id; + } + + if (!undo->empty) { + trx->undo_no = undo->top_undo_no + 1; + } + + trx_list_insert_ordered(trx); + + undo = UT_LIST_GET_NEXT(undo_list, undo); + } + + undo = UT_LIST_GET_FIRST(rseg->update_undo_list); + + while (undo != NULL) { + trx = trx_get_on_id(undo->trx_id); + + if (NULL == trx) { + trx = trx_create(trx_dummy_sess); + + trx->is_recovered = TRUE; + trx->id = undo->trx_id; + trx->xid = undo->xid; + + if (undo->state != TRX_UNDO_ACTIVE) { + + /* Prepared transactions are left in + the prepared state waiting for a + commit or abort decision from MySQL */ + + if (undo->state == TRX_UNDO_PREPARED) { + fprintf(stderr, + "InnoDB: Transaction " + TRX_ID_FMT " was in the" + " XA prepared state.\n", + (ullint) trx->id); + + if (srv_force_recovery == 0) { + + trx->state + = TRX_PREPARED; + trx_n_prepared++; + } else { + fprintf(stderr, + "InnoDB: Since" + " innodb_force_recovery" + " > 0, we will" + " rollback it" + " anyway.\n"); + + trx->state = TRX_ACTIVE; + trx_reserve_descriptor( + trx); + } + } else { + trx->state + = TRX_COMMITTED_IN_MEMORY; + } + + /* We give a dummy value for the trx + number */ + + trx->no = trx->id; + } else { + trx->state = TRX_ACTIVE; + /* A running transaction always has + the number field inited to + IB_ULONGLONG_MAX */ + + trx->no = IB_ULONGLONG_MAX; + + trx_reserve_descriptor(trx); + } + + trx->rseg = rseg; + trx_list_insert_ordered(trx); + + if (undo->dict_operation) { + trx_set_dict_operation( + trx, TRX_DICT_OP_TABLE); + trx->table_id = undo->table_id; + } + } + + trx->update_undo = undo; + + if ((!undo->empty) + && undo->top_undo_no >= trx->undo_no) { + + trx->undo_no = undo->top_undo_no + 1; + } + + undo = UT_LIST_GET_NEXT(undo_list, undo); + } + + rseg = UT_LIST_GET_NEXT(rseg_list, rseg); + } +} + +/******************************************************************//** +Assigns a rollback segment to a transaction in a round-robin fashion. +@return assigned rollback segment instance */ +UNIV_INLINE +trx_rseg_t* +trx_assign_rseg( +/*============*/ + ulint max_undo_logs) /*!< in: maximum number of UNDO logs to use */ +{ + trx_rseg_t* rseg = trx_sys->latest_rseg; + + ut_ad(mutex_own(&kernel_mutex)); + + rseg = UT_LIST_GET_NEXT(rseg_list, rseg); + + if (rseg == NULL || rseg->id == max_undo_logs - 1) { + rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + } + + trx_sys->latest_rseg = rseg; + + return(rseg); +} + +/****************************************************************//** +Starts a new transaction. +@return TRUE */ +UNIV_INTERN +ibool +trx_start_low( +/*==========*/ + trx_t* trx, /*!< in: transaction */ + ulint rseg_id)/*!< in: rollback segment id; if ULINT_UNDEFINED + is passed, the system chooses the rollback segment + automatically in a round-robin fashion */ +{ + trx_rseg_t* rseg; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(trx->rseg == NULL); + + if (trx->is_purge) { + trx->id = 0; + /* Don't reserve a descriptor, since this trx is not added to + trx_list. */ + trx->state = TRX_ACTIVE; + trx->start_time = time(NULL); + + return(TRUE); + } + + ut_ad(trx->state != TRX_ACTIVE); + + ut_a(rseg_id == ULINT_UNDEFINED); + + rseg = trx_assign_rseg(srv_rollback_segments); + + trx->id = trx_sys_get_new_trx_id(); + + /* The initial value for trx->no: IB_ULONGLONG_MAX is used in + read_view_open_now: */ + + trx->no = IB_ULONGLONG_MAX; + + trx->rseg = rseg; + + trx->state = TRX_ACTIVE; + + trx_reserve_descriptor(trx); + + trx->start_time = time(NULL); + + UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx); + + return(TRUE); +} + +/****************************************************************//** +Starts a new transaction. +@return TRUE */ +UNIV_INTERN +ibool +trx_start( +/*======*/ + trx_t* trx, /*!< in: transaction */ + ulint rseg_id)/*!< in: rollback segment id; if ULINT_UNDEFINED + is passed, the system chooses the rollback segment + automatically in a round-robin fashion */ +{ + ibool ret; + + /* Update the info whether we should skip XA steps that eat CPU time + For the duration of the transaction trx->support_xa is not reread + from thd so any changes in the value take effect in the next + transaction. This is to avoid a scenario where some undo + generated by a transaction, has XA stuff, and other undo, + generated by the same transaction, doesn't. */ + trx->support_xa = thd_supports_xa(trx->mysql_thd); + + mutex_enter(&kernel_mutex); + + ret = trx_start_low(trx, rseg_id); + + mutex_exit(&kernel_mutex); + + return(ret); +} + +/****************************************************************//** +Set the transaction serialisation number. */ +static +void +trx_serialisation_number_get( +/*=========================*/ + trx_t* trx) /*!< in: transaction */ +{ + trx_rseg_t* rseg; + + rseg = trx->rseg; + + ut_ad(mutex_own(&rseg->mutex)); + + mutex_enter(&kernel_mutex); + + trx->no = trx_sys_get_new_trx_id(); + + if (UNIV_LIKELY(trx->is_in_trx_serial_list == 0)) { + + UT_LIST_ADD_LAST(trx_serial_list, trx_sys->trx_serial_list, + trx); + + trx->is_in_trx_serial_list = 1; + } + + /* If the rollack segment is not empty then the + new trx_t::no can't be less than any trx_t::no + already in the rollback segment. User threads only + produce events when a rollback segment is empty. */ + + if (rseg->last_page_no == FIL_NULL) { + void* ptr; + rseg_queue_t rseg_queue; + + rseg_queue.rseg = rseg; + rseg_queue.trx_no = trx->no; + + mutex_enter(&purge_sys->bh_mutex); + + /* This is to reduce the pressure on the kernel mutex, + though in reality it should make very little (read no) + difference because this code path is only taken when the + rbs is empty. */ + + mutex_exit(&kernel_mutex); + + ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue); + ut_a(ptr); + + mutex_exit(&purge_sys->bh_mutex); + } else { + mutex_exit(&kernel_mutex); + } +} + +/****************************************************************//** +Assign the transaction its history serialisation number and write the +update UNDO log record to the assigned rollback segment. +@return the LSN of the UNDO log write. */ +static +ib_uint64_t +trx_write_serialisation_history( +/*============================*/ + trx_t* trx) /*!< in: transaction */ +{ + mtr_t mtr; + trx_rseg_t* rseg; + trx_sysf_t* sys_header = NULL; + + ut_ad(!mutex_own(&kernel_mutex)); + + rseg = trx->rseg; + + mtr_start(&mtr); + + /* Change the undo log segment states from TRX_UNDO_ACTIVE + to some other state: these modifications to the file data + structure define the transaction as committed in the file + based domain, at the serialization point of the log sequence + number lsn obtained below. */ + + if (trx->update_undo != NULL) { + page_t* undo_hdr_page; + trx_undo_t* undo = trx->update_undo; + + /* We have to hold the rseg mutex because update + log headers have to be put to the history list in the + (serialisation) order of the UNDO trx number. This is + required for the purge in-memory data structures too. */ + + mutex_enter(&rseg->mutex); + + /* Assign the transaction serialisation number and also + update the purge min binary heap if this is the first + UNDO log being written to the assigned rollback segment. */ + + trx_serialisation_number_get(trx); + + /* It is not necessary to obtain trx->undo_mutex here + because only a single OS thread is allowed to do the + transaction commit for this transaction. */ + + undo_hdr_page = trx_undo_set_state_at_finish(undo, &mtr); + + trx_undo_update_cleanup(trx, undo_hdr_page, &mtr); + } else { + mutex_enter(&rseg->mutex); + } + + if (trx->insert_undo != NULL) { + trx_undo_set_state_at_finish(trx->insert_undo, &mtr); + } + + mutex_exit(&rseg->mutex); + + /* Update the latest MySQL binlog name and offset info + in trx sys header if MySQL binlogging is on or the database + server is a MySQL replication slave */ + + if (trx->mysql_log_file_name + && trx->mysql_log_file_name[0] != '\0') { + if (!sys_header) { + sys_header = trx_sysf_get(&mtr); + } + + trx_sys_update_mysql_binlog_offset( + sys_header, + trx->mysql_log_file_name, + trx->mysql_log_offset, + TRX_SYS_MYSQL_LOG_INFO, &mtr); + + trx->mysql_log_file_name = NULL; + } + + if (trx->mysql_master_log_file_name[0] != '\0') { + /* This database server is a MySQL replication slave */ + if (!sys_header) { + sys_header = trx_sysf_get(&mtr); + } + + trx_sys_update_mysql_binlog_offset( + sys_header, + trx->mysql_relay_log_file_name, + trx->mysql_relay_log_pos, + TRX_SYS_COMMIT_RELAY_LOG_INFO, &mtr); + + trx_sys_update_mysql_binlog_offset( + sys_header, + trx->mysql_master_log_file_name, + trx->mysql_master_log_pos, + TRX_SYS_COMMIT_MASTER_LOG_INFO, &mtr); + + trx_sys_update_mysql_binlog_offset( + sys_header, + trx->mysql_relay_log_file_name, + trx->mysql_relay_log_pos, + TRX_SYS_MYSQL_RELAY_LOG_INFO, &mtr); + + trx_sys_update_mysql_binlog_offset( + sys_header, + trx->mysql_master_log_file_name, + trx->mysql_master_log_pos, + TRX_SYS_MYSQL_MASTER_LOG_INFO, &mtr); + + trx->mysql_master_log_file_name = ""; + } + + /* The following call commits the mini-transaction, making the + whole transaction committed in the file-based world, at this + log sequence number. The transaction becomes 'durable' when + we write the log to disk, but in the logical sense the commit + in the file-based data structures (undo logs etc.) happens + here. + + NOTE that transaction numbers, which are assigned only to + transactions with an update undo log, do not necessarily come + in exactly the same order as commit lsn's, if the transactions + have different rollback segments. To get exactly the same + order we should hold the kernel mutex up to this point, + adding to the contention of the kernel mutex. However, if + a transaction T2 is able to see modifications made by + a transaction T1, T2 will always get a bigger transaction + number and a bigger commit lsn than T1. */ + + /*--------------*/ + mtr_commit(&mtr); + /*--------------*/ + + return(mtr.end_lsn); +} + +/****************************************************************//** +Commits a transaction. */ +UNIV_INTERN +void +trx_commit_off_kernel( +/*==================*/ + trx_t* trx) /*!< in: transaction */ +{ + ib_uint64_t lsn; + + ut_ad(mutex_own(&kernel_mutex)); + + trx->must_flush_log_later = FALSE; + + /* If the transaction made any updates then we need to write the + UNDO logs for the updates to the assigned rollback segment. */ + + if (trx->insert_undo != NULL || trx->update_undo != NULL) { + mutex_exit(&kernel_mutex); + + lsn = trx_write_serialisation_history(trx); + + mutex_enter(&kernel_mutex); + } else { + lsn = 0; + } + + ut_ad(trx->state == TRX_ACTIVE || trx->state == TRX_PREPARED); + ut_ad(mutex_own(&kernel_mutex)); + + if (UNIV_UNLIKELY(trx->state == TRX_PREPARED)) { + ut_a(trx_n_prepared > 0); + trx_n_prepared--; + } + + /* The following assignment makes the transaction committed in memory + and makes its changes to data visible to other transactions. + NOTE that there is a small discrepancy from the strict formal + visibility rules here: a human user of the database can see + modifications made by another transaction T even before the necessary + log segment has been flushed to the disk. If the database happens to + crash before the flush, the user has seen modifications from T which + will never be a committed transaction. However, any transaction T2 + which sees the modifications of the committing transaction T, and + which also itself makes modifications to the database, will get an lsn + larger than the committing transaction T. In the case where the log + flush fails, and T never gets committed, also T2 will never get + committed. */ + + /*--------------------------------------*/ + trx->state = TRX_COMMITTED_IN_MEMORY; + /* The following also removes trx from trx_serial_list */ + trx_release_descriptor(trx); + /*--------------------------------------*/ + + /* If we release kernel_mutex below and we are still doing + recovery i.e.: back ground rollback thread is still active + then there is a chance that the rollback thread may see + this trx as COMMITTED_IN_MEMORY and goes adhead to clean it + up calling trx_cleanup_at_db_startup(). This can happen + in the case we are committing a trx here that is left in + PREPARED state during the crash. Note that commit of the + rollback of a PREPARED trx happens in the recovery thread + while the rollback of other transactions happen in the + background thread. To avoid this race we unconditionally + unset the is_recovered flag from the trx. */ + + trx->is_recovered = FALSE; + + lock_release_off_kernel(trx); + + if (trx->global_read_view) { + read_view_close(trx->global_read_view); + trx->global_read_view = NULL; + } + + trx->read_view = NULL; + + if (lsn) { + ulint flush_log_at_trx_commit; + + mutex_exit(&kernel_mutex); + + if (trx->insert_undo != NULL) { + + trx_undo_insert_cleanup(trx); + } + + if (srv_use_global_flush_log_at_trx_commit) { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL); + } else { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd); + } + + /* NOTE that we could possibly make a group commit more + efficient here: call os_thread_yield here to allow also other + trxs to come to commit! */ + + /*-------------------------------------*/ + + /* Depending on the my.cnf options, we may now write the log + buffer to the log files, making the transaction durable if + the OS does not crash. We may also flush the log files to + disk, making the transaction durable also at an OS crash or a + power outage. + + The idea in InnoDB's group commit is that a group of + transactions gather behind a trx doing a physical disk write + to log files, and when that physical write has been completed, + one of those transactions does a write which commits the whole + group. Note that this group commit will only bring benefit if + there are > 2 users in the database. Then at least 2 users can + gather behind one doing the physical log write to disk. + + If we are calling trx_commit() under prepare_commit_mutex, we + will delay possible log write and flush to a separate function + trx_commit_complete_for_mysql(), which is only called when the + thread has released the mutex. This is to make the + group commit algorithm to work. Otherwise, the prepare_commit + mutex would serialize all commits and prevent a group of + transactions from gathering. */ + + if (trx->flush_log_later) { + /* Do nothing yet */ + trx->must_flush_log_later = TRUE; + } else if (flush_log_at_trx_commit == 0) { + /* Do nothing */ + } else if (flush_log_at_trx_commit == 1) { + if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, + FALSE); + } else { + /* Write the log to the log files AND flush + them to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); + } + } else if (flush_log_at_trx_commit == 2) { + + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + ut_error; + } + + trx->commit_lsn = lsn; + + /*-------------------------------------*/ + + mutex_enter(&kernel_mutex); + } + + /* Free all savepoints */ + trx_roll_free_all_savepoints(trx); + + trx->state = TRX_NOT_STARTED; + trx->rseg = NULL; + trx->undo_no = 0; + trx->last_sql_stat_start.least_undo_no = 0; + + ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0); + ut_ad(UT_LIST_GET_LEN(trx->trx_locks) == 0); + + UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx); + + ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->trx_list)); + + trx->error_state = DB_SUCCESS; +} + +/****************************************************************//** +Cleans up a transaction at database startup. The cleanup is needed if +the transaction already got to the middle of a commit when the database +crashed, and we cannot roll it back. */ +UNIV_INTERN +void +trx_cleanup_at_db_startup( +/*======================*/ + trx_t* trx) /*!< in: transaction */ +{ + if (trx->insert_undo != NULL) { + + trx_undo_insert_cleanup(trx); + } + + trx->state = TRX_NOT_STARTED; + + /* This code is executed in a single threaded context, but we acquire + kernel_mutex to satisfy a debug assertion in + trx_release_descriptor(). */ + + mutex_enter(&kernel_mutex); + trx_release_descriptor(trx); + mutex_exit(&kernel_mutex); + + trx->rseg = NULL; + trx->undo_no = 0; + trx->last_sql_stat_start.least_undo_no = 0; + + UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx); + + ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->trx_list)); +} + +/********************************************************************//** +Assigns a read view for a consistent read query. All the consistent reads +within the same transaction will get the same read view, which is created +when this function is first called for a new started transaction. +@return consistent read view */ +UNIV_INTERN +read_view_t* +trx_assign_read_view( +/*=================*/ + trx_t* trx) /*!< in: active transaction */ +{ + ut_ad(trx->state == TRX_ACTIVE); + + if (trx->read_view) { + return(trx->read_view); + } + + mutex_enter(&kernel_mutex); + + trx->read_view = read_view_open_now(trx->id, trx->prebuilt_view, TRUE); + trx->prebuilt_view = trx->read_view; + trx->global_read_view = trx->read_view; + + mutex_exit(&kernel_mutex); + + return(trx->read_view); +} + +/****************************************************************//** +Commits a transaction. NOTE that the kernel mutex is temporarily released. */ +static +void +trx_handle_commit_sig_off_kernel( +/*=============================*/ + trx_t* trx, /*!< in: transaction */ + que_thr_t** next_thr) /*!< in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread */ +{ + trx_sig_t* sig; + trx_sig_t* next_sig; + + ut_ad(mutex_own(&kernel_mutex)); + + trx->que_state = TRX_QUE_COMMITTING; + + trx_commit_off_kernel(trx); + + ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0); + + /* Remove all TRX_SIG_COMMIT signals from the signal queue and send + reply messages to them */ + + sig = UT_LIST_GET_FIRST(trx->signals); + + while (sig != NULL) { + next_sig = UT_LIST_GET_NEXT(signals, sig); + + if (sig->type == TRX_SIG_COMMIT) { + + trx_sig_reply(sig, next_thr); + trx_sig_remove(trx, sig); + } + + sig = next_sig; + } + + trx->que_state = TRX_QUE_RUNNING; +} + +/***********************************************************//** +The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to +the TRX_QUE_RUNNING state and releases query threads which were +waiting for a lock in the wait_thrs list. */ +UNIV_INTERN +void +trx_end_lock_wait( +/*==============*/ + trx_t* trx) /*!< in: transaction */ +{ + que_thr_t* thr; + ulint sec; + ulint ms; + ib_uint64_t now; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT); + + thr = UT_LIST_GET_FIRST(trx->wait_thrs); + + while (thr != NULL) { + que_thr_end_wait_no_next_thr(thr); + + UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr); + + thr = UT_LIST_GET_FIRST(trx->wait_thrs); + } + + if (UNIV_UNLIKELY(trx->take_stats)) { + ut_usectime(&sec, &ms); + now = (ib_uint64_t)sec * 1000000 + ms; + trx->lock_que_wait_timer += (ulint)(now - trx->lock_que_wait_ustarted); + } + trx->que_state = TRX_QUE_RUNNING; +} + +/***********************************************************//** +Moves the query threads in the lock wait list to the SUSPENDED state and puts +the transaction to the TRX_QUE_RUNNING state. */ +static +void +trx_lock_wait_to_suspended( +/*=======================*/ + trx_t* trx) /*!< in: transaction in the TRX_QUE_LOCK_WAIT state */ +{ + que_thr_t* thr; + ulint sec; + ulint ms; + ib_uint64_t now; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT); + + thr = UT_LIST_GET_FIRST(trx->wait_thrs); + + while (thr != NULL) { + thr->state = QUE_THR_SUSPENDED; + + UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr); + + thr = UT_LIST_GET_FIRST(trx->wait_thrs); + } + + if (UNIV_UNLIKELY(trx->take_stats)) { + ut_usectime(&sec, &ms); + now = (ib_uint64_t)sec * 1000000 + ms; + trx->lock_que_wait_timer += (ulint)(now - trx->lock_que_wait_ustarted); + } + trx->que_state = TRX_QUE_RUNNING; +} + +/***********************************************************//** +Moves the query threads in the sig reply wait list of trx to the SUSPENDED +state. */ +static +void +trx_sig_reply_wait_to_suspended( +/*============================*/ + trx_t* trx) /*!< in: transaction */ +{ + trx_sig_t* sig; + que_thr_t* thr; + + ut_ad(mutex_own(&kernel_mutex)); + + sig = UT_LIST_GET_FIRST(trx->reply_signals); + + while (sig != NULL) { + thr = sig->receiver; + + ut_ad(thr->state == QUE_THR_SIG_REPLY_WAIT); + + thr->state = QUE_THR_SUSPENDED; + + sig->receiver = NULL; + + UT_LIST_REMOVE(reply_signals, trx->reply_signals, sig); + + sig = UT_LIST_GET_FIRST(trx->reply_signals); + } +} + +/*****************************************************************//** +Checks the compatibility of a new signal with the other signals in the +queue. +@return TRUE if the signal can be queued */ +static +ibool +trx_sig_is_compatible( +/*==================*/ + trx_t* trx, /*!< in: trx handle */ + ulint type, /*!< in: signal type */ + ulint sender) /*!< in: TRX_SIG_SELF or TRX_SIG_OTHER_SESS */ +{ + trx_sig_t* sig; + + ut_ad(mutex_own(&kernel_mutex)); + + if (UT_LIST_GET_LEN(trx->signals) == 0) { + + return(TRUE); + } + + if (sender == TRX_SIG_SELF) { + if (type == TRX_SIG_ERROR_OCCURRED) { + + return(TRUE); + + } else if (type == TRX_SIG_BREAK_EXECUTION) { + + return(TRUE); + } else { + return(FALSE); + } + } + + ut_ad(sender == TRX_SIG_OTHER_SESS); + + sig = UT_LIST_GET_FIRST(trx->signals); + + if (type == TRX_SIG_COMMIT) { + while (sig != NULL) { + + if (sig->type == TRX_SIG_TOTAL_ROLLBACK) { + + return(FALSE); + } + + sig = UT_LIST_GET_NEXT(signals, sig); + } + + return(TRUE); + + } else if (type == TRX_SIG_TOTAL_ROLLBACK) { + while (sig != NULL) { + + if (sig->type == TRX_SIG_COMMIT) { + + return(FALSE); + } + + sig = UT_LIST_GET_NEXT(signals, sig); + } + + return(TRUE); + + } else if (type == TRX_SIG_BREAK_EXECUTION) { + + return(TRUE); + } else { + ut_error; + + return(FALSE); + } +} + +/****************************************************************//** +Sends a signal to a trx object. */ +UNIV_INTERN +void +trx_sig_send( +/*=========*/ + trx_t* trx, /*!< in: trx handle */ + ulint type, /*!< in: signal type */ + ulint sender, /*!< in: TRX_SIG_SELF or + TRX_SIG_OTHER_SESS */ + que_thr_t* receiver_thr, /*!< in: query thread which wants the + reply, or NULL; if type is + TRX_SIG_END_WAIT, this must be NULL */ + trx_savept_t* savept, /*!< in: possible rollback savepoint, or + NULL */ + que_thr_t** next_thr) /*!< in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread; if the parameter + is NULL, it is ignored */ +{ + trx_sig_t* sig; + trx_t* receiver_trx; + + ut_ad(trx); + ut_ad(mutex_own(&kernel_mutex)); + + if (!trx_sig_is_compatible(trx, type, sender)) { + /* The signal is not compatible with the other signals in + the queue: die */ + + ut_error; + } + + /* Queue the signal object */ + + if (UT_LIST_GET_LEN(trx->signals) == 0) { + + /* The signal list is empty: the 'sig' slot must be unused + (we improve performance a bit by avoiding mem_alloc) */ + sig = &(trx->sig); + } else { + /* It might be that the 'sig' slot is unused also in this + case, but we choose the easy way of using mem_alloc */ + + sig = mem_alloc(sizeof(trx_sig_t)); + } + + UT_LIST_ADD_LAST(signals, trx->signals, sig); + + sig->type = type; + sig->sender = sender; + sig->receiver = receiver_thr; + + if (savept) { + sig->savept = *savept; + } + + if (receiver_thr) { + receiver_trx = thr_get_trx(receiver_thr); + + UT_LIST_ADD_LAST(reply_signals, receiver_trx->reply_signals, + sig); + } + + if (trx->sess->state == SESS_ERROR) { + + trx_sig_reply_wait_to_suspended(trx); + } + + if ((sender != TRX_SIG_SELF) || (type == TRX_SIG_BREAK_EXECUTION)) { + ut_error; + } + + /* If there were no other signals ahead in the queue, try to start + handling of the signal */ + + if (UT_LIST_GET_FIRST(trx->signals) == sig) { + + trx_sig_start_handle(trx, next_thr); + } +} + +/****************************************************************//** +Ends signal handling. If the session is in the error state, and +trx->graph_before_signal_handling != NULL, then returns control to the error +handling routine of the graph (currently just returns the control to the +graph root which then will send an error message to the client). */ +UNIV_INTERN +void +trx_end_signal_handling( +/*====================*/ + trx_t* trx) /*!< in: trx */ +{ + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(trx->handling_signals == TRUE); + + trx->handling_signals = FALSE; + + trx->graph = trx->graph_before_signal_handling; + + if (trx->graph && (trx->sess->state == SESS_ERROR)) { + + que_fork_error_handle(trx, trx->graph); + } +} + +/****************************************************************//** +Starts handling of a trx signal. */ +UNIV_INTERN +void +trx_sig_start_handle( +/*=================*/ + trx_t* trx, /*!< in: trx handle */ + que_thr_t** next_thr) /*!< in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread; if the parameter + is NULL, it is ignored */ +{ + trx_sig_t* sig; + ulint type; +loop: + /* We loop in this function body as long as there are queued signals + we can process immediately */ + + ut_ad(trx); + ut_ad(mutex_own(&kernel_mutex)); + + if (trx->handling_signals && (UT_LIST_GET_LEN(trx->signals) == 0)) { + + trx_end_signal_handling(trx); + + return; + } + + if (trx->state == TRX_NOT_STARTED) { + + trx_start_low(trx, ULINT_UNDEFINED); + } + + /* If the trx is in a lock wait state, moves the waiting query threads + to the suspended state */ + + if (trx->que_state == TRX_QUE_LOCK_WAIT) { + + trx_lock_wait_to_suspended(trx); + } + + /* If the session is in the error state and this trx has threads + waiting for reply from signals, moves these threads to the suspended + state, canceling wait reservations; note that if the transaction has + sent a commit or rollback signal to itself, and its session is not in + the error state, then nothing is done here. */ + + if (trx->sess->state == SESS_ERROR) { + trx_sig_reply_wait_to_suspended(trx); + } + + /* If there are no running query threads, we can start processing of a + signal, otherwise we have to wait until all query threads of this + transaction are aware of the arrival of the signal. */ + + if (trx->n_active_thrs > 0) { + + return; + } + + if (trx->handling_signals == FALSE) { + trx->graph_before_signal_handling = trx->graph; + + trx->handling_signals = TRUE; + } + + sig = UT_LIST_GET_FIRST(trx->signals); + type = sig->type; + + if (type == TRX_SIG_COMMIT) { + + trx_handle_commit_sig_off_kernel(trx, next_thr); + + } else if ((type == TRX_SIG_TOTAL_ROLLBACK) + || (type == TRX_SIG_ROLLBACK_TO_SAVEPT)) { + + trx_rollback(trx, sig, next_thr); + + /* No further signals can be handled until the rollback + completes, therefore we return */ + + return; + + } else if (type == TRX_SIG_ERROR_OCCURRED) { + + trx_rollback(trx, sig, next_thr); + + /* No further signals can be handled until the rollback + completes, therefore we return */ + + return; + + } else if (type == TRX_SIG_BREAK_EXECUTION) { + + trx_sig_reply(sig, next_thr); + trx_sig_remove(trx, sig); + } else { + ut_error; + } + + goto loop; +} + +/****************************************************************//** +Send the reply message when a signal in the queue of the trx has been +handled. */ +UNIV_INTERN +void +trx_sig_reply( +/*==========*/ + trx_sig_t* sig, /*!< in: signal */ + que_thr_t** next_thr) /*!< in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread */ +{ + trx_t* receiver_trx; + + ut_ad(sig); + ut_ad(mutex_own(&kernel_mutex)); + + if (sig->receiver != NULL) { + ut_ad((sig->receiver)->state == QUE_THR_SIG_REPLY_WAIT); + + receiver_trx = thr_get_trx(sig->receiver); + + UT_LIST_REMOVE(reply_signals, receiver_trx->reply_signals, + sig); + ut_ad(receiver_trx->sess->state != SESS_ERROR); + + que_thr_end_wait(sig->receiver, next_thr); + + sig->receiver = NULL; + + } +} + +/****************************************************************//** +Removes a signal object from the trx signal queue. */ +UNIV_INTERN +void +trx_sig_remove( +/*===========*/ + trx_t* trx, /*!< in: trx handle */ + trx_sig_t* sig) /*!< in, own: signal */ +{ + ut_ad(trx && sig); + ut_ad(mutex_own(&kernel_mutex)); + + ut_ad(sig->receiver == NULL); + + UT_LIST_REMOVE(signals, trx->signals, sig); + sig->type = 0; /* reset the field to catch possible bugs */ + + if (sig != &(trx->sig)) { + mem_free(sig); + } +} + +/*********************************************************************//** +Creates a commit command node struct. +@return own: commit node struct */ +UNIV_INTERN +commit_node_t* +commit_node_create( +/*===============*/ + mem_heap_t* heap) /*!< in: mem heap where created */ +{ + commit_node_t* node; + + node = mem_heap_alloc(heap, sizeof(commit_node_t)); + node->common.type = QUE_NODE_COMMIT; + node->state = COMMIT_NODE_SEND; + + return(node); +} + +/***********************************************************//** +Performs an execution step for a commit type node in a query graph. +@return query thread to run next, or NULL */ +UNIV_INTERN +que_thr_t* +trx_commit_step( +/*============*/ + que_thr_t* thr) /*!< in: query thread */ +{ + commit_node_t* node; + que_thr_t* next_thr; + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = COMMIT_NODE_SEND; + } + + if (node->state == COMMIT_NODE_SEND) { + mutex_enter(&kernel_mutex); + + node->state = COMMIT_NODE_WAIT; + + next_thr = NULL; + + thr->state = QUE_THR_SIG_REPLY_WAIT; + + /* Send the commit signal to the transaction */ + + trx_sig_send(thr_get_trx(thr), TRX_SIG_COMMIT, TRX_SIG_SELF, + thr, NULL, &next_thr); + + mutex_exit(&kernel_mutex); + + return(next_thr); + } + + ut_ad(node->state == COMMIT_NODE_WAIT); + + node->state = COMMIT_NODE_SEND; + + thr->run_node = que_node_get_parent(node); + + return(thr); +} + +/**********************************************************************//** +Does the transaction commit for MySQL. +@return DB_SUCCESS or error number */ +UNIV_INTERN +ulint +trx_commit_for_mysql( +/*=================*/ + trx_t* trx) /*!< in: trx handle */ +{ + /* Because we do not do the commit by sending an Innobase + sig to the transaction, we must here make sure that trx has been + started. */ + + ut_a(trx); + + trx_start_if_not_started(trx); + + trx->op_info = "committing"; + + mutex_enter(&kernel_mutex); + + trx_commit_off_kernel(trx); + + mutex_exit(&kernel_mutex); + + trx->op_info = ""; + + return(DB_SUCCESS); +} + +/**********************************************************************//** +If required, flushes the log to disk if we called trx_commit_for_mysql() +with trx->flush_log_later == TRUE. +@return 0 or error number */ +UNIV_INTERN +ulint +trx_commit_complete_for_mysql( +/*==========================*/ + trx_t* trx) /*!< in: trx handle */ +{ + ib_uint64_t lsn = trx->commit_lsn; + ulint flush_log_at_trx_commit; + + ut_a(trx); + + trx->op_info = "flushing log"; + + if (srv_use_global_flush_log_at_trx_commit) { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL); + } else { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd); + } + + if (!trx->must_flush_log_later) { + /* Do nothing */ + } else if (flush_log_at_trx_commit == 0) { + /* Do nothing */ + } else if (flush_log_at_trx_commit == 1) { + if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + /* Write the log to the log files AND flush them to + disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); + } + } else if (flush_log_at_trx_commit == 2) { + + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + ut_error; + } + + trx->must_flush_log_later = FALSE; + + trx->op_info = ""; + + return(0); +} + +/**********************************************************************//** +Marks the latest SQL statement ended. */ +UNIV_INTERN +void +trx_mark_sql_stat_end( +/*==================*/ + trx_t* trx) /*!< in: trx handle */ +{ + ut_a(trx); + + if (trx->state == TRX_NOT_STARTED) { + trx->undo_no = 0; + } + + trx->last_sql_stat_start.least_undo_no = trx->undo_no; +} + +/**********************************************************************//** +Prints info about a transaction to the given file. The caller must own the +kernel mutex. */ +UNIV_INTERN +void +trx_print( +/*======*/ + FILE* f, /*!< in: output stream */ + trx_t* trx, /*!< in: transaction */ + ulint max_query_len) /*!< in: max query length to print, or 0 to + use the default max length */ +{ + ibool newline; + + fprintf(f, "TRANSACTION " TRX_ID_FMT, (ullint) trx->id); + + switch (trx->state) { + case TRX_NOT_STARTED: + fputs(", not started", f); + break; + case TRX_ACTIVE: + fprintf(f, ", ACTIVE %lu sec", + (ulong)difftime(time(NULL), trx->start_time)); + break; + case TRX_PREPARED: + fprintf(f, ", ACTIVE (PREPARED) %lu sec", + (ulong)difftime(time(NULL), trx->start_time)); + break; + case TRX_COMMITTED_IN_MEMORY: + fputs(", COMMITTED IN MEMORY", f); + break; + default: + fprintf(f, " state %lu", (ulong) trx->state); + } + + if (*trx->op_info) { + putc(' ', f); + fputs(trx->op_info, f); + } + + if (trx->is_recovered) { + fputs(" recovered trx", f); + } + + if (trx->is_purge) { + fputs(" purge trx", f); + } + + if (trx->declared_to_be_inside_innodb) { + fprintf(f, ", thread declared inside InnoDB %lu", + (ulong) trx->n_tickets_to_enter_innodb); + } + + putc('\n', f); + + if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { + fprintf(f, "mysql tables in use %lu, locked %lu\n", + (ulong) trx->n_mysql_tables_in_use, + (ulong) trx->mysql_n_tables_locked); + } + + newline = TRUE; + + switch (trx->que_state) { + case TRX_QUE_RUNNING: + newline = FALSE; break; + case TRX_QUE_LOCK_WAIT: + fputs("LOCK WAIT ", f); break; + case TRX_QUE_ROLLING_BACK: + fputs("ROLLING BACK ", f); break; + case TRX_QUE_COMMITTING: + fputs("COMMITTING ", f); break; + default: + fprintf(f, "que state %lu ", (ulong) trx->que_state); + } + + if (0 < UT_LIST_GET_LEN(trx->trx_locks) + || mem_heap_get_size(trx->lock_heap) > 400) { + newline = TRUE; + + fprintf(f, "%lu lock struct(s), heap size %lu," + " %lu row lock(s)", + (ulong) UT_LIST_GET_LEN(trx->trx_locks), + (ulong) mem_heap_get_size(trx->lock_heap), + (ulong) lock_number_of_rows_locked(trx)); + } + + if (trx->has_search_latch) { + newline = TRUE; + fputs(", holds adaptive hash latch", f); + } + + if (trx->undo_no != 0) { + newline = TRUE; + fprintf(f, ", undo log entries %llu", + (ullint) trx->undo_no); + } + + if (newline) { + putc('\n', f); + } + + if (trx->mysql_thd != NULL) { + innobase_mysql_print_thd(f, trx->mysql_thd, max_query_len); + } +} + +/*******************************************************************//** +Compares the "weight" (or size) of two transactions. Transactions that +have edited non-transactional tables are considered heavier than ones +that have not. +@return TRUE if weight(a) >= weight(b) */ +UNIV_INTERN +ibool +trx_weight_ge( +/*==========*/ + const trx_t* a, /*!< in: the first transaction to be compared */ + const trx_t* b) /*!< in: the second transaction to be compared */ +{ + ibool a_notrans_edit; + ibool b_notrans_edit; + + /* If mysql_thd is NULL for a transaction we assume that it has + not edited non-transactional tables. */ + + a_notrans_edit = a->mysql_thd != NULL + && thd_has_edited_nontrans_tables(a->mysql_thd); + + b_notrans_edit = b->mysql_thd != NULL + && thd_has_edited_nontrans_tables(b->mysql_thd); + + if (a_notrans_edit != b_notrans_edit) { + + return(a_notrans_edit); + } + + /* Either both had edited non-transactional tables or both had + not, we fall back to comparing the number of altered/locked + rows. */ + +#if 0 + fprintf(stderr, + "%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n", + __func__, + a->undo_no, UT_LIST_GET_LEN(a->trx_locks), + b->undo_no, UT_LIST_GET_LEN(b->trx_locks)); +#endif + + return(TRX_WEIGHT(a) >= TRX_WEIGHT(b)); +} + +/****************************************************************//** +Prepares a transaction. */ +UNIV_INTERN +void +trx_prepare_off_kernel( +/*===================*/ + trx_t* trx) /*!< in: transaction */ +{ + trx_rseg_t* rseg; + ib_uint64_t lsn = 0; + mtr_t mtr; + + ut_ad(mutex_own(&kernel_mutex)); + + rseg = trx->rseg; + + if (trx->insert_undo != NULL || trx->update_undo != NULL) { + + mutex_exit(&kernel_mutex); + + mtr_start(&mtr); + + /* Change the undo log segment states from TRX_UNDO_ACTIVE + to TRX_UNDO_PREPARED: these modifications to the file data + structure define the transaction as prepared in the + file-based world, at the serialization point of lsn. */ + + mutex_enter(&(rseg->mutex)); + + if (trx->insert_undo != NULL) { + + /* It is not necessary to obtain trx->undo_mutex here + because only a single OS thread is allowed to do the + transaction prepare for this transaction. */ + + trx_undo_set_state_at_prepare(trx, trx->insert_undo, + &mtr); + } + + if (trx->update_undo) { + trx_undo_set_state_at_prepare( + trx, trx->update_undo, &mtr); + } + + mutex_exit(&(rseg->mutex)); + + if (trx->mysql_master_log_file_name[0] != '\0') { + /* This database server is a MySQL replication slave */ + trx_sysf_t* sys_header = trx_sysf_get(&mtr); + + trx_sys_update_mysql_binlog_offset( + sys_header, + trx->mysql_relay_log_file_name, + trx->mysql_relay_log_pos, + TRX_SYS_MYSQL_RELAY_LOG_INFO, &mtr); + trx_sys_update_mysql_binlog_offset( + sys_header, + trx->mysql_master_log_file_name, + trx->mysql_master_log_pos, + TRX_SYS_MYSQL_MASTER_LOG_INFO, &mtr); + trx->mysql_master_log_file_name = ""; + } + + /*--------------*/ + mtr_commit(&mtr); /* This mtr commit makes the + transaction prepared in the file-based + world */ + /*--------------*/ + lsn = mtr.end_lsn; + + mutex_enter(&kernel_mutex); + } + + ut_ad(mutex_own(&kernel_mutex)); + + /*--------------------------------------*/ + if (UNIV_UNLIKELY(trx->state != TRX_ACTIVE)) { + + trx_reserve_descriptor(trx); + } + trx->state = TRX_PREPARED; + trx_n_prepared++; + /*--------------------------------------*/ + + if (lsn) { + ulint flush_log_at_trx_commit; + + /* Depending on the my.cnf options, we may now write the log + buffer to the log files, making the prepared state of the + transaction durable if the OS does not crash. We may also + flush the log files to disk, making the prepared state of the + transaction durable also at an OS crash or a power outage. + + The idea in InnoDB's group prepare is that a group of + transactions gather behind a trx doing a physical disk write + to log files, and when that physical write has been completed, + one of those transactions does a write which prepares the whole + group. Note that this group prepare will only bring benefit if + there are > 2 users in the database. Then at least 2 users can + gather behind one doing the physical log write to disk. + + TODO: find out if MySQL holds some mutex when calling this. + That would spoil our group prepare algorithm. */ + + mutex_exit(&kernel_mutex); + + if (srv_use_global_flush_log_at_trx_commit) { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL); + } else { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd); + } + + if (flush_log_at_trx_commit == 0) { + /* Do nothing */ + } else if (flush_log_at_trx_commit == 1) { + if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, + FALSE); + } else { + /* Write the log to the log files AND flush + them to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); + } + } else if (flush_log_at_trx_commit == 2) { + + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + ut_error; + } + + mutex_enter(&kernel_mutex); + } +} + +/**********************************************************************//** +Does the transaction prepare for MySQL. +@return 0 or error number */ +UNIV_INTERN +ulint +trx_prepare_for_mysql( +/*==================*/ + trx_t* trx) /*!< in: trx handle */ +{ + /* Because we do not do the prepare by sending an Innobase + sig to the transaction, we must here make sure that trx has been + started. */ + + ut_a(trx); + + trx->op_info = "preparing"; + + trx_start_if_not_started(trx); + + mutex_enter(&kernel_mutex); + + trx_prepare_off_kernel(trx); + + mutex_exit(&kernel_mutex); + + trx->op_info = ""; + + return(0); +} + +/**********************************************************************//** +This function is used to find number of prepared transactions and +their transaction objects for a recovery. +@return number of prepared transactions stored in xid_list */ +UNIV_INTERN +int +trx_recover_for_mysql( +/*==================*/ + XID* xid_list, /*!< in/out: prepared transactions */ + ulint len) /*!< in: number of slots in xid_list */ +{ + trx_t* trx; + ulint count = 0; + + ut_ad(xid_list); + ut_ad(len); + + /* We should set those transactions which are in the prepared state + to the xid_list */ + + mutex_enter(&kernel_mutex); + + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx) { + if (trx->state == TRX_PREPARED) { + xid_list[count] = trx->xid; + + if (count == 0) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Starting recovery for" + " XA transactions...\n"); + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Transaction " TRX_ID_FMT " in" + " prepared state after recovery\n", + (ullint) trx->id); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Transaction contains changes" + " to %llu rows\n", + (ullint) trx->undo_no); + + count++; + + if (count == len) { + break; + } + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + mutex_exit(&kernel_mutex); + + if (count > 0){ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: %lu transactions in prepared state" + " after recovery\n", + (ulong) count); + } + + return ((int) count); +} + +/*******************************************************************//** +This function is used to find one X/Open XA distributed transaction +which is in the prepared state +@return trx or NULL; on match, the trx->xid will be invalidated */ +UNIV_INTERN +trx_t* +trx_get_trx_by_xid( +/*===============*/ + const XID* xid) /*!< in: X/Open XA transaction identifier */ +{ + trx_t* trx; + + if (xid == NULL) { + + return(NULL); + } + + mutex_enter(&kernel_mutex); + + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx) { + /* Compare two X/Open XA transaction id's: their + length should be the same and binary comparison + of gtrid_length+bqual_length bytes should be + the same */ + + if (trx->is_recovered + && trx->state == TRX_PREPARED + && xid->gtrid_length == trx->xid.gtrid_length + && xid->bqual_length == trx->xid.bqual_length + && memcmp(xid->data, trx->xid.data, + xid->gtrid_length + xid->bqual_length) == 0) { + + /* Invalidate the XID, so that subsequent calls + will not find it. */ + memset(&trx->xid, 0, sizeof(trx->xid)); + trx->xid.formatID = -1; + break; + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + mutex_exit(&kernel_mutex); + + return(trx); +} diff --git a/storage/xtradb/trx/trx0undo.c b/storage/xtradb/trx/trx0undo.c new file mode 100644 index 00000000000..3d794c69c8b --- /dev/null +++ b/storage/xtradb/trx/trx0undo.c @@ -0,0 +1,2000 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0undo.c +Transaction undo log + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0undo.h" + +#ifdef UNIV_NONINL +#include "trx0undo.ic" +#endif + +#include "fsp0fsp.h" +#ifndef UNIV_HOTBACKUP +#include "mach0data.h" +#include "mtr0log.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "trx0rec.h" +#include "trx0purge.h" + +/* How should the old versions in the history list be managed? + ---------------------------------------------------------- +If each transaction is given a whole page for its update undo log, file +space consumption can be 10 times higher than necessary. Therefore, +partly filled update undo log pages should be reusable. But then there +is no way individual pages can be ordered so that the ordering agrees +with the serialization numbers of the transactions on the pages. Thus, +the history list must be formed of undo logs, not their header pages as +it was in the old implementation. + However, on a single header page the transactions are placed in +the order of their serialization numbers. As old versions are purged, we +may free the page when the last transaction on the page has been purged. + A problem is that the purge has to go through the transactions +in the serialization order. This means that we have to look through all +rollback segments for the one that has the smallest transaction number +in its history list. + When should we do a purge? A purge is necessary when space is +running out in any of the rollback segments. Then we may have to purge +also old version which might be needed by some consistent read. How do +we trigger the start of a purge? When a transaction writes to an undo log, +it may notice that the space is running out. When a read view is closed, +it may make some history superfluous. The server can have an utility which +periodically checks if it can purge some history. + In a parallellized purge we have the problem that a query thread +can remove a delete marked clustered index record before another query +thread has processed an earlier version of the record, which cannot then +be done because the row cannot be constructed from the clustered index +record. To avoid this problem, we will store in the update and delete mark +undo record also the columns necessary to construct the secondary index +entries which are modified. + We can latch the stack of versions of a single clustered index record +by taking a latch on the clustered index page. As long as the latch is held, +no new versions can be added and no versions removed by undo. But, a purge +can still remove old versions from the bottom of the stack. */ + +/* How to protect rollback segments, undo logs, and history lists with + ------------------------------------------------------------------- +latches? +------- +The contention of the kernel mutex should be minimized. When a transaction +does its first insert or modify in an index, an undo log is assigned for it. +Then we must have an x-latch to the rollback segment header. + When the transaction does more modifys or rolls back, the undo log is +protected with undo_mutex in the transaction. + When the transaction commits, its insert undo log is either reset and +cached for a fast reuse, or freed. In these cases we must have an x-latch on +the rollback segment page. The update undo log is put to the history list. If +it is not suitable for reuse, its slot in the rollback segment is reset. In +both cases, an x-latch must be acquired on the rollback segment. + The purge operation steps through the history list without modifying +it until a truncate operation occurs, which can remove undo logs from the end +of the list and release undo log segments. In stepping through the list, +s-latches on the undo log pages are enough, but in a truncate, x-latches must +be obtained on the rollback segment and individual pages. */ +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************************//** +Initializes the fields in an undo log segment page. */ +static +void +trx_undo_page_init( +/*===============*/ + page_t* undo_page, /*!< in: undo log segment page */ + ulint type, /*!< in: undo log segment type */ + mtr_t* mtr); /*!< in: mtr */ + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Creates and initializes an undo log memory object. +@return own: the undo log memory object */ +static +trx_undo_t* +trx_undo_mem_create( +/*================*/ + trx_rseg_t* rseg, /*!< in: rollback segment memory object */ + ulint id, /*!< in: slot index within rseg */ + ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + trx_id_t trx_id, /*!< in: id of the trx for which the undo log + is created */ + const XID* xid, /*!< in: X/Open XA transaction identification*/ + ulint page_no,/*!< in: undo log header page number */ + ulint offset);/*!< in: undo log header byte offset on page */ +#endif /* !UNIV_HOTBACKUP */ +/***************************************************************//** +Initializes a cached insert undo log header page for new use. NOTE that this +function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change +the operation of this function! +@return undo log header byte offset on page */ +static +ulint +trx_undo_insert_header_reuse( +/*=========================*/ + page_t* undo_page, /*!< in/out: insert undo log segment + header page, x-latched */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr); /*!< in: mtr */ +/**********************************************************************//** +If an update undo log can be discarded immediately, this function frees the +space, resetting the page to the proper state for caching. */ +static +void +trx_undo_discard_latest_update_undo( +/*================================*/ + page_t* undo_page, /*!< in: header page of an undo log of size 1 */ + mtr_t* mtr); /*!< in: mtr */ + +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Gets the previous record in an undo log from the previous page. +@return undo log record, the page s-latched, NULL if none */ +static +trx_undo_rec_t* +trx_undo_get_prev_rec_from_prev_page( +/*=================================*/ + trx_undo_rec_t* rec, /*!< in: undo record */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset, /*!< in: undo log header offset on page */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint space; + ulint zip_size; + ulint prev_page_no; + page_t* prev_page; + page_t* undo_page; + + undo_page = page_align(rec); + + prev_page_no = flst_get_prev_addr(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_NODE, mtr) + .page; + + if (prev_page_no == FIL_NULL) { + + return(NULL); + } + + space = page_get_space_id(undo_page); + zip_size = fil_space_get_zip_size(space); + + prev_page = trx_undo_page_get_s_latched(space, zip_size, + prev_page_no, mtr); + + return(trx_undo_page_get_last_rec(prev_page, page_no, offset)); +} + +/***********************************************************************//** +Gets the previous record in an undo log. +@return undo log record, the page s-latched, NULL if none */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_prev_rec( +/*==================*/ + trx_undo_rec_t* rec, /*!< in: undo record */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset, /*!< in: undo log header offset on page */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_undo_rec_t* prev_rec; + + prev_rec = trx_undo_page_get_prev_rec(rec, page_no, offset); + + if (prev_rec) { + + return(prev_rec); + } + + /* We have to go to the previous undo log page to look for the + previous record */ + + return(trx_undo_get_prev_rec_from_prev_page(rec, page_no, offset, + mtr)); +} + +/***********************************************************************//** +Gets the next record in an undo log from the next page. +@return undo log record, the page latched, NULL if none */ +static +trx_undo_rec_t* +trx_undo_get_next_rec_from_next_page( +/*=================================*/ + ulint space, /*!< in: undo log header space */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + page_t* undo_page, /*!< in: undo log page */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset, /*!< in: undo log header offset on page */ + ulint mode, /*!< in: latch mode: RW_S_LATCH or RW_X_LATCH */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_ulogf_t* log_hdr; + ulint next_page_no; + page_t* next_page; + ulint next; + + if (page_no == page_get_page_no(undo_page)) { + + log_hdr = undo_page + offset; + next = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG); + + if (next != 0) { + + return(NULL); + } + } + + next_page_no = flst_get_next_addr(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_NODE, mtr) + .page; + if (next_page_no == FIL_NULL) { + + return(NULL); + } + + if (mode == RW_S_LATCH) { + next_page = trx_undo_page_get_s_latched(space, zip_size, + next_page_no, mtr); + } else { + ut_ad(mode == RW_X_LATCH); + next_page = trx_undo_page_get(space, zip_size, + next_page_no, mtr); + } + + return(trx_undo_page_get_first_rec(next_page, page_no, offset)); +} + +/***********************************************************************//** +Gets the next record in an undo log. +@return undo log record, the page s-latched, NULL if none */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_next_rec( +/*==================*/ + trx_undo_rec_t* rec, /*!< in: undo record */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset, /*!< in: undo log header offset on page */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint space; + ulint zip_size; + trx_undo_rec_t* next_rec; + + next_rec = trx_undo_page_get_next_rec(rec, page_no, offset); + + if (next_rec) { + return(next_rec); + } + + space = page_get_space_id(page_align(rec)); + zip_size = fil_space_get_zip_size(space); + + return(trx_undo_get_next_rec_from_next_page(space, zip_size, + page_align(rec), + page_no, offset, + RW_S_LATCH, mtr)); +} + +/***********************************************************************//** +Gets the first record in an undo log. +@return undo log record, the page latched, NULL if none */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_first_rec( +/*===================*/ + ulint space, /*!< in: undo log header space */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset, /*!< in: undo log header offset on page */ + ulint mode, /*!< in: latching mode: RW_S_LATCH or RW_X_LATCH */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* undo_page; + trx_undo_rec_t* rec; + + if (mode == RW_S_LATCH) { + undo_page = trx_undo_page_get_s_latched(space, zip_size, + page_no, mtr); + } else { + undo_page = trx_undo_page_get(space, zip_size, page_no, mtr); + } + + rec = trx_undo_page_get_first_rec(undo_page, page_no, offset); + + if (rec) { + return(rec); + } + + return(trx_undo_get_next_rec_from_next_page(space, zip_size, + undo_page, page_no, offset, + mode, mtr)); +} + +/*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/ + +/**********************************************************************//** +Writes the mtr log entry of an undo log page initialization. */ +UNIV_INLINE +void +trx_undo_page_init_log( +/*===================*/ + page_t* undo_page, /*!< in: undo log page */ + ulint type, /*!< in: undo log type */ + mtr_t* mtr) /*!< in: mtr */ +{ + mlog_write_initial_log_record(undo_page, MLOG_UNDO_INIT, mtr); + + mlog_catenate_ulint_compressed(mtr, type); +} +#else /* !UNIV_HOTBACKUP */ +# define trx_undo_page_init_log(undo_page,type,mtr) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses the redo log entry of an undo log page initialization. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_page_init( +/*=====================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + ulint type; + + ptr = mach_parse_compressed(ptr, end_ptr, &type); + + if (ptr == NULL) { + + return(NULL); + } + + if (page) { + trx_undo_page_init(page, type, mtr); + } + + return(ptr); +} + +/********************************************************************//** +Initializes the fields in an undo log segment page. */ +static +void +trx_undo_page_init( +/*===============*/ + page_t* undo_page, /*!< in: undo log segment page */ + ulint type, /*!< in: undo log segment type */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_upagef_t* page_hdr; + + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_TYPE, type); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); + + fil_page_set_type(undo_page, FIL_PAGE_UNDO_LOG); + + trx_undo_page_init_log(undo_page, type, mtr); +} + +#ifndef UNIV_HOTBACKUP +/***************************************************************//** +Creates a new undo log segment in file. +@return DB_SUCCESS if page creation OK possible error codes are: +DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE */ +static +ulint +trx_undo_seg_create( +/*================*/ + trx_rseg_t* rseg __attribute__((unused)),/*!< in: rollback segment */ + trx_rsegf_t* rseg_hdr,/*!< in: rollback segment header, page + x-latched */ + ulint type, /*!< in: type of the segment: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + ulint* id, /*!< out: slot index within rseg header */ + page_t** undo_page, + /*!< out: segment header page x-latched, NULL + if there was an error */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint slot_no; + ulint space; + buf_block_t* block; + trx_upagef_t* page_hdr; + trx_usegf_t* seg_hdr; + ulint n_reserved; + ibool success; + ulint err = DB_SUCCESS; + + ut_ad(mtr && id && rseg_hdr); + ut_ad(mutex_own(&(rseg->mutex))); + + /* fputs(type == TRX_UNDO_INSERT + ? "Creating insert undo log segment\n" + : "Creating update undo log segment\n", stderr); */ + slot_no = trx_rsegf_undo_find_free(rseg_hdr, mtr); + + if (slot_no == ULINT_UNDEFINED) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: cannot find a free slot for" + " an undo log. Do you have too\n" + "InnoDB: many active transactions" + " running concurrently?\n"); + + return(DB_TOO_MANY_CONCURRENT_TRXS); + } + + space = page_get_space_id(page_align(rseg_hdr)); + + success = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO, + mtr); + if (!success) { + + return(DB_OUT_OF_FILE_SPACE); + } + + /* Allocate a new file segment for the undo log */ + block = fseg_create_general(space, 0, + TRX_UNDO_SEG_HDR + + TRX_UNDO_FSEG_HEADER, TRUE, mtr); + + fil_space_release_free_extents(space, n_reserved); + + if (block == NULL) { + /* No space left */ + + return(DB_OUT_OF_FILE_SPACE); + } + + buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); + + *undo_page = buf_block_get_frame(block); + + page_hdr = *undo_page + TRX_UNDO_PAGE_HDR; + seg_hdr = *undo_page + TRX_UNDO_SEG_HDR; + + trx_undo_page_init(*undo_page, type, mtr); + + mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE, + TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE, + MLOG_2BYTES, mtr); + + mlog_write_ulint(seg_hdr + TRX_UNDO_LAST_LOG, 0, MLOG_2BYTES, mtr); + + flst_init(seg_hdr + TRX_UNDO_PAGE_LIST, mtr); + + flst_add_last(seg_hdr + TRX_UNDO_PAGE_LIST, + page_hdr + TRX_UNDO_PAGE_NODE, mtr); + + trx_rsegf_set_nth_undo(rseg_hdr, slot_no, + page_get_page_no(*undo_page), mtr); + *id = slot_no; + + return(err); +} + +/**********************************************************************//** +Writes the mtr log entry of an undo log header initialization. */ +UNIV_INLINE +void +trx_undo_header_create_log( +/*=======================*/ + const page_t* undo_page, /*!< in: undo log header page */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in: mtr */ +{ + mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_CREATE, mtr); + + mlog_catenate_ull_compressed(mtr, trx_id); +} +#else /* !UNIV_HOTBACKUP */ +# define trx_undo_header_create_log(undo_page,trx_id,mtr) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +/***************************************************************//** +Creates a new undo log header in file. NOTE that this function has its own +log record type MLOG_UNDO_HDR_CREATE. You must NOT change the operation of +this function! +@return header byte offset on page */ +static +ulint +trx_undo_header_create( +/*===================*/ + page_t* undo_page, /*!< in/out: undo log segment + header page, x-latched; it is + assumed that there is + TRX_UNDO_LOG_XA_HDR_SIZE bytes + free space on it */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_upagef_t* page_hdr; + trx_usegf_t* seg_hdr; + trx_ulogf_t* log_hdr; + trx_ulogf_t* prev_log_hdr; + ulint prev_log; + ulint free; + ulint new_free; + + ut_ad(mtr && undo_page); + + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + + free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE); + + log_hdr = undo_page + free; + + new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE; + + ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free); + + mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE); + + prev_log = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG); + + if (prev_log != 0) { + prev_log_hdr = undo_page + prev_log; + + mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, free); + } + + mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, free); + + log_hdr = undo_page + free; + + mach_write_to_2(log_hdr + TRX_UNDO_DEL_MARKS, TRUE); + + mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id); + mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free); + + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE); + mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE); + + mach_write_to_2(log_hdr + TRX_UNDO_NEXT_LOG, 0); + mach_write_to_2(log_hdr + TRX_UNDO_PREV_LOG, prev_log); + + /* Write the log record about the header creation */ + trx_undo_header_create_log(undo_page, trx_id, mtr); + + return(free); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Write X/Open XA Transaction Identification (XID) to undo log header */ +static +void +trx_undo_write_xid( +/*===============*/ + trx_ulogf_t* log_hdr,/*!< in: undo log header */ + const XID* xid, /*!< in: X/Open XA Transaction Identification */ + mtr_t* mtr) /*!< in: mtr */ +{ + mlog_write_ulint(log_hdr + TRX_UNDO_XA_FORMAT, + (ulint)xid->formatID, MLOG_4BYTES, mtr); + + mlog_write_ulint(log_hdr + TRX_UNDO_XA_TRID_LEN, + (ulint)xid->gtrid_length, MLOG_4BYTES, mtr); + + mlog_write_ulint(log_hdr + TRX_UNDO_XA_BQUAL_LEN, + (ulint)xid->bqual_length, MLOG_4BYTES, mtr); + + mlog_write_string(log_hdr + TRX_UNDO_XA_XID, (const byte*) xid->data, + XIDDATASIZE, mtr); +} + +/********************************************************************//** +Read X/Open XA Transaction Identification (XID) from undo log header */ +static +void +trx_undo_read_xid( +/*==============*/ + trx_ulogf_t* log_hdr,/*!< in: undo log header */ + XID* xid) /*!< out: X/Open XA Transaction Identification */ +{ + xid->formatID = (long)mach_read_from_4(log_hdr + TRX_UNDO_XA_FORMAT); + + xid->gtrid_length + = (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_TRID_LEN); + xid->bqual_length + = (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_BQUAL_LEN); + + memcpy(xid->data, log_hdr + TRX_UNDO_XA_XID, XIDDATASIZE); +} + +/***************************************************************//** +Adds space for the XA XID after an undo log old-style header. */ +static +void +trx_undo_header_add_space_for_xid( +/*==============================*/ + page_t* undo_page,/*!< in: undo log segment header page */ + trx_ulogf_t* log_hdr,/*!< in: undo log header */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_upagef_t* page_hdr; + ulint free; + ulint new_free; + + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE); + + /* free is now the end offset of the old style undo log header */ + + ut_a(free == (ulint)(log_hdr - undo_page) + TRX_UNDO_LOG_OLD_HDR_SIZE); + + new_free = free + (TRX_UNDO_LOG_XA_HDR_SIZE + - TRX_UNDO_LOG_OLD_HDR_SIZE); + + /* Add space for a XID after the header, update the free offset + fields on the undo log page and in the undo log header */ + + mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_START, new_free, + MLOG_2BYTES, mtr); + + mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE, new_free, + MLOG_2BYTES, mtr); + + mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, new_free, + MLOG_2BYTES, mtr); +} + +/**********************************************************************//** +Writes the mtr log entry of an undo log header reuse. */ +UNIV_INLINE +void +trx_undo_insert_header_reuse_log( +/*=============================*/ + const page_t* undo_page, /*!< in: undo log header page */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in: mtr */ +{ + mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_REUSE, mtr); + + mlog_catenate_ull_compressed(mtr, trx_id); +} +#else /* !UNIV_HOTBACKUP */ +# define trx_undo_insert_header_reuse_log(undo_page,trx_id,mtr) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses the redo log entry of an undo log page header create or reuse. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_page_header( +/*=======================*/ + ulint type, /*!< in: MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE */ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + trx_id_t trx_id; + /* Silence a GCC warning about possibly uninitialized variable + when mach_ull_parse_compressed() is not inlined. */ + ut_d(trx_id = 0); + /* Declare the variable uninitialized in Valgrind, so that the + above initialization will not mask any bugs. */ + UNIV_MEM_INVALID(&trx_id, sizeof trx_id); + + ptr = mach_ull_parse_compressed(ptr, end_ptr, &trx_id); + + if (ptr == NULL) { + + return(NULL); + } + + if (page) { + if (type == MLOG_UNDO_HDR_CREATE) { + trx_undo_header_create(page, trx_id, mtr); + } else { + ut_ad(type == MLOG_UNDO_HDR_REUSE); + trx_undo_insert_header_reuse(page, trx_id, mtr); + } + } + + return(ptr); +} + +/***************************************************************//** +Initializes a cached insert undo log header page for new use. NOTE that this +function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change +the operation of this function! +@return undo log header byte offset on page */ +static +ulint +trx_undo_insert_header_reuse( +/*=========================*/ + page_t* undo_page, /*!< in/out: insert undo log segment + header page, x-latched */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_upagef_t* page_hdr; + trx_usegf_t* seg_hdr; + trx_ulogf_t* log_hdr; + ulint free; + ulint new_free; + + ut_ad(mtr && undo_page); + + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + + free = TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE; + + ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100); + + log_hdr = undo_page + free; + + new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE; + + /* Insert undo data is not needed after commit: we may free all + the space on the page */ + + ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) + == TRX_UNDO_INSERT); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free); + + mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE); + + log_hdr = undo_page + free; + + mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id); + mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free); + + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE); + mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE); + + /* Write the log record MLOG_UNDO_HDR_REUSE */ + trx_undo_insert_header_reuse_log(undo_page, trx_id, mtr); + + return(free); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Writes the redo log entry of an update undo log header discard. */ +UNIV_INLINE +void +trx_undo_discard_latest_log( +/*========================*/ + page_t* undo_page, /*!< in: undo log header page */ + mtr_t* mtr) /*!< in: mtr */ +{ + mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_DISCARD, mtr); +} +#else /* !UNIV_HOTBACKUP */ +# define trx_undo_discard_latest_log(undo_page, mtr) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses the redo log entry of an undo log page header discard. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_discard_latest( +/*==========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr __attribute__((unused)), /*!< in: buffer end */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + ut_ad(end_ptr); + + if (page) { + trx_undo_discard_latest_update_undo(page, mtr); + } + + return(ptr); +} + +/**********************************************************************//** +If an update undo log can be discarded immediately, this function frees the +space, resetting the page to the proper state for caching. */ +static +void +trx_undo_discard_latest_update_undo( +/*================================*/ + page_t* undo_page, /*!< in: header page of an undo log of size 1 */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_usegf_t* seg_hdr; + trx_upagef_t* page_hdr; + trx_ulogf_t* log_hdr; + trx_ulogf_t* prev_log_hdr; + ulint free; + ulint prev_hdr_offset; + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + free = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG); + log_hdr = undo_page + free; + + prev_hdr_offset = mach_read_from_2(log_hdr + TRX_UNDO_PREV_LOG); + + if (prev_hdr_offset != 0) { + prev_log_hdr = undo_page + prev_hdr_offset; + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, + mach_read_from_2(prev_log_hdr + + TRX_UNDO_LOG_START)); + mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, 0); + } + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, free); + + mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_CACHED); + mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, prev_hdr_offset); + + trx_undo_discard_latest_log(undo_page, mtr); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Tries to add a page to the undo log segment where the undo log is placed. +@return X-latched block if success, else NULL */ +UNIV_INTERN +buf_block_t* +trx_undo_add_page( +/*==============*/ + trx_t* trx, /*!< in: transaction */ + trx_undo_t* undo, /*!< in: undo log memory object */ + mtr_t* mtr) /*!< in: mtr which does not have a latch to any + undo log page; the caller must have reserved + the rollback segment mutex */ +{ + page_t* header_page; + buf_block_t* new_block; + page_t* new_page; + trx_rseg_t* rseg; + ulint n_reserved; + + ut_ad(mutex_own(&(trx->undo_mutex))); + ut_ad(!mutex_own(&kernel_mutex)); + ut_ad(mutex_own(&(trx->rseg->mutex))); + + rseg = trx->rseg; + + if (rseg->curr_size == rseg->max_size) { + + return(NULL); + } + + header_page = trx_undo_page_get(undo->space, undo->zip_size, + undo->hdr_page_no, mtr); + + if (!fsp_reserve_free_extents(&n_reserved, undo->space, 1, + FSP_UNDO, mtr)) { + + return(NULL); + } + + new_block = fseg_alloc_free_page_general( + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER + + header_page, + undo->top_page_no + 1, FSP_UP, TRUE, mtr, mtr); + + fil_space_release_free_extents(undo->space, n_reserved); + + if (new_block == NULL) { + + /* No space left */ + + return(NULL); + } + + ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1); + buf_block_dbg_add_level(new_block, SYNC_TRX_UNDO_PAGE); + undo->last_page_no = buf_block_get_page_no(new_block); + + new_page = buf_block_get_frame(new_block); + + trx_undo_page_init(new_page, undo->type, mtr); + + flst_add_last(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST, + new_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr); + undo->size++; + rseg->curr_size++; + + return(new_block); +} + +/********************************************************************//** +Frees an undo log page that is not the header page. +@return last page number in remaining log */ +static +ulint +trx_undo_free_page( +/*===============*/ + trx_rseg_t* rseg, /*!< in: rollback segment */ + ibool in_history, /*!< in: TRUE if the undo log is in the history + list */ + ulint space, /*!< in: space */ + ulint hdr_page_no, /*!< in: header page number */ + ulint page_no, /*!< in: page number to free: must not be the + header page */ + mtr_t* mtr) /*!< in: mtr which does not have a latch to any + undo log page; the caller must have reserved + the rollback segment mutex */ +{ + page_t* header_page; + page_t* undo_page; + fil_addr_t last_addr; + trx_rsegf_t* rseg_header; + ulint hist_size; + ulint zip_size; + + ut_a(hdr_page_no != page_no); + ut_ad(!mutex_own(&kernel_mutex)); + ut_ad(mutex_own(&(rseg->mutex))); + + zip_size = rseg->zip_size; + + undo_page = trx_undo_page_get(space, zip_size, page_no, mtr); + + header_page = trx_undo_page_get(space, zip_size, hdr_page_no, mtr); + + flst_remove(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST, + undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr); + + fseg_free_page(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER, + space, page_no, mtr); + + last_addr = flst_get_last(header_page + TRX_UNDO_SEG_HDR + + TRX_UNDO_PAGE_LIST, mtr); + rseg->curr_size--; + + if (in_history) { + rseg_header = trx_rsegf_get(space, zip_size, + rseg->page_no, mtr); + + hist_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, + MLOG_4BYTES, mtr); + ut_ad(hist_size > 0); + mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, + hist_size - 1, MLOG_4BYTES, mtr); + } + + return(last_addr.page); +} + +/********************************************************************//** +Frees the last undo log page. +The caller must hold the rollback segment mutex. */ +UNIV_INTERN +void +trx_undo_free_last_page_func( +/*==========================*/ +#ifdef UNIV_DEBUG + const trx_t* trx, /*!< in: transaction */ +#endif /* UNIV_DEBUG */ + trx_undo_t* undo, /*!< in/out: undo log memory copy */ + mtr_t* mtr) /*!< in/out: mini-transaction which does not + have a latch to any undo log page or which + has allocated the undo log page */ +{ + ut_ad(mutex_own(&trx->undo_mutex)); + ut_ad(undo->hdr_page_no != undo->last_page_no); + ut_ad(undo->size > 0); + + undo->last_page_no = trx_undo_free_page( + undo->rseg, FALSE, undo->space, + undo->hdr_page_no, undo->last_page_no, mtr); + + undo->size--; +} + +/********************************************************************//** +Empties an undo log header page of undo records for that undo log. Other +undo logs may still have records on that page, if it is an update undo log. */ +static +void +trx_undo_empty_header_page( +/*=======================*/ + ulint space, /*!< in: space */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint hdr_page_no, /*!< in: header page number */ + ulint hdr_offset, /*!< in: header offset */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* header_page; + trx_ulogf_t* log_hdr; + ulint end; + + header_page = trx_undo_page_get(space, zip_size, hdr_page_no, mtr); + + log_hdr = header_page + hdr_offset; + + end = trx_undo_page_get_end(header_page, hdr_page_no, hdr_offset); + + mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, end, MLOG_2BYTES, mtr); +} + +/***********************************************************************//** +Truncates an undo log from the end. This function is used during a rollback +to free space from an undo log. */ +UNIV_INTERN +void +trx_undo_truncate_end_func( +/*=======================*/ +#ifdef UNIV_DEBUG + const trx_t* trx, /*!< in: transaction whose undo log it is */ +#endif /* UNIV_DEBUG */ + trx_undo_t* undo, /*!< in: undo log */ + undo_no_t limit) /*!< in: all undo records with undo number + >= this value should be truncated */ +{ + page_t* undo_page; + ulint last_page_no; + trx_undo_rec_t* rec; + trx_undo_rec_t* trunc_here; + mtr_t mtr; + + ut_ad(mutex_own(&(trx->undo_mutex))); + ut_ad(mutex_own(&(trx->rseg->mutex))); + + for (;;) { + mtr_start(&mtr); + + trunc_here = NULL; + + last_page_no = undo->last_page_no; + + undo_page = trx_undo_page_get(undo->space, undo->zip_size, + last_page_no, &mtr); + + rec = trx_undo_page_get_last_rec(undo_page, undo->hdr_page_no, + undo->hdr_offset); + while (rec) { + if (trx_undo_rec_get_undo_no(rec) >= limit) { + /* Truncate at least this record off, maybe + more */ + trunc_here = rec; + } else { + goto function_exit; + } + + rec = trx_undo_page_get_prev_rec(rec, + undo->hdr_page_no, + undo->hdr_offset); + } + + if (last_page_no == undo->hdr_page_no) { + + goto function_exit; + } + + ut_ad(last_page_no == undo->last_page_no); + trx_undo_free_last_page(trx, undo, &mtr); + + mtr_commit(&mtr); + } + +function_exit: + if (trunc_here) { + mlog_write_ulint(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE, + trunc_here - undo_page, MLOG_2BYTES, &mtr); + } + + mtr_commit(&mtr); +} + +/***********************************************************************//** +Truncates an undo log from the start. This function is used during a purge +operation. */ +UNIV_INTERN +void +trx_undo_truncate_start( +/*====================*/ + trx_rseg_t* rseg, /*!< in: rollback segment */ + ulint space, /*!< in: space id of the log */ + ulint hdr_page_no, /*!< in: header page number */ + ulint hdr_offset, /*!< in: header offset on the page */ + undo_no_t limit) /*!< in: all undo pages with + undo numbers < this value + should be truncated; NOTE that + the function only frees whole + pages; the header page is not + freed, but emptied, if all the + records there are < limit */ +{ + page_t* undo_page; + trx_undo_rec_t* rec; + trx_undo_rec_t* last_rec; + ulint page_no; + mtr_t mtr; + + ut_ad(mutex_own(&(rseg->mutex))); + + if (!limit) { + + return; + } +loop: + mtr_start(&mtr); + + rec = trx_undo_get_first_rec(space, rseg->zip_size, + hdr_page_no, hdr_offset, + RW_X_LATCH, &mtr); + if (rec == NULL) { + /* Already empty */ + + mtr_commit(&mtr); + + return; + } + + undo_page = page_align(rec); + + last_rec = trx_undo_page_get_last_rec(undo_page, hdr_page_no, + hdr_offset); + if (trx_undo_rec_get_undo_no(last_rec) >= limit) { + + mtr_commit(&mtr); + + return; + } + + page_no = page_get_page_no(undo_page); + + if (page_no == hdr_page_no) { + trx_undo_empty_header_page(space, rseg->zip_size, + hdr_page_no, hdr_offset, + &mtr); + } else { + trx_undo_free_page(rseg, TRUE, space, hdr_page_no, + page_no, &mtr); + } + + mtr_commit(&mtr); + + goto loop; +} + +/**********************************************************************//** +Frees an undo log segment which is not in the history list. */ +static +void +trx_undo_seg_free( +/*==============*/ + trx_undo_t* undo) /*!< in: undo log */ +{ + trx_rseg_t* rseg; + fseg_header_t* file_seg; + trx_rsegf_t* rseg_header; + trx_usegf_t* seg_header; + ibool finished; + mtr_t mtr; + + rseg = undo->rseg; + + do { + + mtr_start(&mtr); + + ut_ad(!mutex_own(&kernel_mutex)); + + mutex_enter(&(rseg->mutex)); + + seg_header = trx_undo_page_get(undo->space, undo->zip_size, + undo->hdr_page_no, + &mtr) + TRX_UNDO_SEG_HDR; + + file_seg = seg_header + TRX_UNDO_FSEG_HEADER; + + finished = fseg_free_step(file_seg, &mtr); + + if (finished) { + /* Update the rseg header */ + rseg_header = trx_rsegf_get( + rseg->space, rseg->zip_size, rseg->page_no, + &mtr); + trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, + &mtr); + } + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + } while (!finished); +} + +/*========== UNDO LOG MEMORY COPY INITIALIZATION =====================*/ + +/********************************************************************//** +Creates and initializes an undo log memory object according to the values +in the header in file, when the database is started. The memory object is +inserted in the appropriate list of rseg. +@return own: the undo log memory object */ +static +trx_undo_t* +trx_undo_mem_create_at_db_start( +/*============================*/ + trx_rseg_t* rseg, /*!< in: rollback segment memory object */ + ulint id, /*!< in: slot index within rseg */ + ulint page_no,/*!< in: undo log segment page number */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* undo_page; + trx_upagef_t* page_header; + trx_usegf_t* seg_header; + trx_ulogf_t* undo_header; + trx_undo_t* undo; + ulint type; + ulint state; + trx_id_t trx_id; + ulint offset; + fil_addr_t last_addr; + page_t* last_page; + trx_undo_rec_t* rec; + XID xid; + ibool xid_exists = FALSE; + + if (id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, + "InnoDB: Error: undo->id is %lu\n", (ulong) id); + ut_error; + } + + undo_page = trx_undo_page_get(rseg->space, rseg->zip_size, + page_no, mtr); + + page_header = undo_page + TRX_UNDO_PAGE_HDR; + + type = mtr_read_ulint(page_header + TRX_UNDO_PAGE_TYPE, MLOG_2BYTES, + mtr); + seg_header = undo_page + TRX_UNDO_SEG_HDR; + + state = mach_read_from_2(seg_header + TRX_UNDO_STATE); + + offset = mach_read_from_2(seg_header + TRX_UNDO_LAST_LOG); + + undo_header = undo_page + offset; + + trx_id = mach_read_from_8(undo_header + TRX_UNDO_TRX_ID); + + xid_exists = mtr_read_ulint(undo_header + TRX_UNDO_XID_EXISTS, + MLOG_1BYTE, mtr); + + /* Read X/Open XA transaction identification if it exists, or + set it to NULL. */ + + memset(&xid, 0, sizeof(xid)); + xid.formatID = -1; + + if (xid_exists == TRUE) { + trx_undo_read_xid(undo_header, &xid); + } + + mutex_enter(&(rseg->mutex)); + + undo = trx_undo_mem_create(rseg, id, type, trx_id, &xid, + page_no, offset); + mutex_exit(&(rseg->mutex)); + + undo->dict_operation = mtr_read_ulint( + undo_header + TRX_UNDO_DICT_TRANS, MLOG_1BYTE, mtr); + + undo->table_id = mach_read_from_8(undo_header + TRX_UNDO_TABLE_ID); + undo->state = state; + undo->size = flst_get_len(seg_header + TRX_UNDO_PAGE_LIST, mtr); + + /* If the log segment is being freed, the page list is inconsistent! */ + if (state == TRX_UNDO_TO_FREE) { + + goto add_to_list; + } + + last_addr = flst_get_last(seg_header + TRX_UNDO_PAGE_LIST, mtr); + + undo->last_page_no = last_addr.page; + undo->top_page_no = last_addr.page; + + last_page = trx_undo_page_get(rseg->space, rseg->zip_size, + undo->last_page_no, mtr); + + rec = trx_undo_page_get_last_rec(last_page, page_no, offset); + + if (rec == NULL) { + undo->empty = TRUE; + } else { + undo->empty = FALSE; + undo->top_offset = rec - last_page; + undo->top_undo_no = trx_undo_rec_get_undo_no(rec); + } +add_to_list: + if (type == TRX_UNDO_INSERT) { + if (state != TRX_UNDO_CACHED) { + UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_list, + undo); + } else { + UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_cached, + undo); + } + } else { + ut_ad(type == TRX_UNDO_UPDATE); + if (state != TRX_UNDO_CACHED) { + UT_LIST_ADD_LAST(undo_list, rseg->update_undo_list, + undo); + } else { + UT_LIST_ADD_LAST(undo_list, rseg->update_undo_cached, + undo); + } + } + + return(undo); +} + +/********************************************************************//** +Initializes the undo log lists for a rollback segment memory copy. This +function is only called when the database is started or a new rollback +segment is created. +@return the combined size of undo log segments in pages */ +UNIV_INTERN +ulint +trx_undo_lists_init( +/*================*/ + trx_rseg_t* rseg) /*!< in: rollback segment memory object */ +{ + ulint page_no; + trx_undo_t* undo; + ulint size = 0; + trx_rsegf_t* rseg_header; + ulint i; + mtr_t mtr; + + UT_LIST_INIT(rseg->update_undo_list); + UT_LIST_INIT(rseg->update_undo_cached); + UT_LIST_INIT(rseg->insert_undo_list); + UT_LIST_INIT(rseg->insert_undo_cached); + + mtr_start(&mtr); + + rseg_header = trx_rsegf_get_new(rseg->space, rseg->zip_size, + rseg->page_no, &mtr); + + for (i = 0; i < TRX_RSEG_N_SLOTS; i++) { + page_no = trx_rsegf_get_nth_undo(rseg_header, i, &mtr); + + /* In forced recovery: try to avoid operations which look + at database pages; undo logs are rapidly changing data, and + the probability that they are in an inconsistent state is + high */ + + if (page_no != FIL_NULL + && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) { + + undo = trx_undo_mem_create_at_db_start(rseg, i, + page_no, &mtr); + size += undo->size; + + mtr_commit(&mtr); + + mtr_start(&mtr); + + rseg_header = trx_rsegf_get( + rseg->space, rseg->zip_size, rseg->page_no, + &mtr); + } + } + + mtr_commit(&mtr); + + return(size); +} + +/********************************************************************//** +Creates and initializes an undo log memory object. +@return own: the undo log memory object */ +static +trx_undo_t* +trx_undo_mem_create( +/*================*/ + trx_rseg_t* rseg, /*!< in: rollback segment memory object */ + ulint id, /*!< in: slot index within rseg */ + ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + trx_id_t trx_id, /*!< in: id of the trx for which the undo log + is created */ + const XID* xid, /*!< in: X/Open transaction identification */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset) /*!< in: undo log header byte offset on page */ +{ + trx_undo_t* undo; + + ut_ad(mutex_own(&(rseg->mutex))); + + if (id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, + "InnoDB: Error: undo->id is %lu\n", (ulong) id); + ut_error; + } + + undo = mem_alloc(sizeof(trx_undo_t)); + + if (undo == NULL) { + + return NULL; + } + + undo->id = id; + undo->type = type; + undo->state = TRX_UNDO_ACTIVE; + undo->del_marks = FALSE; + undo->trx_id = trx_id; + undo->xid = *xid; + + undo->dict_operation = FALSE; + + undo->rseg = rseg; + + undo->space = rseg->space; + undo->zip_size = rseg->zip_size; + undo->hdr_page_no = page_no; + undo->hdr_offset = offset; + undo->last_page_no = page_no; + undo->size = 1; + + undo->empty = TRUE; + undo->top_page_no = page_no; + undo->guess_block = NULL; + + return(undo); +} + +/********************************************************************//** +Initializes a cached undo log object for new use. */ +static +void +trx_undo_mem_init_for_reuse( +/*========================*/ + trx_undo_t* undo, /*!< in: undo log to init */ + trx_id_t trx_id, /*!< in: id of the trx for which the undo log + is created */ + const XID* xid, /*!< in: X/Open XA transaction identification*/ + ulint offset) /*!< in: undo log header byte offset on page */ +{ + ut_ad(mutex_own(&((undo->rseg)->mutex))); + + if (UNIV_UNLIKELY(undo->id >= TRX_RSEG_N_SLOTS)) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + + mem_analyze_corruption(undo); + ut_error; + } + + undo->state = TRX_UNDO_ACTIVE; + undo->del_marks = FALSE; + undo->trx_id = trx_id; + undo->xid = *xid; + + undo->dict_operation = FALSE; + + undo->hdr_offset = offset; + undo->empty = TRUE; +} + +/********************************************************************//** +Frees an undo log memory copy. */ +UNIV_INTERN +void +trx_undo_mem_free( +/*==============*/ + trx_undo_t* undo) /*!< in: the undo object to be freed */ +{ + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, + "InnoDB: Error: undo->id is %lu\n", (ulong) undo->id); + ut_error; + } + + mem_free(undo); +} + +/**********************************************************************//** +Creates a new undo log. +@return DB_SUCCESS if successful in creating the new undo lob object, +possible error codes are: DB_TOO_MANY_CONCURRENT_TRXS +DB_OUT_OF_FILE_SPACE DB_OUT_OF_MEMORY */ +static +ulint +trx_undo_create( +/*============*/ + trx_t* trx, /*!< in: transaction */ + trx_rseg_t* rseg, /*!< in: rollback segment memory copy */ + ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + trx_id_t trx_id, /*!< in: id of the trx for which the undo log + is created */ + const XID* xid, /*!< in: X/Open transaction identification*/ + trx_undo_t** undo, /*!< out: the new undo log object, undefined + * if did not succeed */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_rsegf_t* rseg_header; + ulint page_no; + ulint offset; + ulint id; + page_t* undo_page; + ulint err; + + ut_ad(mutex_own(&(rseg->mutex))); + + if (rseg->curr_size == rseg->max_size) { + + return(DB_OUT_OF_FILE_SPACE); + } + + rseg->curr_size++; + + rseg_header = trx_rsegf_get(rseg->space, rseg->zip_size, rseg->page_no, + mtr); + + err = trx_undo_seg_create(rseg, rseg_header, type, &id, + &undo_page, mtr); + + if (err != DB_SUCCESS) { + /* Did not succeed */ + + rseg->curr_size--; + + return(err); + } + + page_no = page_get_page_no(undo_page); + + offset = trx_undo_header_create(undo_page, trx_id, mtr); + + if (trx->support_xa) { + trx_undo_header_add_space_for_xid(undo_page, + undo_page + offset, mtr); + } + + *undo = trx_undo_mem_create(rseg, id, type, trx_id, xid, + page_no, offset); + if (*undo == NULL) { + + err = DB_OUT_OF_MEMORY; + } + + return(err); +} + +/*================ UNDO LOG ASSIGNMENT AND CLEANUP =====================*/ + +/********************************************************************//** +Reuses a cached undo log. +@return the undo log memory object, NULL if none cached */ +static +trx_undo_t* +trx_undo_reuse_cached( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + trx_rseg_t* rseg, /*!< in: rollback segment memory object */ + ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + trx_id_t trx_id, /*!< in: id of the trx for which the undo log + is used */ + const XID* xid, /*!< in: X/Open XA transaction identification */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_undo_t* undo; + page_t* undo_page; + ulint offset; + + ut_ad(mutex_own(&(rseg->mutex))); + + if (type == TRX_UNDO_INSERT) { + + undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached); + if (undo == NULL) { + + return(NULL); + } + + UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, undo); + } else { + ut_ad(type == TRX_UNDO_UPDATE); + + undo = UT_LIST_GET_FIRST(rseg->update_undo_cached); + if (undo == NULL) { + + return(NULL); + } + + UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, undo); + } + + ut_ad(undo->size == 1); + + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + mem_analyze_corruption(undo); + ut_error; + } + + undo_page = trx_undo_page_get(undo->space, undo->zip_size, + undo->hdr_page_no, mtr); + + if (type == TRX_UNDO_INSERT) { + offset = trx_undo_insert_header_reuse(undo_page, trx_id, mtr); + + if (trx->support_xa) { + trx_undo_header_add_space_for_xid( + undo_page, undo_page + offset, mtr); + } + } else { + ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) + == TRX_UNDO_UPDATE); + + offset = trx_undo_header_create(undo_page, trx_id, mtr); + + if (trx->support_xa) { + trx_undo_header_add_space_for_xid( + undo_page, undo_page + offset, mtr); + } + } + + trx_undo_mem_init_for_reuse(undo, trx_id, xid, offset); + + return(undo); +} + +/**********************************************************************//** +Marks an undo log header as a header of a data dictionary operation +transaction. */ +static +void +trx_undo_mark_as_dict_operation( +/*============================*/ + trx_t* trx, /*!< in: dict op transaction */ + trx_undo_t* undo, /*!< in: assigned undo log */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* hdr_page; + + hdr_page = trx_undo_page_get(undo->space, undo->zip_size, + undo->hdr_page_no, mtr); + + switch (trx_get_dict_operation(trx)) { + case TRX_DICT_OP_NONE: + ut_error; + case TRX_DICT_OP_INDEX: + /* Do not discard the table on recovery. */ + undo->table_id = 0; + break; + case TRX_DICT_OP_TABLE: + undo->table_id = trx->table_id; + break; + } + + mlog_write_ulint(hdr_page + undo->hdr_offset + + TRX_UNDO_DICT_TRANS, + TRUE, MLOG_1BYTE, mtr); + + mlog_write_ull(hdr_page + undo->hdr_offset + TRX_UNDO_TABLE_ID, + undo->table_id, mtr); + + undo->dict_operation = TRUE; +} + +/**********************************************************************//** +Assigns an undo log for a transaction. A new undo log is created or a cached +undo log reused. +@return DB_SUCCESS if undo log assign successful, possible error codes +are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE +DB_OUT_OF_MEMORY */ +UNIV_INTERN +ulint +trx_undo_assign_undo( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + ulint type) /*!< in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */ +{ + trx_rseg_t* rseg; + trx_undo_t* undo; + mtr_t mtr; + ulint err = DB_SUCCESS; + + ut_ad(trx); + ut_ad(trx->rseg); + + rseg = trx->rseg; + + ut_ad(mutex_own(&(trx->undo_mutex))); + + mtr_start(&mtr); + + ut_ad(!mutex_own(&kernel_mutex)); + + mutex_enter(&(rseg->mutex)); + + undo = trx_undo_reuse_cached(trx, rseg, type, trx->id, &trx->xid, + &mtr); + if (undo == NULL) { + err = trx_undo_create(trx, rseg, type, trx->id, &trx->xid, + &undo, &mtr); + if (err != DB_SUCCESS) { + + goto func_exit; + } + } + + if (type == TRX_UNDO_INSERT) { + UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_list, undo); + ut_ad(trx->insert_undo == NULL); + trx->insert_undo = undo; + } else { + UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_list, undo); + ut_ad(trx->update_undo == NULL); + trx->update_undo = undo; + } + + if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) { + trx_undo_mark_as_dict_operation(trx, undo, &mtr); + } + +func_exit: + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + return err; +} + +/******************************************************************//** +Sets the state of the undo log segment at a transaction finish. +@return undo log segment header page, x-latched */ +UNIV_INTERN +page_t* +trx_undo_set_state_at_finish( +/*=========================*/ + trx_undo_t* undo, /*!< in: undo log memory copy */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_usegf_t* seg_hdr; + trx_upagef_t* page_hdr; + page_t* undo_page; + ulint state; + + ut_ad(undo); + ut_ad(mtr); + + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + mem_analyze_corruption(undo); + ut_error; + } + + undo_page = trx_undo_page_get(undo->space, undo->zip_size, + undo->hdr_page_no, mtr); + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + if (undo->size == 1 + && mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE) + < TRX_UNDO_PAGE_REUSE_LIMIT) { + + state = TRX_UNDO_CACHED; + + } else if (undo->type == TRX_UNDO_INSERT) { + + state = TRX_UNDO_TO_FREE; + } else { + state = TRX_UNDO_TO_PURGE; + } + + undo->state = state; + + mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, state, MLOG_2BYTES, mtr); + + return(undo_page); +} + +/******************************************************************//** +Sets the state of the undo log segment at a transaction prepare. +@return undo log segment header page, x-latched */ +UNIV_INTERN +page_t* +trx_undo_set_state_at_prepare( +/*==========================*/ + trx_t* trx, /*!< in: transaction */ + trx_undo_t* undo, /*!< in: undo log memory copy */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_usegf_t* seg_hdr; + trx_ulogf_t* undo_header; + page_t* undo_page; + ulint offset; + + ut_ad(trx && undo && mtr); + + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + mem_analyze_corruption(undo); + ut_error; + } + + undo_page = trx_undo_page_get(undo->space, undo->zip_size, + undo->hdr_page_no, mtr); + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + + /*------------------------------*/ + undo->state = TRX_UNDO_PREPARED; + undo->xid = trx->xid; + /*------------------------------*/ + + mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, undo->state, + MLOG_2BYTES, mtr); + + offset = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG); + undo_header = undo_page + offset; + + mlog_write_ulint(undo_header + TRX_UNDO_XID_EXISTS, + TRUE, MLOG_1BYTE, mtr); + + trx_undo_write_xid(undo_header, &undo->xid, mtr); + + return(undo_page); +} + +/**********************************************************************//** +Adds the update undo log header as the first in the history list, and +frees the memory object, or puts it to the list of cached update undo log +segments. */ +UNIV_INTERN +void +trx_undo_update_cleanup( +/*====================*/ + trx_t* trx, /*!< in: trx owning the update undo log */ + page_t* undo_page, /*!< in: update undo log header page, + x-latched */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_rseg_t* rseg; + trx_undo_t* undo; + + undo = trx->update_undo; + rseg = trx->rseg; + + ut_ad(mutex_own(&(rseg->mutex))); + + trx_purge_add_update_undo_to_history(trx, undo_page, mtr); + + UT_LIST_REMOVE(undo_list, rseg->update_undo_list, undo); + + trx->update_undo = NULL; + + if (undo->state == TRX_UNDO_CACHED) { + + UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_cached, undo); + } else { + ut_ad(undo->state == TRX_UNDO_TO_PURGE + || undo->state == TRX_UNDO_TO_FREE); + + trx_undo_mem_free(undo); + } +} + +/******************************************************************//** +Frees or caches an insert undo log after a transaction commit or rollback. +Knowledge of inserts is not needed after a commit or rollback, therefore +the data can be discarded. */ +UNIV_INTERN +void +trx_undo_insert_cleanup( +/*====================*/ + trx_t* trx) /*!< in: transaction handle */ +{ + trx_undo_t* undo; + trx_rseg_t* rseg; + + undo = trx->insert_undo; + ut_ad(undo); + + rseg = trx->rseg; + + mutex_enter(&(rseg->mutex)); + + UT_LIST_REMOVE(undo_list, rseg->insert_undo_list, undo); + trx->insert_undo = NULL; + + if (undo->state == TRX_UNDO_CACHED) { + + UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_cached, undo); + } else { + ut_ad(undo->state == TRX_UNDO_TO_FREE); + + /* Delete first the undo log segment in the file */ + + mutex_exit(&(rseg->mutex)); + + trx_undo_seg_free(undo); + + mutex_enter(&(rseg->mutex)); + + ut_ad(rseg->curr_size > undo->size); + + rseg->curr_size -= undo->size; + + trx_undo_mem_free(undo); + } + + mutex_exit(&(rseg->mutex)); +} + +/********************************************************************//** +At shutdown, frees the undo logs of a PREPARED transaction. */ +UNIV_INTERN +void +trx_undo_free_prepared( +/*===================*/ + trx_t* trx) /*!< in/out: PREPARED transaction */ +{ + ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); + + if (trx->update_undo) { + ut_a(trx->update_undo->state == TRX_UNDO_PREPARED); + UT_LIST_REMOVE(undo_list, trx->rseg->update_undo_list, + trx->update_undo); + trx_undo_mem_free(trx->update_undo); + } + if (trx->insert_undo) { + ut_a(trx->insert_undo->state == TRX_UNDO_PREPARED); + UT_LIST_REMOVE(undo_list, trx->rseg->insert_undo_list, + trx->insert_undo); + trx_undo_mem_free(trx->insert_undo); + } +} +#endif /* !UNIV_HOTBACKUP */ |