diff options
author | Jan Lindström <jplindst@mariadb.org> | 2014-01-25 11:02:49 +0200 |
---|---|---|
committer | Jan Lindström <jplindst@mariadb.org> | 2014-01-25 11:02:49 +0200 |
commit | d43afb8828e358f9c3bb690d0fdcd88b0637f155 (patch) | |
tree | f977f3b5fa3c938183510750aecbea31bccc67ef /storage/xtradb/trx | |
parent | d0f77b83611077344ff29db02ea5593c9da62537 (diff) | |
parent | 02765f4c614069ece1f30976848b6299ba6f24bd (diff) | |
download | mariadb-git-d43afb8828e358f9c3bb690d0fdcd88b0637f155.tar.gz |
Merge MariaDB-10.0.7 revision 3961.
Diffstat (limited to 'storage/xtradb/trx')
-rw-r--r-- | storage/xtradb/trx/trx0i_s.cc (renamed from storage/xtradb/trx/trx0i_s.c) | 181 | ||||
-rw-r--r-- | storage/xtradb/trx/trx0purge.cc (renamed from storage/xtradb/trx/trx0purge.c) | 1058 | ||||
-rw-r--r-- | storage/xtradb/trx/trx0rec.cc (renamed from storage/xtradb/trx/trx0rec.c) | 344 | ||||
-rw-r--r-- | storage/xtradb/trx/trx0roll.cc (renamed from storage/xtradb/trx/trx0roll.c) | 1013 | ||||
-rw-r--r-- | storage/xtradb/trx/trx0rseg.cc (renamed from storage/xtradb/trx/trx0rseg.c) | 205 | ||||
-rw-r--r-- | storage/xtradb/trx/trx0sys.c | 2136 | ||||
-rw-r--r-- | storage/xtradb/trx/trx0sys.cc | 1414 | ||||
-rw-r--r-- | storage/xtradb/trx/trx0trx.c | 2482 | ||||
-rw-r--r-- | storage/xtradb/trx/trx0trx.cc | 2543 | ||||
-rw-r--r-- | storage/xtradb/trx/trx0undo.cc (renamed from storage/xtradb/trx/trx0undo.c) | 108 |
10 files changed, 5561 insertions, 5923 deletions
diff --git a/storage/xtradb/trx/trx0i_s.c b/storage/xtradb/trx/trx0i_s.cc index 8b3a83585cc..f5d4a6c862f 100644 --- a/storage/xtradb/trx/trx0i_s.c +++ b/storage/xtradb/trx/trx0i_s.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2007, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA *****************************************************************************/ /**************************************************//** -@file trx/trx0i_s.c +@file trx/trx0i_s.cc INFORMATION SCHEMA innodb_trx, innodb_locks and innodb_lock_waits tables fetch code. @@ -131,31 +131,31 @@ noop because it will be empty. */ /** Memory for each table in the intermediate buffer is allocated in separate chunks. These chunks are considered to be concatenated to represent one flat array of rows. */ -typedef struct i_s_mem_chunk_struct { +struct i_s_mem_chunk_t { ulint offset; /*!< offset, in number of rows */ ulint rows_allocd; /*!< the size of this chunk, in number of rows */ void* base; /*!< start of the chunk */ -} i_s_mem_chunk_t; +}; /** This represents one table's cache. */ -typedef struct i_s_table_cache_struct { +struct i_s_table_cache_t { ulint rows_used; /*!< number of used rows */ ulint rows_allocd; /*!< number of allocated rows */ ulint row_size; /*!< size of a single row */ i_s_mem_chunk_t chunks[MEM_CHUNKS_IN_TABLE_CACHE]; /*!< array of memory chunks that stores the rows */ -} i_s_table_cache_t; +}; /** This structure describes the intermediate buffer */ -struct trx_i_s_cache_struct { +struct trx_i_s_cache_t { rw_lock_t rw_lock; /*!< read-write lock protecting the rest of this structure */ ullint last_read; /*!< last time the cache was read; measured in microseconds since epoch */ - mutex_t last_read_mutex;/*!< mutex protecting the + ib_mutex_t last_read_mutex;/*!< mutex protecting the last_read member - it is updated inside a shared lock of the rw_lock member */ @@ -172,9 +172,9 @@ struct trx_i_s_cache_struct { /** Number of hash cells in the cache storage */ #define CACHE_STORAGE_HASH_CELLS 2048 ha_storage_t* storage; /*!< storage for external volatile - data that can possibly not be - available later, when we release - the kernel mutex */ + data that may become unavailable + when we release + lock_sys->mutex or trx_sys->mutex */ ulint mem_allocd; /*!< the amount of memory allocated with mem_alloc*() */ ibool is_truncated; /*!< this is TRUE if the memory @@ -476,7 +476,7 @@ fill_trx_row( size_t stmt_len; const char* s; - ut_ad(mutex_own(&kernel_mutex)); + ut_ad(lock_mutex_own()); row->trx_id = trx->id; row->trx_started = (ib_time_t) trx->start_time; @@ -485,9 +485,10 @@ fill_trx_row( ut_ad(requested_lock_row == NULL || i_s_locks_row_validate(requested_lock_row)); - if (trx->wait_lock != NULL) { + if (trx->lock.wait_lock != NULL) { + ut_a(requested_lock_row != NULL); - row->trx_wait_started = (ib_time_t) trx->wait_started; + row->trx_wait_started = (ib_time_t) trx->lock.wait_started; } else { ut_a(requested_lock_row == NULL); row->trx_wait_started = 0; @@ -505,6 +506,7 @@ fill_trx_row( } row->trx_mysql_thread_id = thd_get_thread_id(trx->mysql_thd); + stmt = innobase_get_stmt(trx->mysql_thd, &stmt_len); if (stmt != NULL) { @@ -517,9 +519,10 @@ fill_trx_row( memcpy(query, stmt, stmt_len); query[stmt_len] = '\0'; - row->trx_query = ha_storage_put_memlim( - cache->storage, query, stmt_len + 1, - MAX_ALLOWED_FOR_STORAGE(cache)); + row->trx_query = static_cast<const char*>( + ha_storage_put_memlim( + cache->storage, query, stmt_len + 1, + MAX_ALLOWED_FOR_STORAGE(cache))); row->trx_query_cs = innobase_get_charset(trx->mysql_thd); @@ -553,11 +556,15 @@ thd_done: row->trx_tables_locked = trx->mysql_n_tables_locked; - row->trx_lock_structs = UT_LIST_GET_LEN(trx->trx_locks); + /* These are protected by both trx->mutex or lock_sys->mutex, + or just lock_sys->mutex. For reading, it suffices to hold + lock_sys->mutex. */ + + row->trx_lock_structs = UT_LIST_GET_LEN(trx->lock.trx_locks); - row->trx_lock_memory_bytes = mem_heap_get_size(trx->lock_heap); + row->trx_lock_memory_bytes = mem_heap_get_size(trx->lock.lock_heap); - row->trx_rows_locked = lock_number_of_rows_locked(trx); + row->trx_rows_locked = lock_number_of_rows_locked(&trx->lock); row->trx_rows_modified = trx->undo_no; @@ -605,6 +612,10 @@ thd_done: row->trx_search_latch_timeout = trx->search_latch_timeout; + row->trx_is_read_only = trx->read_only; + + row->trx_is_autocommit_non_locking = trx_is_autocommit_non_locking(trx); + return(TRUE); } @@ -1132,25 +1143,25 @@ add_trx_relevant_locks_to_cache( requested lock row, or NULL or undefined */ { - ut_ad(mutex_own(&kernel_mutex)); + ut_ad(lock_mutex_own()); /* If transaction is waiting we add the wait lock and all locks from another transactions that are blocking the wait lock. */ - if (trx->que_state == TRX_QUE_LOCK_WAIT) { + if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) { const lock_t* curr_lock; ulint wait_lock_heap_no; i_s_locks_row_t* blocking_lock_row; lock_queue_iterator_t iter; - ut_a(trx->wait_lock != NULL); + ut_a(trx->lock.wait_lock != NULL); wait_lock_heap_no - = wait_lock_get_heap_no(trx->wait_lock); + = wait_lock_get_heap_no(trx->lock.wait_lock); /* add the requested lock */ *requested_lock_row - = add_lock_to_cache(cache, trx->wait_lock, + = add_lock_to_cache(cache, trx->lock.wait_lock, wait_lock_heap_no); /* memory could not be allocated */ @@ -1162,17 +1173,18 @@ add_trx_relevant_locks_to_cache( /* then iterate over the locks before the wait lock and add the ones that are blocking it */ - lock_queue_iterator_reset(&iter, trx->wait_lock, + lock_queue_iterator_reset(&iter, trx->lock.wait_lock, ULINT_UNDEFINED); - curr_lock = lock_queue_iterator_get_prev(&iter); - while (curr_lock != NULL) { + for (curr_lock = lock_queue_iterator_get_prev(&iter); + curr_lock != NULL; + curr_lock = lock_queue_iterator_get_prev(&iter)) { - if (lock_has_to_wait(trx->wait_lock, + if (lock_has_to_wait(trx->lock.wait_lock, curr_lock)) { /* add the lock that is - blocking trx->wait_lock */ + blocking trx->lock.wait_lock */ blocking_lock_row = add_lock_to_cache( cache, curr_lock, @@ -1197,8 +1209,6 @@ add_trx_relevant_locks_to_cache( return(FALSE); } } - - curr_lock = lock_queue_iterator_get_prev(&iter); } } else { @@ -1268,26 +1278,49 @@ Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the table cache buffer. Cache must be locked for write. */ static void -fetch_data_into_cache( -/*==================*/ - trx_i_s_cache_t* cache) /*!< in/out: cache */ +fetch_data_into_cache_low( +/*======================*/ + trx_i_s_cache_t* cache, /*!< in/out: cache */ + ibool only_ac_nl, /*!< in: only select non-locking + autocommit transactions */ + trx_list_t* trx_list) /*!< in: trx list */ { - trx_t* trx; - i_s_trx_row_t* trx_row; - i_s_locks_row_t* requested_lock_row; + const trx_t* trx; - ut_ad(mutex_own(&kernel_mutex)); + ut_ad(trx_list == &trx_sys->rw_trx_list + || trx_list == &trx_sys->ro_trx_list + || trx_list == &trx_sys->mysql_trx_list); - trx_i_s_cache_clear(cache); + ut_ad(only_ac_nl == (trx_list == &trx_sys->mysql_trx_list)); - /* We iterate over the list of all transactions and add each one + /* Iterate over the transaction list and add each one to innodb_trx's cache. We also add all locks that are relevant to each transaction into innodb_locks' and innodb_lock_waits' caches. */ - for (trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + for (trx = UT_LIST_GET_FIRST(*trx_list); trx != NULL; - trx = UT_LIST_GET_NEXT(trx_list, trx)) { + trx = + (trx_list == &trx_sys->mysql_trx_list + ? UT_LIST_GET_NEXT(mysql_trx_list, trx) + : UT_LIST_GET_NEXT(trx_list, trx))) { + + i_s_trx_row_t* trx_row; + i_s_locks_row_t* requested_lock_row; + + if (trx->state == TRX_STATE_NOT_STARTED + || (only_ac_nl && !trx_is_autocommit_non_locking(trx))) { + + continue; + } + + assert_trx_nonlocking_or_in_list(trx); + + ut_ad(trx->in_ro_trx_list + == (trx_list == &trx_sys->ro_trx_list)); + + ut_ad(trx->in_rw_trx_list + == (trx_list == &trx_sys->rw_trx_list)); if (!add_trx_relevant_locks_to_cache(cache, trx, &requested_lock_row)) { @@ -1315,6 +1348,28 @@ fetch_data_into_cache( return; } } +} + +/*******************************************************************//** +Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the +table cache buffer. Cache must be locked for write. */ +static +void +fetch_data_into_cache( +/*==================*/ + trx_i_s_cache_t* cache) /*!< in/out: cache */ +{ + ut_ad(lock_mutex_own()); + ut_ad(mutex_own(&trx_sys->mutex)); + + trx_i_s_cache_clear(cache); + + fetch_data_into_cache_low(cache, FALSE, &trx_sys->rw_trx_list); + fetch_data_into_cache_low(cache, FALSE, &trx_sys->ro_trx_list); + + /* Only select autocommit non-locking selects because they can + only be on the MySQL transaction list (TRUE). */ + fetch_data_into_cache_low(cache, TRUE, &trx_sys->mysql_trx_list); cache->is_truncated = FALSE; } @@ -1335,11 +1390,16 @@ trx_i_s_possibly_fetch_data_into_cache( } /* We need to read trx_sys and record/table lock queues */ - mutex_enter(&kernel_mutex); + + lock_mutex_enter(); + + mutex_enter(&trx_sys->mutex); fetch_data_into_cache(cache); - mutex_exit(&kernel_mutex); + mutex_exit(&trx_sys->mutex); + + lock_mutex_exit(); return(0); } @@ -1367,8 +1427,8 @@ trx_i_s_cache_init( { /* The latching is done in the following order: acquire trx_i_s_cache_t::rw_lock, X - acquire kernel_mutex - release kernel_mutex + acquire lock mutex + release lock mutex release trx_i_s_cache_t::rw_lock acquire trx_i_s_cache_t::rw_lock, S acquire trx_i_s_cache_t::last_read_mutex @@ -1593,7 +1653,7 @@ trx_i_s_create_lock_id( } else { /* table lock */ res_len = ut_snprintf(lock_id, lock_id_size, - TRX_ID_FMT ":%llu", + TRX_ID_FMT":"UINT64PF, row->lock_trx_id, row->lock_table_id); } @@ -1605,3 +1665,24 @@ trx_i_s_create_lock_id( return(lock_id); } + +UNIV_INTERN +void +trx_i_s_get_lock_sys_memory_usage(ulint *constant, ulint *variable) +{ + trx_t* trx; + + *constant = lock_sys->rec_hash->n_cells * sizeof(hash_cell_t); + *variable = 0; + + if (trx_sys) { + mutex_enter(&trx_sys->mutex); + trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list); + while (trx) { + *variable += ((trx->lock.lock_heap) ? mem_heap_get_size(trx->lock.lock_heap) : 0); + trx = UT_LIST_GET_NEXT(mysql_trx_list, trx); + } + mutex_exit(&trx_sys->mutex); + } + +} diff --git a/storage/xtradb/trx/trx0purge.c b/storage/xtradb/trx/trx0purge.cc index d343a73c9d8..3dfcf23c3f5 100644 --- a/storage/xtradb/trx/trx0purge.c +++ b/storage/xtradb/trx/trx0purge.cc @@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA *****************************************************************************/ /**************************************************//** -@file trx/trx0purge.c +@file trx/trx0purge.cc Purge old versions Created 3/26/1996 Heikki Tuuri @@ -31,7 +31,6 @@ Created 3/26/1996 Heikki Tuuri #include "fsp0fsp.h" #include "mach0data.h" -#include "mtr0log.h" #include "trx0rseg.h" #include "trx0trx.h" #include "trx0roll.h" @@ -42,7 +41,16 @@ Created 3/26/1996 Heikki Tuuri #include "row0upd.h" #include "trx0rec.h" #include "srv0srv.h" +#include "srv0start.h" #include "os0thread.h" +#include "srv0mon.h" +#include "mtr0log.h" + +/** Maximum allowable purge history length. <=0 means 'infinite'. */ +UNIV_INTERN ulong srv_max_purge_lag = 0; + +/** Max DML user threads delay in micro-seconds. */ +UNIV_INTERN ulong srv_max_purge_lag_delay = 0; /** The global data structure coordinating a purge */ UNIV_INTERN trx_purge_t* purge_sys = NULL; @@ -65,155 +73,33 @@ UNIV_INTERN mysql_pfs_key_t purge_sys_bh_mutex_key; UNIV_INTERN my_bool srv_purge_view_update_only_debug; #endif /* UNIV_DEBUG */ -/*****************************************************************//** -Checks if trx_id is >= purge_view: then it is guaranteed that its update -undo log still exists in the system. -@return TRUE if is sure that it is preserved, also if the function -returns FALSE, it is possible that the undo log still exists in the -system */ -UNIV_INTERN -ibool -trx_purge_update_undo_must_exist( -/*=============================*/ - trx_id_t trx_id) /*!< in: transaction id */ -{ -#ifdef UNIV_SYNC_DEBUG - ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); -#endif /* UNIV_SYNC_DEBUG */ - - if (!read_view_sees_trx_id(purge_sys->view, trx_id)) { - - return(TRUE); - } - - return(FALSE); -} - -/*=================== PURGE RECORD ARRAY =============================*/ - -/*******************************************************************//** -Stores info of an undo log record during a purge. -@return pointer to the storage cell */ -static -trx_undo_inf_t* -trx_purge_arr_store_info( -/*=====================*/ - trx_id_t trx_no, /*!< in: transaction number */ - undo_no_t undo_no)/*!< in: undo number */ -{ - trx_undo_inf_t* cell; - trx_undo_arr_t* arr; - ulint i; - - arr = purge_sys->arr; - - for (i = 0;; i++) { - cell = trx_undo_arr_get_nth_info(arr, i); - - if (!(cell->in_use)) { - /* Not in use, we may store here */ - cell->undo_no = undo_no; - cell->trx_no = trx_no; - cell->in_use = TRUE; - - arr->n_used++; - - return(cell); - } - } -} - -/*******************************************************************//** -Removes info of an undo log record during a purge. */ -UNIV_INLINE -void -trx_purge_arr_remove_info( -/*======================*/ - trx_undo_inf_t* cell) /*!< in: pointer to the storage cell */ -{ - trx_undo_arr_t* arr; - - arr = purge_sys->arr; - - cell->in_use = FALSE; - - ut_ad(arr->n_used > 0); - - arr->n_used--; -} - -/*******************************************************************//** -Gets the biggest pair of a trx number and an undo number in a purge array. */ -static -void -trx_purge_arr_get_biggest( -/*======================*/ - trx_undo_arr_t* arr, /*!< in: purge array */ - trx_id_t* trx_no, /*!< out: transaction number: 0 - if array is empty */ - undo_no_t* undo_no)/*!< out: undo number */ -{ - trx_undo_inf_t* cell; - trx_id_t pair_trx_no; - undo_no_t pair_undo_no; - ulint i; - ulint n; - - n = arr->n_used; - pair_trx_no = 0; - pair_undo_no = 0; - - if (n) { - for (i = 0;; i++) { - cell = trx_undo_arr_get_nth_info(arr, i); - - if (!cell->in_use) { - continue; - } - - if ((cell->trx_no > pair_trx_no) - || ((cell->trx_no == pair_trx_no) - && cell->undo_no >= pair_undo_no)) { - - pair_trx_no = cell->trx_no; - pair_undo_no = cell->undo_no; - } - - if (!--n) { - break; - } - } - } - - *trx_no = pair_trx_no; - *undo_no = pair_undo_no; -} - /****************************************************************//** Builds a purge 'query' graph. The actual purge is performed by executing this query graph. @return own: the query graph */ static que_t* -trx_purge_graph_build(void) -/*=======================*/ +trx_purge_graph_build( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + ulint n_purge_threads) /*!< in: number of purge + threads */ { + ulint i; mem_heap_t* heap; que_fork_t* fork; - que_thr_t* thr; - /* que_thr_t* thr2; */ heap = mem_heap_create(512); fork = que_fork_create(NULL, NULL, QUE_FORK_PURGE, heap); - fork->trx = purge_sys->trx; - - thr = que_thr_create(fork, heap); + fork->trx = trx; - thr->child = row_purge_node_create(thr, heap); + for (i = 0; i < n_purge_threads; ++i) { + que_thr_t* thr; - /* thr2 = que_thr_create(fork, fork, heap); + thr = que_thr_create(fork, heap); - thr2->child = row_purge_node_create(fork, thr2, heap); */ + thr->child = row_purge_node_create(thr, heap); + } return(fork); } @@ -225,22 +111,18 @@ UNIV_INTERN void trx_purge_sys_create( /*=================*/ - ib_bh_t* ib_bh) /*!< in, own: UNDO log min binary heap */ + ulint n_purge_threads, /*!< in: number of purge + threads */ + ib_bh_t* ib_bh) /*!< in, own: UNDO log min + binary heap */ { - ut_ad(mutex_own(&kernel_mutex)); + purge_sys = static_cast<trx_purge_t*>(mem_zalloc(sizeof(*purge_sys))); - purge_sys = mem_zalloc(sizeof(trx_purge_t)); + purge_sys->state = PURGE_STATE_INIT; + purge_sys->event = os_event_create(); /* Take ownership of ib_bh, we are responsible for freeing it. */ purge_sys->ib_bh = ib_bh; - purge_sys->state = TRX_STOP_PURGE; - - purge_sys->n_pages_handled = 0; - - purge_sys->purge_trx_no = 0; - purge_sys->purge_undo_no = 0; - purge_sys->next_stored = FALSE; - ut_d(purge_sys->done_trx_no = 0); rw_lock_create(trx_purge_latch_key, &purge_sys->latch, SYNC_PURGE_LATCH); @@ -251,21 +133,27 @@ trx_purge_sys_create( purge_sys->heap = mem_heap_create(256); - purge_sys->arr = trx_undo_arr_create(); + ut_a(n_purge_threads > 0); purge_sys->sess = sess_open(); purge_sys->trx = purge_sys->sess->trx; - purge_sys->trx->is_purge = 1; + ut_a(purge_sys->trx->sess == purge_sys->sess); - ut_a(trx_start_low(purge_sys->trx, ULINT_UNDEFINED)); + /* A purge transaction is not a real transaction, we use a transaction + here only because the query threads code requires it. It is otherwise + quite unnecessary. We should get rid of it eventually. */ + purge_sys->trx->id = 0; + purge_sys->trx->start_time = ut_time(); + purge_sys->trx->state = TRX_STATE_ACTIVE; + purge_sys->trx->op_info = "purge trx"; - purge_sys->query = trx_purge_graph_build(); + purge_sys->query = trx_purge_graph_build( + purge_sys->trx, n_purge_threads); - purge_sys->prebuilt_view = - read_view_oldest_copy_or_open_new(0, NULL); - purge_sys->view = purge_sys->prebuilt_view; + purge_sys->view = read_view_purge_open(purge_sys->prebuilt_clone, + purge_sys->prebuilt_view); } /************************************************************************ @@ -275,34 +163,21 @@ void trx_purge_sys_close(void) /*======================*/ { - ut_ad(!mutex_own(&kernel_mutex)); - que_graph_free(purge_sys->query); - ut_a(purge_sys->sess->trx->is_purge); - purge_sys->sess->trx->state = TRX_NOT_STARTED; + ut_a(purge_sys->trx->id == 0); + ut_a(purge_sys->sess->trx == purge_sys->trx); - mutex_enter(&kernel_mutex); - trx_release_descriptor(purge_sys->sess->trx); - mutex_exit(&kernel_mutex); + purge_sys->trx->state = TRX_STATE_NOT_STARTED; sess_close(purge_sys->sess); - purge_sys->sess = NULL; - if (purge_sys->view != NULL) { - /* Because acquiring the kernel mutex is a pre-condition - of read_view_close(). We don't really need it here. */ - mutex_enter(&kernel_mutex); + purge_sys->sess = NULL; - read_view_close(purge_sys->view); - read_view_free(purge_sys->prebuilt_view); - purge_sys->prebuilt_view = NULL; - purge_sys->view = NULL; + read_view_free(purge_sys->prebuilt_view); + read_view_free(purge_sys->prebuilt_clone); - mutex_exit(&kernel_mutex); - } - - trx_undo_arr_free(purge_sys->arr); + purge_sys->view = NULL; rw_lock_free(&purge_sys->latch); mutex_free(&purge_sys->bh_mutex); @@ -311,6 +186,10 @@ trx_purge_sys_close(void) ib_bh_free(purge_sys->ib_bh); + os_event_free(purge_sys->event); + + purge_sys->event = NULL; + mem_free(purge_sys); purge_sys = NULL; @@ -331,21 +210,18 @@ trx_purge_add_update_undo_to_history( mtr_t* mtr) /*!< in: mtr */ { trx_undo_t* undo; + trx_rseg_t* rseg; trx_rsegf_t* rseg_header; trx_ulogf_t* undo_header; undo = trx->update_undo; - - ut_ad(undo); - - ut_ad(mutex_own(&undo->rseg->mutex)); + rseg = undo->rseg; rseg_header = trx_rsegf_get( undo->rseg->space, undo->rseg->zip_size, undo->rseg->page_no, mtr); undo_header = undo_page + undo->hdr_offset; - /* Add the log as the first in the history list */ if (undo->state != TRX_UNDO_CACHED) { ulint hist_size; @@ -364,6 +240,8 @@ trx_purge_add_update_undo_to_history( trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, mtr); + MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED); + hist_size = mtr_read_ulint( rseg_header + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, mtr); @@ -375,40 +253,36 @@ trx_purge_add_update_undo_to_history( hist_size + undo->size, MLOG_4BYTES, mtr); } - flst_add_first( - rseg_header + TRX_RSEG_HISTORY, - undo_header + TRX_UNDO_HISTORY_NODE, mtr); + /* Add the log as the first in the history list */ + flst_add_first(rseg_header + TRX_RSEG_HISTORY, + undo_header + TRX_UNDO_HISTORY_NODE, mtr); + +#ifdef HAVE_ATOMIC_BUILTINS + os_atomic_increment_ulint(&trx_sys->rseg_history_len, 1); +#else + mutex_enter(&trx_sys->mutex); + ++trx_sys->rseg_history_len; + mutex_exit(&trx_sys->mutex); +#endif /* HAVE_ATOMIC_BUILTINS */ - /* Write the trx number to the undo log header */ + srv_wake_purge_thread_if_not_active(); + /* Write the trx number to the undo log header */ mlog_write_ull(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr); /* Write information about delete markings to the undo log header */ if (!undo->del_marks) { - mlog_write_ulint( - undo_header + TRX_UNDO_DEL_MARKS, FALSE, - MLOG_2BYTES, mtr); + mlog_write_ulint(undo_header + TRX_UNDO_DEL_MARKS, FALSE, + MLOG_2BYTES, mtr); } - if (undo->rseg->last_page_no == FIL_NULL) { - undo->rseg->last_trx_no = trx->no; - undo->rseg->last_offset = undo->hdr_offset; - undo->rseg->last_page_no = undo->hdr_page_no; - undo->rseg->last_del_marks = undo->del_marks; - - /* FIXME: Add a bin heap validate function to check that - the rseg exists. */ + if (rseg->last_page_no == FIL_NULL) { + rseg->last_page_no = undo->hdr_page_no; + rseg->last_offset = undo->hdr_offset; + rseg->last_trx_no = trx->no; + rseg->last_del_marks = undo->del_marks; } - - mutex_enter(&kernel_mutex); - trx_sys->rseg_history_len++; - mutex_exit(&kernel_mutex); - -// if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) { /*should wake up always*/ - /* Inform the purge thread that there is work to do. */ - srv_wake_purge_thread_if_not_active(); -// } } /**********************************************************************//** @@ -424,49 +298,55 @@ trx_purge_free_segment( will cut off from the end of the history list */ { - page_t* undo_page; + mtr_t mtr; trx_rsegf_t* rseg_hdr; trx_ulogf_t* log_hdr; trx_usegf_t* seg_hdr; - ibool freed; ulint seg_size; ulint hist_size; ibool marked = FALSE; - mtr_t mtr; /* fputs("Freeing an update undo log segment\n", stderr); */ -loop: - mtr_start(&mtr); - mutex_enter(&(rseg->mutex)); + for (;;) { + page_t* undo_page; - rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size, - rseg->page_no, &mtr); + mtr_start(&mtr); - undo_page = trx_undo_page_get(rseg->space, rseg->zip_size, - hdr_addr.page, &mtr); - seg_hdr = undo_page + TRX_UNDO_SEG_HDR; - log_hdr = undo_page + hdr_addr.boffset; + mutex_enter(&rseg->mutex); - /* Mark the last undo log totally purged, so that if the system - crashes, the tail of the undo log will not get accessed again. The - list of pages in the undo log tail gets inconsistent during the - freeing of the segment, and therefore purge should not try to access - them again. */ + rseg_hdr = trx_rsegf_get( + rseg->space, rseg->zip_size, rseg->page_no, &mtr); - if (!marked) { - mlog_write_ulint(log_hdr + TRX_UNDO_DEL_MARKS, FALSE, - MLOG_2BYTES, &mtr); - marked = TRUE; - } + undo_page = trx_undo_page_get( + rseg->space, rseg->zip_size, hdr_addr.page, &mtr); - freed = fseg_free_step_not_header(seg_hdr + TRX_UNDO_FSEG_HEADER, - &mtr); - if (!freed) { - mutex_exit(&(rseg->mutex)); - mtr_commit(&mtr); + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + log_hdr = undo_page + hdr_addr.boffset; + + /* Mark the last undo log totally purged, so that if the + system crashes, the tail of the undo log will not get accessed + again. The list of pages in the undo log tail gets inconsistent + during the freeing of the segment, and therefore purge should + not try to access them again. */ - goto loop; + if (!marked) { + mlog_write_ulint( + log_hdr + TRX_UNDO_DEL_MARKS, FALSE, + MLOG_2BYTES, &mtr); + + marked = TRUE; + } + + if (fseg_free_step_not_header( + seg_hdr + TRX_UNDO_FSEG_HEADER, &mtr)) { + + break; + } + + mutex_exit(&rseg->mutex); + + mtr_commit(&mtr); } /* The page list may now be inconsistent, but the length field @@ -483,22 +363,22 @@ loop: flst_cut_end(rseg_hdr + TRX_RSEG_HISTORY, log_hdr + TRX_UNDO_HISTORY_NODE, n_removed_logs, &mtr); - mutex_enter(&kernel_mutex); - ut_ad(trx_sys->rseg_history_len >= n_removed_logs); +#ifdef HAVE_ATOMIC_BUILTINS + os_atomic_decrement_ulint(&trx_sys->rseg_history_len, n_removed_logs); +#else + mutex_enter(&trx_sys->mutex); trx_sys->rseg_history_len -= n_removed_logs; - mutex_exit(&kernel_mutex); + mutex_exit(&trx_sys->mutex); +#endif /* HAVE_ATOMIC_BUILTINS */ - freed = FALSE; + do { - while (!freed) { /* Here we assume that a file segment with just the header page can be freed in a few steps, so that the buffer pool is not flooded with bufferfixed pages: see the note in - fsp0fsp.c. */ + fsp0fsp.cc. */ - freed = fseg_free_step(seg_hdr + TRX_UNDO_FSEG_HEADER, - &mtr); - } + } while(!fseg_free_step(seg_hdr + TRX_UNDO_FSEG_HEADER, &mtr)); hist_size = mtr_read_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, &mtr); @@ -522,12 +402,8 @@ static void trx_purge_truncate_rseg_history( /*============================*/ - trx_rseg_t* rseg, /*!< in: rollback segment */ - trx_id_t limit_trx_no, /*!< in: remove update undo logs whose - trx number is < limit_trx_no */ - undo_no_t limit_undo_no) /*!< in: if transaction number is equal - to limit_trx_no, truncate undo records - with undo number < limit_undo_no */ + trx_rseg_t* rseg, /*!< in: rollback segment */ + const purge_iter_t* limit) /*!< in: truncate offset */ { fil_addr_t hdr_addr; fil_addr_t prev_hdr_addr; @@ -561,20 +437,26 @@ loop: hdr_addr.page, &mtr); log_hdr = undo_page + hdr_addr.boffset; + undo_trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO); - if (undo_trx_no >= limit_trx_no) { - if (undo_trx_no == limit_trx_no) { - trx_undo_truncate_start(rseg, rseg->space, - hdr_addr.page, - hdr_addr.boffset, - limit_undo_no); + if (undo_trx_no >= limit->trx_no) { + + if (undo_trx_no == limit->trx_no) { + + trx_undo_truncate_start( + rseg, rseg->space, hdr_addr.page, + hdr_addr.boffset, limit->undo_no); } - mutex_enter(&kernel_mutex); - ut_a(trx_sys->rseg_history_len >= n_removed_logs); +#ifdef HAVE_ATOMIC_BUILTINS + os_atomic_decrement_ulint( + &trx_sys->rseg_history_len, n_removed_logs); +#else + mutex_enter(&trx_sys->mutex); trx_sys->rseg_history_len -= n_removed_logs; - mutex_exit(&kernel_mutex); + mutex_exit(&trx_sys->mutex); +#endif /* HAVE_ATOMIC_BUILTINS */ flst_truncate_end(rseg_hdr + TRX_RSEG_HISTORY, log_hdr + TRX_UNDO_HISTORY_NODE, @@ -624,60 +506,30 @@ Removes unnecessary history data from rollback segments. NOTE that when this function is called, the caller must not have any latches on undo log pages! */ static void -trx_purge_truncate_history(void) -/*============================*/ +trx_purge_truncate_history( +/*========================*/ + purge_iter_t* limit, /*!< in: truncate limit */ + const read_view_t* view) /*!< in: purge view */ { - trx_rseg_t* rseg; - trx_id_t limit_trx_no; - undo_no_t limit_undo_no; - - trx_purge_arr_get_biggest( - purge_sys->arr, &limit_trx_no, &limit_undo_no); - - if (limit_trx_no == 0) { - - limit_trx_no = purge_sys->purge_trx_no; - limit_undo_no = purge_sys->purge_undo_no; - } + ulint i; /* We play safe and set the truncate limit at most to the purge view low_limit number, though this is not necessary */ - if (limit_trx_no >= purge_sys->view->low_limit_no) { - limit_trx_no = purge_sys->view->low_limit_no; - limit_undo_no = 0; + if (limit->trx_no >= view->low_limit_no) { + limit->trx_no = view->low_limit_no; + limit->undo_no = 0; } - ut_ad(limit_trx_no <= purge_sys->view->low_limit_no); + ut_ad(limit->trx_no <= purge_sys->view->low_limit_no); - for (rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); - rseg != NULL; - rseg = UT_LIST_GET_NEXT(rseg_list, rseg)) { + for (i = 0; i < TRX_SYS_N_RSEGS; ++i) { + trx_rseg_t* rseg = trx_sys->rseg_array[i]; - trx_purge_truncate_rseg_history( - rseg, limit_trx_no, limit_undo_no); - } -} - -/********************************************************************//** -Does a truncate if the purge array is empty. NOTE that when this function is -called, the caller must not have any latches on undo log pages! */ -UNIV_INLINE -void -trx_purge_truncate_if_arr_empty(void) -/*=================================*/ -{ - static ulint count; - -#ifdef UNIV_DEBUG - if (purge_sys->arr->n_used == 0) { - purge_sys->done_trx_no = purge_sys->purge_trx_no; - } -#endif /* UNIV_DEBUG */ - - if (!(++count % TRX_SYS_N_RSEGS) && purge_sys->arr->n_used == 0) { - - trx_purge_truncate_history(); + if (rseg != NULL) { + ut_a(rseg->id == i); + trx_purge_truncate_rseg_history(rseg, limit); + } } } @@ -688,8 +540,11 @@ static void trx_purge_rseg_get_next_history_log( /*================================*/ - trx_rseg_t* rseg) /*!< in: rollback segment */ + trx_rseg_t* rseg, /*!< in: rollback segment */ + ulint* n_pages_handled)/*!< in/out: number of UNDO pages + handled */ { + const void* ptr; page_t* undo_page; trx_ulogf_t* log_hdr; fil_addr_t prev_log_addr; @@ -697,14 +552,13 @@ trx_purge_rseg_get_next_history_log( ibool del_marks; mtr_t mtr; rseg_queue_t rseg_queue; - const void* ptr; mutex_enter(&(rseg->mutex)); ut_a(rseg->last_page_no != FIL_NULL); - purge_sys->purge_trx_no = rseg->last_trx_no + 1; - purge_sys->purge_undo_no = 0; + purge_sys->iter.trx_no = rseg->last_trx_no + 1; + purge_sys->iter.undo_no = 0; purge_sys->next_stored = FALSE; mtr_start(&mtr); @@ -716,7 +570,7 @@ trx_purge_rseg_get_next_history_log( /* Increase the purge page count by one for every handled log */ - purge_sys->n_pages_handled++; + (*n_pages_handled)++; prev_log_addr = trx_purge_get_log_from_hist( flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr)); @@ -729,10 +583,10 @@ trx_purge_rseg_get_next_history_log( mutex_exit(&(rseg->mutex)); mtr_commit(&mtr); - mutex_enter(&kernel_mutex); + mutex_enter(&trx_sys->mutex); /* Add debug code to track history list corruption reported - on the MySQL mailing list on Nov 9, 2004. The fut0lst.c + on the MySQL mailing list on Nov 9, 2004. The fut0lst.cc file-based list was corrupt. The prev node pointer was FIL_NULL, even though the list length was over 8 million nodes! We assume that purge truncates the history list in large @@ -752,12 +606,13 @@ trx_purge_rseg_get_next_history_log( ut_ad(0); } - mutex_exit(&kernel_mutex); + mutex_exit(&trx_sys->mutex); return; } - mutex_exit(&(rseg->mutex)); + mutex_exit(&rseg->mutex); + mtr_commit(&mtr); /* Read the trx number and del marks from the previous log header */ @@ -795,7 +650,7 @@ trx_purge_rseg_get_next_history_log( mutex_exit(&purge_sys->bh_mutex); - mutex_exit(&(rseg->mutex)); + mutex_exit(&rseg->mutex); } /***********************************************************************//** @@ -839,18 +694,16 @@ trx_purge_get_rseg_with_min_trx_id( ut_a(purge_sys->rseg->last_page_no != FIL_NULL); - /* We assume in purge of externally stored fields - that space id == 0 */ - ut_a(purge_sys->rseg->space == 0); + /* We assume in purge of externally stored fields that space id is + in the range of UNDO tablespace space ids */ + ut_a(purge_sys->rseg->space <= srv_undo_tablespaces_open); zip_size = purge_sys->rseg->zip_size; - ut_a(purge_sys->purge_trx_no <= purge_sys->rseg->last_trx_no); - - purge_sys->purge_trx_no = purge_sys->rseg->last_trx_no; + ut_a(purge_sys->iter.trx_no <= purge_sys->rseg->last_trx_no); + purge_sys->iter.trx_no = purge_sys->rseg->last_trx_no; purge_sys->hdr_offset = purge_sys->rseg->last_offset; - purge_sys->hdr_page_no = purge_sys->rseg->last_page_no; mutex_exit(&purge_sys->rseg->mutex); @@ -867,21 +720,22 @@ trx_purge_read_undo_rec( trx_purge_t* purge_sys, /*!< in/out: purge instance */ ulint zip_size) /*!< in: block size or 0 */ { + ulint offset; ulint page_no; - ulint offset = 0; - ib_uint64_t undo_no = 0; + ib_uint64_t undo_no; purge_sys->hdr_offset = purge_sys->rseg->last_offset; page_no = purge_sys->hdr_page_no = purge_sys->rseg->last_page_no; if (purge_sys->rseg->last_del_marks) { mtr_t mtr; - trx_undo_rec_t* undo_rec; + trx_undo_rec_t* undo_rec = NULL; mtr_start(&mtr); undo_rec = trx_undo_get_first_rec( - 0 /* System space id */, zip_size, + purge_sys->rseg->space, + zip_size, purge_sys->hdr_page_no, purge_sys->hdr_offset, RW_S_LATCH, &mtr); @@ -889,14 +743,20 @@ trx_purge_read_undo_rec( offset = page_offset(undo_rec); undo_no = trx_undo_rec_get_undo_no(undo_rec); page_no = page_get_page_no(page_align(undo_rec)); + } else { + offset = 0; + undo_no = 0; } mtr_commit(&mtr); + } else { + offset = 0; + undo_no = 0; } purge_sys->offset = offset; purge_sys->page_no = page_no; - purge_sys->purge_undo_no = undo_no; + purge_sys->iter.undo_no = undo_no; purge_sys->next_stored = TRUE; } @@ -918,7 +778,6 @@ trx_purge_choose_next_log(void) zip_size = trx_purge_get_rseg_with_min_trx_id(purge_sys); if (purge_sys->rseg != NULL) { - trx_purge_read_undo_rec(purge_sys, zip_size); } else { /* There is nothing to do yet. */ @@ -933,23 +792,23 @@ static trx_undo_rec_t* trx_purge_get_next_rec( /*===================*/ - mem_heap_t* heap) /*!< in: memory heap where copied */ + ulint* n_pages_handled,/*!< in/out: number of UNDO pages + handled */ + mem_heap_t* heap) /*!< in: memory heap where copied */ { trx_undo_rec_t* rec; trx_undo_rec_t* rec_copy; trx_undo_rec_t* rec2; - trx_undo_rec_t* next_rec; page_t* undo_page; page_t* page; ulint offset; ulint page_no; ulint space; ulint zip_size; - ulint type; - ulint cmpl_info; mtr_t mtr; ut_ad(purge_sys->next_stored); + ut_ad(purge_sys->iter.trx_no < purge_sys->view->low_limit_no); space = purge_sys->rseg->space; zip_size = purge_sys->rseg->zip_size; @@ -960,7 +819,8 @@ trx_purge_get_next_rec( /* It is the dummy undo log record, which means that there is no need to purge this undo log */ - trx_purge_rseg_get_next_history_log(purge_sys->rseg); + trx_purge_rseg_get_next_history_log( + purge_sys->rseg, n_pages_handled); /* Look for the next undo log and record to purge */ @@ -978,6 +838,10 @@ trx_purge_get_next_rec( rec2 = rec; for (;;) { + ulint type; + trx_undo_rec_t* next_rec; + ulint cmpl_info; + /* Try first to find the next record which requires a purge operation from the same page of the same undo log */ @@ -1015,7 +879,8 @@ trx_purge_get_next_rec( if (rec2 == NULL) { mtr_commit(&mtr); - trx_purge_rseg_get_next_history_log(purge_sys->rseg); + trx_purge_rseg_get_next_history_log( + purge_sys->rseg, n_pages_handled); /* Look for the next undo log and record to purge */ @@ -1023,20 +888,20 @@ trx_purge_get_next_rec( mtr_start(&mtr); - undo_page = trx_undo_page_get_s_latched(space, zip_size, - page_no, &mtr); + undo_page = trx_undo_page_get_s_latched( + space, zip_size, page_no, &mtr); rec = undo_page + offset; } else { page = page_align(rec2); - purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec2); - purge_sys->page_no = page_get_page_no(page); purge_sys->offset = rec2 - page; + purge_sys->page_no = page_get_page_no(page); + purge_sys->iter.undo_no = trx_undo_rec_get_undo_no(rec2); if (undo_page != page) { /* We advance to a new page of the undo log: */ - purge_sys->n_pages_handled++; + (*n_pages_handled)++; } } @@ -1052,88 +917,262 @@ Fetches the next undo log record from the history list to purge. It must be released with the corresponding release function. @return copy of an undo log record or pointer to trx_purge_dummy_rec, if the whole undo log can skipped in purge; NULL if none left */ -UNIV_INTERN +static __attribute__((warn_unused_result, nonnull)) trx_undo_rec_t* trx_purge_fetch_next_rec( /*=====================*/ - roll_ptr_t* roll_ptr,/*!< out: roll pointer to undo record */ - trx_undo_inf_t** cell, /*!< out: storage cell for the record in the - purge array */ - mem_heap_t* heap) /*!< in: memory heap where copied */ + roll_ptr_t* roll_ptr, /*!< out: roll pointer to undo record */ + ulint* n_pages_handled,/*!< in/out: number of UNDO log pages + handled */ + mem_heap_t* heap) /*!< in: memory heap where copied */ { - trx_undo_rec_t* undo_rec; - - - if (purge_sys->state == TRX_STOP_PURGE) { - trx_purge_truncate_if_arr_empty(); - - return(NULL); - } else if (!purge_sys->next_stored) { + if (!purge_sys->next_stored) { trx_purge_choose_next_log(); if (!purge_sys->next_stored) { - purge_sys->state = TRX_STOP_PURGE; - - trx_purge_truncate_if_arr_empty(); if (srv_print_thread_releases) { fprintf(stderr, "Purge: No logs left in the" - " history list; pages handled %lu\n", - (ulong) purge_sys->n_pages_handled); + " history list\n"); } return(NULL); } } - if (purge_sys->n_pages_handled >= purge_sys->handle_limit) { + if (purge_sys->iter.trx_no >= purge_sys->view->low_limit_no) { - purge_sys->state = TRX_STOP_PURGE; + return(NULL); + } - trx_purge_truncate_if_arr_empty(); + /* fprintf(stderr, "Thread %lu purging trx %llu undo record %llu\n", + os_thread_get_curr_id(), iter->trx_no, iter->undo_no); */ - return(NULL); - } else if (purge_sys->purge_trx_no >= purge_sys->view->low_limit_no) { - purge_sys->state = TRX_STOP_PURGE; + *roll_ptr = trx_undo_build_roll_ptr( + FALSE, purge_sys->rseg->id, + purge_sys->page_no, purge_sys->offset); - trx_purge_truncate_if_arr_empty(); + /* The following call will advance the stored values of the + purge iterator. */ - return(NULL); + return(trx_purge_get_next_rec(n_pages_handled, heap)); +} + +/*******************************************************************//** +This function runs a purge batch. +@return number of undo log pages handled in the batch */ +static +ulint +trx_purge_attach_undo_recs( +/*=======================*/ + ulint n_purge_threads,/*!< in: number of purge threads */ + trx_purge_t* purge_sys, /*!< in/out: purge instance */ + purge_iter_t* limit, /*!< out: records read up to */ + ulint batch_size) /*!< in: no. of pages to purge */ +{ + que_thr_t* thr; + ulint i = 0; + ulint n_pages_handled = 0; + ulint n_thrs = UT_LIST_GET_LEN(purge_sys->query->thrs); + + ut_a(n_purge_threads > 0); + + *limit = purge_sys->iter; + + /* Debug code to validate some pre-requisites and reset done flag. */ + for (thr = UT_LIST_GET_FIRST(purge_sys->query->thrs); + thr != NULL && i < n_purge_threads; + thr = UT_LIST_GET_NEXT(thrs, thr), ++i) { + + purge_node_t* node; + + /* Get the purge node. */ + node = (purge_node_t*) thr->child; + + ut_a(que_node_get_type(node) == QUE_NODE_PURGE); + ut_a(node->undo_recs == NULL); + ut_a(node->done); + + node->done = FALSE; } - /* fprintf(stderr, "Thread %lu purging trx %llu undo record %llu\n", - os_thread_get_curr_id(), - (ullint) purge_sys->purge_trx_no, - (ullint) purge_sys->purge_undo_no); */ + /* There should never be fewer nodes than threads, the inverse + however is allowed because we only use purge threads as needed. */ + ut_a(i == n_purge_threads); + /* Fetch and parse the UNDO records. The UNDO records are added + to a per purge node vector. */ + thr = UT_LIST_GET_FIRST(purge_sys->query->thrs); + ut_a(n_thrs > 0 && thr != NULL); - *roll_ptr = trx_undo_build_roll_ptr( - FALSE, (purge_sys->rseg)->id, purge_sys->page_no, - purge_sys->offset); + ut_ad(trx_purge_check_limit()); + + i = 0; - *cell = trx_purge_arr_store_info( - purge_sys->purge_trx_no, purge_sys->purge_undo_no); + for (;;) { + purge_node_t* node; + trx_purge_rec_t* purge_rec; - ut_ad(purge_sys->purge_trx_no < purge_sys->view->low_limit_no); + ut_a(!thr->is_active); - /* The following call will advance the stored values of purge_trx_no - and purge_undo_no, therefore we had to store them first */ + /* Get the purge node. */ + node = (purge_node_t*) thr->child; + ut_a(que_node_get_type(node) == QUE_NODE_PURGE); + + purge_rec = static_cast<trx_purge_rec_t*>( + mem_heap_zalloc(node->heap, sizeof(*purge_rec))); + + /* Track the max {trx_id, undo_no} for truncating the + UNDO logs once we have purged the records. */ + + if (purge_sys->iter.trx_no > limit->trx_no + || (purge_sys->iter.trx_no == limit->trx_no + && purge_sys->iter.undo_no >= limit->undo_no)) { + + *limit = purge_sys->iter; + } - undo_rec = trx_purge_get_next_rec(heap); + /* Fetch the next record, and advance the purge_sys->iter. */ + purge_rec->undo_rec = trx_purge_fetch_next_rec( + &purge_rec->roll_ptr, &n_pages_handled, node->heap); - return(undo_rec); + if (purge_rec->undo_rec != NULL) { + + if (node->undo_recs == NULL) { + node->undo_recs = ib_vector_create( + ib_heap_allocator_create(node->heap), + sizeof(trx_purge_rec_t), + batch_size); + } else { + ut_a(!ib_vector_is_empty(node->undo_recs)); + } + + ib_vector_push(node->undo_recs, purge_rec); + + if (n_pages_handled >= batch_size) { + + break; + } + } else { + break; + } + + thr = UT_LIST_GET_NEXT(thrs, thr); + + if (!(++i % n_purge_threads)) { + thr = UT_LIST_GET_FIRST(purge_sys->query->thrs); + } + + ut_a(thr != NULL); + } + + ut_ad(trx_purge_check_limit()); + + return(n_pages_handled); } /*******************************************************************//** -Releases a reserved purge undo record. */ -UNIV_INTERN +Calculate the DML delay required. +@return delay in microseconds or ULINT_MAX */ +static +ulint +trx_purge_dml_delay(void) +/*=====================*/ +{ + /* Determine how much data manipulation language (DML) statements + need to be delayed in order to reduce the lagging of the purge + thread. */ + ulint delay = 0; /* in microseconds; default: no delay */ + + /* If purge lag is set (ie. > 0) then calculate the new DML delay. + Note: we do a dirty read of the trx_sys_t data structure here, + without holding trx_sys->mutex. */ + + if (srv_max_purge_lag > 0) { + float ratio; + + ratio = float(trx_sys->rseg_history_len) / srv_max_purge_lag; + + if (ratio > 1.0) { + /* If the history list length exceeds the + srv_max_purge_lag, the data manipulation + statements are delayed by at least 5000 + microseconds. */ + delay = (ulint) ((ratio - .5) * 10000); + } + + if (delay > srv_max_purge_lag_delay) { + delay = srv_max_purge_lag_delay; + } + + MONITOR_SET(MONITOR_DML_PURGE_DELAY, delay); + } + + return(delay); +} + +/*******************************************************************//** +Wait for pending purge jobs to complete. */ +static void -trx_purge_rec_release( -/*==================*/ - trx_undo_inf_t* cell) /*!< in: storage cell */ +trx_purge_wait_for_workers_to_complete( +/*===================================*/ + trx_purge_t* purge_sys) /*!< in: purge instance */ { - trx_purge_arr_remove_info(cell); + ulint n_submitted = purge_sys->n_submitted; + +#ifdef HAVE_ATOMIC_BUILTINS + /* Ensure that the work queue empties out. */ + while (!os_compare_and_swap_ulint( + &purge_sys->n_completed, n_submitted, n_submitted)) { +#else + mutex_enter(&purge_sys->bh_mutex); + + while (purge_sys->n_completed < n_submitted) { +#endif /* HAVE_ATOMIC_BUILTINS */ + +#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(&purge_sys->bh_mutex); +#endif /* !HAVE_ATOMIC_BUILTINS */ + + if (srv_get_task_queue_length() > 0) { + srv_release_threads(SRV_WORKER, 1); + } + + os_thread_yield(); + +#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(&purge_sys->bh_mutex); +#endif /* !HAVE_ATOMIC_BUILTINS */ + } + +#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(&purge_sys->bh_mutex); +#endif /* !HAVE_ATOMIC_BUILTINS */ + + /* None of the worker threads should be doing any work. */ + ut_a(purge_sys->n_submitted == purge_sys->n_completed); + + /* There should be no outstanding tasks as long + as the worker threads are active. */ + ut_a(srv_get_task_queue_length() == 0); +} + +/******************************************************************//** +Remove old historical changes from the rollback segments. */ +static +void +trx_purge_truncate(void) +/*====================*/ +{ + ut_ad(trx_purge_check_limit()); + + if (purge_sys->limit.trx_no == 0) { + trx_purge_truncate_history(&purge_sys->iter, purge_sys->view); + } else { + trx_purge_truncate_history(&purge_sys->limit, purge_sys->view); + } } /*******************************************************************//** @@ -1143,112 +1182,227 @@ UNIV_INTERN ulint trx_purge( /*======*/ - ulint limit) /*!< in: the maximum number of records to - purge in one batch */ + ulint n_purge_threads, /*!< in: number of purge tasks + to submit to the work queue */ + ulint batch_size, /*!< in: the maximum number of records + to purge in one batch */ + bool truncate) /*!< in: truncate history if true */ { - que_thr_t* thr; - ulint old_pages_handled; + que_thr_t* thr = NULL; + ulint n_pages_handled; - ut_a(purge_sys->trx->n_active_thrs == 0); + ut_a(n_purge_threads > 0); - rw_lock_x_lock(&purge_sys->latch); + srv_dml_needed_delay = trx_purge_dml_delay(); - mutex_enter(&kernel_mutex); + /* The number of tasks submitted should be completed. */ + ut_a(purge_sys->n_submitted == purge_sys->n_completed); - /* Close and free the old purge view */ + rw_lock_x_lock(&purge_sys->latch); - read_view_close(purge_sys->view); purge_sys->view = NULL; + mem_heap_empty(purge_sys->heap); - /* Determine how much data manipulation language (DML) statements - need to be delayed in order to reduce the lagging of the purge - thread. */ - srv_dml_needed_delay = 0; /* in microseconds; default: no delay */ + purge_sys->view = read_view_purge_open(purge_sys->prebuilt_clone, + purge_sys->prebuilt_view); - /* If we cannot advance the 'purge view' because of an old - 'consistent read view', then the DML statements cannot be delayed. - Also, srv_max_purge_lag <= 0 means 'infinity'. */ - if (srv_max_purge_lag > 0) { - float ratio = (float) trx_sys->rseg_history_len - / srv_max_purge_lag; - if (ratio > ULINT_MAX / 10000) { - /* Avoid overflow: maximum delay is 4295 seconds */ - srv_dml_needed_delay = ULINT_MAX; - } else if (ratio > 1) { - /* If the history list length exceeds the - innodb_max_purge_lag, the - data manipulation statements are delayed - by at least 5000 microseconds. */ - srv_dml_needed_delay = (ulint) ((ratio - .5) * 10000); - } + rw_lock_x_unlock(&purge_sys->latch); + +#ifdef UNIV_DEBUG + if (srv_purge_view_update_only_debug) { + return(0); } +#endif - purge_sys->view = read_view_oldest_copy_or_open_new( - 0, purge_sys->prebuilt_view); + /* Fetch the UNDO recs that need to be purged. */ + n_pages_handled = trx_purge_attach_undo_recs( + n_purge_threads, purge_sys, &purge_sys->limit, batch_size); + + /* Do we do an asynchronous purge or not ? */ + if (n_purge_threads > 1) { + ulint i = 0; + + /* Submit the tasks to the work queue. */ + for (i = 0; i < n_purge_threads - 1; ++i) { + thr = que_fork_scheduler_round_robin( + purge_sys->query, thr); + + ut_a(thr != NULL); + + srv_que_task_enqueue_low(thr); + } - mutex_exit(&kernel_mutex); + thr = que_fork_scheduler_round_robin(purge_sys->query, thr); + ut_a(thr != NULL); - rw_lock_x_unlock(&(purge_sys->latch)); + purge_sys->n_submitted += n_purge_threads - 1; + + goto run_synchronously; + + /* Do it synchronously. */ + } else { + thr = que_fork_scheduler_round_robin(purge_sys->query, NULL); + ut_ad(thr); + +run_synchronously: + ++purge_sys->n_submitted; + + que_run_threads(thr); + + os_atomic_inc_ulint( + &purge_sys->bh_mutex, &purge_sys->n_completed, 1); + + if (n_purge_threads > 1) { + trx_purge_wait_for_workers_to_complete(purge_sys); + } + } + + ut_a(purge_sys->n_submitted == purge_sys->n_completed); #ifdef UNIV_DEBUG - if (srv_purge_view_update_only_debug) { - return(0); + if (purge_sys->limit.trx_no == 0) { + purge_sys->done = purge_sys->iter; + } else { + purge_sys->done = purge_sys->limit; } -#endif +#endif /* UNIV_DEBUG */ - purge_sys->state = TRX_PURGE_ON; + if (truncate) { + trx_purge_truncate(); + } - purge_sys->handle_limit = purge_sys->n_pages_handled + limit; + MONITOR_INC_VALUE(MONITOR_PURGE_INVOKED, 1); + MONITOR_INC_VALUE(MONITOR_PURGE_N_PAGE_HANDLED, n_pages_handled); - old_pages_handled = purge_sys->n_pages_handled; + return(n_pages_handled); +} +/*******************************************************************//** +Get the purge state. +@return purge state. */ +UNIV_INTERN +purge_state_t +trx_purge_state(void) +/*=================*/ +{ + purge_state_t state; - mutex_enter(&kernel_mutex); + rw_lock_x_lock(&purge_sys->latch); + + state = purge_sys->state; + + rw_lock_x_unlock(&purge_sys->latch); - thr = que_fork_start_command(purge_sys->query); + return(state); +} - ut_ad(thr); +/*******************************************************************//** +Stop purge and wait for it to stop, move to PURGE_STATE_STOP. */ +UNIV_INTERN +void +trx_purge_stop(void) +/*================*/ +{ + purge_state_t state; + ib_int64_t sig_count = os_event_reset(purge_sys->event); + + ut_a(srv_n_purge_threads > 0); + + rw_lock_x_lock(&purge_sys->latch); - mutex_exit(&kernel_mutex); + ut_a(purge_sys->state != PURGE_STATE_INIT); + ut_a(purge_sys->state != PURGE_STATE_EXIT); + ut_a(purge_sys->state != PURGE_STATE_DISABLED); - if (srv_print_thread_releases) { + ++purge_sys->n_stop; - fputs("Starting purge\n", stderr); + state = purge_sys->state; + + if (state == PURGE_STATE_RUN) { + ib_logf(IB_LOG_LEVEL_INFO, "Stopping purge"); + + /* We need to wakeup the purge thread in case it is suspended, + so that it can acknowledge the state change. */ + + srv_purge_wakeup(); } - que_run_threads(thr); + purge_sys->state = PURGE_STATE_STOP; + + rw_lock_x_unlock(&purge_sys->latch); + + if (state != PURGE_STATE_STOP) { + + /* Wait for purge coordinator to signal that it + is suspended. */ + os_event_wait_low(purge_sys->event, sig_count); + } else { + bool once = true; + + rw_lock_x_lock(&purge_sys->latch); + + /* Wait for purge to signal that it has actually stopped. */ + while (purge_sys->running) { + + if (once) { + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for purge to stop"); + once = false; + } + + rw_lock_x_unlock(&purge_sys->latch); + + os_thread_sleep(10000); - if (srv_print_thread_releases) { + rw_lock_x_lock(&purge_sys->latch); + } - fprintf(stderr, - "Purge ends; pages handled %lu\n", - (ulong) purge_sys->n_pages_handled); + rw_lock_x_unlock(&purge_sys->latch); } - return((ulint) (purge_sys->n_pages_handled - old_pages_handled)); + MONITOR_INC_VALUE(MONITOR_PURGE_STOP_COUNT, 1); } -/******************************************************************//** -Prints information of the purge system to stderr. */ +/*******************************************************************//** +Resume purge, move to PURGE_STATE_RUN. */ UNIV_INTERN void -trx_purge_sys_print(void) -/*=====================*/ +trx_purge_run(void) +/*===============*/ { - fprintf(stderr, "InnoDB: Purge system view:\n"); - read_view_print(stderr, purge_sys->view); - - fprintf(stderr, "InnoDB: Purge trx n:o " TRX_ID_FMT - ", undo n:o " TRX_ID_FMT "\n", - (ullint) purge_sys->purge_trx_no, - (ullint) purge_sys->purge_undo_no); - fprintf(stderr, - "InnoDB: Purge next stored %lu, page_no %lu, offset %lu,\n" - "InnoDB: Purge hdr_page_no %lu, hdr_offset %lu\n", - (ulong) purge_sys->next_stored, - (ulong) purge_sys->page_no, - (ulong) purge_sys->offset, - (ulong) purge_sys->hdr_page_no, - (ulong) purge_sys->hdr_offset); + rw_lock_x_lock(&purge_sys->latch); + + switch(purge_sys->state) { + case PURGE_STATE_INIT: + case PURGE_STATE_EXIT: + case PURGE_STATE_DISABLED: + ut_error; + + case PURGE_STATE_RUN: + case PURGE_STATE_STOP: + break; + } + + if (purge_sys->n_stop > 0) { + + ut_a(purge_sys->state == PURGE_STATE_STOP); + + --purge_sys->n_stop; + + if (purge_sys->n_stop == 0) { + + ib_logf(IB_LOG_LEVEL_INFO, "Resuming purge"); + + purge_sys->state = PURGE_STATE_RUN; + } + + MONITOR_INC_VALUE(MONITOR_PURGE_RESUME_COUNT, 1); + } else { + ut_a(purge_sys->state == PURGE_STATE_RUN); + } + + rw_lock_x_unlock(&purge_sys->latch); + + srv_purge_wakeup(); } diff --git a/storage/xtradb/trx/trx0rec.c b/storage/xtradb/trx/trx0rec.cc index ef42152aeb7..a698b37c2a6 100644 --- a/storage/xtradb/trx/trx0rec.c +++ b/storage/xtradb/trx/trx0rec.cc @@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc., *****************************************************************************/ /**************************************************//** -@file trx/trx0rec.c +@file trx/trx0rec.cc Transaction undo log record Created 3/26/1996 Heikki Tuuri @@ -287,7 +287,7 @@ trx_undo_rec_get_pars( TRX_UNDO_INSERT_REC, ... */ ulint* cmpl_info, /*!< out: compiler info, relevant only for update type records */ - ibool* updated_extern, /*!< out: TRUE if we updated an + bool* updated_extern, /*!< out: true if we updated an externally stored fild */ undo_no_t* undo_no, /*!< out: undo log record number */ table_id_t* table_id) /*!< out: table id */ @@ -300,12 +300,8 @@ trx_undo_rec_get_pars( type_cmpl = mach_read_from_1(ptr); ptr++; - if (type_cmpl & TRX_UNDO_UPD_EXTERN) { - *updated_extern = TRUE; - type_cmpl -= TRX_UNDO_UPD_EXTERN; - } else { - *updated_extern = FALSE; - } + *updated_extern = !!(type_cmpl & TRX_UNDO_UPD_EXTERN); + type_cmpl &= ~TRX_UNDO_UPD_EXTERN; *type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1); *cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT; @@ -353,8 +349,9 @@ trx_undo_rec_get_col_val( ut_ad(*len > *orig_len); /* @see dtuple_convert_big_rec() */ ut_ad(*len >= BTR_EXTERN_FIELD_REF_SIZE); + /* we do not have access to index->table here - ut_ad(dict_table_get_format(index->table) >= DICT_TF_FORMAT_ZIP + ut_ad(dict_table_get_format(index->table) >= UNIV_FORMAT_B || *len >= col->max_prefix + BTR_EXTERN_FIELD_REF_SIZE); */ @@ -587,6 +584,7 @@ trx_undo_page_report_modify( /* Store first some general parameters to the undo log */ if (!update) { + ut_ad(!rec_get_deleted_flag(rec, dict_table_is_comp(table))); type_cmpl = TRX_UNDO_DEL_MARK_REC; } else if (rec_get_deleted_flag(rec, dict_table_is_comp(table))) { type_cmpl = TRX_UNDO_UPD_DEL_REC; @@ -670,27 +668,14 @@ trx_undo_page_report_modify( /* Save to the undo log the old values of the columns to be updated. */ if (update) { - ulint extended = 0; - if (trx_undo_left(undo_page, ptr) < 5) { return(0); } - if (srv_use_sys_stats_table - && index == UT_LIST_GET_FIRST(dict_sys->sys_stats->indexes)) { - for (i = 0; i < upd_get_n_fields(update); i++) { - ulint pos = upd_get_nth_field(update, i)->field_no; - - if (pos >= rec_offs_n_fields(offsets)) { - extended++; - } - } - } - - ptr += mach_write_compressed(ptr, upd_get_n_fields(update) - extended); + ptr += mach_write_compressed(ptr, upd_get_n_fields(update)); - for (i = 0; i < upd_get_n_fields(update) - extended; i++) { + for (i = 0; i < upd_get_n_fields(update); i++) { ulint pos = upd_get_nth_field(update, i)->field_no; @@ -973,7 +958,9 @@ trx_undo_update_rec_get_update( /* Store first trx id and roll ptr to update vector */ upd_field = upd_get_nth_field(update, n_fields); - buf = mem_heap_alloc(heap, DATA_TRX_ID_LEN); + + buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_TRX_ID_LEN)); + trx_write_trx_id(buf, trx_id); upd_field_set_field_no(upd_field, @@ -982,7 +969,9 @@ trx_undo_update_rec_get_update( dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN); upd_field = upd_get_nth_field(update, n_fields + 1); - buf = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN); + + buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_ROLL_PTR_LEN)); + trx_write_roll_ptr(buf, roll_ptr); upd_field_set_field_no( @@ -1048,8 +1037,9 @@ trx_undo_update_rec_get_update( } /*******************************************************************//** -Builds a partial row from an update undo log record. It contains the -columns which occur as ordering in any index of the table. +Builds a partial row from an update undo log record, for purge. +It contains the columns which occur as ordering in any index of the table. +Any missing columns are indicated by col->mtype == DATA_MISSING. @return pointer to remaining part of undo record */ UNIV_INTERN byte* @@ -1083,7 +1073,12 @@ trx_undo_rec_get_partial_row( *row = dtuple_create(heap, row_len); - dict_table_copy_types(*row, index->table); + /* Mark all columns in the row uninitialized, so that + we can distinguish missing fields from fields that are SQL NULL. */ + for (ulint i = 0; i < row_len; i++) { + dfield_get_type(dtuple_get_nth_field(*row, i)) + ->mtype = DATA_MISSING; + } end_ptr = ptr + mach_read_from_2(ptr); ptr += 2; @@ -1105,7 +1100,9 @@ trx_undo_rec_get_partial_row( ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); dfield = dtuple_get_nth_field(*row, col_no); - + dict_col_copy_type( + dict_table_get_nth_col(index->table, col_no), + dfield_get_type(dfield)); dfield_set_data(dfield, field, len); if (len != UNIV_SQL_NULL @@ -1120,9 +1117,9 @@ trx_undo_rec_get_partial_row( ut_a(dfield_get_len(dfield) >= BTR_EXTERN_FIELD_REF_SIZE); ut_a(dict_table_get_format(index->table) - >= DICT_TF_FORMAT_ZIP + >= UNIV_FORMAT_B || dfield_get_len(dfield) - >= REC_ANTELOPE_MAX_INDEX_COL_LEN + >= REC_ANTELOPE_MAX_INDEX_COL_LEN + BTR_EXTERN_FIELD_REF_SIZE); } } @@ -1185,7 +1182,7 @@ transaction and in consistent reads that must look to the history of this transaction. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t trx_undo_report_row_operation( /*==========================*/ ulint flags, /*!< in: if BTR_NO_UNDO_LOG_FLAG bit is @@ -1204,6 +1201,7 @@ trx_undo_report_row_operation( const rec_t* rec, /*!< in: in case of an update or delete marking, the record in the clustered index, otherwise NULL */ + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ roll_ptr_t* roll_ptr) /*!< out: rollback pointer to the inserted undo log record, 0 if BTR_NO_UNDO_LOG @@ -1215,16 +1213,14 @@ trx_undo_report_row_operation( buf_block_t* undo_block; trx_rseg_t* rseg; mtr_t mtr; - ulint err = DB_SUCCESS; - mem_heap_t* heap = NULL; - ulint offsets_[REC_OFFS_NORMAL_SIZE]; - ulint* offsets = offsets_; + dberr_t err = DB_SUCCESS; #ifdef UNIV_DEBUG int loop_count = 0; #endif /* UNIV_DEBUG */ - rec_offs_init(offsets_); + ut_ad(!srv_read_only_mode); ut_a(dict_index_is_clust(index)); + ut_ad(!rec || rec_offs_validate(rec, index, offsets)); if (flags & BTR_NO_UNDO_LOG_FLAG) { @@ -1238,55 +1234,61 @@ trx_undo_report_row_operation( || (clust_entry && !update && !rec)); trx = thr_get_trx(thr); + + /* This table is visible only to the session that created it. */ + if (trx->read_only) { + ut_ad(!srv_read_only_mode); + /* MySQL should block writes to non-temporary tables. */ + ut_a(DICT_TF2_FLAG_IS_SET(index->table, DICT_TF2_TEMPORARY)); + if (trx->rseg == 0) { + trx_assign_rseg(trx); + } + } + rseg = trx->rseg; - mutex_enter(&(trx->undo_mutex)); + mtr_start(&mtr); + mutex_enter(&trx->undo_mutex); /* If the undo log is not assigned yet, assign one */ - if (op_type == TRX_UNDO_INSERT_OP) { + switch (op_type) { + case TRX_UNDO_INSERT_OP: + undo = trx->insert_undo; - if (trx->insert_undo == NULL) { + if (undo == NULL) { err = trx_undo_assign_undo(trx, TRX_UNDO_INSERT); - } + undo = trx->insert_undo; - undo = trx->insert_undo; - - if (UNIV_UNLIKELY(!undo)) { - /* Did not succeed */ - ut_ad(err != DB_SUCCESS); - mutex_exit(&(trx->undo_mutex)); + if (undo == NULL) { + /* Did not succeed */ + ut_ad(err != DB_SUCCESS); + goto err_exit; + } - return(err); + ut_ad(err == DB_SUCCESS); } - - ut_ad(err == DB_SUCCESS); - } else { + break; + default: ut_ad(op_type == TRX_UNDO_MODIFY_OP); - if (trx->update_undo == NULL) { + undo = trx->update_undo; + if (undo == NULL) { err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE); + undo = trx->update_undo; - } - - undo = trx->update_undo; - - if (UNIV_UNLIKELY(!undo)) { - /* Did not succeed */ - ut_ad(err != DB_SUCCESS); - mutex_exit(&(trx->undo_mutex)); - return(err); + if (undo == NULL) { + /* Did not succeed */ + ut_ad(err != DB_SUCCESS); + goto err_exit; + } } ut_ad(err == DB_SUCCESS); - offsets = rec_get_offsets(rec, index, offsets, - ULINT_UNDEFINED, &heap); } - mtr_start(&mtr); - page_no = undo->last_page_no; undo_block = buf_page_get_gen( undo->space, undo->zip_size, page_no, RW_X_LATCH, @@ -1300,10 +1302,13 @@ trx_undo_report_row_operation( undo_page = buf_block_get_frame(undo_block); ut_ad(page_no == buf_block_get_page_no(undo_block)); - if (op_type == TRX_UNDO_INSERT_OP) { + switch (op_type) { + case TRX_UNDO_INSERT_OP: offset = trx_undo_page_report_insert( undo_page, trx, index, clust_entry, &mtr); - } else { + break; + default: + ut_ad(op_type == TRX_UNDO_MODIFY_OP); offset = trx_undo_page_report_modify( undo_page, trx, index, rec, offsets, update, cmpl_info, &mtr); @@ -1360,8 +1365,7 @@ trx_undo_report_row_operation( *roll_ptr = trx_undo_build_roll_ptr( op_type == TRX_UNDO_INSERT_OP, rseg->id, page_no, offset); - err = DB_SUCCESS; - goto func_exit; + return(DB_SUCCESS); } ut_ad(page_no == undo->last_page_no); @@ -1378,6 +1382,7 @@ trx_undo_report_row_operation( mutex_enter(&rseg->mutex); undo_block = trx_undo_add_page(trx, undo, &mtr); mutex_exit(&rseg->mutex); + page_no = undo->last_page_no; } while (undo_block != NULL); @@ -1387,10 +1392,6 @@ trx_undo_report_row_operation( err_exit: mutex_exit(&trx->undo_mutex); mtr_commit(&mtr); -func_exit: - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } return(err); } @@ -1435,59 +1436,62 @@ trx_undo_get_undo_rec_low( /******************************************************************//** Copies an undo record to heap. -NOTE: the caller must have latches on the clustered index page and -purge_view. +NOTE: the caller must have latches on the clustered index page. -@return DB_SUCCESS, or DB_MISSING_HISTORY if the undo log has been -truncated and we cannot fetch the old version */ -UNIV_INTERN -ulint +@retval true if the undo log has been +truncated and we cannot fetch the old version +@retval false if the undo log record is available */ +static __attribute__((nonnull, warn_unused_result)) +bool trx_undo_get_undo_rec( /*==================*/ roll_ptr_t roll_ptr, /*!< in: roll pointer to record */ trx_id_t trx_id, /*!< in: id of the trx that generated the roll pointer: it points to an undo log of this transaction */ - trx_undo_rec_t** undo_rec, /*!< out, own: copy of the record */ + trx_undo_rec_t**undo_rec, /*!< out, own: copy of the record */ mem_heap_t* heap) /*!< in: memory heap where copied */ { -#ifdef UNIV_SYNC_DEBUG - ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); -#endif /* UNIV_SYNC_DEBUG */ - - if (!trx_purge_update_undo_must_exist(trx_id)) { + bool missing_history; - /* It may be that the necessary undo log has already been - deleted */ + rw_lock_s_lock(&purge_sys->latch); + missing_history = read_view_sees_trx_id(purge_sys->view, trx_id); - return(DB_MISSING_HISTORY); + if (!missing_history) { + *undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap); } - *undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap); + rw_lock_s_unlock(&purge_sys->latch); - return(DB_SUCCESS); + return(missing_history); } +#ifdef UNIV_DEBUG +#define ATTRIB_USED_ONLY_IN_DEBUG +#else /* UNIV_DEBUG */ +#define ATTRIB_USED_ONLY_IN_DEBUG __attribute__((unused)) +#endif /* UNIV_DEBUG */ + /*******************************************************************//** -Build a previous version of a clustered index record. This function checks -that the caller has a latch on the index page of the clustered index record -and an s-latch on the purge_view. This guarantees that the stack of versions -is locked all the way down to the purge_view. -@return DB_SUCCESS, or DB_MISSING_HISTORY if the previous version is -earlier than purge_view, which means that it may have been removed, -DB_ERROR if corrupted record */ +Build a previous version of a clustered index record. The caller must +hold a latch on the index page of the clustered index record. +@retval true if previous version was built, or if it was an insert +or the table has been rebuilt +@retval false if the previous version is earlier than purge_view, +which means that it may have been removed */ UNIV_INTERN -ulint +bool trx_undo_prev_version_build( /*========================*/ - const rec_t* index_rec,/*!< in: clustered index record in the + const rec_t* index_rec ATTRIB_USED_ONLY_IN_DEBUG, + /*!< in: clustered index record in the index tree */ - mtr_t* index_mtr __attribute__((unused)), + mtr_t* index_mtr ATTRIB_USED_ONLY_IN_DEBUG, /*!< in: mtr which contains the latch to index_rec page and purge_view */ const rec_t* rec, /*!< in: version of a clustered index record */ dict_index_t* index, /*!< in: clustered index */ - ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ mem_heap_t* heap, /*!< in: memory heap from which the memory needed is allocated */ rec_t** old_vers)/*!< out, own: previous version, or NULL if @@ -1504,63 +1508,48 @@ trx_undo_prev_version_build( table_id_t table_id; trx_id_t trx_id; roll_ptr_t roll_ptr; - roll_ptr_t old_roll_ptr; upd_t* update; byte* ptr; ulint info_bits; ulint cmpl_info; - ibool dummy_extern; + bool dummy_extern; byte* buf; - ulint err; #ifdef UNIV_SYNC_DEBUG - ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(&purge_sys->latch, RW_LOCK_SHARED)); #endif /* UNIV_SYNC_DEBUG */ ut_ad(mtr_memo_contains_page(index_mtr, index_rec, MTR_MEMO_PAGE_S_FIX) || mtr_memo_contains_page(index_mtr, index_rec, MTR_MEMO_PAGE_X_FIX)); ut_ad(rec_offs_validate(rec, index, offsets)); - - if (!dict_index_is_clust(index)) { - fprintf(stderr, "InnoDB: Error: trying to access" - " update undo rec for non-clustered index %s\n" - "InnoDB: Submit a detailed bug report to" - " http://bugs.mysql.com\n" - "InnoDB: index record ", index->name); - rec_print(stderr, index_rec, index); - fputs("\n" - "InnoDB: record version ", stderr); - rec_print_new(stderr, rec, offsets); - putc('\n', stderr); - ut_ad(0); - return(DB_ERROR); - } + ut_a(dict_index_is_clust(index)); roll_ptr = row_get_rec_roll_ptr(rec, index, offsets); - old_roll_ptr = roll_ptr; *old_vers = NULL; if (trx_undo_roll_ptr_is_insert(roll_ptr)) { - /* The record rec is the first inserted version */ - - return(DB_SUCCESS); + return(true); } rec_trx_id = row_get_rec_trx_id(rec, index, offsets); - err = trx_undo_get_undo_rec(roll_ptr, rec_trx_id, &undo_rec, heap); - - if (UNIV_UNLIKELY(err != DB_SUCCESS)) { - /* The undo record may already have been purged. - This should never happen in InnoDB. */ - - return(err); + if (trx_undo_get_undo_rec(roll_ptr, rec_trx_id, &undo_rec, heap)) { + /* The undo record may already have been purged, + during purge or semi-consistent read. */ + return(false); } ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info, &dummy_extern, &undo_no, &table_id); + if (table_id != index->table->id) { + /* The table should have been rebuilt, but purge has + not yet removed the undo log records for the + now-dropped old table (table_id). */ + return(true); + } + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, &info_bits); @@ -1591,59 +1580,11 @@ trx_undo_prev_version_build( ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id, roll_ptr, info_bits, NULL, heap, &update); + ut_a(ptr); - if (UNIV_UNLIKELY(table_id != index->table->id)) { - ptr = NULL; - - fprintf(stderr, - "InnoDB: Error: trying to access update undo rec" - " for table %s\n" - "InnoDB: but the table id in the" - " undo record is wrong\n" - "InnoDB: Submit a detailed bug report" - " to http://bugs.mysql.com\n" - "InnoDB: Run also CHECK TABLE %s\n", - index->table_name, index->table_name); - } - - if (ptr == NULL) { - /* The record was corrupted, return an error; these printfs - should catch an elusive bug in row_vers_old_has_index_entry */ - - fprintf(stderr, - "InnoDB: table %s, index %s, n_uniq %lu\n" - "InnoDB: undo rec address %p, type %lu cmpl_info %lu\n" - "InnoDB: undo rec table id %llu," - " index table id %llu\n" - "InnoDB: dump of 150 bytes in undo rec: ", - index->table_name, index->name, - (ulong) dict_index_get_n_unique(index), - undo_rec, (ulong) type, (ulong) cmpl_info, - (ullint) table_id, - (ullint) index->table->id); - ut_print_buf(stderr, undo_rec, 150); - fputs("\n" - "InnoDB: index record ", stderr); - rec_print(stderr, index_rec, index); - fputs("\n" - "InnoDB: record version ", stderr); - rec_print_new(stderr, rec, offsets); - fprintf(stderr, "\n" - "InnoDB: Record trx id " TRX_ID_FMT - ", update rec trx id " TRX_ID_FMT "\n" - "InnoDB: Roll ptr in rec " TRX_ID_FMT - ", in update rec" TRX_ID_FMT "\n", - (ullint) rec_trx_id, (ullint) trx_id, - (ullint) old_roll_ptr, (ullint) roll_ptr); - - trx_purge_sys_print(); - ut_ad(0); - return(DB_ERROR); - } - -# ifdef UNIV_BLOB_NULL_DEBUG +# if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern(rec, offsets)); -# endif /* UNIV_BLOB_NULL_DEBUG */ +# endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ if (row_upd_changes_field_size_or_external(index, offsets, update)) { ulint n_ext; @@ -1660,11 +1601,24 @@ trx_undo_prev_version_build( delete-marked record by trx_id, no transactions need to access the BLOB. */ + /* the row_upd_changes_disowned_external(update) call could be + omitted, but the synchronization on purge_sys->latch is likely + more expensive. */ + if ((update->info_bits & REC_INFO_DELETED_FLAG) - && read_view_sees_trx_id(purge_sys->view, trx_id)) { - /* treat as a fresh insert, not to - cause assertion error at the caller. */ - return(DB_SUCCESS); + && row_upd_changes_disowned_external(update)) { + bool missing_extern; + + rw_lock_s_lock(&purge_sys->latch); + missing_extern = read_view_sees_trx_id(purge_sys->view, + trx_id); + rw_lock_s_unlock(&purge_sys->latch); + + if (missing_extern) { + /* treat as a fresh insert, not to + cause assertion error at the caller. */ + return(true); + } } /* We have to set the appropriate extern storage bits in the @@ -1673,26 +1627,30 @@ trx_undo_prev_version_build( those fields that update updates to become externally stored fields. Store the info: */ - entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, - offsets, &n_ext, heap); + entry = row_rec_to_index_entry( + rec, index, offsets, &n_ext, heap); n_ext += btr_push_update_extern_fields(entry, update, heap); /* The page containing the clustered index record corresponding to entry is latched in mtr. Thus the following call is safe. */ row_upd_index_replace_new_col_vals(entry, index, update, heap); - buf = mem_heap_alloc(heap, rec_get_converted_size(index, entry, - n_ext)); + buf = static_cast<byte*>( + mem_heap_alloc( + heap, + rec_get_converted_size(index, entry, n_ext))); *old_vers = rec_convert_dtuple_to_rec(buf, index, entry, n_ext); } else { - buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + buf = static_cast<byte*>( + mem_heap_alloc(heap, rec_offs_size(offsets))); + *old_vers = rec_copy(buf, rec, offsets); rec_offs_make_valid(*old_vers, index, offsets); row_upd_rec_in_place(*old_vers, index, offsets, update, NULL); } - return(DB_SUCCESS); + return(true); } #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/trx/trx0roll.c b/storage/xtradb/trx/trx0roll.cc index 2dde8900cda..eb2af877a6d 100644 --- a/storage/xtradb/trx/trx0roll.c +++ b/storage/xtradb/trx/trx0roll.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA *****************************************************************************/ /**************************************************//** -@file trx/trx0roll.c +@file trx/trx0roll.cc Transaction rollback Created 3/26/1996 Heikki Tuuri @@ -38,10 +38,13 @@ Created 3/26/1996 Heikki Tuuri #include "que0que.h" #include "usr0sess.h" #include "srv0start.h" +#include "read0read.h" #include "row0undo.h" #include "row0mysql.h" #include "lock0lock.h" #include "pars0pars.h" +#include "srv0mon.h" +#include "trx0sys.h" #ifdef WITH_WSREP #include "ha_prototypes.h" #endif /* WITH_WSREP */ @@ -60,176 +63,273 @@ static undo_no_t trx_roll_max_undo_no; /** Auxiliary variable which tells the previous progress % we printed */ static ulint trx_roll_progress_printed_pct; +/****************************************************************//** +Finishes a transaction rollback. */ +static +void +trx_rollback_finish( +/*================*/ + trx_t* trx); /*!< in: transaction */ + /*******************************************************************//** -Rollback a transaction used in MySQL. -@return error code or DB_SUCCESS */ -UNIV_INTERN -int -trx_general_rollback_for_mysql( -/*===========================*/ +Rollback a transaction used in MySQL. */ +static +void +trx_rollback_to_savepoint_low( +/*==========================*/ trx_t* trx, /*!< in: transaction handle */ trx_savept_t* savept) /*!< in: pointer to savepoint undo number, if partial rollback requested, or NULL for complete rollback */ { - mem_heap_t* heap; que_thr_t* thr; + mem_heap_t* heap; roll_node_t* roll_node; - /* Tell Innobase server that there might be work for - utility threads: */ - - srv_active_wake_master_thread(); - - trx_start_if_not_started(trx); - heap = mem_heap_create(512); roll_node = roll_node_create(heap); - if (savept) { + if (savept != NULL) { roll_node->partial = TRUE; roll_node->savept = *savept; + assert_trx_in_list(trx); + } else { + assert_trx_nonlocking_or_in_list(trx); } trx->error_state = DB_SUCCESS; - thr = pars_complete_graph_for_exec(roll_node, trx, heap); - - ut_a(thr == que_fork_start_command(que_node_get_parent(thr))); - que_run_threads(thr); + if (trx->insert_undo || trx->update_undo) { + thr = pars_complete_graph_for_exec(roll_node, trx, heap); - mutex_enter(&kernel_mutex); + ut_a(thr == que_fork_start_command( + static_cast<que_fork_t*>(que_node_get_parent(thr)))); - while (trx->que_state != TRX_QUE_RUNNING) { + que_run_threads(thr); - mutex_exit(&kernel_mutex); + ut_a(roll_node->undo_thr != NULL); + que_run_threads(roll_node->undo_thr); - os_thread_sleep(100000); + /* Free the memory reserved by the undo graph. */ + que_graph_free(static_cast<que_t*>( + roll_node->undo_thr->common.parent)); + } - mutex_enter(&kernel_mutex); + if (savept == NULL) { + trx_rollback_finish(trx); + MONITOR_INC(MONITOR_TRX_ROLLBACK); + } else { + trx->lock.que_state = TRX_QUE_RUNNING; + MONITOR_INC(MONITOR_TRX_ROLLBACK_SAVEPOINT); } - mutex_exit(&kernel_mutex); + ut_a(trx->error_state == DB_SUCCESS); + ut_a(trx->lock.que_state == TRX_QUE_RUNNING); mem_heap_free(heap); - ut_a(trx->error_state == DB_SUCCESS); + MONITOR_DEC(MONITOR_TRX_ACTIVE); +} + +/*******************************************************************//** +Rollback a transaction to a given savepoint or do a complete rollback. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_rollback_to_savepoint( +/*======================*/ + trx_t* trx, /*!< in: transaction handle */ + trx_savept_t* savept) /*!< in: pointer to savepoint undo number, if + partial rollback requested, or NULL for + complete rollback */ +{ + ut_ad(!trx_mutex_own(trx)); /* Tell Innobase server that there might be work for utility threads: */ srv_active_wake_master_thread(); - return((int) trx->error_state); + trx_start_if_not_started_xa(trx); + + trx_rollback_to_savepoint_low(trx, savept); + + /* Tell Innobase server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + + return(trx->error_state); } /*******************************************************************//** Rollback a transaction used in MySQL. @return error code or DB_SUCCESS */ -UNIV_INTERN -int -trx_rollback_for_mysql( -/*===================*/ - trx_t* trx) /*!< in: transaction handle */ +static +dberr_t +trx_rollback_for_mysql_low( +/*=======================*/ + trx_t* trx) /*!< in/out: transaction */ { - int err; - - if (trx->state == TRX_NOT_STARTED) { - - return(DB_SUCCESS); - } + srv_active_wake_master_thread(); trx->op_info = "rollback"; - /* If we are doing the XA recovery of prepared transactions, then - the transaction object does not have an InnoDB session object, and we - set a dummy session that we use for all MySQL transactions. */ + /* If we are doing the XA recovery of prepared transactions, + then the transaction object does not have an InnoDB session + object, and we set a dummy session that we use for all MySQL + transactions. */ - err = trx_general_rollback_for_mysql(trx, NULL); + trx_rollback_to_savepoint_low(trx, NULL); trx->op_info = ""; -#ifdef WITH_WSREP - if (wsrep_on(trx->mysql_thd) && - trx->was_chosen_as_deadlock_victim) { - trx->was_chosen_as_deadlock_victim = FALSE; + ut_a(trx->error_state == DB_SUCCESS); + + srv_active_wake_master_thread(); + + return(trx->error_state); +} + +/*******************************************************************//** +Rollback a transaction used in MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_rollback_for_mysql( +/*===================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + /* We are reading trx->state without holding trx_sys->mutex + here, because the rollback should be invoked for a running + active MySQL transaction (or recovered prepared transaction) + that is associated with the current thread. */ + + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + ut_ad(trx->in_mysql_trx_list); + return(DB_SUCCESS); + + case TRX_STATE_ACTIVE: + ut_ad(trx->in_mysql_trx_list); + assert_trx_nonlocking_or_in_list(trx); + return(trx_rollback_for_mysql_low(trx)); + + case TRX_STATE_PREPARED: + ut_ad(!trx_is_autocommit_non_locking(trx)); + return(trx_rollback_for_mysql_low(trx)); + + case TRX_STATE_COMMITTED_IN_MEMORY: + assert_trx_in_list(trx); + break; } -#endif - return(err); + + ut_error; + return(DB_CORRUPTION); } /*******************************************************************//** Rollback the latest SQL statement for MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t trx_rollback_last_sql_stat_for_mysql( /*=================================*/ - trx_t* trx) /*!< in: transaction handle */ + trx_t* trx) /*!< in/out: transaction */ { - int err; + dberr_t err; - if (trx->state == TRX_NOT_STARTED) { + /* We are reading trx->state without holding trx_sys->mutex + here, because the statement rollback should be invoked for a + running active MySQL transaction that is associated with the + current thread. */ + ut_ad(trx->in_mysql_trx_list); + switch (trx->state) { + case TRX_STATE_NOT_STARTED: return(DB_SUCCESS); + case TRX_STATE_ACTIVE: + assert_trx_nonlocking_or_in_list(trx); + + trx->op_info = "rollback of SQL statement"; + + err = trx_rollback_to_savepoint( + trx, &trx->last_sql_stat_start); + + if (trx->fts_trx) { + fts_savepoint_rollback_last_stmt(trx); + } + + /* The following call should not be needed, + but we play it safe: */ + trx_mark_sql_stat_end(trx); + + trx->op_info = ""; + + return(err); + case TRX_STATE_PREPARED: + case TRX_STATE_COMMITTED_IN_MEMORY: + /* The statement rollback is only allowed on an ACTIVE + transaction, not a PREPARED or COMMITTED one. */ + break; } - trx->op_info = "rollback of SQL statement"; + ut_error; + return(DB_CORRUPTION); +} - err = trx_general_rollback_for_mysql(trx, &trx->last_sql_stat_start); - /* The following call should not be needed, but we play safe: */ - trx_mark_sql_stat_end(trx); +/*******************************************************************//** +Search for a savepoint using name. +@return savepoint if found else NULL */ +static +trx_named_savept_t* +trx_savepoint_find( +/*===============*/ + trx_t* trx, /*!< in: transaction */ + const char* name) /*!< in: savepoint name */ +{ + trx_named_savept_t* savep; - trx->op_info = ""; + for (savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + savep != NULL; + savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) { -#ifdef WITH_WSREP - if (wsrep_on(trx->mysql_thd) && - trx->was_chosen_as_deadlock_victim) { - trx->was_chosen_as_deadlock_victim = FALSE; + if (0 == ut_strcmp(savep->name, name)) { + return(savep); + } } -#endif - return(err); + + return(NULL); } /*******************************************************************//** Frees a single savepoint struct. */ -UNIV_INTERN +static void trx_roll_savepoint_free( /*=====================*/ trx_t* trx, /*!< in: transaction handle */ trx_named_savept_t* savep) /*!< in: savepoint to free */ { - ut_a(savep != NULL); - ut_a(UT_LIST_GET_LEN(trx->trx_savepoints) > 0); - UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep); mem_free(savep->name); mem_free(savep); } /*******************************************************************//** -Frees savepoint structs starting from savep, if savep == NULL then -free all savepoints. */ +Frees savepoint structs starting from savep. */ UNIV_INTERN void trx_roll_savepoints_free( /*=====================*/ trx_t* trx, /*!< in: transaction handle */ - trx_named_savept_t* savep) /*!< in: free all savepoints > this one; - if this is NULL, free all savepoints - of trx */ + trx_named_savept_t* savep) /*!< in: free all savepoints starting + with this savepoint i*/ { - trx_named_savept_t* next_savep; - - if (savep == NULL) { - savep = UT_LIST_GET_FIRST(trx->trx_savepoints); - } else { - savep = UT_LIST_GET_NEXT(trx_savepoints, savep); - } - while (savep != NULL) { + trx_named_savept_t* next_savep; + next_savep = UT_LIST_GET_NEXT(trx_savepoints, savep); trx_roll_savepoint_free(trx, savep); @@ -247,8 +347,65 @@ the row, these locks are naturally released in the rollback. Savepoints which were set after this savepoint are deleted. @return if no savepoint of the name found then DB_NO_SAVEPOINT, otherwise DB_SUCCESS */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +trx_rollback_to_savepoint_for_mysql_low( +/*====================================*/ + trx_t* trx, /*!< in/out: transaction */ + trx_named_savept_t* savep, /*!< in/out: savepoint */ + ib_int64_t* mysql_binlog_cache_pos) + /*!< out: the MySQL binlog + cache position corresponding + to this savepoint; MySQL needs + this information to remove the + binlog entries of the queries + executed after the savepoint */ +{ + dberr_t err; + + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + ut_ad(trx->in_mysql_trx_list); + + /* Free all savepoints strictly later than savep. */ + + trx_roll_savepoints_free( + trx, UT_LIST_GET_NEXT(trx_savepoints, savep)); + + *mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos; + + trx->op_info = "rollback to a savepoint"; + + err = trx_rollback_to_savepoint(trx, &savep->savept); + + /* Store the current undo_no of the transaction so that + we know where to roll back if we have to roll back the + next SQL statement: */ + + trx_mark_sql_stat_end(trx); + + trx->op_info = ""; + +#ifdef WITH_WSREP + if (wsrep_on(trx->mysql_thd) && + trx->lock.was_chosen_as_deadlock_victim) { + trx->lock.was_chosen_as_deadlock_victim = FALSE; + } +#endif + + return(err); +} + +/*******************************************************************//** +Rolls back a transaction back to a named savepoint. Modifications after the +savepoint are undone but InnoDB does NOT release the corresponding locks +which are stored in memory. If a lock is 'implicit', that is, a new inserted +row holds a lock where the lock information is carried by the trx id stored in +the row, these locks are naturally released in the rollback. Savepoints which +were set after this savepoint are deleted. +@return if no savepoint of the name found then DB_NO_SAVEPOINT, +otherwise DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t trx_rollback_to_savepoint_for_mysql( /*================================*/ trx_t* trx, /*!< in: transaction handle */ @@ -261,49 +418,38 @@ trx_rollback_to_savepoint_for_mysql( executed after the savepoint */ { trx_named_savept_t* savep; - ulint err; - savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + /* We are reading trx->state without holding trx_sys->mutex + here, because the savepoint rollback should be invoked for a + running active MySQL transaction that is associated with the + current thread. */ + ut_ad(trx->in_mysql_trx_list); - while (savep != NULL) { - if (0 == ut_strcmp(savep->name, savepoint_name)) { - /* Found */ - break; - } - savep = UT_LIST_GET_NEXT(trx_savepoints, savep); - } + savep = trx_savepoint_find(trx, savepoint_name); if (savep == NULL) { - return(DB_NO_SAVEPOINT); } - if (trx->state == TRX_NOT_STARTED) { + switch (trx->state) { + case TRX_STATE_NOT_STARTED: ut_print_timestamp(stderr); fputs(" InnoDB: Error: transaction has a savepoint ", stderr); ut_print_name(stderr, trx, FALSE, savep->name); fputs(" though it is not started\n", stderr); return(DB_ERROR); + case TRX_STATE_ACTIVE: + return(trx_rollback_to_savepoint_for_mysql_low( + trx, savep, mysql_binlog_cache_pos)); + case TRX_STATE_PREPARED: + case TRX_STATE_COMMITTED_IN_MEMORY: + /* The savepoint rollback is only allowed on an ACTIVE + transaction, not a PREPARED or COMMITTED one. */ + break; } - /* We can now free all savepoints strictly later than this one */ - - trx_roll_savepoints_free(trx, savep); - - *mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos; - - trx->op_info = "rollback to a savepoint"; - - err = trx_general_rollback_for_mysql(trx, &savep->savept); - - /* Store the current undo_no of the transaction so that we know where - to roll back if we have to roll back the next SQL statement: */ - - trx_mark_sql_stat_end(trx); - - trx->op_info = ""; - - return(err); + ut_error; + return(DB_CORRUPTION); } /*******************************************************************//** @@ -313,7 +459,7 @@ savepoint and replaces it with a new. Savepoints are deleted in a transaction commit or rollback. @return always DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t trx_savepoint_for_mysql( /*====================*/ trx_t* trx, /*!< in: transaction handle */ @@ -325,20 +471,9 @@ trx_savepoint_for_mysql( { trx_named_savept_t* savep; - ut_a(trx); - ut_a(savepoint_name); - - trx_start_if_not_started(trx); + trx_start_if_not_started_xa(trx); - savep = UT_LIST_GET_FIRST(trx->trx_savepoints); - - while (savep != NULL) { - if (0 == ut_strcmp(savep->name, savepoint_name)) { - /* Found */ - break; - } - savep = UT_LIST_GET_NEXT(trx_savepoints, savep); - } + savep = trx_savepoint_find(trx, savepoint_name); if (savep) { /* There is a savepoint with the same name: free that */ @@ -351,7 +486,7 @@ trx_savepoint_for_mysql( /* Create a new savepoint and add it as the last in the list */ - savep = mem_alloc(sizeof(trx_named_savept_t)); + savep = static_cast<trx_named_savept_t*>(mem_alloc(sizeof(*savep))); savep->name = mem_strdup(savepoint_name); @@ -370,7 +505,7 @@ savepoint are left as is. @return if no savepoint of the name found then DB_NO_SAVEPOINT, otherwise DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t trx_release_savepoint_for_mysql( /*============================*/ trx_t* trx, /*!< in: transaction handle */ @@ -378,18 +513,16 @@ trx_release_savepoint_for_mysql( { trx_named_savept_t* savep; - savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + ut_ad(trx->in_mysql_trx_list); - /* Search for the savepoint by name and free if found. */ - while (savep != NULL) { - if (0 == ut_strcmp(savep->name, savepoint_name)) { - trx_roll_savepoint_free(trx, savep); - return(DB_SUCCESS); - } - savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + savep = trx_savepoint_find(trx, savepoint_name); + + if (savep != NULL) { + trx_roll_savepoint_free(trx, savep); } - return(DB_NO_SAVEPOINT); + return(savep != NULL ? DB_SUCCESS : DB_NO_SAVEPOINT); } /*******************************************************************//** @@ -451,17 +584,22 @@ trx_rollback_active( thr->child = roll_node; roll_node->common.parent = thr; - mutex_enter(&kernel_mutex); - trx->graph = fork; ut_a(thr == que_fork_start_command(fork)); + mutex_enter(&trx_sys->mutex); + trx_roll_crash_recv_trx = trx; + trx_roll_max_undo_no = trx->undo_no; + trx_roll_progress_printed_pct = 0; + rows_to_undo = trx_roll_max_undo_no; + mutex_exit(&trx_sys->mutex); + if (rows_to_undo > 1000000000) { rows_to_undo = rows_to_undo / 1000000; unit = "M"; @@ -471,9 +609,8 @@ trx_rollback_active( fprintf(stderr, " InnoDB: Rolling back trx with id " TRX_ID_FMT ", %lu%s" " rows to undo\n", - (ullint) trx->id, + trx->id, (ulong) rows_to_undo, unit); - mutex_exit(&kernel_mutex); if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) { row_mysql_lock_data_dictionary(trx); @@ -481,48 +618,51 @@ trx_rollback_active( } que_run_threads(thr); + ut_a(roll_node->undo_thr != NULL); - mutex_enter(&kernel_mutex); + que_run_threads(roll_node->undo_thr); - while (trx->que_state != TRX_QUE_RUNNING) { - - mutex_exit(&kernel_mutex); - - fprintf(stderr, - "InnoDB: Waiting for rollback of trx id " - TRX_ID_FMT " to end\n", - (ullint) trx->id); - os_thread_sleep(100000); + trx_rollback_finish(thr_get_trx(roll_node->undo_thr)); - mutex_enter(&kernel_mutex); - } + /* Free the memory reserved by the undo graph */ + que_graph_free(static_cast<que_t*>( + roll_node->undo_thr->common.parent)); - mutex_exit(&kernel_mutex); + ut_a(trx->lock.que_state == TRX_QUE_RUNNING); if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE && trx->table_id != 0) { - /* If the transaction was for a dictionary operation, we - drop the relevant table, if it still exists */ + /* If the transaction was for a dictionary operation, + we drop the relevant table only if it is not flagged + as DISCARDED. If it still exists. */ - fprintf(stderr, - "InnoDB: Dropping table with id %llu" - " in recovery if it exists\n", - (ullint) trx->table_id); + table = dict_table_open_on_id( + trx->table_id, dictionary_locked, + DICT_TABLE_OP_NORMAL); - table = dict_table_get_on_id_low(trx->table_id); + if (table && !dict_table_is_discarded(table)) { - if (table) { - ulint err; + dberr_t err; + + /* Ensure that the table doesn't get evicted from the + cache, keeps things simple for drop. */ + + if (table->can_be_evicted) { + dict_table_move_from_lru_to_non_lru(table); + } - fputs("InnoDB: Table found: dropping table ", stderr); - ut_print_name(stderr, trx, TRUE, table->name); - fputs(" in recovery\n", stderr); + dict_table_close(table, dictionary_locked, FALSE); + + ib_logf(IB_LOG_LEVEL_WARN, + "Dropping table '%s', with id " UINT64PF " " + "in recovery", + table->name, trx->table_id); err = row_drop_table_for_mysql(table->name, trx, TRUE); trx_commit_for_mysql(trx); - ut_a(err == (int) DB_SUCCESS); + ut_a(err == DB_SUCCESS); } } @@ -530,15 +670,72 @@ trx_rollback_active( row_mysql_unlock_data_dictionary(trx); } - fprintf(stderr, "\nInnoDB: Rolling back of trx id " TRX_ID_FMT - " completed\n", - (ullint) trx->id); + ib_logf(IB_LOG_LEVEL_INFO, + "Rollback of trx with id " TRX_ID_FMT " completed", trx->id); + mem_heap_free(heap); trx_roll_crash_recv_trx = NULL; } /*******************************************************************//** +Rollback or clean up any resurrected incomplete transactions. It assumes +that the caller holds the trx_sys_t::mutex and it will release the +lock if it does a clean up or rollback. +@return TRUE if the transaction was cleaned up or rolled back +and trx_sys->mutex was released. */ +static +ibool +trx_rollback_resurrected( +/*=====================*/ + trx_t* trx, /*!< in: transaction to rollback or clean */ + ibool all) /*!< in: FALSE=roll back dictionary transactions; + TRUE=roll back all non-PREPARED transactions */ +{ + ut_ad(mutex_own(&trx_sys->mutex)); + + /* The trx->is_recovered flag and trx->state are set + atomically under the protection of the trx->mutex (and + lock_sys->mutex) in lock_trx_release_locks(). We do not want + to accidentally clean up a non-recovered transaction here. */ + + trx_mutex_enter(trx); + bool is_recovered = trx->is_recovered; + trx_state_t state = trx->state; + trx_mutex_exit(trx); + + if (!is_recovered) { + return(FALSE); + } + + switch (state) { + case TRX_STATE_COMMITTED_IN_MEMORY: + mutex_exit(&trx_sys->mutex); + fprintf(stderr, + "InnoDB: Cleaning up trx with id " TRX_ID_FMT "\n", + trx->id); + trx_cleanup_at_db_startup(trx); + trx_free_for_background(trx); + return(TRUE); + case TRX_STATE_ACTIVE: + if (all || trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) { + mutex_exit(&trx_sys->mutex); + trx_rollback_active(trx); + trx_free_for_background(trx); + return(TRUE); + } + return(FALSE); + case TRX_STATE_PREPARED: + return(FALSE); + case TRX_STATE_NOT_STARTED: + break; + } + + ut_error; + return(FALSE); +} + +/*******************************************************************//** Rollback or clean up any incomplete transactions which were encountered in crash recovery. If the transaction already was committed, then we clean up a possible insert undo log. If the @@ -552,10 +749,11 @@ trx_rollback_or_clean_recovered( { trx_t* trx; - mutex_enter(&kernel_mutex); + ut_a(srv_force_recovery < SRV_FORCE_NO_TRX_UNDO); + + if (trx_sys_get_n_rw_trx() == 0) { - if (!UT_LIST_GET_FIRST(trx_sys->trx_list)) { - goto leave_function; + return; } if (all) { @@ -564,40 +762,38 @@ trx_rollback_or_clean_recovered( " of uncommitted transactions\n"); } - mutex_exit(&kernel_mutex); + /* Note: For XA recovered transactions, we rely on MySQL to + do rollback. They will be in TRX_STATE_PREPARED state. If the server + is shutdown and they are still lingering in trx_sys_t::trx_list + then the shutdown will hang. */ -loop: - mutex_enter(&kernel_mutex); + /* Loop over the transaction list as long as there are + recovered transactions to clean up or recover. */ - for (trx = UT_LIST_GET_FIRST(trx_sys->trx_list); trx; - trx = UT_LIST_GET_NEXT(trx_list, trx)) { - if (!trx->is_recovered) { - continue; - } + do { + mutex_enter(&trx_sys->mutex); + + for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); + trx != NULL; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + + assert_trx_in_rw_list(trx); + + /* If this function does a cleanup or rollback + then it will release the trx_sys->mutex, therefore + we need to reacquire it before retrying the loop. */ + + if (trx_rollback_resurrected(trx, all)) { - switch (trx->state) { - case TRX_NOT_STARTED: - case TRX_PREPARED: - continue; - - case TRX_COMMITTED_IN_MEMORY: - mutex_exit(&kernel_mutex); - fprintf(stderr, - "InnoDB: Cleaning up trx with id " - TRX_ID_FMT "\n", - (ullint) trx->id); - trx_cleanup_at_db_startup(trx); - goto loop; - - case TRX_ACTIVE: - if (all || trx_get_dict_operation(trx) - != TRX_DICT_OP_NONE) { - mutex_exit(&kernel_mutex); - trx_rollback_active(trx); - goto loop; + mutex_enter(&trx_sys->mutex); + + break; } } - } + + mutex_exit(&trx_sys->mutex); + + } while (trx != NULL); if (all) { ut_print_timestamp(stderr); @@ -605,9 +801,6 @@ loop: " InnoDB: Rollback of non-prepared" " transactions completed\n"); } - -leave_function: - mutex_exit(&kernel_mutex); } /*******************************************************************//** @@ -617,14 +810,16 @@ committed, then we clean up a possible insert undo log. If the transaction was not yet committed, then we roll it back. Note: this is done in a background thread. @return a dummy parameter */ -UNIV_INTERN +extern "C" UNIV_INTERN os_thread_ret_t -trx_rollback_or_clean_all_recovered( -/*================================*/ +DECLARE_THREAD(trx_rollback_or_clean_all_recovered)( +/*================================================*/ void* arg __attribute__((unused))) /*!< in: a dummy parameter required by os_thread_create */ { + ut_ad(!srv_read_only_mode); + #ifdef UNIV_PFS_THREAD pfs_register_thread(trx_rollback_clean_thread_key); #endif /* UNIV_PFS_THREAD */ @@ -642,30 +837,25 @@ trx_rollback_or_clean_all_recovered( /*******************************************************************//** Creates an undo number array. @return own: undo number array */ -UNIV_INTERN +static trx_undo_arr_t* -trx_undo_arr_create(void) -/*=====================*/ +trx_undo_arr_create( +/*================*/ + ulint n_cells) /*!< Number of cells */ { trx_undo_arr_t* arr; mem_heap_t* heap; - ulint i; - - heap = mem_heap_create(1024); + ulint sz = sizeof(*arr) + sizeof(*arr->infos) * n_cells; - arr = mem_heap_alloc(heap, sizeof(trx_undo_arr_t)); + heap = mem_heap_create(sz); - arr->infos = mem_heap_alloc(heap, sizeof(trx_undo_inf_t) - * UNIV_MAX_PARALLELISM); - arr->n_cells = UNIV_MAX_PARALLELISM; - arr->n_used = 0; + arr = static_cast<trx_undo_arr_t*>(mem_heap_zalloc(heap, sz)); - arr->heap = heap; + arr->n_cells = n_cells; - for (i = 0; i < UNIV_MAX_PARALLELISM; i++) { + arr->infos = (trx_undo_inf_t*) (arr + 1); - (trx_undo_arr_get_nth_info(arr, i))->in_use = FALSE; - } + arr->heap = heap; return(arr); } @@ -678,8 +868,6 @@ trx_undo_arr_free( /*==============*/ trx_undo_arr_t* arr) /*!< in: undo number array */ { - ut_ad(arr->n_used == 0); - mem_heap_free(arr->heap); } @@ -693,19 +881,18 @@ trx_undo_arr_store_info( trx_t* trx, /*!< in: transaction */ undo_no_t undo_no)/*!< in: undo number */ { - trx_undo_inf_t* cell; - trx_undo_inf_t* stored_here; + ulint i; trx_undo_arr_t* arr; + ulint n = 0; ulint n_used; - ulint n; - ulint i; + trx_undo_inf_t* stored_here = NULL; - n = 0; arr = trx->undo_no_arr; n_used = arr->n_used; - stored_here = NULL; - for (i = 0;; i++) { + for (i = 0; i < arr->n_cells; i++) { + trx_undo_inf_t* cell; + cell = trx_undo_arr_get_nth_info(arr, i); if (!cell->in_use) { @@ -742,6 +929,10 @@ trx_undo_arr_store_info( return(TRUE); } } + + ut_error; + + return(FALSE); } /*******************************************************************//** @@ -753,22 +944,19 @@ trx_undo_arr_remove_info( trx_undo_arr_t* arr, /*!< in: undo number array */ undo_no_t undo_no)/*!< in: undo number */ { - trx_undo_inf_t* cell; ulint i; - for (i = 0;; i++) { - cell = trx_undo_arr_get_nth_info(arr, i); + for (i = 0; i < arr->n_cells; i++) { - if (cell->in_use - && cell->undo_no == undo_no) { + trx_undo_inf_t* cell; - cell->in_use = FALSE; + cell = trx_undo_arr_get_nth_info(arr, i); + if (cell->in_use && cell->undo_no == undo_no) { + cell->in_use = FALSE; ut_ad(arr->n_used > 0); - - arr->n_used--; - - return; + --arr->n_used; + break; } } } @@ -780,46 +968,40 @@ static undo_no_t trx_undo_arr_get_biggest( /*=====================*/ - trx_undo_arr_t* arr) /*!< in: undo number array */ + const trx_undo_arr_t* arr) /*!< in: undo number array */ { - trx_undo_inf_t* cell; - ulint n_used; - undo_no_t biggest; - ulint n; ulint i; + undo_no_t biggest = 0; + ulint n_checked = 0; - n = 0; - n_used = arr->n_used; - biggest = 0; + for (i = 0; i < arr->n_cells && n_checked < arr->n_used; ++i) { - for (i = 0;; i++) { - cell = trx_undo_arr_get_nth_info(arr, i); + const trx_undo_inf_t* cell = &arr->infos[i]; if (cell->in_use) { - n++; + + ++n_checked; + if (cell->undo_no > biggest) { biggest = cell->undo_no; } } - - if (n == n_used) { - return(biggest); - } } + + return(biggest); } /***********************************************************************//** Tries truncate the undo logs. */ -UNIV_INTERN +static void trx_roll_try_truncate( /*==================*/ trx_t* trx) /*!< in/out: transaction */ { - trx_undo_arr_t* arr; - undo_no_t limit; - undo_no_t biggest; + undo_no_t limit; + const trx_undo_arr_t* arr; ut_ad(mutex_own(&(trx->undo_mutex))); ut_ad(mutex_own(&((trx->rseg)->mutex))); @@ -831,6 +1013,8 @@ trx_roll_try_truncate( limit = trx->undo_no; if (arr->n_used > 0) { + undo_no_t biggest; + biggest = trx_undo_arr_get_biggest(arr); if (biggest >= limit) { @@ -846,6 +1030,12 @@ trx_roll_try_truncate( if (trx->update_undo) { trx_undo_truncate_end(trx, trx->update_undo, limit); } + +#ifdef WITH_WSREP_OUT + if (wsrep_on(trx->mysql_thd)) { + trx->lock.was_chosen_as_deadlock_victim = FALSE; + } +#endif /* WITH_WSREP */ } /***********************************************************************//** @@ -865,19 +1055,21 @@ trx_roll_pop_top_rec( trx_undo_rec_t* prev_rec; page_t* prev_rec_page; - ut_ad(mutex_own(&(trx->undo_mutex))); + ut_ad(mutex_own(&trx->undo_mutex)); + + undo_page = trx_undo_page_get_s_latched( + undo->space, undo->zip_size, undo->top_page_no, mtr); - undo_page = trx_undo_page_get_s_latched(undo->space, undo->zip_size, - undo->top_page_no, mtr); offset = undo->top_offset; /* fprintf(stderr, "Thread %lu undoing trx " TRX_ID_FMT " undo record " TRX_ID_FMT "\n", os_thread_get_curr_id(), trx->id, undo->top_undo_no); */ - prev_rec = trx_undo_get_prev_rec(undo_page + offset, - undo->hdr_page_no, undo->hdr_offset, - mtr); + prev_rec = trx_undo_get_prev_rec( + undo_page + offset, undo->hdr_page_no, undo->hdr_offset, + true, mtr); + if (prev_rec == NULL) { undo->empty = TRUE; @@ -930,11 +1122,11 @@ try_again: mutex_enter(&(trx->undo_mutex)); if (trx->pages_undone >= TRX_ROLL_TRUNC_THRESHOLD) { - mutex_enter(&(rseg->mutex)); + mutex_enter(&rseg->mutex); trx_roll_try_truncate(trx); - mutex_exit(&(rseg->mutex)); + mutex_exit(&rseg->mutex); } ins_undo = trx->insert_undo; @@ -950,8 +1142,7 @@ try_again: undo = ins_undo; } - if (!undo || undo->empty - || limit > undo->top_undo_no) { + if (!undo || undo->empty || limit > undo->top_undo_no) { if ((trx->undo_no_arr)->n_used == 0) { /* Rollback is ending */ @@ -968,15 +1159,11 @@ try_again: return(NULL); } - if (undo == ins_undo) { - is_insert = TRUE; - } else { - is_insert = FALSE; - } + is_insert = (undo == ins_undo); + + *roll_ptr = trx_undo_build_roll_ptr( + is_insert, undo->rseg->id, undo->top_page_no, undo->top_offset); - *roll_ptr = trx_undo_build_roll_ptr(is_insert, (undo->rseg)->id, - undo->top_page_no, - undo->top_offset); mtr_start(&mtr); undo_rec = trx_roll_pop_top_rec(trx, undo, &mtr); @@ -1070,89 +1257,13 @@ trx_undo_rec_release( mutex_exit(&(trx->undo_mutex)); } -/*********************************************************************//** -Starts a rollback operation. */ -UNIV_INTERN -void -trx_rollback( -/*=========*/ - trx_t* trx, /*!< in: transaction */ - trx_sig_t* sig, /*!< in: signal starting the rollback */ - que_thr_t** next_thr)/*!< in/out: next query thread to run; - if the value which is passed in is - a pointer to a NULL pointer, then the - calling function can start running - a new query thread; if the passed value is - NULL, the parameter is ignored */ -{ - que_t* roll_graph; - que_thr_t* thr; - /* que_thr_t* thr2; */ - - ut_ad(mutex_own(&kernel_mutex)); - ut_ad((trx->undo_no_arr == NULL) || ((trx->undo_no_arr)->n_used == 0)); - - /* Initialize the rollback field in the transaction */ - - switch (sig->type) { - case TRX_SIG_TOTAL_ROLLBACK: - trx->roll_limit = 0; - break; - case TRX_SIG_ROLLBACK_TO_SAVEPT: - trx->roll_limit = (sig->savept).least_undo_no; - break; - case TRX_SIG_ERROR_OCCURRED: - trx->roll_limit = trx->last_sql_stat_start.least_undo_no; - break; - default: - ut_error; - } - - ut_a(trx->roll_limit <= trx->undo_no); - - trx->pages_undone = 0; - - if (trx->undo_no_arr == NULL) { - trx->undo_no_arr = trx_undo_arr_create(); - } - - /* Build a 'query' graph which will perform the undo operations */ - - roll_graph = trx_roll_graph_build(trx); - - trx->graph = roll_graph; - trx->que_state = TRX_QUE_ROLLING_BACK; - - thr = que_fork_start_command(roll_graph); - - ut_ad(thr); - - /* thr2 = que_fork_start_command(roll_graph); - - ut_ad(thr2); */ - - if (next_thr && (*next_thr == NULL)) { - *next_thr = thr; - /* srv_que_task_enqueue_low(thr2); */ - } else { - srv_que_task_enqueue_low(thr); - /* srv_que_task_enqueue_low(thr2); */ - } -#ifdef WITH_WSREP - if (wsrep_on(trx->mysql_thd) && - trx->was_chosen_as_deadlock_victim) { - trx->was_chosen_as_deadlock_victim = FALSE; - } -#endif -} - /****************************************************************//** Builds an undo 'query' graph for a transaction. The actual rollback is performed by executing this query graph like a query subprocedure call. The reply about the completion of the rollback will be sent by this graph. @return own: the query graph */ -UNIV_INTERN +static que_t* trx_roll_graph_build( /*=================*/ @@ -1161,153 +1272,76 @@ trx_roll_graph_build( mem_heap_t* heap; que_fork_t* fork; que_thr_t* thr; - /* que_thr_t* thr2; */ - ut_ad(mutex_own(&kernel_mutex)); + ut_ad(trx_mutex_own(trx)); heap = mem_heap_create(512); fork = que_fork_create(NULL, NULL, QUE_FORK_ROLLBACK, heap); fork->trx = trx; thr = que_thr_create(fork, heap); - /* thr2 = que_thr_create(fork, heap); */ thr->child = row_undo_node_create(trx, thr, heap); - /* thr2->child = row_undo_node_create(trx, thr2, heap); */ return(fork); } /*********************************************************************//** -Finishes error processing after the necessary partial rollback has been -done. */ +Starts a rollback operation, creates the UNDO graph that will do the +actual undo operation. +@return query graph thread that will perform the UNDO operations. */ static -void -trx_finish_error_processing( -/*========================*/ - trx_t* trx) /*!< in: transaction */ +que_thr_t* +trx_rollback_start( +/*===============*/ + trx_t* trx, /*!< in: transaction */ + ib_id_t roll_limit) /*!< in: rollback to undo no (for + partial undo), 0 if we are rolling back + the entire transaction */ { - trx_sig_t* sig; - trx_sig_t* next_sig; - - ut_ad(mutex_own(&kernel_mutex)); + que_t* roll_graph; - sig = UT_LIST_GET_FIRST(trx->signals); + ut_ad(trx_mutex_own(trx)); - while (sig != NULL) { - next_sig = UT_LIST_GET_NEXT(signals, sig); + ut_ad(trx->undo_no_arr == NULL || trx->undo_no_arr->n_used == 0); - if (sig->type == TRX_SIG_ERROR_OCCURRED) { + /* Initialize the rollback field in the transaction */ - trx_sig_remove(trx, sig); - } + trx->roll_limit = roll_limit; - sig = next_sig; - } + ut_a(trx->roll_limit <= trx->undo_no); - trx->que_state = TRX_QUE_RUNNING; -} + trx->pages_undone = 0; -/*********************************************************************//** -Finishes a partial rollback operation. */ -static -void -trx_finish_partial_rollback_off_kernel( -/*===================================*/ - trx_t* trx, /*!< in: transaction */ - que_thr_t** next_thr)/*!< in/out: next query thread to run; - if the value which is passed in is a pointer - to a NULL pointer, then the calling function - can start running a new query thread; if this - parameter is NULL, it is ignored */ -{ - trx_sig_t* sig; + if (trx->undo_no_arr == NULL) { + /* Single query thread -> 1 */ + trx->undo_no_arr = trx_undo_arr_create(1); + } - ut_ad(mutex_own(&kernel_mutex)); + /* Build a 'query' graph which will perform the undo operations */ - sig = UT_LIST_GET_FIRST(trx->signals); + roll_graph = trx_roll_graph_build(trx); - /* Remove the signal from the signal queue and send reply message - to it */ + trx->graph = roll_graph; - trx_sig_reply(sig, next_thr); - trx_sig_remove(trx, sig); + trx->lock.que_state = TRX_QUE_ROLLING_BACK; - trx->que_state = TRX_QUE_RUNNING; + return(que_fork_start_command(roll_graph)); } /****************************************************************//** Finishes a transaction rollback. */ -UNIV_INTERN +static void -trx_finish_rollback_off_kernel( -/*===========================*/ - que_t* graph, /*!< in: undo graph which can now be freed */ - trx_t* trx, /*!< in: transaction */ - que_thr_t** next_thr)/*!< in/out: next query thread to run; - if the value which is passed in is - a pointer to a NULL pointer, then the - calling function can start running - a new query thread; if this parameter is - NULL, it is ignored */ +trx_rollback_finish( +/*================*/ + trx_t* trx) /*!< in: transaction */ { - trx_sig_t* sig; - trx_sig_t* next_sig; - - ut_ad(mutex_own(&kernel_mutex)); - ut_a(trx->undo_no_arr == NULL || trx->undo_no_arr->n_used == 0); - /* Free the memory reserved by the undo graph */ - que_graph_free(graph); - - sig = UT_LIST_GET_FIRST(trx->signals); - - if (sig->type == TRX_SIG_ROLLBACK_TO_SAVEPT) { - - trx_finish_partial_rollback_off_kernel(trx, next_thr); - - return; - - } else if (sig->type == TRX_SIG_ERROR_OCCURRED) { - - trx_finish_error_processing(trx); - - return; - } - -#ifdef UNIV_DEBUG - if (lock_print_waits) { - fprintf(stderr, "Trx " TRX_ID_FMT " rollback finished\n", - (ullint) trx->id); - } -#endif /* UNIV_DEBUG */ - - trx_commit_off_kernel(trx); - - /* Remove all TRX_SIG_TOTAL_ROLLBACK signals from the signal queue and - send reply messages to them */ + trx_commit(trx); - trx->que_state = TRX_QUE_RUNNING; - - while (sig != NULL) { - next_sig = UT_LIST_GET_NEXT(signals, sig); - - if (sig->type == TRX_SIG_TOTAL_ROLLBACK) { - - trx_sig_reply(sig, next_thr); - - trx_sig_remove(trx, sig); - } - - sig = next_sig; - } -#ifdef WITH_WSREP - if (wsrep_on(trx->mysql_thd) && - trx->was_chosen_as_deadlock_victim) { - trx->was_chosen_as_deadlock_victim = FALSE; - } -#endif + trx->lock.que_state = TRX_QUE_RUNNING; } /*********************************************************************//** @@ -1321,11 +1355,11 @@ roll_node_create( { roll_node_t* node; - node = mem_heap_alloc(heap, sizeof(roll_node_t)); - node->common.type = QUE_NODE_ROLLBACK; + node = static_cast<roll_node_t*>(mem_heap_zalloc(heap, sizeof(*node))); + node->state = ROLL_NODE_SEND; - node->partial = FALSE; + node->common.type = QUE_NODE_ROLLBACK; return(node); } @@ -1340,10 +1374,8 @@ trx_rollback_step( que_thr_t* thr) /*!< in: query thread */ { roll_node_t* node; - ulint sig_no; - trx_savept_t* savept; - node = thr->run_node; + node = static_cast<roll_node_t*>(thr->run_node); ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK); @@ -1352,33 +1384,30 @@ trx_rollback_step( } if (node->state == ROLL_NODE_SEND) { - mutex_enter(&kernel_mutex); + trx_t* trx; + ib_id_t roll_limit = 0; - node->state = ROLL_NODE_WAIT; + trx = thr_get_trx(thr); - if (node->partial) { - sig_no = TRX_SIG_ROLLBACK_TO_SAVEPT; - savept = &(node->savept); - } else { - sig_no = TRX_SIG_TOTAL_ROLLBACK; - savept = NULL; - } + trx_mutex_enter(trx); - /* Send a rollback signal to the transaction */ + node->state = ROLL_NODE_WAIT; - trx_sig_send(thr_get_trx(thr), sig_no, TRX_SIG_SELF, thr, - savept, NULL); + ut_a(node->undo_thr == NULL); - thr->state = QUE_THR_SIG_REPLY_WAIT; + roll_limit = node->partial ? node->savept.least_undo_no : 0; - mutex_exit(&kernel_mutex); + trx_commit_or_rollback_prepare(trx); - return(NULL); - } + node->undo_thr = trx_rollback_start(trx, roll_limit); - ut_ad(node->state == ROLL_NODE_WAIT); + trx_mutex_exit(trx); - thr->run_node = que_node_get_parent(node); + } else { + ut_ad(node->state == ROLL_NODE_WAIT); + + thr->run_node = que_node_get_parent(node); + } return(thr); } diff --git a/storage/xtradb/trx/trx0rseg.c b/storage/xtradb/trx/trx0rseg.cc index ed3c27326d4..003d1036a8c 100644 --- a/storage/xtradb/trx/trx0rseg.c +++ b/storage/xtradb/trx/trx0rseg.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle Corpn. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA *****************************************************************************/ /**************************************************//** -@file trx/trx0rseg.c +@file trx/trx0rseg.cc Rollback segment Created 3/26/1996 Heikki Tuuri @@ -33,32 +33,14 @@ Created 3/26/1996 Heikki Tuuri #include "fut0lst.h" #include "srv0srv.h" #include "trx0purge.h" +#include "ut0bh.h" +#include "srv0mon.h" #ifdef UNIV_PFS_MUTEX /* Key to register rseg_mutex_key with performance schema */ UNIV_INTERN mysql_pfs_key_t rseg_mutex_key; #endif /* UNIV_PFS_MUTEX */ -/******************************************************************//** -Looks for a rollback segment, based on the rollback segment id. -@return rollback segment */ -UNIV_INTERN -trx_rseg_t* -trx_rseg_get_on_id( -/*===============*/ - ulint id) /*!< in: rollback segment id */ -{ - trx_rseg_t* rseg; - - ut_a(id < TRX_SYS_N_RSEGS); - - rseg = trx_sys->rseg_array[id]; - - ut_a(rseg == NULL || id == rseg->id); - - return(rseg); -} - /****************************************************************//** Creates a rollback segment header. This function is called only when a new rollback segment is created in the database. @@ -81,13 +63,11 @@ trx_rseg_header_create( buf_block_t* block; ut_ad(mtr); - ut_ad(mutex_own(&kernel_mutex)); ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL), MTR_MEMO_X_LOCK)); /* Allocate a new file segment for the rollback segment */ - block = fseg_create(space, 0, - TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr); + block = fseg_create(space, 0, TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr); if (block == NULL) { /* No space left */ @@ -137,6 +117,7 @@ trx_rseg_mem_free( trx_rseg_t* rseg) /* in, own: instance to free */ { trx_undo_t* undo; + trx_undo_t* next_undo; mutex_free(&rseg->mutex); @@ -144,29 +125,36 @@ trx_rseg_mem_free( ut_a(UT_LIST_GET_LEN(rseg->update_undo_list) == 0); ut_a(UT_LIST_GET_LEN(rseg->insert_undo_list) == 0); - undo = UT_LIST_GET_FIRST(rseg->update_undo_cached); + for (undo = UT_LIST_GET_FIRST(rseg->update_undo_cached); + undo != NULL; + undo = next_undo) { + + next_undo = UT_LIST_GET_NEXT(undo_list, undo); - while (undo != NULL) { - trx_undo_t* prev_undo = undo; + UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, undo); - undo = UT_LIST_GET_NEXT(undo_list, undo); - UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, prev_undo); + MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); - trx_undo_mem_free(prev_undo); + trx_undo_mem_free(undo); } - undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached); + for (undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached); + undo != NULL; + undo = next_undo) { - while (undo != NULL) { - trx_undo_t* prev_undo = undo; + next_undo = UT_LIST_GET_NEXT(undo_list, undo); - undo = UT_LIST_GET_NEXT(undo_list, undo); - UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, prev_undo); + UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, undo); - trx_undo_mem_free(prev_undo); + MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); + + trx_undo_mem_free(undo); } - trx_sys_set_nth_rseg(trx_sys, rseg->id, NULL); + /* const_cast<trx_rseg_t*>() because this function is + like a destructor. */ + + *((trx_rseg_t**) trx_sys->rseg_array + rseg->id) = NULL; mem_free(rseg); } @@ -198,9 +186,7 @@ trx_rseg_mem_create( trx_ulogf_t* undo_log_hdr; ulint sum_of_undo_sizes; - ut_ad(mutex_own(&kernel_mutex)); - - rseg = mem_zalloc(sizeof(trx_rseg_t)); + rseg = static_cast<trx_rseg_t*>(mem_zalloc(sizeof(trx_rseg_t))); rseg->id = id; rseg->space = space; @@ -209,41 +195,43 @@ trx_rseg_mem_create( mutex_create(rseg_mutex_key, &rseg->mutex, SYNC_RSEG); - UT_LIST_ADD_LAST(rseg_list, trx_sys->rseg_list, rseg); - - trx_sys_set_nth_rseg(trx_sys, id, rseg); + /* const_cast<trx_rseg_t*>() because this function is + like a constructor. */ + *((trx_rseg_t**) trx_sys->rseg_array + rseg->id) = rseg; rseg_header = trx_rsegf_get_new(space, zip_size, page_no, mtr); - rseg->max_size = mtr_read_ulint(rseg_header + TRX_RSEG_MAX_SIZE, - MLOG_4BYTES, mtr); + rseg->max_size = mtr_read_ulint( + rseg_header + TRX_RSEG_MAX_SIZE, MLOG_4BYTES, mtr); /* Initialize the undo log lists according to the rseg header */ sum_of_undo_sizes = trx_undo_lists_init(rseg); - rseg->curr_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, - MLOG_4BYTES, mtr) + rseg->curr_size = mtr_read_ulint( + rseg_header + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, mtr) + 1 + sum_of_undo_sizes; len = flst_get_len(rseg_header + TRX_RSEG_HISTORY, mtr); + if (len > 0) { - const void* ptr; rseg_queue_t rseg_queue; trx_sys->rseg_history_len += len; node_addr = trx_purge_get_log_from_hist( flst_get_last(rseg_header + TRX_RSEG_HISTORY, mtr)); + rseg->last_page_no = node_addr.page; rseg->last_offset = node_addr.boffset; - undo_log_hdr = trx_undo_page_get(rseg->space, rseg->zip_size, - node_addr.page, - mtr) + node_addr.boffset; + undo_log_hdr = trx_undo_page_get( + rseg->space, rseg->zip_size, node_addr.page, + mtr) + node_addr.boffset; rseg->last_trx_no = mach_read_from_8( undo_log_hdr + TRX_UNDO_TRX_NO); + rseg->last_del_marks = mtr_read_ulint( undo_log_hdr + TRX_UNDO_DEL_MARKS, MLOG_2BYTES, mtr); @@ -251,6 +239,8 @@ trx_rseg_mem_create( rseg_queue.trx_no = rseg->last_trx_no; if (rseg->last_page_no != FIL_NULL) { + const void* ptr; + /* There is no need to cover this operation by the purge mutex because we are still bootstrapping. */ @@ -266,7 +256,7 @@ trx_rseg_mem_create( /******************************************************************** Creates the memory copies for the rollback segments and initializes the -rseg list and array in trx_sys at a database startup. */ +rseg array in trx_sys at a database startup. */ static void trx_rseg_create_instance( @@ -282,9 +272,7 @@ trx_rseg_create_instance( page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr); - if (page_no == FIL_NULL) { - trx_sys_set_nth_rseg(trx_sys, i, NULL); - } else { + if (page_no != FIL_NULL) { ulint space; ulint zip_size; trx_rseg_t* rseg = NULL; @@ -299,6 +287,8 @@ trx_rseg_create_instance( i, space, zip_size, page_no, ib_bh, mtr); ut_a(rseg->id == i); + } else { + ut_a(trx_sys->rseg_array[i] == NULL); } } } @@ -308,8 +298,9 @@ Creates a rollback segment. @return pointer to new rollback segment if create successful */ UNIV_INTERN trx_rseg_t* -trx_rseg_create(void) -/*=================*/ +trx_rseg_create( +/*============*/ + ulint space) /*!< in: id of UNDO tablespace */ { mtr_t mtr; ulint slot_no; @@ -318,29 +309,26 @@ trx_rseg_create(void) mtr_start(&mtr); /* To obey the latching order, acquire the file space - x-latch before the kernel mutex. */ - mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), &mtr); - - mutex_enter(&kernel_mutex); + x-latch before the trx_sys->mutex. */ + mtr_x_lock(fil_space_get_latch(space, NULL), &mtr); slot_no = trx_sysf_rseg_find_free(&mtr); if (slot_no != ULINT_UNDEFINED) { - ulint space; + ulint id; ulint page_no; ulint zip_size; trx_sysf_t* sys_header; page_no = trx_rseg_header_create( - TRX_SYS_SPACE, 0, ULINT_MAX, slot_no, &mtr); + space, 0, ULINT_MAX, slot_no, &mtr); ut_a(page_no != FIL_NULL); - ut_ad(!trx_rseg_get_on_id(slot_no)); - sys_header = trx_sysf_get(&mtr); - space = trx_sysf_rseg_get_space(sys_header, slot_no, &mtr); + id = trx_sysf_rseg_get_space(sys_header, slot_no, &mtr); + ut_a(id == space); zip_size = space ? fil_space_get_zip_size(space) : 0; @@ -349,26 +337,89 @@ trx_rseg_create(void) purge_sys->ib_bh, &mtr); } - mutex_exit(&kernel_mutex); mtr_commit(&mtr); return(rseg); } -/******************************************************************** -Initialize the rollback instance list. */ +/*********************************************************************//** +Creates the memory copies for rollback segments and initializes the +rseg array in trx_sys at a database startup. */ UNIV_INTERN void -trx_rseg_list_and_array_init( -/*=========================*/ - trx_sysf_t* sys_header, /*!< in: trx system header */ +trx_rseg_array_init( +/*================*/ + trx_sysf_t* sys_header, /* in/out: trx system header */ ib_bh_t* ib_bh, /*!< in: rseg queue */ mtr_t* mtr) /*!< in: mtr */ { - UT_LIST_INIT(trx_sys->rseg_list); - trx_sys->rseg_history_len = 0; trx_rseg_create_instance(sys_header, ib_bh, mtr); } +/******************************************************************** +Get the number of unique rollback tablespaces in use except space id 0. +The last space id will be the sentinel value ULINT_UNDEFINED. The array +will be sorted on space id. Note: space_ids should have have space for +TRX_SYS_N_RSEGS + 1 elements. +@return number of unique rollback tablespaces in use. */ +UNIV_INTERN +ulint +trx_rseg_get_n_undo_tablespaces( +/*============================*/ + ulint* space_ids) /*!< out: array of space ids of + UNDO tablespaces */ +{ + ulint i; + mtr_t mtr; + trx_sysf_t* sys_header; + ulint n_undo_tablespaces = 0; + ulint space_ids_aux[TRX_SYS_N_RSEGS + 1]; + + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + for (i = 0; i < TRX_SYS_N_RSEGS; i++) { + ulint page_no; + ulint space; + + page_no = trx_sysf_rseg_get_page_no(sys_header, i, &mtr); + + if (page_no == FIL_NULL) { + continue; + } + + space = trx_sysf_rseg_get_space(sys_header, i, &mtr); + + if (space != 0) { + ulint j; + ibool found = FALSE; + + for (j = 0; j < n_undo_tablespaces; ++j) { + if (space_ids[j] == space) { + found = TRUE; + break; + } + } + + if (!found) { + ut_a(n_undo_tablespaces <= i); + space_ids[n_undo_tablespaces++] = space; + } + } + } + + mtr_commit(&mtr); + + ut_a(n_undo_tablespaces <= TRX_SYS_N_RSEGS); + + space_ids[n_undo_tablespaces] = ULINT_UNDEFINED; + + if (n_undo_tablespaces > 0) { + ut_ulint_sort(space_ids, space_ids_aux, 0, n_undo_tablespaces); + } + + return(n_undo_tablespaces); +} diff --git a/storage/xtradb/trx/trx0sys.c b/storage/xtradb/trx/trx0sys.c deleted file mode 100644 index d5d4590a23e..00000000000 --- a/storage/xtradb/trx/trx0sys.c +++ /dev/null @@ -1,2136 +0,0 @@ -/***************************************************************************** - -Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file trx/trx0sys.c -Transaction system - -Created 3/26/1996 Heikki Tuuri -*******************************************************/ - -#include "trx0sys.h" - -#ifdef UNIV_NONINL -#include "trx0sys.ic" -#endif - -#ifndef UNIV_HOTBACKUP -#include "fsp0fsp.h" -#include "mtr0log.h" -#include "mtr0log.h" -#include "trx0trx.h" -#include "trx0rseg.h" -#include "trx0undo.h" -#include "srv0srv.h" -#include "srv0start.h" -#include "trx0purge.h" -#include "log0log.h" -#include "log0recv.h" -#include "os0file.h" -#include "read0read.h" - -#ifdef WITH_WSREP -#include "ha_prototypes.h" /* wsrep_is_wsrep_xid() */ -#endif /* */ - -/** The file format tag structure with id and name. */ -struct file_format_struct { - ulint id; /*!< id of the file format */ - const char* name; /*!< text representation of the - file format */ - mutex_t mutex; /*!< covers changes to the above - fields */ -}; - -/** The file format tag */ -typedef struct file_format_struct file_format_t; - -/** The transaction system */ -UNIV_INTERN trx_sys_t* trx_sys = NULL; -/** The doublewrite buffer */ -UNIV_INTERN trx_doublewrite_t* trx_doublewrite = NULL; - -/** The following is set to TRUE when we are upgrading from pre-4.1 -format data files to the multiple tablespaces format data files */ -UNIV_INTERN ibool trx_doublewrite_must_reset_space_ids = FALSE; -/** Set to TRUE when the doublewrite buffer is being created */ -UNIV_INTERN ibool trx_doublewrite_buf_is_being_created = FALSE; - -/** The following is TRUE when we are using the database in the -post-4.1 format, i.e., we have successfully upgraded, or have created -a new database installation */ -UNIV_INTERN ibool trx_sys_multiple_tablespace_format = FALSE; - -/** In a MySQL replication slave, in crash recovery we store the master log -file name and position here. */ -/* @{ */ -/** Master binlog file name */ -UNIV_INTERN char trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN]; -/** Master binlog file position. We have successfully got the updates -up to this position. -1 means that no crash recovery was needed, or -there was no master log position info inside InnoDB.*/ -UNIV_INTERN ib_int64_t trx_sys_mysql_master_log_pos = -1; -/* @} */ - -UNIV_INTERN char trx_sys_mysql_relay_log_name[TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN]; -UNIV_INTERN ib_int64_t trx_sys_mysql_relay_log_pos = -1; - -/** If this MySQL server uses binary logging, after InnoDB has been inited -and if it has done a crash recovery, we store the binlog file name and position -here. */ -/* @{ */ -/** Binlog file name */ -UNIV_INTERN char trx_sys_mysql_bin_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN]; -/** Binlog file position, or -1 if unknown */ -UNIV_INTERN ib_int64_t trx_sys_mysql_bin_log_pos = -1; -/* @} */ -#endif /* !UNIV_HOTBACKUP */ - -/** List of animal names representing file format. */ -static const char* file_format_name_map[] = { - "Antelope", - "Barracuda", - "Cheetah", - "Dragon", - "Elk", - "Fox", - "Gazelle", - "Hornet", - "Impala", - "Jaguar", - "Kangaroo", - "Leopard", - "Moose", - "Nautilus", - "Ocelot", - "Porpoise", - "Quail", - "Rabbit", - "Shark", - "Tiger", - "Urchin", - "Viper", - "Whale", - "Xenops", - "Yak", - "Zebra" -}; - -/** The number of elements in the file format name array. */ -static const ulint FILE_FORMAT_NAME_N - = sizeof(file_format_name_map) / sizeof(file_format_name_map[0]); - -#ifdef UNIV_PFS_MUTEX -/* Key to register the mutex with performance schema */ -UNIV_INTERN mysql_pfs_key_t trx_doublewrite_mutex_key; -UNIV_INTERN mysql_pfs_key_t file_format_max_mutex_key; -#endif /* UNIV_PFS_MUTEX */ - -#ifndef UNIV_HOTBACKUP -#ifdef UNIV_DEBUG -/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */ -UNIV_INTERN uint trx_rseg_n_slots_debug = 0; -#endif - -/** This is used to track the maximum file format id known to InnoDB. It's -updated via SET GLOBAL innodb_file_format_max = 'x' or when we open -or create a table. */ -static file_format_t file_format_max; - -/****************************************************************//** -Determines if a page number is located inside the doublewrite buffer. -@return TRUE if the location is inside the two blocks of the -doublewrite buffer */ -UNIV_INTERN -ibool -trx_doublewrite_page_inside( -/*========================*/ - ulint page_no) /*!< in: page number */ -{ - if (trx_doublewrite == NULL) { - - return(FALSE); - } - - if (page_no >= trx_doublewrite->block1 - && page_no < trx_doublewrite->block1 - + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { - return(TRUE); - } - - if (page_no >= trx_doublewrite->block2 - && page_no < trx_doublewrite->block2 - + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { - return(TRUE); - } - - return(FALSE); -} - -/****************************************************************//** -Creates or initialializes the doublewrite buffer at a database start. */ -static -void -trx_doublewrite_init( -/*=================*/ - byte* doublewrite) /*!< in: pointer to the doublewrite buf - header on trx sys page */ -{ - trx_doublewrite = mem_alloc(sizeof(trx_doublewrite_t)); - - /* Since we now start to use the doublewrite buffer, no need to call - fsync() after every write to a data file */ -#ifdef UNIV_DO_FLUSH - os_do_not_call_flush_at_each_write = TRUE; -#endif /* UNIV_DO_FLUSH */ - - mutex_create(trx_doublewrite_mutex_key, - &trx_doublewrite->mutex, SYNC_DOUBLEWRITE); - - trx_doublewrite->first_free = 0; - - trx_doublewrite->block1 = mach_read_from_4( - doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1); - trx_doublewrite->block2 = mach_read_from_4( - doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2); - trx_doublewrite->write_buf_unaligned = ut_malloc( - (1 + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE); - - trx_doublewrite->write_buf = ut_align( - trx_doublewrite->write_buf_unaligned, UNIV_PAGE_SIZE); - trx_doublewrite->buf_block_arr = mem_alloc( - 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * sizeof(void*)); -} - -/****************************************************************//** -Marks the trx sys header when we have successfully upgraded to the >= 4.1.x -multiple tablespace format. */ -UNIV_INTERN -void -trx_sys_mark_upgraded_to_multiple_tablespaces(void) -/*===============================================*/ -{ - buf_block_t* block; - byte* doublewrite; - mtr_t mtr; - - /* We upgraded to 4.1.x and reset the space id fields in the - doublewrite buffer. Let us mark to the trx_sys header that the upgrade - has been done. */ - - mtr_start(&mtr); - - block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, - RW_X_LATCH, &mtr); - buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); - - doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE; - - mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED, - TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, - MLOG_4BYTES, &mtr); - mtr_commit(&mtr); - - /* Flush the modified pages to disk and make a checkpoint */ - log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); - - trx_sys_multiple_tablespace_format = TRUE; -} - -/****************************************************************//** -Creates the doublewrite buffer to a new InnoDB installation. The header of the -doublewrite buffer is placed on the trx system header page. */ -UNIV_INTERN -void -trx_sys_create_doublewrite_buf(void) -/*================================*/ -{ - buf_block_t* block; - buf_block_t* block2; - buf_block_t* new_block; - byte* doublewrite; - byte* fseg_header; - ulint page_no; - ulint prev_page_no; - ulint i; - mtr_t mtr; - - if (trx_doublewrite) { - /* Already inited */ - - return; - } - -start_again: - mtr_start(&mtr); - trx_doublewrite_buf_is_being_created = TRUE; - - block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, - RW_X_LATCH, &mtr); - buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); - - doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE; - - if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) - == TRX_SYS_DOUBLEWRITE_MAGIC_N) { - /* The doublewrite buffer has already been created: - just read in some numbers */ - - trx_doublewrite_init(doublewrite); - - mtr_commit(&mtr); - trx_doublewrite_buf_is_being_created = FALSE; - } else { - fprintf(stderr, - "InnoDB: Doublewrite buffer not found:" - " creating new\n"); - - if (buf_pool_get_curr_size() - < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE - + FSP_EXTENT_SIZE / 2 + 100) - * UNIV_PAGE_SIZE)) { - fprintf(stderr, - "InnoDB: Cannot create doublewrite buffer:" - " you must\n" - "InnoDB: increase your buffer pool size.\n" - "InnoDB: Cannot continue operation.\n"); - - exit(1); - } - - block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, - TRX_SYS_DOUBLEWRITE - + TRX_SYS_DOUBLEWRITE_FSEG, &mtr); - - /* fseg_create acquires a second latch on the page, - therefore we must declare it: */ - - buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK); - - if (block2 == NULL) { - fprintf(stderr, - "InnoDB: Cannot create doublewrite buffer:" - " you must\n" - "InnoDB: increase your tablespace size.\n" - "InnoDB: Cannot continue operation.\n"); - - /* We exit without committing the mtr to prevent - its modifications to the database getting to disk */ - - exit(1); - } - - fseg_header = buf_block_get_frame(block) - + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG; - prev_page_no = 0; - - for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE - + FSP_EXTENT_SIZE / 2; i++) { - new_block = fseg_alloc_free_page( - fseg_header, prev_page_no + 1, FSP_UP, &mtr); - if (new_block == NULL) { - fprintf(stderr, - "InnoDB: Cannot create doublewrite" - " buffer: you must\n" - "InnoDB: increase your" - " tablespace size.\n" - "InnoDB: Cannot continue operation.\n" - ); - - exit(1); - } - - /* We read the allocated pages to the buffer pool; - when they are written to disk in a flush, the space - id and page number fields are also written to the - pages. When we at database startup read pages - from the doublewrite buffer, we know that if the - space id and page number in them are the same as - the page position in the tablespace, then the page - has not been written to in doublewrite. */ - - ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1); - page_no = buf_block_get_page_no(new_block); - - if (i == FSP_EXTENT_SIZE / 2) { - ut_a(page_no == FSP_EXTENT_SIZE); - mlog_write_ulint(doublewrite - + TRX_SYS_DOUBLEWRITE_BLOCK1, - page_no, MLOG_4BYTES, &mtr); - mlog_write_ulint(doublewrite - + TRX_SYS_DOUBLEWRITE_REPEAT - + TRX_SYS_DOUBLEWRITE_BLOCK1, - page_no, MLOG_4BYTES, &mtr); - } else if (i == FSP_EXTENT_SIZE / 2 - + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { - ut_a(page_no == 2 * FSP_EXTENT_SIZE); - mlog_write_ulint(doublewrite - + TRX_SYS_DOUBLEWRITE_BLOCK2, - page_no, MLOG_4BYTES, &mtr); - mlog_write_ulint(doublewrite - + TRX_SYS_DOUBLEWRITE_REPEAT - + TRX_SYS_DOUBLEWRITE_BLOCK2, - page_no, MLOG_4BYTES, &mtr); - } else if (i > FSP_EXTENT_SIZE / 2) { - ut_a(page_no == prev_page_no + 1); - } - - prev_page_no = page_no; - } - - mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC, - TRX_SYS_DOUBLEWRITE_MAGIC_N, - MLOG_4BYTES, &mtr); - mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC - + TRX_SYS_DOUBLEWRITE_REPEAT, - TRX_SYS_DOUBLEWRITE_MAGIC_N, - MLOG_4BYTES, &mtr); - - mlog_write_ulint(doublewrite - + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED, - TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, - MLOG_4BYTES, &mtr); - mtr_commit(&mtr); - - /* Flush the modified pages to disk and make a checkpoint */ - log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); - - fprintf(stderr, "InnoDB: Doublewrite buffer created\n"); - - trx_sys_multiple_tablespace_format = TRUE; - - goto start_again; - } - - if (srv_doublewrite_file) { - /* the same doublewrite buffer to TRX_SYS_SPACE should exist. - check and create if not exist.*/ - - mtr_start(&mtr); - trx_doublewrite_buf_is_being_created = TRUE; - - block = buf_page_get(TRX_DOUBLEWRITE_SPACE, 0, TRX_SYS_PAGE_NO, - RW_X_LATCH, &mtr); - buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); - - doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE; - - if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) - == TRX_SYS_DOUBLEWRITE_MAGIC_N) { - /* The doublewrite buffer has already been created: - just read in some numbers */ - - trx_doublewrite_init(doublewrite); - - mtr_commit(&mtr); - trx_doublewrite_buf_is_being_created = FALSE; - } else { - fprintf(stderr, - "InnoDB: Doublewrite buffer not found in the doublewrite file:" - " creating new doublewrite buffer.\n"); - - if (buf_pool_get_curr_size() - < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE - + FSP_EXTENT_SIZE / 2 + 100) - * UNIV_PAGE_SIZE)) { - fprintf(stderr, - "InnoDB: Cannot create the doublewrite buffer:" - " You must\n" - "InnoDB: increase your buffer pool size.\n" - "InnoDB: Cannot continue processing.\n"); - - exit(1); - } - - block2 = fseg_create(TRX_DOUBLEWRITE_SPACE, TRX_SYS_PAGE_NO, - TRX_SYS_DOUBLEWRITE - + TRX_SYS_DOUBLEWRITE_FSEG, &mtr); - - /* fseg_create acquires a second latch on the page, - therefore we must declare it: */ - - buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK); - - if (block2 == NULL) { - fprintf(stderr, - "InnoDB: Cannot create the doublewrite buffer:" - " You must\n" - "InnoDB: increase your tablespace size.\n" - "InnoDB: Cannot continue processing.\n"); - - /* We exit without committing the mtr to prevent - its modifications to the database getting to disk */ - - exit(1); - } - - fseg_header = buf_block_get_frame(block) - + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG; - prev_page_no = 0; - - for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE - + FSP_EXTENT_SIZE / 2; i++) { - new_block = fseg_alloc_free_page( - fseg_header, prev_page_no + 1, FSP_UP, &mtr); - if (new_block == NULL) { - fprintf(stderr, - "InnoDB: Cannot create doublewrite" - " buffer: you must\n" - "InnoDB: increase your" - " tablespace size.\n" - "InnoDB: Cannot continue operation.\n" - ); - - exit(1); - } - - /* We read the allocated pages to the buffer pool; - when they are written to disk in a flush, the space - id and page number fields are also written to the - pages. When we at database startup read pages - from the doublewrite buffer, we know that if the - space id and page number in them are the same as - the page position in the tablespace, then the page - has not been written to in doublewrite. */ - - ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1); - page_no = buf_block_get_page_no(new_block); - - if (i == FSP_EXTENT_SIZE / 2) { - ut_a(page_no == FSP_EXTENT_SIZE); - mlog_write_ulint(doublewrite - + TRX_SYS_DOUBLEWRITE_BLOCK1, - page_no, MLOG_4BYTES, &mtr); - mlog_write_ulint(doublewrite - + TRX_SYS_DOUBLEWRITE_REPEAT - + TRX_SYS_DOUBLEWRITE_BLOCK1, - page_no, MLOG_4BYTES, &mtr); - } else if (i == FSP_EXTENT_SIZE / 2 - + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { - ut_a(page_no == 2 * FSP_EXTENT_SIZE); - mlog_write_ulint(doublewrite - + TRX_SYS_DOUBLEWRITE_BLOCK2, - page_no, MLOG_4BYTES, &mtr); - mlog_write_ulint(doublewrite - + TRX_SYS_DOUBLEWRITE_REPEAT - + TRX_SYS_DOUBLEWRITE_BLOCK2, - page_no, MLOG_4BYTES, &mtr); - } else if (i > FSP_EXTENT_SIZE / 2) { - ut_a(page_no == prev_page_no + 1); - } - - prev_page_no = page_no; - } - - mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC, - TRX_SYS_DOUBLEWRITE_MAGIC_N, - MLOG_4BYTES, &mtr); - mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC - + TRX_SYS_DOUBLEWRITE_REPEAT, - TRX_SYS_DOUBLEWRITE_MAGIC_N, - MLOG_4BYTES, &mtr); - - mlog_write_ulint(doublewrite - + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED, - TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, - MLOG_4BYTES, &mtr); - mtr_commit(&mtr); - - /* Flush the modified pages to disk and make a checkpoint */ - log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); - - fprintf(stderr, "InnoDB: Doublewrite buffer created in the doublewrite file\n"); - trx_sys_multiple_tablespace_format = TRUE; - } - trx_doublewrite_buf_is_being_created = FALSE; - } -} - -/****************************************************************//** -At a database startup initializes the doublewrite buffer memory structure if -we already have a doublewrite buffer created in the data files. If we are -upgrading to an InnoDB version which supports multiple tablespaces, then this -function performs the necessary update operations. If we are in a crash -recovery, this function uses a possible doublewrite buffer to restore -half-written pages in the data files. */ -UNIV_INTERN -void -trx_sys_doublewrite_init_or_restore_pages( -/*======================================*/ - ibool restore_corrupt_pages) /*!< in: TRUE=restore pages */ -{ - byte* buf; - byte* read_buf; - byte* unaligned_read_buf; - ulint block1; - ulint block2; - ulint source_page_no; - byte* page; - byte* doublewrite; - ulint doublewrite_space_id; - ulint space_id; - ulint page_no; - ulint i; - - doublewrite_space_id = (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE); - - if (srv_doublewrite_file) { - fprintf(stderr, - "InnoDB: doublewrite file '%s' is used.\n", - srv_doublewrite_file); - } - - /* We do the file i/o past the buffer pool */ - - unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE); - read_buf = ut_align(unaligned_read_buf, UNIV_PAGE_SIZE); - - /* Read the trx sys header to check if we are using the doublewrite - buffer */ - - fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, TRX_SYS_PAGE_NO, 0, - UNIV_PAGE_SIZE, read_buf, NULL); - doublewrite = read_buf + TRX_SYS_DOUBLEWRITE; - - if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) - == TRX_SYS_DOUBLEWRITE_MAGIC_N) { - /* The doublewrite buffer has been created */ - - trx_doublewrite_init(doublewrite); - - block1 = trx_doublewrite->block1; - block2 = trx_doublewrite->block2; - - buf = trx_doublewrite->write_buf; - } else { - goto leave_func; - } - - if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED) - != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) { - - /* We are upgrading from a version < 4.1.x to a version where - multiple tablespaces are supported. We must reset the space id - field in the pages in the doublewrite buffer because starting - from this version the space id is stored to - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */ - - trx_doublewrite_must_reset_space_ids = TRUE; - - fprintf(stderr, - "InnoDB: Resetting space id's in the" - " doublewrite buffer\n"); - } else { - trx_sys_multiple_tablespace_format = TRUE; - } - - /* Read the pages from the doublewrite buffer to memory */ - - fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, block1, 0, - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, - buf, NULL); - fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, block2, 0, - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, - buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, - NULL); - /* Check if any of these pages is half-written in data files, in the - intended position */ - - page = buf; - - for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) { - - page_no = mach_read_from_4(page + FIL_PAGE_OFFSET); - - if (trx_doublewrite_must_reset_space_ids) { - - space_id = 0; - mach_write_to_4(page - + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0); - /* We do not need to calculate new checksums for the - pages because the field .._SPACE_ID does not affect - them. Write the page back to where we read it from. */ - - if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { - source_page_no = block1 + i; - } else { - source_page_no = block2 - + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; - } - - fil_io(OS_FILE_WRITE, TRUE, 0, 0, source_page_no, 0, - UNIV_PAGE_SIZE, page, NULL); - /* printf("Resetting space id in page %lu\n", - source_page_no); */ - } else { - space_id = mach_read_from_4( - page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); - } - - if (!restore_corrupt_pages) { - /* The database was shut down gracefully: no need to - restore pages */ - - } else if (!fil_tablespace_exists_in_mem(space_id)) { - /* Maybe we have dropped the single-table tablespace - and this page once belonged to it: do nothing */ - - } else if (!fil_check_adress_in_tablespace(space_id, - page_no)) { - fprintf(stderr, - "InnoDB: Warning: a page in the" - " doublewrite buffer is not within space\n" - "InnoDB: bounds; space id %lu" - " page number %lu, page %lu in" - " doublewrite buf.\n", - (ulong) space_id, (ulong) page_no, (ulong) i); - - } else if ((space_id == TRX_SYS_SPACE - || (srv_doublewrite_file && space_id == TRX_DOUBLEWRITE_SPACE)) - && ((page_no >= block1 - && page_no - < block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) - || (page_no >= block2 - && page_no - < (block2 - + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)))) { - - /* It is an unwritten doublewrite buffer page: - do nothing */ - } else { - ulint zip_size = fil_space_get_zip_size(space_id); - - /* Read in the actual page from the file */ - fil_io(OS_FILE_READ, TRUE, space_id, zip_size, - page_no, 0, - zip_size ? zip_size : UNIV_PAGE_SIZE, - read_buf, NULL); - - if (srv_recovery_stats && recv_recovery_is_on()) { - mutex_enter(&(recv_sys->mutex)); - recv_sys->stats_doublewrite_check_pages++; - mutex_exit(&(recv_sys->mutex)); - } - - /* Check if the page is corrupt */ - - if (UNIV_UNLIKELY - (buf_page_is_corrupted( - TRUE, read_buf, zip_size))) { - - fprintf(stderr, - "InnoDB: Warning: database page" - " corruption or a failed\n" - "InnoDB: file read of" - " space %lu page %lu.\n" - "InnoDB: Trying to recover it from" - " the doublewrite buffer.\n", - (ulong) space_id, (ulong) page_no); - - if (buf_page_is_corrupted( - TRUE, page, zip_size)) { - fprintf(stderr, - "InnoDB: Dump of the page:\n"); - buf_page_print( - read_buf, zip_size, - BUF_PAGE_PRINT_NO_CRASH); - fprintf(stderr, - "InnoDB: Dump of" - " corresponding page" - " in doublewrite buffer:\n"); - buf_page_print( - page, zip_size, - BUF_PAGE_PRINT_NO_CRASH); - - fprintf(stderr, - "InnoDB: Also the page in the" - " doublewrite buffer" - " is corrupt.\n" - "InnoDB: Cannot continue" - " operation.\n" - "InnoDB: You can try to" - " recover the database" - " with the my.cnf\n" - "InnoDB: option:\n" - "InnoDB:" - " innodb_force_recovery=6\n"); - ut_error; - } - - /* Write the good page from the - doublewrite buffer to the intended - position */ - - fil_io(OS_FILE_WRITE, TRUE, space_id, - zip_size, page_no, 0, - zip_size ? zip_size : UNIV_PAGE_SIZE, - page, NULL); - - if (srv_recovery_stats && recv_recovery_is_on()) { - mutex_enter(&(recv_sys->mutex)); - recv_sys->stats_doublewrite_overwrite_pages++; - mutex_exit(&(recv_sys->mutex)); - } - - fprintf(stderr, - "InnoDB: Recovered the page from" - " the doublewrite buffer.\n"); - } - } - - page += UNIV_PAGE_SIZE; - } - - fil_flush_file_spaces(FIL_TABLESPACE); - -leave_func: - ut_free(unaligned_read_buf); -} - -/****************************************************************//** -Checks that trx is in the trx list. -@return TRUE if is in */ -UNIV_INTERN -ibool -trx_in_trx_list( -/*============*/ - trx_t* in_trx) /*!< in: trx */ -{ - trx_t* trx; - - ut_ad(mutex_own(&(kernel_mutex))); - - trx = UT_LIST_GET_FIRST(trx_sys->trx_list); - - while (trx != NULL) { - - if (trx == in_trx) { - - return(TRUE); - } - - trx = UT_LIST_GET_NEXT(trx_list, trx); - } - - return(FALSE); -} - -/*****************************************************************//** -Writes the value of max_trx_id to the file based trx system header. */ -UNIV_INTERN -void -trx_sys_flush_max_trx_id(void) -/*==========================*/ -{ - trx_sysf_t* sys_header; - mtr_t mtr; - - ut_ad(mutex_own(&kernel_mutex)); - - mtr_start(&mtr); - - sys_header = trx_sysf_get(&mtr); - - mlog_write_ull(sys_header + TRX_SYS_TRX_ID_STORE, - trx_sys->max_trx_id, &mtr); - mtr_commit(&mtr); -} - -/*****************************************************************//** -Updates the offset information about the end of the MySQL binlog entry -which corresponds to the transaction just being committed. In a MySQL -replication slave updates the latest master binlog position up to which -replication has proceeded. */ -UNIV_INTERN -void -trx_sys_update_mysql_binlog_offset( -/*===============================*/ - trx_sysf_t* sys_header, - const char* file_name_in,/*!< in: MySQL log file name */ - ib_int64_t offset, /*!< in: position in that log file */ - ulint field, /*!< in: offset of the MySQL log info field in - the trx sys header */ - mtr_t* mtr) /*!< in: mtr */ -{ - const char* file_name; - - if (ut_strlen(file_name_in) >= TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN) { - - /* We cannot fit the name to the 512 bytes we have reserved */ - /* -> To store relay log file information, file_name must fit to the 480 bytes */ - - file_name = ""; - } else { - file_name = file_name_in; - } - - if (mach_read_from_4(sys_header + field - + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) - != TRX_SYS_MYSQL_LOG_MAGIC_N) { - - mlog_write_ulint(sys_header + field - + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD, - TRX_SYS_MYSQL_LOG_MAGIC_N, - MLOG_4BYTES, mtr); - } - - if (0 != strcmp((char*) (sys_header + field + TRX_SYS_MYSQL_LOG_NAME), - file_name)) { - - mlog_write_string(sys_header + field - + TRX_SYS_MYSQL_LOG_NAME, - (byte*) file_name, 1 + ut_strlen(file_name), - mtr); - } - - if (mach_read_from_4(sys_header + field - + TRX_SYS_MYSQL_LOG_OFFSET_HIGH) > 0 - || (offset >> 32) > 0) { - - mlog_write_ulint(sys_header + field - + TRX_SYS_MYSQL_LOG_OFFSET_HIGH, - (ulint)(offset >> 32), - MLOG_4BYTES, mtr); - } - - mlog_write_ulint(sys_header + field - + TRX_SYS_MYSQL_LOG_OFFSET_LOW, - (ulint)(offset & 0xFFFFFFFFUL), - MLOG_4BYTES, mtr); -} - -/*****************************************************************//** -Stores the MySQL binlog offset info in the trx system header if -the magic number shows it valid, and print the info to stderr */ -UNIV_INTERN -void -trx_sys_print_mysql_binlog_offset(void) -/*===================================*/ -{ - trx_sysf_t* sys_header; - mtr_t mtr; - ulint trx_sys_mysql_bin_log_pos_high; - ulint trx_sys_mysql_bin_log_pos_low; - - mtr_start(&mtr); - - sys_header = trx_sysf_get(&mtr); - - if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO - + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) - != TRX_SYS_MYSQL_LOG_MAGIC_N) { - - mtr_commit(&mtr); - - return; - } - - trx_sys_mysql_bin_log_pos_high = mach_read_from_4( - sys_header + TRX_SYS_MYSQL_LOG_INFO - + TRX_SYS_MYSQL_LOG_OFFSET_HIGH); - trx_sys_mysql_bin_log_pos_low = mach_read_from_4( - sys_header + TRX_SYS_MYSQL_LOG_INFO - + TRX_SYS_MYSQL_LOG_OFFSET_LOW); - - trx_sys_mysql_bin_log_pos - = (((ib_int64_t)trx_sys_mysql_bin_log_pos_high) << 32) - + (ib_int64_t)trx_sys_mysql_bin_log_pos_low; - - ut_memcpy(trx_sys_mysql_bin_log_name, - sys_header + TRX_SYS_MYSQL_LOG_INFO - + TRX_SYS_MYSQL_LOG_NAME, TRX_SYS_MYSQL_LOG_NAME_LEN); - - fprintf(stderr, - "InnoDB: Last MySQL binlog file position %lu %lu," - " file name %s\n", - trx_sys_mysql_bin_log_pos_high, trx_sys_mysql_bin_log_pos_low, - trx_sys_mysql_bin_log_name); - - mtr_commit(&mtr); -} - -#ifdef WITH_WSREP - -void -trx_sys_update_wsrep_checkpoint( - const XID* xid, /*!< in: transaction XID */ - mtr_t* mtr) /*!< in: mtr */ -{ - trx_sysf_t* sys_header; - - ut_ad(xid && mtr); - ut_a(xid->formatID == -1 || wsrep_is_wsrep_xid(xid)); - - sys_header = trx_sysf_get(mtr); - if (mach_read_from_4(sys_header + TRX_SYS_WSREP_XID_INFO - + TRX_SYS_WSREP_XID_MAGIC_N_FLD) - != TRX_SYS_WSREP_XID_MAGIC_N) { - mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO - + TRX_SYS_WSREP_XID_MAGIC_N_FLD, - TRX_SYS_WSREP_XID_MAGIC_N, - MLOG_4BYTES, mtr); - } - - mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO - + TRX_SYS_WSREP_XID_FORMAT, - (int)xid->formatID, - MLOG_4BYTES, mtr); - mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO - + TRX_SYS_WSREP_XID_GTRID_LEN, - (int)xid->gtrid_length, - MLOG_4BYTES, mtr); - mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO - + TRX_SYS_WSREP_XID_BQUAL_LEN, - (int)xid->bqual_length, - MLOG_4BYTES, mtr); - mlog_write_string(sys_header + TRX_SYS_WSREP_XID_INFO - + TRX_SYS_WSREP_XID_DATA, - (const unsigned char*) xid->data, - XIDDATASIZE, mtr); - -} - -void -trx_sys_read_wsrep_checkpoint(XID* xid) -/*===================================*/ -{ - trx_sysf_t* sys_header; - mtr_t mtr; - ulint magic; - - ut_ad(xid); - - mtr_start(&mtr); - - sys_header = trx_sysf_get(&mtr); - - if ((magic = mach_read_from_4(sys_header + TRX_SYS_WSREP_XID_INFO - + TRX_SYS_WSREP_XID_MAGIC_N_FLD)) - != TRX_SYS_WSREP_XID_MAGIC_N) { - memset(xid, 0, sizeof(*xid)); - xid->formatID = -1; - trx_sys_update_wsrep_checkpoint(xid, &mtr); - mtr_commit(&mtr); - return; - } - - xid->formatID = (int)mach_read_from_4( - sys_header - + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_FORMAT); - xid->gtrid_length = (int)mach_read_from_4( - sys_header - + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_GTRID_LEN); - xid->bqual_length = (int)mach_read_from_4( - sys_header - + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_BQUAL_LEN); - ut_memcpy(xid->data, - sys_header + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_DATA, - XIDDATASIZE); - - mtr_commit(&mtr); -} - -#endif /* WITH_WSREP */ - -/*****************************************************************//** -Reads the log coordinates at the given offset in the trx sys header. */ -static -void -trx_sys_read_log_pos( -/*=================*/ - const trx_sysf_t* sys_header, /*!< in: the trx sys header */ - uint header_offset, /*!< in: coord offset in the - header */ - char* log_fn, /*!< out: the log file name */ - ib_int64_t* log_pos) /*!< out: the log poistion */ -{ - ut_memcpy(log_fn, sys_header + header_offset + TRX_SYS_MYSQL_LOG_NAME, - TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN); - - *log_pos = - (((ib_int64_t)mach_read_from_4(sys_header + header_offset - + TRX_SYS_MYSQL_LOG_OFFSET_HIGH)) << 32) - + mach_read_from_4(sys_header + header_offset - + TRX_SYS_MYSQL_LOG_OFFSET_LOW); -} - -/*****************************************************************//** -Prints to stderr the MySQL master log offset info in the trx system header -PREPARE set of fields if the magic number shows it valid and stores it -in global variables. */ -UNIV_INTERN -void -trx_sys_print_mysql_master_log_pos(void) -/*====================================*/ -{ - trx_sysf_t* sys_header; - mtr_t mtr; - - mtr_start(&mtr); - - sys_header = trx_sysf_get(&mtr); - - if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO - + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) - != TRX_SYS_MYSQL_LOG_MAGIC_N) { - - mtr_commit(&mtr); - - return; - } - - /* Copy the master log position info to global variables we can - use in ha_innobase.cc to initialize glob_mi to right values */ - trx_sys_read_log_pos(sys_header, TRX_SYS_MYSQL_MASTER_LOG_INFO, - trx_sys_mysql_master_log_name, - &trx_sys_mysql_master_log_pos); - - trx_sys_read_log_pos(sys_header, TRX_SYS_MYSQL_RELAY_LOG_INFO, - trx_sys_mysql_relay_log_name, - &trx_sys_mysql_relay_log_pos); - - mtr_commit(&mtr); - - fprintf(stderr, - "InnoDB: In a MySQL replication slave the last" - " master binlog file\n" - "InnoDB: position %llu, file name %s\n", - trx_sys_mysql_master_log_pos, - trx_sys_mysql_master_log_name); - - fprintf(stderr, - "InnoDB: and relay log file\n" - "InnoDB: position %llu, file name %s\n", - trx_sys_mysql_relay_log_pos, - trx_sys_mysql_relay_log_name); -} - -/*****************************************************************//** -Prints to stderr the MySQL master log offset info in the trx system header -COMMIT set of fields if the magic number shows it valid and stores it -in global variables. */ -UNIV_INTERN -void -trx_sys_print_committed_mysql_master_log_pos(void) -/*==============================================*/ -{ - trx_sysf_t* sys_header; - mtr_t mtr; - - mtr_start(&mtr); - - sys_header = trx_sysf_get(&mtr); - - if (mach_read_from_4(sys_header + TRX_SYS_COMMIT_MASTER_LOG_INFO - + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) - != TRX_SYS_MYSQL_LOG_MAGIC_N) { - - mtr_commit(&mtr); - - return; - } - - /* Copy the master log position info to global variables we can - use in ha_innobase.cc to initialize glob_mi to right values */ - trx_sys_read_log_pos(sys_header, TRX_SYS_COMMIT_MASTER_LOG_INFO, - trx_sys_mysql_master_log_name, - &trx_sys_mysql_master_log_pos); - - trx_sys_read_log_pos(sys_header, TRX_SYS_COMMIT_RELAY_LOG_INFO, - trx_sys_mysql_relay_log_name, - &trx_sys_mysql_relay_log_pos); - - mtr_commit(&mtr); - - fprintf(stderr, - "InnoDB: In a MySQL replication slave the last" - " master binlog file\n" - "InnoDB: position %llu, file name %s\n", - trx_sys_mysql_master_log_pos, trx_sys_mysql_master_log_name); - - fprintf(stderr, - "InnoDB: and relay log file\n" - "InnoDB: position %llu, file name %s\n", - trx_sys_mysql_relay_log_pos, trx_sys_mysql_relay_log_name); -} - -/****************************************************************//** -Looks for a free slot for a rollback segment in the trx system file copy. -@return slot index or ULINT_UNDEFINED if not found */ -UNIV_INTERN -ulint -trx_sysf_rseg_find_free( -/*====================*/ - mtr_t* mtr) /*!< in: mtr */ -{ - trx_sysf_t* sys_header; - ulint page_no; - ulint i; - - ut_ad(mutex_own(&(kernel_mutex))); - - sys_header = trx_sysf_get(mtr); - - for (i = 0; i < TRX_SYS_N_RSEGS; i++) { - - page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr); - - if (page_no == FIL_NULL) { - - return(i); - } - } - - return(ULINT_UNDEFINED); -} - -/*****************************************************************//** -Creates the file page for the transaction system. This function is called only -at the database creation, before trx_sys_init. */ -static -void -trx_sysf_create( -/*============*/ - mtr_t* mtr) /*!< in: mtr */ -{ - trx_sysf_t* sys_header; - ulint slot_no; - buf_block_t* block; - page_t* page; - ulint page_no; - byte* ptr; - ulint len; - - ut_ad(mtr); - - /* Note that below we first reserve the file space x-latch, and - then enter the kernel: we must do it in this order to conform - to the latching order rules. */ - - mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), mtr); - mutex_enter(&kernel_mutex); - - /* Create the trx sys file block in a new allocated file segment */ - block = fseg_create(TRX_SYS_SPACE, 0, TRX_SYS + TRX_SYS_FSEG_HEADER, - mtr); - buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); - - ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO); - - page = buf_block_get_frame(block); - - mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS, - MLOG_2BYTES, mtr); - - /* Reset the doublewrite buffer magic number to zero so that we - know that the doublewrite buffer has not yet been created (this - suppresses a Valgrind warning) */ - - mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE - + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr); - - sys_header = trx_sysf_get(mtr); - - /* Start counting transaction ids from number 1 up */ - mach_write_to_8(sys_header + TRX_SYS_TRX_ID_STORE, 1); - - /* Reset the rollback segment slots. Old versions of InnoDB - define TRX_SYS_N_RSEGS as 256 (TRX_SYS_OLD_N_RSEGS) and expect - that the whole array is initialized. */ - ptr = TRX_SYS_RSEGS + sys_header; - len = ut_max(TRX_SYS_OLD_N_RSEGS, TRX_SYS_N_RSEGS) - * TRX_SYS_RSEG_SLOT_SIZE; - memset(ptr, 0xff, len); - ptr += len; - ut_a(ptr <= page + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END)); - - /* Initialize all of the page. This part used to be uninitialized. */ - memset(ptr, 0, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page - ptr); - - mlog_log_string(sys_header, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END - + page - sys_header, mtr); - - /* Create the first rollback segment in the SYSTEM tablespace */ - slot_no = trx_sysf_rseg_find_free(mtr); - page_no = trx_rseg_header_create(TRX_SYS_SPACE, 0, ULINT_MAX, slot_no, - mtr); - ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID); - ut_a(page_no == FSP_FIRST_RSEG_PAGE_NO); - - mutex_exit(&kernel_mutex); -} - -/*****************************************************************//** -Compare two trx_rseg_t instances on last_trx_no. */ -static -int -trx_rseg_compare_last_trx_no( -/*=========================*/ - const void* p1, /*!< in: elem to compare */ - const void* p2) /*!< in: elem to compare */ -{ - ib_int64_t cmp; - - const rseg_queue_t* rseg_q1 = (const rseg_queue_t*) p1; - const rseg_queue_t* rseg_q2 = (const rseg_queue_t*) p2; - - cmp = rseg_q1->trx_no - rseg_q2->trx_no; - - if (cmp < 0) { - return(-1); - } else if (cmp > 0) { - return(1); - } - - return(0); -} - -/*****************************************************************//** -Creates dummy of the file page for the transaction system. */ -static -void -trx_sysf_dummy_create( -/*==================*/ - ulint space, - mtr_t* mtr) -{ - buf_block_t* block; - page_t* page; - - ut_ad(mtr); - - /* Note that below we first reserve the file space x-latch, and - then enter the kernel: we must do it in this order to conform - to the latching order rules. */ - - mtr_x_lock(fil_space_get_latch(space, NULL), mtr); - mutex_enter(&kernel_mutex); - - /* Create the trx sys file block in a new allocated file segment */ - block = fseg_create(space, 0, TRX_SYS + TRX_SYS_FSEG_HEADER, - mtr); - buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); - - fprintf(stderr, "%lu\n", buf_block_get_page_no(block)); - ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO); - - page = buf_block_get_frame(block); - - mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS, - MLOG_2BYTES, mtr); - - /* Reset the doublewrite buffer magic number to zero so that we - know that the doublewrite buffer has not yet been created (this - suppresses a Valgrind warning) */ - - mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE - + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr); - -#ifdef UNDEFINED - /* TODO: REMOVE IT: The bellow is not needed, I think */ - sys_header = trx_sysf_get(mtr); - - /* Start counting transaction ids from number 1 up */ - mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE, - ut_dulint_create(0, 1), mtr); - - /* Reset the rollback segment slots */ - for (i = 0; i < TRX_SYS_N_RSEGS; i++) { - - trx_sysf_rseg_set_space(sys_header, i, ULINT_UNDEFINED, mtr); - trx_sysf_rseg_set_page_no(sys_header, i, FIL_NULL, mtr); - } - - /* The remaining area (up to the page trailer) is uninitialized. - Silence Valgrind warnings about it. */ - UNIV_MEM_VALID(sys_header + (TRX_SYS_RSEGS - + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE - + TRX_SYS_RSEG_SPACE), - (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END - - (TRX_SYS_RSEGS - + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE - + TRX_SYS_RSEG_SPACE)) - + page - sys_header); - - /* Create the first rollback segment in the SYSTEM tablespace */ - page_no = trx_rseg_header_create(space, 0, ULINT_MAX, &slot_no, - mtr); - ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID); - ut_a(page_no != FIL_NULL); -#endif - - mutex_exit(&kernel_mutex); -} - -/*****************************************************************//** -Creates and initializes the central memory structures for the transaction -system. This is called when the database is started. */ -UNIV_INTERN -void -trx_sys_init_at_db_start(void) -/*==========================*/ -{ - trx_sysf_t* sys_header; - ib_uint64_t rows_to_undo = 0; - const char* unit = ""; - trx_t* trx; - mtr_t mtr; - ib_bh_t* ib_bh; - - mtr_start(&mtr); - - ut_ad(trx_sys == NULL); - - mutex_enter(&kernel_mutex); - - /* We create the min binary heap here and pass ownership to - purge when we init the purge sub-system. Purge is responsible - for freeing the binary heap. */ - - ib_bh = ib_bh_create( - trx_rseg_compare_last_trx_no, - sizeof(rseg_queue_t), TRX_SYS_N_RSEGS); - - trx_sys = mem_zalloc(sizeof(*trx_sys)); - - /* Allocate the trx descriptors array */ - trx_sys->descriptors = ut_malloc(sizeof(trx_id_t) * - TRX_DESCR_ARRAY_INITIAL_SIZE); - trx_sys->descr_n_max = TRX_DESCR_ARRAY_INITIAL_SIZE; - trx_sys->descr_n_used = 0; - srv_descriptors_memory = TRX_DESCR_ARRAY_INITIAL_SIZE * - sizeof(trx_id_t); - - sys_header = trx_sysf_get(&mtr); - - trx_rseg_list_and_array_init(sys_header, ib_bh, &mtr); - - trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); - - /* VERY important: after the database is started, max_trx_id value is - divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in - trx_sys_get_new_trx_id will evaluate to TRUE when the function - is first time called, and the value for trx id will be written - to the disk-based header! Thus trx id values will not overlap when - the database is repeatedly started! */ - - trx_sys->max_trx_id = 2 * TRX_SYS_TRX_ID_WRITE_MARGIN - + ut_uint64_align_up(mach_read_from_8(sys_header - + TRX_SYS_TRX_ID_STORE), - TRX_SYS_TRX_ID_WRITE_MARGIN); - - UT_LIST_INIT(trx_sys->mysql_trx_list); - trx_dummy_sess = sess_open(); - trx_lists_init_at_db_start(); - - if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) { - trx = UT_LIST_GET_FIRST(trx_sys->trx_list); - - for (;;) { - - if (trx->state != TRX_PREPARED) { - rows_to_undo += trx->undo_no; - } - - trx = UT_LIST_GET_NEXT(trx_list, trx); - - if (!trx) { - break; - } - } - - if (rows_to_undo > 1000000000) { - unit = "M"; - rows_to_undo = rows_to_undo / 1000000; - } - - fprintf(stderr, - "InnoDB: %lu transaction(s) which must be" - " rolled back or cleaned up\n" - "InnoDB: in total %lu%s row operations to undo\n", - (ulong) UT_LIST_GET_LEN(trx_sys->trx_list), - (ulong) rows_to_undo, unit); - - fprintf(stderr, "InnoDB: Trx id counter is " TRX_ID_FMT "\n", - (ullint) trx_sys->max_trx_id); - } - - UT_LIST_INIT(trx_sys->view_list); - - /* Transfer ownership to purge. */ - trx_purge_sys_create(ib_bh); - - mutex_exit(&kernel_mutex); - - mtr_commit(&mtr); -} - -/*****************************************************************//** -Creates and initializes the transaction system at the database creation. */ -UNIV_INTERN -void -trx_sys_create(void) -/*================*/ -{ - mtr_t mtr; - - mtr_start(&mtr); - - trx_sysf_create(&mtr); - - mtr_commit(&mtr); - - trx_sys_init_at_db_start(); -} - -/*****************************************************************//** -Update the file format tag. -@return always TRUE */ -static -ibool -trx_sys_file_format_max_write( -/*==========================*/ - ulint format_id, /*!< in: file format id */ - const char** name) /*!< out: max file format name, can - be NULL */ -{ - mtr_t mtr; - byte* ptr; - buf_block_t* block; - ib_uint64_t tag_value; - - mtr_start(&mtr); - - block = buf_page_get( - TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr); - - file_format_max.id = format_id; - file_format_max.name = trx_sys_file_format_id_to_name(format_id); - - ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG; - tag_value = format_id + TRX_SYS_FILE_FORMAT_TAG_MAGIC_N; - - if (name) { - *name = file_format_max.name; - } - - mlog_write_ull(ptr, tag_value, &mtr); - - mtr_commit(&mtr); - - return(TRUE); -} - -/*****************************************************************//** -Read the file format tag. -@return the file format or ULINT_UNDEFINED if not set. */ -static -ulint -trx_sys_file_format_max_read(void) -/*==============================*/ -{ - mtr_t mtr; - const byte* ptr; - const buf_block_t* block; - ib_id_t file_format_id; - - /* Since this is called during the startup phase it's safe to - read the value without a covering mutex. */ - mtr_start(&mtr); - - block = buf_page_get( - TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr); - - ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG; - file_format_id = mach_read_from_8(ptr); - - mtr_commit(&mtr); - - file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N; - - if (file_format_id >= FILE_FORMAT_NAME_N) { - - /* Either it has never been tagged, or garbage in it. */ - return(ULINT_UNDEFINED); - } - - return((ulint) file_format_id); -} - -/*****************************************************************//** -Get the name representation of the file format from its id. -@return pointer to the name */ -UNIV_INTERN -const char* -trx_sys_file_format_id_to_name( -/*===========================*/ - const ulint id) /*!< in: id of the file format */ -{ - ut_a(id < FILE_FORMAT_NAME_N); - - return(file_format_name_map[id]); -} - -/*****************************************************************//** -Check for the max file format tag stored on disk. Note: If max_format_id -is == DICT_TF_FORMAT_MAX + 1 then we only print a warning. -@return DB_SUCCESS or error code */ -UNIV_INTERN -ulint -trx_sys_file_format_max_check( -/*==========================*/ - ulint max_format_id) /*!< in: max format id to check */ -{ - ulint format_id; - - /* Check the file format in the tablespace. Do not try to - recover if the file format is not supported by the engine - unless forced by the user. */ - format_id = trx_sys_file_format_max_read(); - if (format_id == ULINT_UNDEFINED) { - /* Format ID was not set. Set it to minimum possible - value. */ - format_id = DICT_TF_FORMAT_MIN; - } - - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: highest supported file format is %s.\n", - trx_sys_file_format_id_to_name(DICT_TF_FORMAT_MAX)); - - if (format_id > DICT_TF_FORMAT_MAX) { - - ut_a(format_id < FILE_FORMAT_NAME_N); - - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: %s: the system tablespace is in a file " - "format that this version doesn't support - %s\n", - ((max_format_id <= DICT_TF_FORMAT_MAX) - ? "Error" : "Warning"), - trx_sys_file_format_id_to_name(format_id)); - - if (max_format_id <= DICT_TF_FORMAT_MAX) { - return(DB_ERROR); - } - } - - format_id = (format_id > max_format_id) ? format_id : max_format_id; - - /* We don't need a mutex here, as this function should only - be called once at start up. */ - file_format_max.id = format_id; - file_format_max.name = trx_sys_file_format_id_to_name(format_id); - - return(DB_SUCCESS); -} - -/*****************************************************************//** -Set the file format id unconditionally except if it's already the -same value. -@return TRUE if value updated */ -UNIV_INTERN -ibool -trx_sys_file_format_max_set( -/*========================*/ - ulint format_id, /*!< in: file format id */ - const char** name) /*!< out: max file format name or - NULL if not needed. */ -{ - ibool ret = FALSE; - - ut_a(format_id <= DICT_TF_FORMAT_MAX); - - mutex_enter(&file_format_max.mutex); - - /* Only update if not already same value. */ - if (format_id != file_format_max.id) { - - ret = trx_sys_file_format_max_write(format_id, name); - } - - mutex_exit(&file_format_max.mutex); - - return(ret); -} - -/********************************************************************//** -Tags the system table space with minimum format id if it has not been -tagged yet. -WARNING: This function is only called during the startup and AFTER the -redo log application during recovery has finished. */ -UNIV_INTERN -void -trx_sys_file_format_tag_init(void) -/*==============================*/ -{ - ulint format_id; - - format_id = trx_sys_file_format_max_read(); - - /* If format_id is not set then set it to the minimum. */ - if (format_id == ULINT_UNDEFINED) { - trx_sys_file_format_max_set(DICT_TF_FORMAT_MIN, NULL); - } -} - -/********************************************************************//** -Update the file format tag in the system tablespace only if the given -format id is greater than the known max id. -@return TRUE if format_id was bigger than the known max id */ -UNIV_INTERN -ibool -trx_sys_file_format_max_upgrade( -/*============================*/ - const char** name, /*!< out: max file format name */ - ulint format_id) /*!< in: file format identifier */ -{ - ibool ret = FALSE; - - ut_a(name); - ut_a(file_format_max.name != NULL); - ut_a(format_id <= DICT_TF_FORMAT_MAX); - - mutex_enter(&file_format_max.mutex); - - if (format_id > file_format_max.id) { - - ret = trx_sys_file_format_max_write(format_id, name); - } - - mutex_exit(&file_format_max.mutex); - - return(ret); -} - -/*****************************************************************//** -Get the name representation of the file format from its id. -@return pointer to the max format name */ -UNIV_INTERN -const char* -trx_sys_file_format_max_get(void) -/*=============================*/ -{ - return(file_format_max.name); -} - -/*****************************************************************//** -Initializes the tablespace tag system. */ -UNIV_INTERN -void -trx_sys_file_format_init(void) -/*==========================*/ -{ - mutex_create(file_format_max_mutex_key, - &file_format_max.mutex, SYNC_FILE_FORMAT_TAG); - - /* We don't need a mutex here, as this function should only - be called once at start up. */ - file_format_max.id = DICT_TF_FORMAT_MIN; - - file_format_max.name = trx_sys_file_format_id_to_name( - file_format_max.id); -} - -/*****************************************************************//** -Closes the tablespace tag system. */ -UNIV_INTERN -void -trx_sys_file_format_close(void) -/*===========================*/ -{ - /* Does nothing at the moment */ -} - -/*****************************************************************//** -Creates and initializes the dummy transaction system page for tablespace. */ -UNIV_INTERN -void -trx_sys_dummy_create( -/*=================*/ - ulint space) -{ - mtr_t mtr; - - /* This function is only for doublewrite file for now */ - ut_a(space == TRX_DOUBLEWRITE_SPACE); - - mtr_start(&mtr); - - trx_sysf_dummy_create(space, &mtr); - - mtr_commit(&mtr); -} - -/********************************************************************* -Creates the rollback segments */ -UNIV_INTERN -void -trx_sys_create_rsegs( -/*=================*/ - ulint n_rsegs) /*!< number of rollback segments to create */ -{ - ulint new_rsegs = 0; - - /* Do not create additional rollback segments if - innodb_force_recovery has been set and the database - was not shutdown cleanly. */ - if (!srv_force_recovery && !recv_needed_recovery) { - ulint i; - - for (i = 0; i < n_rsegs; ++i) { - - if (trx_rseg_create() != NULL) { - ++new_rsegs; - } else { - break; - } - } - } - - if (new_rsegs > 0) { - fprintf(stderr, - "InnoDB: %lu rollback segment(s) active.\n", - new_rsegs); - } -} - -#else /* !UNIV_HOTBACKUP */ -/*****************************************************************//** -Prints to stderr the MySQL binlog info in the system header if the -magic number shows it valid. */ -UNIV_INTERN -void -trx_sys_print_mysql_binlog_offset_from_page( -/*========================================*/ - const byte* page) /*!< in: buffer containing the trx - system header page, i.e., page number - TRX_SYS_PAGE_NO in the tablespace */ -{ - const trx_sysf_t* sys_header; - - sys_header = page + TRX_SYS; - - if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO - + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) - == TRX_SYS_MYSQL_LOG_MAGIC_N) { - - fprintf(stderr, - "ibbackup: Last MySQL binlog file position %lu %lu," - " file name %s\n", - (ulong) mach_read_from_4( - sys_header + TRX_SYS_MYSQL_LOG_INFO - + TRX_SYS_MYSQL_LOG_OFFSET_HIGH), - (ulong) mach_read_from_4( - sys_header + TRX_SYS_MYSQL_LOG_INFO - + TRX_SYS_MYSQL_LOG_OFFSET_LOW), - sys_header + TRX_SYS_MYSQL_LOG_INFO - + TRX_SYS_MYSQL_LOG_NAME); - } -} - - -/* THESE ARE COPIED FROM NON-HOTBACKUP PART OF THE INNODB SOURCE TREE - (This code duplication should be fixed at some point!) -*/ - -#define TRX_SYS_SPACE 0 /* the SYSTEM tablespace */ -/* The offset of the file format tag on the trx system header page */ -#define TRX_SYS_FILE_FORMAT_TAG (UNIV_PAGE_SIZE - 16) -/* We use these random constants to reduce the probability of reading -garbage (from previous versions) that maps to an actual format id. We -use these as bit masks at the time of reading and writing from/to disk. */ -#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW 3645922177UL -#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH 2745987765UL - -/* END OF COPIED DEFINITIONS */ - - -/*****************************************************************//** -Reads the file format id from the first system table space file. -Even if the call succeeds and returns TRUE, the returned format id -may be ULINT_UNDEFINED signalling that the format id was not present -in the data file. -@return TRUE if call succeeds */ -UNIV_INTERN -ibool -trx_sys_read_file_format_id( -/*========================*/ - const char *pathname, /*!< in: pathname of the first system - table space file */ - ulint *format_id) /*!< out: file format of the system table - space */ -{ - os_file_t file; - ibool success; - byte buf[UNIV_PAGE_SIZE * 2]; - page_t* page = ut_align(buf, UNIV_PAGE_SIZE); - const byte* ptr; - ib_id_t file_format_id; - - *format_id = ULINT_UNDEFINED; - - file = os_file_create_simple_no_error_handling( - innodb_file_data_key, - pathname, - OS_FILE_OPEN, - OS_FILE_READ_ONLY, - &success - ); - if (!success) { - /* The following call prints an error message */ - os_file_get_last_error(TRUE); - - ut_print_timestamp(stderr); - - fprintf(stderr, -" ibbackup: Error: trying to read system tablespace file format,\n" -" ibbackup: but could not open the tablespace file %s!\n", - pathname - ); - return(FALSE); - } - - /* Read the page on which file format is stored */ - - success = os_file_read_no_error_handling( - file, page, TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE, 0, UNIV_PAGE_SIZE - ); - if (!success) { - /* The following call prints an error message */ - os_file_get_last_error(TRUE); - - ut_print_timestamp(stderr); - - fprintf(stderr, -" ibbackup: Error: trying to read system table space file format,\n" -" ibbackup: but failed to read the tablespace file %s!\n", - pathname - ); - os_file_close(file); - return(FALSE); - } - os_file_close(file); - - /* get the file format from the page */ - ptr = page + TRX_SYS_FILE_FORMAT_TAG; - file_format_id = mach_read_from_8(ptr); - file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N; - - if (file_format_id >= FILE_FORMAT_NAME_N) { - - /* Either it has never been tagged, or garbage in it. */ - return(TRUE); - } - - *format_id = (ulint) file_format_id; - - return(TRUE); -} - - -/*****************************************************************//** -Reads the file format id from the given per-table data file. -@return TRUE if call succeeds */ -UNIV_INTERN -ibool -trx_sys_read_pertable_file_format_id( -/*=================================*/ - const char *pathname, /*!< in: pathname of a per-table - datafile */ - ulint *format_id) /*!< out: file format of the per-table - data file */ -{ - os_file_t file; - ibool success; - byte buf[UNIV_PAGE_SIZE * 2]; - page_t* page = ut_align(buf, UNIV_PAGE_SIZE); - const byte* ptr; - ib_uint32_t flags; - - *format_id = ULINT_UNDEFINED; - - file = os_file_create_simple_no_error_handling( - innodb_file_data_key, - pathname, - OS_FILE_OPEN, - OS_FILE_READ_ONLY, - &success - ); - if (!success) { - /* The following call prints an error message */ - os_file_get_last_error(TRUE); - - ut_print_timestamp(stderr); - - fprintf(stderr, -" ibbackup: Error: trying to read per-table tablespace format,\n" -" ibbackup: but could not open the tablespace file %s!\n", - pathname - ); - return(FALSE); - } - - /* Read the first page of the per-table datafile */ - - success = os_file_read_no_error_handling( - file, page, 0, 0, UNIV_PAGE_SIZE - ); - if (!success) { - /* The following call prints an error message */ - os_file_get_last_error(TRUE); - - ut_print_timestamp(stderr); - - fprintf(stderr, -" ibbackup: Error: trying to per-table data file format,\n" -" ibbackup: but failed to read the tablespace file %s!\n", - pathname - ); - os_file_close(file); - return(FALSE); - } - os_file_close(file); - - /* get the file format from the page */ - ptr = page + 54; - flags = mach_read_from_4(ptr); - if (flags == 0) { - /* file format is Antelope */ - *format_id = 0; - return (TRUE); - } else if (flags & 1) { - /* tablespace flags are ok */ - *format_id = (flags / 32) % 128; - return (TRUE); - } else { - /* bad tablespace flags */ - return(FALSE); - } -} - - -/*****************************************************************//** -Get the name representation of the file format from its id. -@return pointer to the name */ -UNIV_INTERN -const char* -trx_sys_file_format_id_to_name( -/*===========================*/ - const ulint id) /*!< in: id of the file format */ -{ - if (!(id < FILE_FORMAT_NAME_N)) { - /* unknown id */ - return ("Unknown"); - } - - return(file_format_name_map[id]); -} - -#endif /* !UNIV_HOTBACKUP */ - -#ifndef UNIV_HOTBACKUP -/********************************************************************* -Shutdown/Close the transaction system. */ -UNIV_INTERN -void -trx_sys_close(void) -/*===============*/ -{ - trx_t* trx; - trx_rseg_t* rseg; - read_view_t* view; - - ut_ad(trx_sys != NULL); - ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); - - /* Check that all read views are closed except read view owned - by a purge. */ - - if (UT_LIST_GET_LEN(trx_sys->view_list) > 1) { - fprintf(stderr, - "InnoDB: Error: all read views were not closed" - " before shutdown:\n" - "InnoDB: %lu read views open \n", - UT_LIST_GET_LEN(trx_sys->view_list) - 1); - } - - sess_close(trx_dummy_sess); - trx_dummy_sess = NULL; - - trx_purge_sys_close(); - - mutex_enter(&kernel_mutex); - - /* Free the double write data structures. */ - ut_a(trx_doublewrite != NULL); - ut_free(trx_doublewrite->write_buf_unaligned); - trx_doublewrite->write_buf_unaligned = NULL; - - mem_free(trx_doublewrite->buf_block_arr); - trx_doublewrite->buf_block_arr = NULL; - - mutex_free(&trx_doublewrite->mutex); - mem_free(trx_doublewrite); - trx_doublewrite = NULL; - - /* Only prepared transactions may be left in the system. Free them. */ - ut_a(UT_LIST_GET_LEN(trx_sys->trx_list) == trx_n_prepared); - - while ((trx = UT_LIST_GET_FIRST(trx_sys->trx_list)) != NULL) { - trx_free_prepared(trx); - } - - /* There can't be any active transactions. */ - rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); - - while (rseg != NULL) { - trx_rseg_t* prev_rseg = rseg; - - rseg = UT_LIST_GET_NEXT(rseg_list, prev_rseg); - UT_LIST_REMOVE(rseg_list, trx_sys->rseg_list, prev_rseg); - - trx_rseg_mem_free(prev_rseg); - } - - view = UT_LIST_GET_FIRST(trx_sys->view_list); - - while (view != NULL) { - read_view_t* prev_view = view; - - view = UT_LIST_GET_NEXT(view_list, prev_view); - - /* Views are allocated from the trx_sys->global_read_view_heap. - So, we simply remove the element here. */ - UT_LIST_REMOVE(view_list, trx_sys->view_list, prev_view); - } - - ut_a(UT_LIST_GET_LEN(trx_sys->trx_list) == 0); - ut_a(UT_LIST_GET_LEN(trx_sys->rseg_list) == 0); - ut_a(UT_LIST_GET_LEN(trx_sys->view_list) == 0); - ut_a(UT_LIST_GET_LEN(trx_sys->mysql_trx_list) == 0); - - ut_ad(trx_sys->descr_n_used == 0); - ut_free(trx_sys->descriptors); - - mem_free(trx_sys); - - trx_sys = NULL; - mutex_exit(&kernel_mutex); -} -#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/trx/trx0sys.cc b/storage/xtradb/trx/trx0sys.cc new file mode 100644 index 00000000000..daa13b8b2c5 --- /dev/null +++ b/storage/xtradb/trx/trx0sys.cc @@ -0,0 +1,1414 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0sys.cc +Transaction system + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0sys.h" + +#ifdef UNIV_NONINL +#include "trx0sys.ic" +#endif + +#ifndef UNIV_HOTBACKUP +#include "fsp0fsp.h" +#include "mtr0log.h" +#include "mtr0log.h" +#include "trx0trx.h" +#include "trx0rseg.h" +#include "trx0undo.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "trx0purge.h" +#include "log0log.h" +#include "log0recv.h" +#include "os0file.h" +#include "read0read.h" + +#ifdef WITH_WSREP +#include "ha_prototypes.h" /* wsrep_is_wsrep_xid() */ +#endif /* */ + +/** The file format tag structure with id and name. */ +struct file_format_t { + ulint id; /*!< id of the file format */ + const char* name; /*!< text representation of the + file format */ + ib_mutex_t mutex; /*!< covers changes to the above + fields */ +}; + +/** The transaction system */ +UNIV_INTERN trx_sys_t* trx_sys = NULL; + +/** In a MySQL replication slave, in crash recovery we store the master log +file name and position here. */ +/* @{ */ +/** Master binlog file name */ +UNIV_INTERN char trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN]; +/** Master binlog file position. We have successfully got the updates +up to this position. -1 means that no crash recovery was needed, or +there was no master log position info inside InnoDB.*/ +UNIV_INTERN ib_int64_t trx_sys_mysql_master_log_pos = -1; +/* @} */ + +/** If this MySQL server uses binary logging, after InnoDB has been inited +and if it has done a crash recovery, we store the binlog file name and position +here. */ +/* @{ */ +/** Binlog file name */ +UNIV_INTERN char trx_sys_mysql_bin_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN]; +/** Binlog file position, or -1 if unknown */ +UNIV_INTERN ib_int64_t trx_sys_mysql_bin_log_pos = -1; +/* @} */ +#endif /* !UNIV_HOTBACKUP */ + +/** List of animal names representing file format. */ +static const char* file_format_name_map[] = { + "Antelope", + "Barracuda", + "Cheetah", + "Dragon", + "Elk", + "Fox", + "Gazelle", + "Hornet", + "Impala", + "Jaguar", + "Kangaroo", + "Leopard", + "Moose", + "Nautilus", + "Ocelot", + "Porpoise", + "Quail", + "Rabbit", + "Shark", + "Tiger", + "Urchin", + "Viper", + "Whale", + "Xenops", + "Yak", + "Zebra" +}; + +/** The number of elements in the file format name array. */ +static const ulint FILE_FORMAT_NAME_N + = sizeof(file_format_name_map) / sizeof(file_format_name_map[0]); + +#ifdef UNIV_PFS_MUTEX +/* Key to register the mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t file_format_max_mutex_key; +UNIV_INTERN mysql_pfs_key_t trx_sys_mutex_key; +#endif /* UNIV_PFS_RWLOCK */ + +#ifndef UNIV_HOTBACKUP +#ifdef UNIV_DEBUG +/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */ +UNIV_INTERN uint trx_rseg_n_slots_debug = 0; +#endif + +/** This is used to track the maximum file format id known to InnoDB. It's +updated via SET GLOBAL innodb_file_format_max = 'x' or when we open +or create a table. */ +static file_format_t file_format_max; + +#ifdef UNIV_DEBUG +/****************************************************************//** +Checks whether a trx is in one of rw_trx_list or ro_trx_list. +@return TRUE if is in */ +UNIV_INTERN +ibool +trx_in_trx_list( +/*============*/ + const trx_t* in_trx) /*!< in: transaction */ +{ + const trx_t* trx; + trx_list_t* trx_list; + + /* Non-locking autocommits should not hold any locks. */ + assert_trx_in_list(in_trx); + + trx_list = in_trx->read_only + ? &trx_sys->ro_trx_list : &trx_sys->rw_trx_list; + + ut_ad(mutex_own(&trx_sys->mutex)); + + ut_ad(trx_assert_started(in_trx)); + + for (trx = UT_LIST_GET_FIRST(*trx_list); + trx != NULL && trx != in_trx; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + + assert_trx_in_list(trx); + ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list)); + } + + return(trx != NULL); +} +#endif /* UNIV_DEBUG */ + +/*****************************************************************//** +Writes the value of max_trx_id to the file based trx system header. */ +UNIV_INTERN +void +trx_sys_flush_max_trx_id(void) +/*==========================*/ +{ + mtr_t mtr; + trx_sysf_t* sys_header; + + ut_ad(mutex_own(&trx_sys->mutex)); + + if (!srv_read_only_mode) { + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + mlog_write_ull( + sys_header + TRX_SYS_TRX_ID_STORE, + trx_sys->max_trx_id, &mtr); + + mtr_commit(&mtr); + } +} + +/*****************************************************************//** +Updates the offset information about the end of the MySQL binlog entry +which corresponds to the transaction just being committed. In a MySQL +replication slave updates the latest master binlog position up to which +replication has proceeded. */ +UNIV_INTERN +void +trx_sys_update_mysql_binlog_offset( +/*===============================*/ + const char* file_name,/*!< in: MySQL log file name */ + ib_int64_t offset, /*!< in: position in that log file */ + ulint field, /*!< in: offset of the MySQL log info field in + the trx sys header */ +#ifdef WITH_WSREP + trx_sysf_t* sys_header, /*!< in: trx sys header */ +#endif /* WITH_WSREP */ + mtr_t* mtr) /*!< in: mtr */ +{ +#ifndef WITH_WSREP + trx_sysf_t* sys_header; +#endif + if (ut_strlen(file_name) >= TRX_SYS_MYSQL_LOG_NAME_LEN) { + + /* We cannot fit the name to the 512 bytes we have reserved */ + + return; + } + +#ifndef WITH_WSREP + sys_header = trx_sysf_get(mtr); +#endif + + if (mach_read_from_4(sys_header + field + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + != TRX_SYS_MYSQL_LOG_MAGIC_N) { + + mlog_write_ulint(sys_header + field + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD, + TRX_SYS_MYSQL_LOG_MAGIC_N, + MLOG_4BYTES, mtr); + } + + if (0 != strcmp((char*) (sys_header + field + TRX_SYS_MYSQL_LOG_NAME), + file_name)) { + + mlog_write_string(sys_header + field + + TRX_SYS_MYSQL_LOG_NAME, + (byte*) file_name, 1 + ut_strlen(file_name), + mtr); + } + + if (mach_read_from_4(sys_header + field + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH) > 0 + || (offset >> 32) > 0) { + + mlog_write_ulint(sys_header + field + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH, + (ulint)(offset >> 32), + MLOG_4BYTES, mtr); + } + + mlog_write_ulint(sys_header + field + + TRX_SYS_MYSQL_LOG_OFFSET_LOW, + (ulint)(offset & 0xFFFFFFFFUL), + MLOG_4BYTES, mtr); +} + +/*****************************************************************//** +Stores the MySQL binlog offset info in the trx system header if +the magic number shows it valid, and print the info to stderr */ +UNIV_INTERN +void +trx_sys_print_mysql_binlog_offset(void) +/*===================================*/ +{ + trx_sysf_t* sys_header; + mtr_t mtr; + ulint trx_sys_mysql_bin_log_pos_high; + ulint trx_sys_mysql_bin_log_pos_low; + + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + != TRX_SYS_MYSQL_LOG_MAGIC_N) { + + mtr_commit(&mtr); + + return; + } + + trx_sys_mysql_bin_log_pos_high = mach_read_from_4( + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH); + trx_sys_mysql_bin_log_pos_low = mach_read_from_4( + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW); + + trx_sys_mysql_bin_log_pos + = (((ib_int64_t) trx_sys_mysql_bin_log_pos_high) << 32) + + (ib_int64_t) trx_sys_mysql_bin_log_pos_low; + + ut_memcpy(trx_sys_mysql_bin_log_name, + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME, TRX_SYS_MYSQL_LOG_NAME_LEN); + + fprintf(stderr, + "InnoDB: Last MySQL binlog file position %lu %lu," + " file name %s\n", + trx_sys_mysql_bin_log_pos_high, trx_sys_mysql_bin_log_pos_low, + trx_sys_mysql_bin_log_name); + + mtr_commit(&mtr); +} + +#ifdef WITH_WSREP + +void +trx_sys_update_wsrep_checkpoint( + const XID* xid, /*!< in: transaction XID */ + trx_sysf_t* sys_header, /*!< in: sys_header */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(xid && mtr); + ut_a(xid->formatID == -1 || wsrep_is_wsrep_xid((const void *)xid)); + + if (mach_read_from_4(sys_header + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_MAGIC_N_FLD) + != TRX_SYS_WSREP_XID_MAGIC_N) { + mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_MAGIC_N_FLD, + TRX_SYS_WSREP_XID_MAGIC_N, + MLOG_4BYTES, mtr); + } + + mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_FORMAT, + (int)xid->formatID, + MLOG_4BYTES, mtr); + mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_GTRID_LEN, + (int)xid->gtrid_length, + MLOG_4BYTES, mtr); + mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_BQUAL_LEN, + (int)xid->bqual_length, + MLOG_4BYTES, mtr); + mlog_write_string(sys_header + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_DATA, + (const unsigned char*) xid->data, + XIDDATASIZE, mtr); + +} + +void +trx_sys_read_wsrep_checkpoint(XID* xid) +/*===================================*/ +{ + trx_sysf_t* sys_header; + mtr_t mtr; + ulint magic; + + ut_ad(xid); + + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + if ((magic = mach_read_from_4(sys_header + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_MAGIC_N_FLD)) + != TRX_SYS_WSREP_XID_MAGIC_N) { + memset(xid, 0, sizeof(*xid)); + xid->formatID = -1; + trx_sys_update_wsrep_checkpoint(xid, sys_header, &mtr); + mtr_commit(&mtr); + return; + } + + xid->formatID = (int)mach_read_from_4( + sys_header + + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_FORMAT); + xid->gtrid_length = (int)mach_read_from_4( + sys_header + + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_GTRID_LEN); + xid->bqual_length = (int)mach_read_from_4( + sys_header + + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_BQUAL_LEN); + ut_memcpy(xid->data, + sys_header + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_DATA, + XIDDATASIZE); + + mtr_commit(&mtr); +} + +#endif /* WITH_WSREP */ + +/*****************************************************************//** +Prints to stderr the MySQL master log offset info in the trx system header if +the magic number shows it valid. */ +UNIV_INTERN +void +trx_sys_print_mysql_master_log_pos(void) +/*====================================*/ +{ + trx_sysf_t* sys_header; + mtr_t mtr; + + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + != TRX_SYS_MYSQL_LOG_MAGIC_N) { + + mtr_commit(&mtr); + + return; + } + + fprintf(stderr, + "InnoDB: In a MySQL replication slave the last" + " master binlog file\n" + "InnoDB: position %lu %lu, file name %s\n", + (ulong) mach_read_from_4(sys_header + + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH), + (ulong) mach_read_from_4(sys_header + + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW), + sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME); + /* Copy the master log position info to global variables we can + use in ha_innobase.cc to initialize glob_mi to right values */ + + ut_memcpy(trx_sys_mysql_master_log_name, + sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME, + TRX_SYS_MYSQL_LOG_NAME_LEN); + + trx_sys_mysql_master_log_pos + = (((ib_int64_t) mach_read_from_4( + sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH)) << 32) + + ((ib_int64_t) mach_read_from_4( + sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW)); + mtr_commit(&mtr); +} + +/****************************************************************//** +Looks for a free slot for a rollback segment in the trx system file copy. +@return slot index or ULINT_UNDEFINED if not found */ +UNIV_INTERN +ulint +trx_sysf_rseg_find_free( +/*====================*/ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint i; + trx_sysf_t* sys_header; + + sys_header = trx_sysf_get(mtr); + + for (i = 0; i < TRX_SYS_N_RSEGS; i++) { + ulint page_no; + + page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr); + + if (page_no == FIL_NULL) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/*****************************************************************//** +Creates the file page for the transaction system. This function is called only +at the database creation, before trx_sys_init. */ +static +void +trx_sysf_create( +/*============*/ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_sysf_t* sys_header; + ulint slot_no; + buf_block_t* block; + page_t* page; + ulint page_no; + byte* ptr; + ulint len; + + ut_ad(mtr); + + /* Note that below we first reserve the file space x-latch, and + then enter the kernel: we must do it in this order to conform + to the latching order rules. */ + + mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), mtr); + + /* Create the trx sys file block in a new allocated file segment */ + block = fseg_create(TRX_SYS_SPACE, 0, TRX_SYS + TRX_SYS_FSEG_HEADER, + mtr); + buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); + + ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO); + + page = buf_block_get_frame(block); + + mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS, + MLOG_2BYTES, mtr); + + /* Reset the doublewrite buffer magic number to zero so that we + know that the doublewrite buffer has not yet been created (this + suppresses a Valgrind warning) */ + + mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE + + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr); + + sys_header = trx_sysf_get(mtr); + + /* Start counting transaction ids from number 1 up */ + mach_write_to_8(sys_header + TRX_SYS_TRX_ID_STORE, 1); + + /* Reset the rollback segment slots. Old versions of InnoDB + define TRX_SYS_N_RSEGS as 256 (TRX_SYS_OLD_N_RSEGS) and expect + that the whole array is initialized. */ + ptr = TRX_SYS_RSEGS + sys_header; + len = ut_max(TRX_SYS_OLD_N_RSEGS, TRX_SYS_N_RSEGS) + * TRX_SYS_RSEG_SLOT_SIZE; + memset(ptr, 0xff, len); + ptr += len; + ut_a(ptr <= page + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END)); + + /* Initialize all of the page. This part used to be uninitialized. */ + memset(ptr, 0, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page - ptr); + + mlog_log_string(sys_header, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + + page - sys_header, mtr); + + /* Create the first rollback segment in the SYSTEM tablespace */ + slot_no = trx_sysf_rseg_find_free(mtr); + page_no = trx_rseg_header_create(TRX_SYS_SPACE, 0, ULINT_MAX, slot_no, + mtr); + + ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID); + ut_a(page_no == FSP_FIRST_RSEG_PAGE_NO); +} + +/*****************************************************************//** +Compare two trx_rseg_t instances on last_trx_no. */ +static +int +trx_rseg_compare_last_trx_no( +/*=========================*/ + const void* p1, /*!< in: elem to compare */ + const void* p2) /*!< in: elem to compare */ +{ + ib_int64_t cmp; + + const rseg_queue_t* rseg_q1 = (const rseg_queue_t*) p1; + const rseg_queue_t* rseg_q2 = (const rseg_queue_t*) p2; + + cmp = rseg_q1->trx_no - rseg_q2->trx_no; + + if (cmp < 0) { + return(-1); + } else if (cmp > 0) { + return(1); + } + + return(0); +} + +/*****************************************************************//** +Creates and initializes the central memory structures for the transaction +system. This is called when the database is started. +@return min binary heap of rsegs to purge */ +UNIV_INTERN +ib_bh_t* +trx_sys_init_at_db_start(void) +/*==========================*/ +{ + mtr_t mtr; + ib_bh_t* ib_bh; + trx_sysf_t* sys_header; + ib_uint64_t rows_to_undo = 0; + const char* unit = ""; + + /* We create the min binary heap here and pass ownership to + purge when we init the purge sub-system. Purge is responsible + for freeing the binary heap. */ + + ib_bh = ib_bh_create( + trx_rseg_compare_last_trx_no, + sizeof(rseg_queue_t), TRX_SYS_N_RSEGS); + + mtr_start(&mtr); + + /* Allocate the trx descriptors array */ + trx_sys->descriptors = static_cast<trx_id_t*>( + ut_malloc(sizeof(trx_id_t) * + TRX_DESCR_ARRAY_INITIAL_SIZE)); + trx_sys->descr_n_max = TRX_DESCR_ARRAY_INITIAL_SIZE; + trx_sys->descr_n_used = 0; + srv_descriptors_memory = TRX_DESCR_ARRAY_INITIAL_SIZE * + sizeof(trx_id_t); + + sys_header = trx_sysf_get(&mtr); + + if (srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) { + trx_rseg_array_init(sys_header, ib_bh, &mtr); + } + + /* VERY important: after the database is started, max_trx_id value is + divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in + trx_sys_get_new_trx_id will evaluate to TRUE when the function + is first time called, and the value for trx id will be written + to the disk-based header! Thus trx id values will not overlap when + the database is repeatedly started! */ + + trx_sys->max_trx_id = 2 * TRX_SYS_TRX_ID_WRITE_MARGIN + + ut_uint64_align_up(mach_read_from_8(sys_header + + TRX_SYS_TRX_ID_STORE), + TRX_SYS_TRX_ID_WRITE_MARGIN); + + ut_d(trx_sys->rw_max_trx_id = trx_sys->max_trx_id); + + UT_LIST_INIT(trx_sys->mysql_trx_list); + + trx_dummy_sess = sess_open(); + + trx_lists_init_at_db_start(); + + /* This S lock is not strictly required, it is here only to satisfy + the debug code (assertions). We are still running in single threaded + bootstrap mode. */ + + mutex_enter(&trx_sys->mutex); + + ut_a(UT_LIST_GET_LEN(trx_sys->ro_trx_list) == 0); + + if (UT_LIST_GET_LEN(trx_sys->rw_trx_list) > 0) { + const trx_t* trx; + + for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); + trx != NULL; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + + ut_ad(trx->is_recovered); + assert_trx_in_rw_list(trx); + + if (trx_state_eq(trx, TRX_STATE_ACTIVE)) { + rows_to_undo += trx->undo_no; + } + } + + if (rows_to_undo > 1000000000) { + unit = "M"; + rows_to_undo = rows_to_undo / 1000000; + } + + fprintf(stderr, + "InnoDB: %lu transaction(s) which must be" + " rolled back or cleaned up\n" + "InnoDB: in total %lu%s row operations to undo\n", + (ulong) UT_LIST_GET_LEN(trx_sys->rw_trx_list), + (ulong) rows_to_undo, unit); + + fprintf(stderr, "InnoDB: Trx id counter is " TRX_ID_FMT "\n", + trx_sys->max_trx_id); + } + + mutex_exit(&trx_sys->mutex); + + UT_LIST_INIT(trx_sys->view_list); + + mtr_commit(&mtr); + + return(ib_bh); +} + +/*****************************************************************//** +Creates the trx_sys instance and initializes ib_bh and mutex. */ +UNIV_INTERN +void +trx_sys_create(void) +/*================*/ +{ + ut_ad(trx_sys == NULL); + + trx_sys = static_cast<trx_sys_t*>(mem_zalloc(sizeof(*trx_sys))); + + mutex_create(trx_sys_mutex_key, &trx_sys->mutex, SYNC_TRX_SYS); +} + +/*****************************************************************//** +Creates and initializes the transaction system at the database creation. */ +UNIV_INTERN +void +trx_sys_create_sys_pages(void) +/*==========================*/ +{ + mtr_t mtr; + + mtr_start(&mtr); + + trx_sysf_create(&mtr); + + mtr_commit(&mtr); +} + +/*****************************************************************//** +Update the file format tag. +@return always TRUE */ +static +ibool +trx_sys_file_format_max_write( +/*==========================*/ + ulint format_id, /*!< in: file format id */ + const char** name) /*!< out: max file format name, can + be NULL */ +{ + mtr_t mtr; + byte* ptr; + buf_block_t* block; + ib_uint64_t tag_value; + + mtr_start(&mtr); + + block = buf_page_get( + TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr); + + file_format_max.id = format_id; + file_format_max.name = trx_sys_file_format_id_to_name(format_id); + + ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG; + tag_value = format_id + TRX_SYS_FILE_FORMAT_TAG_MAGIC_N; + + if (name) { + *name = file_format_max.name; + } + + mlog_write_ull(ptr, tag_value, &mtr); + + mtr_commit(&mtr); + + return(TRUE); +} + +/*****************************************************************//** +Read the file format tag. +@return the file format or ULINT_UNDEFINED if not set. */ +static +ulint +trx_sys_file_format_max_read(void) +/*==============================*/ +{ + mtr_t mtr; + const byte* ptr; + const buf_block_t* block; + ib_id_t file_format_id; + + /* Since this is called during the startup phase it's safe to + read the value without a covering mutex. */ + mtr_start(&mtr); + + block = buf_page_get( + TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr); + + ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG; + file_format_id = mach_read_from_8(ptr); + + mtr_commit(&mtr); + + file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N; + + if (file_format_id >= FILE_FORMAT_NAME_N) { + + /* Either it has never been tagged, or garbage in it. */ + return(ULINT_UNDEFINED); + } + + return((ulint) file_format_id); +} + +/*****************************************************************//** +Get the name representation of the file format from its id. +@return pointer to the name */ +UNIV_INTERN +const char* +trx_sys_file_format_id_to_name( +/*===========================*/ + const ulint id) /*!< in: id of the file format */ +{ + ut_a(id < FILE_FORMAT_NAME_N); + + return(file_format_name_map[id]); +} + +/*****************************************************************//** +Check for the max file format tag stored on disk. Note: If max_format_id +is == UNIV_FORMAT_MAX + 1 then we only print a warning. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +trx_sys_file_format_max_check( +/*==========================*/ + ulint max_format_id) /*!< in: max format id to check */ +{ + ulint format_id; + + /* Check the file format in the tablespace. Do not try to + recover if the file format is not supported by the engine + unless forced by the user. */ + format_id = trx_sys_file_format_max_read(); + if (format_id == ULINT_UNDEFINED) { + /* Format ID was not set. Set it to minimum possible + value. */ + format_id = UNIV_FORMAT_MIN; + } + + ib_logf(IB_LOG_LEVEL_INFO, + "Highest supported file format is %s.", + trx_sys_file_format_id_to_name(UNIV_FORMAT_MAX)); + + if (format_id > UNIV_FORMAT_MAX) { + + ut_a(format_id < FILE_FORMAT_NAME_N); + + ib_logf(max_format_id <= UNIV_FORMAT_MAX + ? IB_LOG_LEVEL_ERROR : IB_LOG_LEVEL_WARN, + "The system tablespace is in a file " + "format that this version doesn't support - %s.", + trx_sys_file_format_id_to_name(format_id)); + + if (max_format_id <= UNIV_FORMAT_MAX) { + return(DB_ERROR); + } + } + + format_id = (format_id > max_format_id) ? format_id : max_format_id; + + /* We don't need a mutex here, as this function should only + be called once at start up. */ + file_format_max.id = format_id; + file_format_max.name = trx_sys_file_format_id_to_name(format_id); + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Set the file format id unconditionally except if it's already the +same value. +@return TRUE if value updated */ +UNIV_INTERN +ibool +trx_sys_file_format_max_set( +/*========================*/ + ulint format_id, /*!< in: file format id */ + const char** name) /*!< out: max file format name or + NULL if not needed. */ +{ + ibool ret = FALSE; + + ut_a(format_id <= UNIV_FORMAT_MAX); + + mutex_enter(&file_format_max.mutex); + + /* Only update if not already same value. */ + if (format_id != file_format_max.id) { + + ret = trx_sys_file_format_max_write(format_id, name); + } + + mutex_exit(&file_format_max.mutex); + + return(ret); +} + +/********************************************************************//** +Tags the system table space with minimum format id if it has not been +tagged yet. +WARNING: This function is only called during the startup and AFTER the +redo log application during recovery has finished. */ +UNIV_INTERN +void +trx_sys_file_format_tag_init(void) +/*==============================*/ +{ + ulint format_id; + + format_id = trx_sys_file_format_max_read(); + + /* If format_id is not set then set it to the minimum. */ + if (format_id == ULINT_UNDEFINED) { + trx_sys_file_format_max_set(UNIV_FORMAT_MIN, NULL); + } +} + +/********************************************************************//** +Update the file format tag in the system tablespace only if the given +format id is greater than the known max id. +@return TRUE if format_id was bigger than the known max id */ +UNIV_INTERN +ibool +trx_sys_file_format_max_upgrade( +/*============================*/ + const char** name, /*!< out: max file format name */ + ulint format_id) /*!< in: file format identifier */ +{ + ibool ret = FALSE; + + ut_a(name); + ut_a(file_format_max.name != NULL); + ut_a(format_id <= UNIV_FORMAT_MAX); + + mutex_enter(&file_format_max.mutex); + + if (format_id > file_format_max.id) { + + ret = trx_sys_file_format_max_write(format_id, name); + } + + mutex_exit(&file_format_max.mutex); + + return(ret); +} + +/*****************************************************************//** +Get the name representation of the file format from its id. +@return pointer to the max format name */ +UNIV_INTERN +const char* +trx_sys_file_format_max_get(void) +/*=============================*/ +{ + return(file_format_max.name); +} + +/*****************************************************************//** +Initializes the tablespace tag system. */ +UNIV_INTERN +void +trx_sys_file_format_init(void) +/*==========================*/ +{ + mutex_create(file_format_max_mutex_key, + &file_format_max.mutex, SYNC_FILE_FORMAT_TAG); + + /* We don't need a mutex here, as this function should only + be called once at start up. */ + file_format_max.id = UNIV_FORMAT_MIN; + + file_format_max.name = trx_sys_file_format_id_to_name( + file_format_max.id); +} + +/*****************************************************************//** +Closes the tablespace tag system. */ +UNIV_INTERN +void +trx_sys_file_format_close(void) +/*===========================*/ +{ + /* Does nothing at the moment */ +} + +/********************************************************************* +Creates the rollback segments. +@return number of rollback segments that are active. */ +UNIV_INTERN +ulint +trx_sys_create_rsegs( +/*=================*/ + ulint n_spaces, /*!< number of tablespaces for UNDO logs */ + ulint n_rsegs) /*!< number of rollback segments to create */ +{ + mtr_t mtr; + ulint n_used; + + ut_a(n_spaces < TRX_SYS_N_RSEGS); + ut_a(n_rsegs <= TRX_SYS_N_RSEGS); + + if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO || srv_read_only_mode) { + return(ULINT_UNDEFINED); + } + + /* This is executed in single-threaded mode therefore it is not + necessary to use the same mtr in trx_rseg_create(). n_used cannot + change while the function is executing. */ + + mtr_start(&mtr); + n_used = trx_sysf_rseg_find_free(&mtr); + mtr_commit(&mtr); + + if (n_used == ULINT_UNDEFINED) { + n_used = TRX_SYS_N_RSEGS; + } + + /* Do not create additional rollback segments if innodb_force_recovery + has been set and the database was not shutdown cleanly. */ + + if (!srv_force_recovery && !recv_needed_recovery && n_used < n_rsegs) { + ulint i; + ulint new_rsegs = n_rsegs - n_used; + + for (i = 0; i < new_rsegs; ++i) { + ulint space; + + /* Tablespace 0 is the system tablespace. All UNDO + log tablespaces start from 1. */ + + if (n_spaces > 0) { + space = (i % n_spaces) + 1; + } else { + space = 0; /* System tablespace */ + } + + if (trx_rseg_create(space) != NULL) { + ++n_used; + } else { + break; + } + } + } + + ib_logf(IB_LOG_LEVEL_INFO, + "%lu rollback segment(s) are active.", n_used); + + return(n_used); +} + +#else /* !UNIV_HOTBACKUP */ +/*****************************************************************//** +Prints to stderr the MySQL binlog info in the system header if the +magic number shows it valid. */ +UNIV_INTERN +void +trx_sys_print_mysql_binlog_offset_from_page( +/*========================================*/ + const byte* page) /*!< in: buffer containing the trx + system header page, i.e., page number + TRX_SYS_PAGE_NO in the tablespace */ +{ + const trx_sysf_t* sys_header; + + sys_header = page + TRX_SYS; + + if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + == TRX_SYS_MYSQL_LOG_MAGIC_N) { + + fprintf(stderr, + "ibbackup: Last MySQL binlog file position %lu %lu," + " file name %s\n", + (ulong) mach_read_from_4( + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH), + (ulong) mach_read_from_4( + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW), + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME); + } +} + +/*****************************************************************//** +Reads the file format id from the first system table space file. +Even if the call succeeds and returns TRUE, the returned format id +may be ULINT_UNDEFINED signalling that the format id was not present +in the data file. +@return TRUE if call succeeds */ +UNIV_INTERN +ibool +trx_sys_read_file_format_id( +/*========================*/ + const char *pathname, /*!< in: pathname of the first system + table space file */ + ulint *format_id) /*!< out: file format of the system table + space */ +{ + os_file_t file; + ibool success; + byte buf[UNIV_PAGE_SIZE * 2]; + page_t* page = ut_align(buf, UNIV_PAGE_SIZE); + const byte* ptr; + ib_id_t file_format_id; + + *format_id = ULINT_UNDEFINED; + + file = os_file_create_simple_no_error_handling( + innodb_file_data_key, + pathname, + OS_FILE_OPEN, + OS_FILE_READ_ONLY, + &success + ); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " ibbackup: Error: trying to read system tablespace " + "file format,\n" + " ibbackup: but could not open the tablespace " + "file %s!\n", pathname); + return(FALSE); + } + + /* Read the page on which file format is stored */ + + success = os_file_read_no_error_handling( + file, page, TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE, UNIV_PAGE_SIZE); + + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " ibbackup: Error: trying to read system tablespace " + "file format,\n" + " ibbackup: but failed to read the tablespace " + "file %s!\n", pathname); + + os_file_close(file); + return(FALSE); + } + os_file_close(file); + + /* get the file format from the page */ + ptr = page + TRX_SYS_FILE_FORMAT_TAG; + file_format_id = mach_read_from_8(ptr); + file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N; + + if (file_format_id >= FILE_FORMAT_NAME_N) { + + /* Either it has never been tagged, or garbage in it. */ + return(TRUE); + } + + *format_id = (ulint) file_format_id; + + return(TRUE); +} + +/*****************************************************************//** +Reads the file format id from the given per-table data file. +@return TRUE if call succeeds */ +UNIV_INTERN +ibool +trx_sys_read_pertable_file_format_id( +/*=================================*/ + const char *pathname, /*!< in: pathname of a per-table + datafile */ + ulint *format_id) /*!< out: file format of the per-table + data file */ +{ + os_file_t file; + ibool success; + byte buf[UNIV_PAGE_SIZE * 2]; + page_t* page = ut_align(buf, UNIV_PAGE_SIZE); + const byte* ptr; + ib_uint32_t flags; + + *format_id = ULINT_UNDEFINED; + + file = os_file_create_simple_no_error_handling( + innodb_file_data_key, + pathname, + OS_FILE_OPEN, + OS_FILE_READ_ONLY, + &success + ); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " ibbackup: Error: trying to read per-table " + "tablespace format,\n" + " ibbackup: but could not open the tablespace " + "file %s!\n", pathname); + + return(FALSE); + } + + /* Read the first page of the per-table datafile */ + + success = os_file_read_no_error_handling(file, page, 0, UNIV_PAGE_SIZE); + + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " ibbackup: Error: trying to per-table data file " + "format,\n" + " ibbackup: but failed to read the tablespace " + "file %s!\n", pathname); + + os_file_close(file); + return(FALSE); + } + os_file_close(file); + + /* get the file format from the page */ + ptr = page + 54; + flags = mach_read_from_4(ptr); + if (flags == 0) { + /* file format is Antelope */ + *format_id = 0; + return(TRUE); + } else if (flags & 1) { + /* tablespace flags are ok */ + *format_id = (flags / 32) % 128; + return(TRUE); + } else { + /* bad tablespace flags */ + return(FALSE); + } +} + + +/*****************************************************************//** +Get the name representation of the file format from its id. +@return pointer to the name */ +UNIV_INTERN +const char* +trx_sys_file_format_id_to_name( +/*===========================*/ + const ulint id) /*!< in: id of the file format */ +{ + if (!(id < FILE_FORMAT_NAME_N)) { + /* unknown id */ + return("Unknown"); + } + + return(file_format_name_map[id]); +} + +#endif /* !UNIV_HOTBACKUP */ + +#ifndef UNIV_HOTBACKUP +/********************************************************************* +Shutdown/Close the transaction system. */ +UNIV_INTERN +void +trx_sys_close(void) +/*===============*/ +{ + ulint i; + trx_t* trx; + read_view_t* view; + + ut_ad(trx_sys != NULL); + ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); + + /* Check that all read views are closed except read view owned + by a purge. */ + + mutex_enter(&trx_sys->mutex); + + if (UT_LIST_GET_LEN(trx_sys->view_list) > 1) { + fprintf(stderr, + "InnoDB: Error: all read views were not closed" + " before shutdown:\n" + "InnoDB: %lu read views open \n", + UT_LIST_GET_LEN(trx_sys->view_list) - 1); + } + + mutex_exit(&trx_sys->mutex); + + sess_close(trx_dummy_sess); + trx_dummy_sess = NULL; + + trx_purge_sys_close(); + + /* Free the double write data structures. */ + buf_dblwr_free(); + + mutex_enter(&trx_sys->mutex); + + ut_a(UT_LIST_GET_LEN(trx_sys->ro_trx_list) == 0); + + /* Only prepared transactions may be left in the system. Free them. */ + ut_a(UT_LIST_GET_LEN(trx_sys->rw_trx_list) == trx_sys->n_prepared_trx); + + while ((trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list)) != NULL) { + trx_free_prepared(trx); + } + + /* There can't be any active transactions. */ + for (i = 0; i < TRX_SYS_N_RSEGS; ++i) { + trx_rseg_t* rseg; + + rseg = trx_sys->rseg_array[i]; + + if (rseg != NULL) { + trx_rseg_mem_free(rseg); + } else { + break; + } + } + + view = UT_LIST_GET_FIRST(trx_sys->view_list); + + while (view != NULL) { + read_view_t* prev_view = view; + + view = UT_LIST_GET_NEXT(view_list, prev_view); + + /* Views are allocated from the trx_sys->global_read_view_heap. + So, we simply remove the element here. */ + UT_LIST_REMOVE(view_list, trx_sys->view_list, prev_view); + } + + ut_a(UT_LIST_GET_LEN(trx_sys->view_list) == 0); + ut_a(UT_LIST_GET_LEN(trx_sys->ro_trx_list) == 0); + ut_a(UT_LIST_GET_LEN(trx_sys->rw_trx_list) == 0); + ut_a(UT_LIST_GET_LEN(trx_sys->mysql_trx_list) == 0); + + mutex_exit(&trx_sys->mutex); + + mutex_free(&trx_sys->mutex); + + ut_ad(trx_sys->descr_n_used == 0); + ut_free(trx_sys->descriptors); + + mem_free(trx_sys); + + trx_sys = NULL; +} + +/********************************************************************* +Check if there are any active (non-prepared) transactions. +@return total number of active transactions or 0 if none */ +UNIV_INTERN +ulint +trx_sys_any_active_transactions(void) +/*=================================*/ +{ + ulint total_trx = 0; + + mutex_enter(&trx_sys->mutex); + + total_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list) + + UT_LIST_GET_LEN(trx_sys->mysql_trx_list); + + ut_a(total_trx >= trx_sys->n_prepared_trx); + total_trx -= trx_sys->n_prepared_trx; + + mutex_exit(&trx_sys->mutex); + + return(total_trx); +} + +#ifdef UNIV_DEBUG +/*************************************************************//** +Validate the trx_list_t. +@return TRUE if valid. */ +static +ibool +trx_sys_validate_trx_list_low( +/*===========================*/ + trx_list_t* trx_list) /*!< in: &trx_sys->ro_trx_list + or &trx_sys->rw_trx_list */ +{ + const trx_t* trx; + const trx_t* prev_trx = NULL; + + ut_ad(mutex_own(&trx_sys->mutex)); + + ut_ad(trx_list == &trx_sys->ro_trx_list + || trx_list == &trx_sys->rw_trx_list); + + for (trx = UT_LIST_GET_FIRST(*trx_list); + trx != NULL; + prev_trx = trx, trx = UT_LIST_GET_NEXT(trx_list, prev_trx)) { + + assert_trx_in_list(trx); + ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list)); + + ut_a(prev_trx == NULL || prev_trx->id > trx->id); + } + + return(TRUE); +} + +/*************************************************************//** +Validate the trx_sys_t::ro_trx_list and trx_sys_t::rw_trx_list. +@return TRUE if lists are valid. */ +UNIV_INTERN +ibool +trx_sys_validate_trx_list(void) +/*===========================*/ +{ + ut_ad(mutex_own(&trx_sys->mutex)); + + ut_a(trx_sys_validate_trx_list_low(&trx_sys->ro_trx_list)); + ut_a(trx_sys_validate_trx_list_low(&trx_sys->rw_trx_list)); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/trx/trx0trx.c b/storage/xtradb/trx/trx0trx.c deleted file mode 100644 index 818ba970118..00000000000 --- a/storage/xtradb/trx/trx0trx.c +++ /dev/null @@ -1,2482 +0,0 @@ -/***************************************************************************** - -Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - -*****************************************************************************/ - -/**************************************************//** -@file trx/trx0trx.c -The transaction - -Created 3/26/1996 Heikki Tuuri -*******************************************************/ - -#include "trx0trx.h" - -#ifdef UNIV_NONINL -#include "trx0trx.ic" -#endif - -#include "trx0undo.h" -#include "trx0rseg.h" -#include "log0log.h" -#include "que0que.h" -#include "lock0lock.h" -#include "trx0roll.h" -#include "usr0sess.h" -#include "read0read.h" -#include "srv0srv.h" -#include "btr0sea.h" -#include "os0proc.h" -#include "trx0xa.h" -#include "trx0purge.h" -#include "ha_prototypes.h" - -/** Dummy session used currently in MySQL interface */ -UNIV_INTERN sess_t* trx_dummy_sess = NULL; - -/** Number of transactions currently allocated for MySQL: protected by -the kernel mutex */ -UNIV_INTERN ulint trx_n_mysql_transactions = 0; -/** Number of transactions currently in the XA PREPARED state: protected by -the kernel mutex */ -UNIV_INTERN ulint trx_n_prepared = 0; - -#ifdef UNIV_PFS_MUTEX -/* Key to register the mutex with performance schema */ -UNIV_INTERN mysql_pfs_key_t trx_undo_mutex_key; -#endif /* UNIV_PFS_MUTEX */ - -/*************************************************************//** -Set detailed error message for the transaction. */ -UNIV_INTERN -void -trx_set_detailed_error( -/*===================*/ - trx_t* trx, /*!< in: transaction struct */ - const char* msg) /*!< in: detailed error message */ -{ - ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error)); -} - -/*************************************************************//** -Set detailed error message for the transaction from a file. Note that the -file is rewinded before reading from it. */ -UNIV_INTERN -void -trx_set_detailed_error_from_file( -/*=============================*/ - trx_t* trx, /*!< in: transaction struct */ - FILE* file) /*!< in: file to read message from */ -{ - os_file_read_string(file, trx->detailed_error, - sizeof(trx->detailed_error)); -} - -/*************************************************************//** -Callback function for trx_find_descriptor() to compare trx IDs. */ -UNIV_INTERN -int -trx_descr_cmp( -/*==========*/ - const void *a, /*!< in: pointer to first comparison argument */ - const void *b) /*!< in: pointer to second comparison argument */ -{ - const trx_id_t* da = (const trx_id_t*) a; - const trx_id_t* db = (const trx_id_t*) b; - - if (*da < *db) { - return -1; - } else if (*da > *db) { - return 1; - } - - return 0; -} - -/*************************************************************//** -Reserve a slot for a given trx in the global descriptors array. */ -UNIV_INLINE -void -trx_reserve_descriptor( -/*===================*/ - const trx_t* trx) /*!< in: trx pointer */ -{ - ulint n_used; - ulint n_max; - trx_id_t* descr; - - ut_ad(mutex_own(&kernel_mutex)); - ut_ad(!trx_find_descriptor(trx_sys->descriptors, - trx_sys->descr_n_used, - trx->id)); - - n_used = trx_sys->descr_n_used + 1; - n_max = trx_sys->descr_n_max; - - if (UNIV_UNLIKELY(n_used > n_max)) { - - n_max = n_max * 2; - - trx_sys->descriptors = - ut_realloc(trx_sys->descriptors, - n_max * sizeof(trx_id_t)); - - trx_sys->descr_n_max = n_max; - srv_descriptors_memory = n_max * sizeof(trx_id_t); - } - - descr = trx_sys->descriptors + n_used - 1; - - if (UNIV_UNLIKELY(n_used > 1 && trx->id < descr[-1])) { - - /* Find the slot where it should be inserted. We could use a - binary search, but in reality linear search should be faster, - because the slot we are looking for is near the array end. */ - - trx_id_t* tdescr; - - for (tdescr = descr - 1; - tdescr >= trx_sys->descriptors && *tdescr > trx->id; - tdescr--) { - } - - tdescr++; - - ut_memmove(tdescr + 1, tdescr, (descr - tdescr) * - sizeof(trx_id_t)); - - descr = tdescr; - } - - *descr = trx->id; - - trx_sys->descr_n_used = n_used; -} - -/*************************************************************//** -Release a slot for a given trx in the global descriptors array. */ -UNIV_INTERN -void -trx_release_descriptor( -/*===================*/ - trx_t* trx) /*!< in: trx pointer */ -{ - ulint size; - trx_id_t* descr; - - ut_ad(mutex_own(&kernel_mutex)); - - if (UNIV_LIKELY(trx->is_in_trx_serial_list)) { - - UT_LIST_REMOVE(trx_serial_list, trx_sys->trx_serial_list, - trx); - trx->is_in_trx_serial_list = 0; - } - - descr = trx_find_descriptor(trx_sys->descriptors, - trx_sys->descr_n_used, - trx->id); - - if (UNIV_UNLIKELY(descr == NULL)) { - - return; - } - - size = (trx_sys->descriptors + trx_sys->descr_n_used - 1 - descr) * - sizeof(trx_id_t); - - if (UNIV_LIKELY(size > 0)) { - - ut_memmove(descr, descr + 1, size); - } - - trx_sys->descr_n_used--; -} - -/****************************************************************//** -Creates and initializes a transaction object. -@return own: the transaction */ -UNIV_INTERN -trx_t* -trx_create( -/*=======*/ - sess_t* sess) /*!< in: session */ -{ - trx_t* trx; - - ut_ad(mutex_own(&kernel_mutex)); - ut_ad(sess); - - trx = mem_alloc(sizeof(trx_t)); - - trx->magic_n = TRX_MAGIC_N; - - trx->op_info = ""; - - trx->is_purge = 0; - trx->is_recovered = 0; - trx->state = TRX_NOT_STARTED; - - trx->is_registered = 0; - trx->active_commit_ordered = 0; - - trx->start_time = ut_time(); - - trx->isolation_level = TRX_ISO_REPEATABLE_READ; - - trx->id = 0; - trx->no = IB_ULONGLONG_MAX; - trx->is_in_trx_serial_list = 0; - - trx->support_xa = TRUE; - - trx->fake_changes = FALSE; - - trx->check_foreigns = TRUE; - trx->check_unique_secondary = TRUE; - - trx->flush_log_later = FALSE; - trx->must_flush_log_later = FALSE; - - trx->dict_operation = TRX_DICT_OP_NONE; - trx->table_id = 0; - - trx->mysql_thd = NULL; - trx->duplicates = 0; - - trx->n_mysql_tables_in_use = 0; - trx->mysql_n_tables_locked = 0; - - trx->mysql_log_file_name = NULL; - trx->mysql_log_offset = 0; - trx->mysql_master_log_file_name = ""; - trx->mysql_master_log_pos = 0; - trx->mysql_relay_log_file_name = ""; - trx->mysql_relay_log_pos = 0; - - trx->idle_start = 0; - trx->last_stmt_start = 0; - - mutex_create(trx_undo_mutex_key, &trx->undo_mutex, SYNC_TRX_UNDO); - - trx->rseg = NULL; - - trx->undo_no = 0; - trx->last_sql_stat_start.least_undo_no = 0; - trx->insert_undo = NULL; - trx->update_undo = NULL; - trx->undo_no_arr = NULL; - - trx->error_state = DB_SUCCESS; - trx->error_key_num = 0; - trx->detailed_error[0] = '\0'; - - trx->sess = sess; - trx->que_state = TRX_QUE_RUNNING; - trx->n_active_thrs = 0; - - trx->handling_signals = FALSE; - - UT_LIST_INIT(trx->signals); - UT_LIST_INIT(trx->reply_signals); - - trx->graph = NULL; - - trx->wait_lock = NULL; - trx->was_chosen_as_deadlock_victim = FALSE; - UT_LIST_INIT(trx->wait_thrs); - - trx->lock_heap = mem_heap_create_in_buffer(256); - UT_LIST_INIT(trx->trx_locks); - - UT_LIST_INIT(trx->trx_savepoints); - - trx->dict_operation_lock_mode = 0; - trx->has_search_latch = FALSE; - trx->search_latch_timeout = BTR_SEA_TIMEOUT; - - trx->declared_to_be_inside_innodb = FALSE; - trx->n_tickets_to_enter_innodb = 0; - - trx->global_read_view = NULL; - trx->read_view = NULL; - trx->prebuilt_view = NULL; - - trx->io_reads = 0; - trx->io_read = 0; - trx->io_reads_wait_timer = 0; - trx->lock_que_wait_timer = 0; - trx->innodb_que_wait_timer = 0; - trx->distinct_page_access = 0; - trx->distinct_page_access_hash = NULL; - trx->take_stats = FALSE; - - /* Set X/Open XA transaction identification to NULL */ - memset(&trx->xid, 0, sizeof(trx->xid)); - trx->xid.formatID = -1; - - trx->n_autoinc_rows = 0; - - /* Remember to free the vector explicitly. */ - trx->autoinc_locks = ib_vector_create( - mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 4), 4); -#ifdef WITH_WSREP - trx->wsrep_event = NULL; -#endif /* WITH_WSREP */ - - return(trx); -} - -/********************************************************************//** -Creates a transaction object for MySQL. -@return own: transaction object */ -UNIV_INTERN -trx_t* -trx_allocate_for_mysql(void) -/*========================*/ -{ - trx_t* trx; - - mutex_enter(&kernel_mutex); - - trx = trx_create(trx_dummy_sess); - - trx_n_mysql_transactions++; - - UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx); - - mutex_exit(&kernel_mutex); - - if (UNIV_UNLIKELY(trx->take_stats)) { - trx->distinct_page_access_hash = mem_alloc(DPAH_SIZE); - memset(trx->distinct_page_access_hash, 0, DPAH_SIZE); - } - - return(trx); -} - -/********************************************************************//** -Creates a transaction object for background operations by the master thread. -@return own: transaction object */ -UNIV_INTERN -trx_t* -trx_allocate_for_background(void) -/*=============================*/ -{ - trx_t* trx; - - mutex_enter(&kernel_mutex); - - trx = trx_create(trx_dummy_sess); - - mutex_exit(&kernel_mutex); - - return(trx); -} - -/********************************************************************//** -Releases the search latch if trx has reserved it. */ -UNIV_INTERN -void -trx_search_latch_release_if_reserved( -/*=================================*/ - trx_t* trx) /*!< in: transaction */ -{ - ulint i; - - if (trx->has_search_latch) { - for (i = 0; i < btr_search_index_num; i++) { - if (trx->has_search_latch & ((ulint)1 << i)) { - rw_lock_s_unlock(btr_search_latch_part[i]); - } - } - - trx->has_search_latch = FALSE; - } -} - -/********************************************************************//** -Frees a transaction object. */ -UNIV_INTERN -void -trx_free( -/*=====*/ - trx_t* trx) /*!< in, own: trx object */ -{ - ut_ad(mutex_own(&kernel_mutex)); - - if (trx->declared_to_be_inside_innodb) { - ut_print_timestamp(stderr); - fputs(" InnoDB: Error: Freeing a trx which is declared" - " to be processing\n" - "InnoDB: inside InnoDB.\n", stderr); - trx_print(stderr, trx, 600); - putc('\n', stderr); - - /* This is an error but not a fatal error. We must keep - the counters like srv_conc_n_threads accurate. */ - srv_conc_force_exit_innodb(trx); - } - - if (trx->n_mysql_tables_in_use != 0 - || trx->mysql_n_tables_locked != 0) { - - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Error: MySQL is freeing a thd\n" - "InnoDB: though trx->n_mysql_tables_in_use is %lu\n" - "InnoDB: and trx->mysql_n_tables_locked is %lu.\n", - (ulong)trx->n_mysql_tables_in_use, - (ulong)trx->mysql_n_tables_locked); - - trx_print(stderr, trx, 600); - - ut_print_buf(stderr, trx, sizeof(trx_t)); - putc('\n', stderr); - } - - ut_a(trx->magic_n == TRX_MAGIC_N); - - trx->magic_n = 11112222; - - ut_a(trx->state == TRX_NOT_STARTED); - - mutex_free(&(trx->undo_mutex)); - - ut_a(trx->insert_undo == NULL); - ut_a(trx->update_undo == NULL); - - if (trx->undo_no_arr) { - trx_undo_arr_free(trx->undo_no_arr); - } - - ut_a(UT_LIST_GET_LEN(trx->signals) == 0); - ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0); - - ut_a(trx->wait_lock == NULL); - ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0); - - ut_a(!trx->has_search_latch); - - ut_a(trx->dict_operation_lock_mode == 0); - - if (trx->lock_heap) { - mem_heap_free(trx->lock_heap); - } - - ut_a(UT_LIST_GET_LEN(trx->trx_locks) == 0); - - if (trx->prebuilt_view != NULL) { - read_view_free(trx->prebuilt_view); - } - - ut_a(trx->read_view == NULL); - - ut_a(ib_vector_is_empty(trx->autoinc_locks)); - /* We allocated a dedicated heap for the vector. */ - ib_vector_free(trx->autoinc_locks); - - trx_release_descriptor(trx); - - mem_free(trx); -} - -/********************************************************************//** -At shutdown, frees a transaction object that is in the PREPARED state. */ -UNIV_INTERN -void -trx_free_prepared( -/*==============*/ - trx_t* trx) /*!< in, own: trx object */ -{ - ut_ad(mutex_own(&kernel_mutex)); - ut_a(trx->state == TRX_PREPARED); - ut_a(trx->magic_n == TRX_MAGIC_N); - - /* Prepared transactions are sort of active; they allow - ROLLBACK and COMMIT operations. Because the system does not - contain any other transactions than prepared transactions at - the shutdown stage and because a transaction cannot become - PREPARED while holding locks, it is safe to release the locks - held by PREPARED transactions here at shutdown.*/ - lock_release_off_kernel(trx); - - trx_undo_free_prepared(trx); - - mutex_free(&trx->undo_mutex); - - if (trx->undo_no_arr) { - trx_undo_arr_free(trx->undo_no_arr); - } - - ut_a(UT_LIST_GET_LEN(trx->signals) == 0); - ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0); - - ut_a(trx->wait_lock == NULL); - ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0); - - ut_a(!trx->has_search_latch); - - ut_a(trx->dict_operation_lock_mode == 0); - - if (trx->lock_heap) { - mem_heap_free(trx->lock_heap); - } - - ut_a(ib_vector_is_empty(trx->autoinc_locks)); - ib_vector_free(trx->autoinc_locks); - - trx_release_descriptor(trx); - - if (trx->prebuilt_view != NULL) { - read_view_free(trx->prebuilt_view); - } - - UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx); - - ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->trx_list)); - - mem_free(trx); -} - -/********************************************************************//** -Frees a transaction object for MySQL. */ -UNIV_INTERN -void -trx_free_for_mysql( -/*===============*/ - trx_t* trx) /*!< in, own: trx object */ -{ - if (trx->distinct_page_access_hash) - { - mem_free(trx->distinct_page_access_hash); - trx->distinct_page_access_hash= NULL; - } - - mutex_enter(&kernel_mutex); - - UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx); - - trx_free(trx); - - ut_a(trx_n_mysql_transactions > 0); - - trx_n_mysql_transactions--; - - mutex_exit(&kernel_mutex); -} - -/********************************************************************//** -Frees a transaction object of a background operation of the master thread. */ -UNIV_INTERN -void -trx_free_for_background( -/*====================*/ - trx_t* trx) /*!< in, own: trx object */ -{ - if (trx->distinct_page_access_hash) - { - mem_free(trx->distinct_page_access_hash); - trx->distinct_page_access_hash= NULL; - } - - mutex_enter(&kernel_mutex); - - trx_free(trx); - - mutex_exit(&kernel_mutex); -} - -/****************************************************************//** -Inserts the trx handle in the trx system trx list in the right position. -The list is sorted on the trx id so that the biggest id is at the list -start. This function is used at the database startup to insert incomplete -transactions to the list. */ -static -void -trx_list_insert_ordered( -/*====================*/ - trx_t* trx) /*!< in: trx handle */ -{ - trx_t* trx2; - - ut_ad(mutex_own(&kernel_mutex)); - - trx2 = UT_LIST_GET_FIRST(trx_sys->trx_list); - - while (trx2 != NULL) { - if (trx->id >= trx2->id) { - - ut_ad(trx->id > trx2->id); - break; - } - trx2 = UT_LIST_GET_NEXT(trx_list, trx2); - } - - if (trx2 != NULL) { - trx2 = UT_LIST_GET_PREV(trx_list, trx2); - - if (trx2 == NULL) { - UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx); - } else { - UT_LIST_INSERT_AFTER(trx_list, trx_sys->trx_list, - trx2, trx); - } - } else { - UT_LIST_ADD_LAST(trx_list, trx_sys->trx_list, trx); - } -} - -/****************************************************************//** -Creates trx objects for transactions and initializes the trx list of -trx_sys at database start. Rollback segment and undo log lists must -already exist when this function is called, because the lists of -transactions to be rolled back or cleaned up are built based on the -undo log lists. */ -UNIV_INTERN -void -trx_lists_init_at_db_start(void) -/*============================*/ -{ - trx_rseg_t* rseg; - trx_undo_t* undo; - trx_t* trx; - - ut_ad(mutex_own(&kernel_mutex)); - UT_LIST_INIT(trx_sys->trx_list); - UT_LIST_INIT(trx_sys->trx_serial_list); - - /* Look from the rollback segments if there exist undo logs for - transactions */ - - rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); - - while (rseg != NULL) { - undo = UT_LIST_GET_FIRST(rseg->insert_undo_list); - - while (undo != NULL) { - - trx = trx_create(trx_dummy_sess); - - trx->is_recovered = TRUE; - trx->id = undo->trx_id; - trx->xid = undo->xid; - trx->insert_undo = undo; - trx->rseg = rseg; - - if (undo->state != TRX_UNDO_ACTIVE) { - - /* Prepared transactions are left in - the prepared state waiting for a - commit or abort decision from MySQL */ - - if (undo->state == TRX_UNDO_PREPARED) { - - fprintf(stderr, - "InnoDB: Transaction " - TRX_ID_FMT - " was in the" - " XA prepared state.\n", - (ullint) trx->id); - - if (srv_force_recovery == 0) { - - trx->state = TRX_PREPARED; - trx_n_prepared++; - } else { - fprintf(stderr, - "InnoDB: Since" - " innodb_force_recovery" - " > 0, we will" - " rollback it" - " anyway.\n"); - - trx->state = TRX_ACTIVE; - } - - trx_reserve_descriptor(trx); - } else { - trx->state = TRX_COMMITTED_IN_MEMORY; - } - - /* We give a dummy value for the trx no; - this should have no relevance since purge - is not interested in committed transaction - numbers, unless they are in the history - list, in which case it looks the number - from the disk based undo log structure */ - - trx->no = trx->id; - } else { - trx->state = TRX_ACTIVE; - - /* A running transaction always has the number - field inited to IB_ULONGLONG_MAX */ - - trx->no = IB_ULONGLONG_MAX; - - trx_reserve_descriptor(trx); - - } - - if (undo->dict_operation) { - trx_set_dict_operation( - trx, TRX_DICT_OP_TABLE); - trx->table_id = undo->table_id; - } - - if (!undo->empty) { - trx->undo_no = undo->top_undo_no + 1; - } - - trx_list_insert_ordered(trx); - - undo = UT_LIST_GET_NEXT(undo_list, undo); - } - - undo = UT_LIST_GET_FIRST(rseg->update_undo_list); - - while (undo != NULL) { - trx = trx_get_on_id(undo->trx_id); - - if (NULL == trx) { - trx = trx_create(trx_dummy_sess); - - trx->is_recovered = TRUE; - trx->id = undo->trx_id; - trx->xid = undo->xid; - - if (undo->state != TRX_UNDO_ACTIVE) { - - /* Prepared transactions are left in - the prepared state waiting for a - commit or abort decision from MySQL */ - - if (undo->state == TRX_UNDO_PREPARED) { - fprintf(stderr, - "InnoDB: Transaction " - TRX_ID_FMT " was in the" - " XA prepared state.\n", - (ullint) trx->id); - - if (srv_force_recovery == 0) { - - trx->state - = TRX_PREPARED; - trx_n_prepared++; - } else { - fprintf(stderr, - "InnoDB: Since" - " innodb_force_recovery" - " > 0, we will" - " rollback it" - " anyway.\n"); - - trx->state = TRX_ACTIVE; - trx_reserve_descriptor( - trx); - } - } else { - trx->state - = TRX_COMMITTED_IN_MEMORY; - } - - /* We give a dummy value for the trx - number */ - - trx->no = trx->id; - } else { - trx->state = TRX_ACTIVE; - /* A running transaction always has - the number field inited to - IB_ULONGLONG_MAX */ - - trx->no = IB_ULONGLONG_MAX; - - trx_reserve_descriptor(trx); - } - - trx->rseg = rseg; - trx_list_insert_ordered(trx); - - if (undo->dict_operation) { - trx_set_dict_operation( - trx, TRX_DICT_OP_TABLE); - trx->table_id = undo->table_id; - } - } - - trx->update_undo = undo; - - if ((!undo->empty) - && undo->top_undo_no >= trx->undo_no) { - - trx->undo_no = undo->top_undo_no + 1; - } - - undo = UT_LIST_GET_NEXT(undo_list, undo); - } - - rseg = UT_LIST_GET_NEXT(rseg_list, rseg); - } -} - -/******************************************************************//** -Assigns a rollback segment to a transaction in a round-robin fashion. -@return assigned rollback segment instance */ -UNIV_INLINE -trx_rseg_t* -trx_assign_rseg( -/*============*/ - ulint max_undo_logs) /*!< in: maximum number of UNDO logs to use */ -{ - trx_rseg_t* rseg = trx_sys->latest_rseg; - - ut_ad(mutex_own(&kernel_mutex)); - - rseg = UT_LIST_GET_NEXT(rseg_list, rseg); - - if (rseg == NULL || rseg->id == max_undo_logs - 1) { - rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); - } - - trx_sys->latest_rseg = rseg; - - return(rseg); -} - -/****************************************************************//** -Starts a new transaction. -@return TRUE */ -UNIV_INTERN -ibool -trx_start_low( -/*==========*/ - trx_t* trx, /*!< in: transaction */ - ulint rseg_id)/*!< in: rollback segment id; if ULINT_UNDEFINED - is passed, the system chooses the rollback segment - automatically in a round-robin fashion */ -{ - trx_rseg_t* rseg; - - ut_ad(mutex_own(&kernel_mutex)); - ut_ad(trx->rseg == NULL); - - if (trx->is_purge) { - trx->id = 0; - /* Don't reserve a descriptor, since this trx is not added to - trx_list. */ - trx->state = TRX_ACTIVE; - trx->start_time = time(NULL); - - return(TRUE); - } - - ut_ad(trx->state != TRX_ACTIVE); - - ut_a(rseg_id == ULINT_UNDEFINED); - - rseg = trx_assign_rseg(srv_rollback_segments); - - trx->id = trx_sys_get_new_trx_id(); - -#ifdef WITH_WSREP - memset(&trx->xid, 0, sizeof(trx->xid)); - trx->xid.formatID = -1; -#endif /* WITH_WSREP */ - - /* The initial value for trx->no: IB_ULONGLONG_MAX is used in - read_view_open_now: */ - - trx->no = IB_ULONGLONG_MAX; - - trx->rseg = rseg; - - trx->state = TRX_ACTIVE; - - trx_reserve_descriptor(trx); - - trx->start_time = time(NULL); - - UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx); - - return(TRUE); -} - -/****************************************************************//** -Starts a new transaction. -@return TRUE */ -UNIV_INTERN -ibool -trx_start( -/*======*/ - trx_t* trx, /*!< in: transaction */ - ulint rseg_id)/*!< in: rollback segment id; if ULINT_UNDEFINED - is passed, the system chooses the rollback segment - automatically in a round-robin fashion */ -{ - ibool ret; - - /* Update the info whether we should skip XA steps that eat CPU time - For the duration of the transaction trx->support_xa is not reread - from thd so any changes in the value take effect in the next - transaction. This is to avoid a scenario where some undo - generated by a transaction, has XA stuff, and other undo, - generated by the same transaction, doesn't. */ - trx->support_xa = thd_supports_xa(trx->mysql_thd); - - mutex_enter(&kernel_mutex); - - ret = trx_start_low(trx, rseg_id); - - mutex_exit(&kernel_mutex); - - return(ret); -} - -/****************************************************************//** -Set the transaction serialisation number. */ -static -void -trx_serialisation_number_get( -/*=========================*/ - trx_t* trx) /*!< in: transaction */ -{ - trx_rseg_t* rseg; - - rseg = trx->rseg; - - ut_ad(mutex_own(&rseg->mutex)); - - mutex_enter(&kernel_mutex); - - trx->no = trx_sys_get_new_trx_id(); - - if (UNIV_LIKELY(trx->is_in_trx_serial_list == 0)) { - - UT_LIST_ADD_LAST(trx_serial_list, trx_sys->trx_serial_list, - trx); - - trx->is_in_trx_serial_list = 1; - } - - /* If the rollack segment is not empty then the - new trx_t::no can't be less than any trx_t::no - already in the rollback segment. User threads only - produce events when a rollback segment is empty. */ - - if (rseg->last_page_no == FIL_NULL) { - void* ptr; - rseg_queue_t rseg_queue; - - rseg_queue.rseg = rseg; - rseg_queue.trx_no = trx->no; - - mutex_enter(&purge_sys->bh_mutex); - - /* This is to reduce the pressure on the kernel mutex, - though in reality it should make very little (read no) - difference because this code path is only taken when the - rbs is empty. */ - - mutex_exit(&kernel_mutex); - - ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue); - ut_a(ptr); - - mutex_exit(&purge_sys->bh_mutex); - } else { - mutex_exit(&kernel_mutex); - } -} - -/****************************************************************//** -Assign the transaction its history serialisation number and write the -update UNDO log record to the assigned rollback segment. -@return the LSN of the UNDO log write. */ -static -ib_uint64_t -trx_write_serialisation_history( -/*============================*/ - trx_t* trx) /*!< in: transaction */ -{ - mtr_t mtr; - trx_rseg_t* rseg; - trx_sysf_t* sys_header = NULL; - - ut_ad(!mutex_own(&kernel_mutex)); - - rseg = trx->rseg; - - mtr_start(&mtr); - - /* Change the undo log segment states from TRX_UNDO_ACTIVE - to some other state: these modifications to the file data - structure define the transaction as committed in the file - based domain, at the serialization point of the log sequence - number lsn obtained below. */ - - if (trx->update_undo != NULL) { - page_t* undo_hdr_page; - trx_undo_t* undo = trx->update_undo; - - /* We have to hold the rseg mutex because update - log headers have to be put to the history list in the - (serialisation) order of the UNDO trx number. This is - required for the purge in-memory data structures too. */ - - mutex_enter(&rseg->mutex); - - /* Assign the transaction serialisation number and also - update the purge min binary heap if this is the first - UNDO log being written to the assigned rollback segment. */ - - trx_serialisation_number_get(trx); - - /* It is not necessary to obtain trx->undo_mutex here - because only a single OS thread is allowed to do the - transaction commit for this transaction. */ - - undo_hdr_page = trx_undo_set_state_at_finish(undo, &mtr); - - trx_undo_update_cleanup(trx, undo_hdr_page, &mtr); - } else { - mutex_enter(&rseg->mutex); - } - - if (trx->insert_undo != NULL) { - trx_undo_set_state_at_finish(trx->insert_undo, &mtr); - } - - mutex_exit(&rseg->mutex); - -#ifdef WITH_WSREP - /* Update latest MySQL wsrep XID in trx sys header. */ - if (wsrep_is_wsrep_xid(&trx->xid)) - { - trx_sys_update_wsrep_checkpoint(&trx->xid, &mtr); - } -#endif /* WITH_WSREP */ - - /* Update the latest MySQL binlog name and offset info - in trx sys header if MySQL binlogging is on or the database - server is a MySQL replication slave */ - - if (trx->mysql_log_file_name - && trx->mysql_log_file_name[0] != '\0') { - if (!sys_header) { - sys_header = trx_sysf_get(&mtr); - } - - trx_sys_update_mysql_binlog_offset( - sys_header, - trx->mysql_log_file_name, - trx->mysql_log_offset, - TRX_SYS_MYSQL_LOG_INFO, &mtr); - - trx->mysql_log_file_name = NULL; - } - - if (trx->mysql_master_log_file_name[0] != '\0') { - /* This database server is a MySQL replication slave */ - if (!sys_header) { - sys_header = trx_sysf_get(&mtr); - } - - trx_sys_update_mysql_binlog_offset( - sys_header, - trx->mysql_relay_log_file_name, - trx->mysql_relay_log_pos, - TRX_SYS_COMMIT_RELAY_LOG_INFO, &mtr); - - trx_sys_update_mysql_binlog_offset( - sys_header, - trx->mysql_master_log_file_name, - trx->mysql_master_log_pos, - TRX_SYS_COMMIT_MASTER_LOG_INFO, &mtr); - - trx_sys_update_mysql_binlog_offset( - sys_header, - trx->mysql_relay_log_file_name, - trx->mysql_relay_log_pos, - TRX_SYS_MYSQL_RELAY_LOG_INFO, &mtr); - - trx_sys_update_mysql_binlog_offset( - sys_header, - trx->mysql_master_log_file_name, - trx->mysql_master_log_pos, - TRX_SYS_MYSQL_MASTER_LOG_INFO, &mtr); - - trx->mysql_master_log_file_name = ""; - } - - /* The following call commits the mini-transaction, making the - whole transaction committed in the file-based world, at this - log sequence number. The transaction becomes 'durable' when - we write the log to disk, but in the logical sense the commit - in the file-based data structures (undo logs etc.) happens - here. - - NOTE that transaction numbers, which are assigned only to - transactions with an update undo log, do not necessarily come - in exactly the same order as commit lsn's, if the transactions - have different rollback segments. To get exactly the same - order we should hold the kernel mutex up to this point, - adding to the contention of the kernel mutex. However, if - a transaction T2 is able to see modifications made by - a transaction T1, T2 will always get a bigger transaction - number and a bigger commit lsn than T1. */ - - /*--------------*/ - mtr_commit(&mtr); - /*--------------*/ - - return(mtr.end_lsn); -} - -/****************************************************************//** -Commits a transaction. */ -UNIV_INTERN -void -trx_commit_off_kernel( -/*==================*/ - trx_t* trx) /*!< in: transaction */ -{ - ib_uint64_t lsn; - - ut_ad(mutex_own(&kernel_mutex)); - - trx->must_flush_log_later = FALSE; - - /* If the transaction made any updates then we need to write the - UNDO logs for the updates to the assigned rollback segment. */ - - if (trx->insert_undo != NULL || trx->update_undo != NULL) { - mutex_exit(&kernel_mutex); - - lsn = trx_write_serialisation_history(trx); - - mutex_enter(&kernel_mutex); - } else { - lsn = 0; - } - - ut_ad(trx->state == TRX_ACTIVE || trx->state == TRX_PREPARED); - ut_ad(mutex_own(&kernel_mutex)); - - if (UNIV_UNLIKELY(trx->state == TRX_PREPARED)) { - ut_a(trx_n_prepared > 0); - trx_n_prepared--; - } - - /* The following assignment makes the transaction committed in memory - and makes its changes to data visible to other transactions. - NOTE that there is a small discrepancy from the strict formal - visibility rules here: a human user of the database can see - modifications made by another transaction T even before the necessary - log segment has been flushed to the disk. If the database happens to - crash before the flush, the user has seen modifications from T which - will never be a committed transaction. However, any transaction T2 - which sees the modifications of the committing transaction T, and - which also itself makes modifications to the database, will get an lsn - larger than the committing transaction T. In the case where the log - flush fails, and T never gets committed, also T2 will never get - committed. */ - - /*--------------------------------------*/ - trx->state = TRX_COMMITTED_IN_MEMORY; - /* The following also removes trx from trx_serial_list */ - trx_release_descriptor(trx); - /*--------------------------------------*/ - - /* If we release kernel_mutex below and we are still doing - recovery i.e.: back ground rollback thread is still active - then there is a chance that the rollback thread may see - this trx as COMMITTED_IN_MEMORY and goes adhead to clean it - up calling trx_cleanup_at_db_startup(). This can happen - in the case we are committing a trx here that is left in - PREPARED state during the crash. Note that commit of the - rollback of a PREPARED trx happens in the recovery thread - while the rollback of other transactions happen in the - background thread. To avoid this race we unconditionally - unset the is_recovered flag from the trx. */ - - trx->is_recovered = FALSE; - - lock_release_off_kernel(trx); - - if (trx->global_read_view) { - read_view_close(trx->global_read_view); - trx->global_read_view = NULL; - } - - trx->read_view = NULL; - - if (lsn) { - ulint flush_log_at_trx_commit; - - mutex_exit(&kernel_mutex); - - if (trx->insert_undo != NULL) { - - trx_undo_insert_cleanup(trx); - } - - if (srv_use_global_flush_log_at_trx_commit) { - flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL); - } else { - flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd); - } - - /* NOTE that we could possibly make a group commit more - efficient here: call os_thread_yield here to allow also other - trxs to come to commit! */ - - /*-------------------------------------*/ - - /* Depending on the my.cnf options, we may now write the log - buffer to the log files, making the transaction durable if - the OS does not crash. We may also flush the log files to - disk, making the transaction durable also at an OS crash or a - power outage. - - The idea in InnoDB's group commit is that a group of - transactions gather behind a trx doing a physical disk write - to log files, and when that physical write has been completed, - one of those transactions does a write which commits the whole - group. Note that this group commit will only bring benefit if - there are > 2 users in the database. Then at least 2 users can - gather behind one doing the physical log write to disk. - - If we are calling trx_commit() under prepare_commit_mutex, we - will delay possible log write and flush to a separate function - trx_commit_complete_for_mysql(), which is only called when the - thread has released the mutex. This is to make the - group commit algorithm to work. Otherwise, the prepare_commit - mutex would serialize all commits and prevent a group of - transactions from gathering. */ - - if (trx->flush_log_later) { - /* Do nothing yet */ - trx->must_flush_log_later = TRUE; - } else if (flush_log_at_trx_commit == 0) { - /* Do nothing */ - } else if (flush_log_at_trx_commit == 1 || - flush_log_at_trx_commit == 3) { - if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { - /* Write the log but do not flush it to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, - FALSE); - } else { - /* Write the log to the log files AND flush - them to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); - } - } else if (flush_log_at_trx_commit == 2) { - - /* Write the log but do not flush it to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); - } else { - ut_error; - } - - trx->commit_lsn = lsn; - - /*-------------------------------------*/ - - mutex_enter(&kernel_mutex); - } - - /* Free all savepoints */ - trx_roll_free_all_savepoints(trx); - - trx->state = TRX_NOT_STARTED; - trx->rseg = NULL; - trx->undo_no = 0; - trx->last_sql_stat_start.least_undo_no = 0; - - ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0); - ut_ad(UT_LIST_GET_LEN(trx->trx_locks) == 0); - -#ifdef WITH_WSREP - if (wsrep_on(trx->mysql_thd) && - trx->was_chosen_as_deadlock_victim) { - trx->was_chosen_as_deadlock_victim = FALSE; - } -#endif - UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx); - - ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->trx_list)); - - trx->error_state = DB_SUCCESS; -} - -/****************************************************************//** -Cleans up a transaction at database startup. The cleanup is needed if -the transaction already got to the middle of a commit when the database -crashed, and we cannot roll it back. */ -UNIV_INTERN -void -trx_cleanup_at_db_startup( -/*======================*/ - trx_t* trx) /*!< in: transaction */ -{ - if (trx->insert_undo != NULL) { - - trx_undo_insert_cleanup(trx); - } - - trx->state = TRX_NOT_STARTED; - trx_release_descriptor(trx); - trx->rseg = NULL; - trx->undo_no = 0; - trx->last_sql_stat_start.least_undo_no = 0; - - UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx); - - ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->trx_list)); -} - -/********************************************************************//** -Assigns a read view for a consistent read query. All the consistent reads -within the same transaction will get the same read view, which is created -when this function is first called for a new started transaction. -@return consistent read view */ -UNIV_INTERN -read_view_t* -trx_assign_read_view( -/*=================*/ - trx_t* trx) /*!< in: active transaction */ -{ - ut_ad(trx->state == TRX_ACTIVE); - - if (trx->read_view) { - return(trx->read_view); - } - - mutex_enter(&kernel_mutex); - - trx->read_view = read_view_open_now(trx->id, trx->prebuilt_view, TRUE); - trx->prebuilt_view = trx->read_view; - trx->global_read_view = trx->read_view; - - mutex_exit(&kernel_mutex); - - return(trx->read_view); -} - -/****************************************************************//** -Commits a transaction. NOTE that the kernel mutex is temporarily released. */ -static -void -trx_handle_commit_sig_off_kernel( -/*=============================*/ - trx_t* trx, /*!< in: transaction */ - que_thr_t** next_thr) /*!< in/out: next query thread to run; - if the value which is passed in is - a pointer to a NULL pointer, then the - calling function can start running - a new query thread */ -{ - trx_sig_t* sig; - trx_sig_t* next_sig; - - ut_ad(mutex_own(&kernel_mutex)); - - trx->que_state = TRX_QUE_COMMITTING; - - trx_commit_off_kernel(trx); - - ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0); - - /* Remove all TRX_SIG_COMMIT signals from the signal queue and send - reply messages to them */ - - sig = UT_LIST_GET_FIRST(trx->signals); - - while (sig != NULL) { - next_sig = UT_LIST_GET_NEXT(signals, sig); - - if (sig->type == TRX_SIG_COMMIT) { - - trx_sig_reply(sig, next_thr); - trx_sig_remove(trx, sig); - } - - sig = next_sig; - } - - trx->que_state = TRX_QUE_RUNNING; -} - -/***********************************************************//** -The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to -the TRX_QUE_RUNNING state and releases query threads which were -waiting for a lock in the wait_thrs list. */ -UNIV_INTERN -void -trx_end_lock_wait( -/*==============*/ - trx_t* trx) /*!< in: transaction */ -{ - que_thr_t* thr; - ulint sec; - ulint ms; - ib_uint64_t now; - - ut_ad(mutex_own(&kernel_mutex)); - ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT); - - thr = UT_LIST_GET_FIRST(trx->wait_thrs); - - while (thr != NULL) { - que_thr_end_wait_no_next_thr(thr); - - UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr); - - thr = UT_LIST_GET_FIRST(trx->wait_thrs); - } - - if (UNIV_UNLIKELY(trx->take_stats)) { - ut_usectime(&sec, &ms); - now = (ib_uint64_t)sec * 1000000 + ms; - trx->lock_que_wait_timer += (ulint)(now - trx->lock_que_wait_ustarted); - } - trx->que_state = TRX_QUE_RUNNING; -} - -/***********************************************************//** -Moves the query threads in the lock wait list to the SUSPENDED state and puts -the transaction to the TRX_QUE_RUNNING state. */ -static -void -trx_lock_wait_to_suspended( -/*=======================*/ - trx_t* trx) /*!< in: transaction in the TRX_QUE_LOCK_WAIT state */ -{ - que_thr_t* thr; - ulint sec; - ulint ms; - ib_uint64_t now; - - ut_ad(mutex_own(&kernel_mutex)); - ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT); - - thr = UT_LIST_GET_FIRST(trx->wait_thrs); - - while (thr != NULL) { - thr->state = QUE_THR_SUSPENDED; - - UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr); - - thr = UT_LIST_GET_FIRST(trx->wait_thrs); - } - - if (UNIV_UNLIKELY(trx->take_stats)) { - ut_usectime(&sec, &ms); - now = (ib_uint64_t)sec * 1000000 + ms; - trx->lock_que_wait_timer += (ulint)(now - trx->lock_que_wait_ustarted); - } - trx->que_state = TRX_QUE_RUNNING; -} - -/***********************************************************//** -Moves the query threads in the sig reply wait list of trx to the SUSPENDED -state. */ -static -void -trx_sig_reply_wait_to_suspended( -/*============================*/ - trx_t* trx) /*!< in: transaction */ -{ - trx_sig_t* sig; - que_thr_t* thr; - - ut_ad(mutex_own(&kernel_mutex)); - - sig = UT_LIST_GET_FIRST(trx->reply_signals); - - while (sig != NULL) { - thr = sig->receiver; - - ut_ad(thr->state == QUE_THR_SIG_REPLY_WAIT); - - thr->state = QUE_THR_SUSPENDED; - - sig->receiver = NULL; - - UT_LIST_REMOVE(reply_signals, trx->reply_signals, sig); - - sig = UT_LIST_GET_FIRST(trx->reply_signals); - } -} - -/*****************************************************************//** -Checks the compatibility of a new signal with the other signals in the -queue. -@return TRUE if the signal can be queued */ -static -ibool -trx_sig_is_compatible( -/*==================*/ - trx_t* trx, /*!< in: trx handle */ - ulint type, /*!< in: signal type */ - ulint sender) /*!< in: TRX_SIG_SELF or TRX_SIG_OTHER_SESS */ -{ - trx_sig_t* sig; - - ut_ad(mutex_own(&kernel_mutex)); - - if (UT_LIST_GET_LEN(trx->signals) == 0) { - - return(TRUE); - } - - if (sender == TRX_SIG_SELF) { - if (type == TRX_SIG_ERROR_OCCURRED) { - - return(TRUE); - - } else if (type == TRX_SIG_BREAK_EXECUTION) { - - return(TRUE); - } else { - return(FALSE); - } - } - - ut_ad(sender == TRX_SIG_OTHER_SESS); - - sig = UT_LIST_GET_FIRST(trx->signals); - - if (type == TRX_SIG_COMMIT) { - while (sig != NULL) { - - if (sig->type == TRX_SIG_TOTAL_ROLLBACK) { - - return(FALSE); - } - - sig = UT_LIST_GET_NEXT(signals, sig); - } - - return(TRUE); - - } else if (type == TRX_SIG_TOTAL_ROLLBACK) { - while (sig != NULL) { - - if (sig->type == TRX_SIG_COMMIT) { - - return(FALSE); - } - - sig = UT_LIST_GET_NEXT(signals, sig); - } - - return(TRUE); - - } else if (type == TRX_SIG_BREAK_EXECUTION) { - - return(TRUE); - } else { - ut_error; - - return(FALSE); - } -} - -/****************************************************************//** -Sends a signal to a trx object. */ -UNIV_INTERN -void -trx_sig_send( -/*=========*/ - trx_t* trx, /*!< in: trx handle */ - ulint type, /*!< in: signal type */ - ulint sender, /*!< in: TRX_SIG_SELF or - TRX_SIG_OTHER_SESS */ - que_thr_t* receiver_thr, /*!< in: query thread which wants the - reply, or NULL; if type is - TRX_SIG_END_WAIT, this must be NULL */ - trx_savept_t* savept, /*!< in: possible rollback savepoint, or - NULL */ - que_thr_t** next_thr) /*!< in/out: next query thread to run; - if the value which is passed in is - a pointer to a NULL pointer, then the - calling function can start running - a new query thread; if the parameter - is NULL, it is ignored */ -{ - trx_sig_t* sig; - trx_t* receiver_trx; - - ut_ad(trx); - ut_ad(mutex_own(&kernel_mutex)); - - if (!trx_sig_is_compatible(trx, type, sender)) { - /* The signal is not compatible with the other signals in - the queue: die */ - - ut_error; - } - - /* Queue the signal object */ - - if (UT_LIST_GET_LEN(trx->signals) == 0) { - - /* The signal list is empty: the 'sig' slot must be unused - (we improve performance a bit by avoiding mem_alloc) */ - sig = &(trx->sig); - } else { - /* It might be that the 'sig' slot is unused also in this - case, but we choose the easy way of using mem_alloc */ - - sig = mem_alloc(sizeof(trx_sig_t)); - } - - UT_LIST_ADD_LAST(signals, trx->signals, sig); - - sig->type = type; - sig->sender = sender; - sig->receiver = receiver_thr; - - if (savept) { - sig->savept = *savept; - } - - if (receiver_thr) { - receiver_trx = thr_get_trx(receiver_thr); - - UT_LIST_ADD_LAST(reply_signals, receiver_trx->reply_signals, - sig); - } - - if (trx->sess->state == SESS_ERROR) { - - trx_sig_reply_wait_to_suspended(trx); - } - - if ((sender != TRX_SIG_SELF) || (type == TRX_SIG_BREAK_EXECUTION)) { - ut_error; - } - - /* If there were no other signals ahead in the queue, try to start - handling of the signal */ - - if (UT_LIST_GET_FIRST(trx->signals) == sig) { - - trx_sig_start_handle(trx, next_thr); - } -} - -/****************************************************************//** -Ends signal handling. If the session is in the error state, and -trx->graph_before_signal_handling != NULL, then returns control to the error -handling routine of the graph (currently just returns the control to the -graph root which then will send an error message to the client). */ -UNIV_INTERN -void -trx_end_signal_handling( -/*====================*/ - trx_t* trx) /*!< in: trx */ -{ - ut_ad(mutex_own(&kernel_mutex)); - ut_ad(trx->handling_signals == TRUE); - - trx->handling_signals = FALSE; - - trx->graph = trx->graph_before_signal_handling; - - if (trx->graph && (trx->sess->state == SESS_ERROR)) { - - que_fork_error_handle(trx, trx->graph); - } -} - -/****************************************************************//** -Starts handling of a trx signal. */ -UNIV_INTERN -void -trx_sig_start_handle( -/*=================*/ - trx_t* trx, /*!< in: trx handle */ - que_thr_t** next_thr) /*!< in/out: next query thread to run; - if the value which is passed in is - a pointer to a NULL pointer, then the - calling function can start running - a new query thread; if the parameter - is NULL, it is ignored */ -{ - trx_sig_t* sig; - ulint type; -loop: - /* We loop in this function body as long as there are queued signals - we can process immediately */ - - ut_ad(trx); - ut_ad(mutex_own(&kernel_mutex)); - - if (trx->handling_signals && (UT_LIST_GET_LEN(trx->signals) == 0)) { - - trx_end_signal_handling(trx); - - return; - } - - if (trx->state == TRX_NOT_STARTED) { - - trx_start_low(trx, ULINT_UNDEFINED); - } - - /* If the trx is in a lock wait state, moves the waiting query threads - to the suspended state */ - - if (trx->que_state == TRX_QUE_LOCK_WAIT) { - - trx_lock_wait_to_suspended(trx); - } - - /* If the session is in the error state and this trx has threads - waiting for reply from signals, moves these threads to the suspended - state, canceling wait reservations; note that if the transaction has - sent a commit or rollback signal to itself, and its session is not in - the error state, then nothing is done here. */ - - if (trx->sess->state == SESS_ERROR) { - trx_sig_reply_wait_to_suspended(trx); - } - - /* If there are no running query threads, we can start processing of a - signal, otherwise we have to wait until all query threads of this - transaction are aware of the arrival of the signal. */ - - if (trx->n_active_thrs > 0) { - - return; - } - - if (trx->handling_signals == FALSE) { - trx->graph_before_signal_handling = trx->graph; - - trx->handling_signals = TRUE; - } - - sig = UT_LIST_GET_FIRST(trx->signals); - type = sig->type; - - if (type == TRX_SIG_COMMIT) { - - trx_handle_commit_sig_off_kernel(trx, next_thr); - - } else if ((type == TRX_SIG_TOTAL_ROLLBACK) - || (type == TRX_SIG_ROLLBACK_TO_SAVEPT)) { - - trx_rollback(trx, sig, next_thr); - - /* No further signals can be handled until the rollback - completes, therefore we return */ - - return; - - } else if (type == TRX_SIG_ERROR_OCCURRED) { - - trx_rollback(trx, sig, next_thr); - - /* No further signals can be handled until the rollback - completes, therefore we return */ - - return; - - } else if (type == TRX_SIG_BREAK_EXECUTION) { - - trx_sig_reply(sig, next_thr); - trx_sig_remove(trx, sig); - } else { - ut_error; - } - - goto loop; -} - -/****************************************************************//** -Send the reply message when a signal in the queue of the trx has been -handled. */ -UNIV_INTERN -void -trx_sig_reply( -/*==========*/ - trx_sig_t* sig, /*!< in: signal */ - que_thr_t** next_thr) /*!< in/out: next query thread to run; - if the value which is passed in is - a pointer to a NULL pointer, then the - calling function can start running - a new query thread */ -{ - trx_t* receiver_trx; - - ut_ad(sig); - ut_ad(mutex_own(&kernel_mutex)); - - if (sig->receiver != NULL) { - ut_ad((sig->receiver)->state == QUE_THR_SIG_REPLY_WAIT); - - receiver_trx = thr_get_trx(sig->receiver); - - UT_LIST_REMOVE(reply_signals, receiver_trx->reply_signals, - sig); - ut_ad(receiver_trx->sess->state != SESS_ERROR); - - que_thr_end_wait(sig->receiver, next_thr); - - sig->receiver = NULL; - - } -} - -/****************************************************************//** -Removes a signal object from the trx signal queue. */ -UNIV_INTERN -void -trx_sig_remove( -/*===========*/ - trx_t* trx, /*!< in: trx handle */ - trx_sig_t* sig) /*!< in, own: signal */ -{ - ut_ad(trx && sig); - ut_ad(mutex_own(&kernel_mutex)); - - ut_ad(sig->receiver == NULL); - - UT_LIST_REMOVE(signals, trx->signals, sig); - sig->type = 0; /* reset the field to catch possible bugs */ - - if (sig != &(trx->sig)) { - mem_free(sig); - } -} - -/*********************************************************************//** -Creates a commit command node struct. -@return own: commit node struct */ -UNIV_INTERN -commit_node_t* -commit_node_create( -/*===============*/ - mem_heap_t* heap) /*!< in: mem heap where created */ -{ - commit_node_t* node; - - node = mem_heap_alloc(heap, sizeof(commit_node_t)); - node->common.type = QUE_NODE_COMMIT; - node->state = COMMIT_NODE_SEND; - - return(node); -} - -/***********************************************************//** -Performs an execution step for a commit type node in a query graph. -@return query thread to run next, or NULL */ -UNIV_INTERN -que_thr_t* -trx_commit_step( -/*============*/ - que_thr_t* thr) /*!< in: query thread */ -{ - commit_node_t* node; - que_thr_t* next_thr; - - node = thr->run_node; - - ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT); - - if (thr->prev_node == que_node_get_parent(node)) { - node->state = COMMIT_NODE_SEND; - } - - if (node->state == COMMIT_NODE_SEND) { - mutex_enter(&kernel_mutex); - - node->state = COMMIT_NODE_WAIT; - - next_thr = NULL; - - thr->state = QUE_THR_SIG_REPLY_WAIT; - - /* Send the commit signal to the transaction */ - - trx_sig_send(thr_get_trx(thr), TRX_SIG_COMMIT, TRX_SIG_SELF, - thr, NULL, &next_thr); - - mutex_exit(&kernel_mutex); - - return(next_thr); - } - - ut_ad(node->state == COMMIT_NODE_WAIT); - - node->state = COMMIT_NODE_SEND; - - thr->run_node = que_node_get_parent(node); - - return(thr); -} - -/**********************************************************************//** -Does the transaction commit for MySQL. -@return DB_SUCCESS or error number */ -UNIV_INTERN -ulint -trx_commit_for_mysql( -/*=================*/ - trx_t* trx) /*!< in: trx handle */ -{ - /* Because we do not do the commit by sending an Innobase - sig to the transaction, we must here make sure that trx has been - started. */ - - ut_a(trx); - - trx_start_if_not_started(trx); - - trx->op_info = "committing"; - - mutex_enter(&kernel_mutex); - - trx_commit_off_kernel(trx); - - mutex_exit(&kernel_mutex); - - trx->op_info = ""; - - return(DB_SUCCESS); -} - -/**********************************************************************//** -If required, flushes the log to disk if we called trx_commit_for_mysql() -with trx->flush_log_later == TRUE. -@return 0 or error number */ -UNIV_INTERN -ulint -trx_commit_complete_for_mysql( -/*==========================*/ - trx_t* trx) /*!< in: trx handle */ -{ - ib_uint64_t lsn = trx->commit_lsn; - ulint flush_log_at_trx_commit; - - ut_a(trx); - - trx->op_info = "flushing log"; - - if (srv_use_global_flush_log_at_trx_commit) { - flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL); - } else { - flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd); - } - - if (!trx->must_flush_log_later) { - /* Do nothing */ - } else if (flush_log_at_trx_commit == 0) { - /* Do nothing */ - } else if (flush_log_at_trx_commit == 1 && trx->active_commit_ordered) { - /* Do nothing - we already flushed the prepare and binlog write - to disk, so transaction is durable (will be recovered from - binlog if necessary) */ - } else if (flush_log_at_trx_commit == 1 || flush_log_at_trx_commit == 3) { - if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { - /* Write the log but do not flush it to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); - } else { - /* Write the log to the log files AND flush them to - disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); - } - } else if (flush_log_at_trx_commit == 2) { - - /* Write the log but do not flush it to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); - } else { - ut_error; - } - - trx->must_flush_log_later = FALSE; - - trx->op_info = ""; - - return(0); -} - -/**********************************************************************//** -Marks the latest SQL statement ended. */ -UNIV_INTERN -void -trx_mark_sql_stat_end( -/*==================*/ - trx_t* trx) /*!< in: trx handle */ -{ - ut_a(trx); - - if (trx->state == TRX_NOT_STARTED) { - trx->undo_no = 0; - } - - trx->last_sql_stat_start.least_undo_no = trx->undo_no; -} - -/**********************************************************************//** -Prints info about a transaction to the given file. The caller must own the -kernel mutex. */ -UNIV_INTERN -void -trx_print( -/*======*/ - FILE* f, /*!< in: output stream */ - trx_t* trx, /*!< in: transaction */ - ulint max_query_len) /*!< in: max query length to print, or 0 to - use the default max length */ -{ - ibool newline; - - fprintf(f, "TRANSACTION " TRX_ID_FMT, (ullint) trx->id); - - switch (trx->state) { - case TRX_NOT_STARTED: - fputs(", not started", f); - break; - case TRX_ACTIVE: - fprintf(f, ", ACTIVE %lu sec", - (ulong)difftime(time(NULL), trx->start_time)); - break; - case TRX_PREPARED: - fprintf(f, ", ACTIVE (PREPARED) %lu sec", - (ulong)difftime(time(NULL), trx->start_time)); - break; - case TRX_COMMITTED_IN_MEMORY: - fputs(", COMMITTED IN MEMORY", f); - break; - default: - fprintf(f, " state %lu", (ulong) trx->state); - } - - if (*trx->op_info) { - putc(' ', f); - fputs(trx->op_info, f); - } - - if (trx->is_recovered) { - fputs(" recovered trx", f); - } - - if (trx->is_purge) { - fputs(" purge trx", f); - } - - if (trx->declared_to_be_inside_innodb) { - fprintf(f, ", thread declared inside InnoDB %lu", - (ulong) trx->n_tickets_to_enter_innodb); - } - - putc('\n', f); - - if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { - fprintf(f, "mysql tables in use %lu, locked %lu\n", - (ulong) trx->n_mysql_tables_in_use, - (ulong) trx->mysql_n_tables_locked); - } - - newline = TRUE; - - switch (trx->que_state) { - case TRX_QUE_RUNNING: - newline = FALSE; break; - case TRX_QUE_LOCK_WAIT: - fputs("LOCK WAIT ", f); break; - case TRX_QUE_ROLLING_BACK: - fputs("ROLLING BACK ", f); break; - case TRX_QUE_COMMITTING: - fputs("COMMITTING ", f); break; - default: - fprintf(f, "que state %lu ", (ulong) trx->que_state); - } - - if (0 < UT_LIST_GET_LEN(trx->trx_locks) - || mem_heap_get_size(trx->lock_heap) > 400) { - newline = TRUE; - - fprintf(f, "%lu lock struct(s), heap size %lu," - " %lu row lock(s)", - (ulong) UT_LIST_GET_LEN(trx->trx_locks), - (ulong) mem_heap_get_size(trx->lock_heap), - (ulong) lock_number_of_rows_locked(trx)); - } - - if (trx->has_search_latch) { - newline = TRUE; - fputs(", holds adaptive hash latch", f); - } - - if (trx->undo_no != 0) { - newline = TRUE; - fprintf(f, ", undo log entries %llu", - (ullint) trx->undo_no); - } - - if (newline) { - putc('\n', f); - } - - if (trx->mysql_thd != NULL) { - innobase_mysql_print_thd(f, trx->mysql_thd, max_query_len); - } -} - -/*******************************************************************//** -Compares the "weight" (or size) of two transactions. Transactions that -have edited non-transactional tables are considered heavier than ones -that have not. -@return TRUE if weight(a) >= weight(b) */ -UNIV_INTERN -ibool -trx_weight_ge( -/*==========*/ - const trx_t* a, /*!< in: the first transaction to be compared */ - const trx_t* b) /*!< in: the second transaction to be compared */ -{ - ibool a_notrans_edit; - ibool b_notrans_edit; - - /* If mysql_thd is NULL for a transaction we assume that it has - not edited non-transactional tables. */ - - a_notrans_edit = a->mysql_thd != NULL - && thd_has_edited_nontrans_tables(a->mysql_thd); - - b_notrans_edit = b->mysql_thd != NULL - && thd_has_edited_nontrans_tables(b->mysql_thd); - - if (a_notrans_edit != b_notrans_edit) { - - return(a_notrans_edit); - } - - /* Either both had edited non-transactional tables or both had - not, we fall back to comparing the number of altered/locked - rows. */ - -#if 0 - fprintf(stderr, - "%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n", - __func__, - a->undo_no, UT_LIST_GET_LEN(a->trx_locks), - b->undo_no, UT_LIST_GET_LEN(b->trx_locks)); -#endif - - return(TRX_WEIGHT(a) >= TRX_WEIGHT(b)); -} - -/****************************************************************//** -Prepares a transaction. */ -UNIV_INTERN -void -trx_prepare_off_kernel( -/*===================*/ - trx_t* trx) /*!< in: transaction */ -{ - trx_rseg_t* rseg; - ib_uint64_t lsn = 0; - mtr_t mtr; - - ut_ad(mutex_own(&kernel_mutex)); - - rseg = trx->rseg; - - if (trx->insert_undo != NULL || trx->update_undo != NULL) { - - mutex_exit(&kernel_mutex); - - mtr_start(&mtr); - - /* Change the undo log segment states from TRX_UNDO_ACTIVE - to TRX_UNDO_PREPARED: these modifications to the file data - structure define the transaction as prepared in the - file-based world, at the serialization point of lsn. */ - - mutex_enter(&(rseg->mutex)); - - if (trx->insert_undo != NULL) { - - /* It is not necessary to obtain trx->undo_mutex here - because only a single OS thread is allowed to do the - transaction prepare for this transaction. */ - - trx_undo_set_state_at_prepare(trx, trx->insert_undo, - &mtr); - } - - if (trx->update_undo) { - trx_undo_set_state_at_prepare( - trx, trx->update_undo, &mtr); - } - - mutex_exit(&(rseg->mutex)); - - if (trx->mysql_master_log_file_name[0] != '\0') { - /* This database server is a MySQL replication slave */ - trx_sysf_t* sys_header = trx_sysf_get(&mtr); - - trx_sys_update_mysql_binlog_offset( - sys_header, - trx->mysql_relay_log_file_name, - trx->mysql_relay_log_pos, - TRX_SYS_MYSQL_RELAY_LOG_INFO, &mtr); - trx_sys_update_mysql_binlog_offset( - sys_header, - trx->mysql_master_log_file_name, - trx->mysql_master_log_pos, - TRX_SYS_MYSQL_MASTER_LOG_INFO, &mtr); - trx->mysql_master_log_file_name = ""; - } - - /*--------------*/ - mtr_commit(&mtr); /* This mtr commit makes the - transaction prepared in the file-based - world */ - /*--------------*/ - lsn = mtr.end_lsn; - - mutex_enter(&kernel_mutex); - } - - ut_ad(mutex_own(&kernel_mutex)); - - /*--------------------------------------*/ - if (UNIV_UNLIKELY(trx->state != TRX_ACTIVE)) { - - trx_reserve_descriptor(trx); - } - trx->state = TRX_PREPARED; - trx_n_prepared++; - /*--------------------------------------*/ - - if (lsn) { - ulint flush_log_at_trx_commit; - - /* Depending on the my.cnf options, we may now write the log - buffer to the log files, making the prepared state of the - transaction durable if the OS does not crash. We may also - flush the log files to disk, making the prepared state of the - transaction durable also at an OS crash or a power outage. - - The idea in InnoDB's group prepare is that a group of - transactions gather behind a trx doing a physical disk write - to log files, and when that physical write has been completed, - one of those transactions does a write which prepares the whole - group. Note that this group prepare will only bring benefit if - there are > 2 users in the database. Then at least 2 users can - gather behind one doing the physical log write to disk. - - TODO: find out if MySQL holds some mutex when calling this. - That would spoil our group prepare algorithm. */ - - mutex_exit(&kernel_mutex); - - if (srv_use_global_flush_log_at_trx_commit) { - flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL); - } else { - flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd); - } - - if (flush_log_at_trx_commit == 0) { - /* Do nothing */ - } else if (flush_log_at_trx_commit == 1 || flush_log_at_trx_commit == 3) { - if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { - /* Write the log but do not flush it to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, - FALSE); - } else { - /* Write the log to the log files AND flush - them to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); - } - } else if (flush_log_at_trx_commit == 2) { - - /* Write the log but do not flush it to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); - } else { - ut_error; - } - - mutex_enter(&kernel_mutex); - } -} - -/**********************************************************************//** -Does the transaction prepare for MySQL. -@return 0 or error number */ -UNIV_INTERN -ulint -trx_prepare_for_mysql( -/*==================*/ - trx_t* trx) /*!< in: trx handle */ -{ - /* Because we do not do the prepare by sending an Innobase - sig to the transaction, we must here make sure that trx has been - started. */ - - ut_a(trx); - - trx->op_info = "preparing"; - - trx_start_if_not_started(trx); - - mutex_enter(&kernel_mutex); - - trx_prepare_off_kernel(trx); - - mutex_exit(&kernel_mutex); - - trx->op_info = ""; - - return(0); -} - -/**********************************************************************//** -This function is used to find number of prepared transactions and -their transaction objects for a recovery. -@return number of prepared transactions stored in xid_list */ -UNIV_INTERN -int -trx_recover_for_mysql( -/*==================*/ - XID* xid_list, /*!< in/out: prepared transactions */ - ulint len) /*!< in: number of slots in xid_list */ -{ - trx_t* trx; - ulint count = 0; - - ut_ad(xid_list); - ut_ad(len); - - /* We should set those transactions which are in the prepared state - to the xid_list */ - - mutex_enter(&kernel_mutex); - - trx = UT_LIST_GET_FIRST(trx_sys->trx_list); - - while (trx) { - if (trx->state == TRX_PREPARED) { - xid_list[count] = trx->xid; - - if (count == 0) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Starting recovery for" - " XA transactions...\n"); - } - - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Transaction " TRX_ID_FMT " in" - " prepared state after recovery\n", - (ullint) trx->id); - - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Transaction contains changes" - " to %llu rows\n", - (ullint) trx->undo_no); - - count++; - - if (count == len) { - break; - } - } - - trx = UT_LIST_GET_NEXT(trx_list, trx); - } - - mutex_exit(&kernel_mutex); - - if (count > 0){ - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: %lu transactions in prepared state" - " after recovery\n", - (ulong) count); - } - - return ((int) count); -} - -/*******************************************************************//** -This function is used to find one X/Open XA distributed transaction -which is in the prepared state -@return trx or NULL; on match, the trx->xid will be invalidated */ -UNIV_INTERN -trx_t* -trx_get_trx_by_xid( -/*===============*/ - const XID* xid) /*!< in: X/Open XA transaction identifier */ -{ - trx_t* trx; - - if (xid == NULL) { - - return(NULL); - } - - mutex_enter(&kernel_mutex); - - trx = UT_LIST_GET_FIRST(trx_sys->trx_list); - - while (trx) { - /* Compare two X/Open XA transaction id's: their - length should be the same and binary comparison - of gtrid_length+bqual_length bytes should be - the same */ - - if (trx->is_recovered - && trx->state == TRX_PREPARED - && xid->gtrid_length == trx->xid.gtrid_length - && xid->bqual_length == trx->xid.bqual_length - && memcmp(xid->data, trx->xid.data, - xid->gtrid_length + xid->bqual_length) == 0) { - - /* Invalidate the XID, so that subsequent calls - will not find it. */ - memset(&trx->xid, 0, sizeof(trx->xid)); - trx->xid.formatID = -1; - break; - } - - trx = UT_LIST_GET_NEXT(trx_list, trx); - } - - mutex_exit(&kernel_mutex); - - return(trx); -} diff --git a/storage/xtradb/trx/trx0trx.cc b/storage/xtradb/trx/trx0trx.cc new file mode 100644 index 00000000000..bbf76effc52 --- /dev/null +++ b/storage/xtradb/trx/trx0trx.cc @@ -0,0 +1,2543 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0trx.cc +The transaction + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "btr0types.h" +#include "trx0trx.h" + +#ifdef UNIV_NONINL +#include "trx0trx.ic" +#endif + +#include "trx0undo.h" +#include "trx0rseg.h" +#include "log0log.h" +#include "que0que.h" +#include "lock0lock.h" +#include "trx0roll.h" +#include "usr0sess.h" +#include "read0read.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "btr0sea.h" +#include "os0proc.h" +#include "trx0xa.h" +#include "trx0rec.h" +#include "trx0purge.h" +#include "ha_prototypes.h" +#include "srv0mon.h" +#include "ut0vec.h" + +#include<set> + +/** Set of table_id */ +typedef std::set<table_id_t> table_id_set; + +/** Dummy session used currently in MySQL interface */ +UNIV_INTERN sess_t* trx_dummy_sess = NULL; + +#ifdef UNIV_PFS_MUTEX +/* Key to register the mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t trx_mutex_key; +/* Key to register the mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t trx_undo_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/*************************************************************//** +Set detailed error message for the transaction. */ +UNIV_INTERN +void +trx_set_detailed_error( +/*===================*/ + trx_t* trx, /*!< in: transaction struct */ + const char* msg) /*!< in: detailed error message */ +{ + ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error)); +} + +/*************************************************************//** +Set detailed error message for the transaction from a file. Note that the +file is rewinded before reading from it. */ +UNIV_INTERN +void +trx_set_detailed_error_from_file( +/*=============================*/ + trx_t* trx, /*!< in: transaction struct */ + FILE* file) /*!< in: file to read message from */ +{ + os_file_read_string(file, trx->detailed_error, + sizeof(trx->detailed_error)); +} + +/*************************************************************//** +Callback function for trx_find_descriptor() to compare trx IDs. */ +UNIV_INTERN +int +trx_descr_cmp( +/*==========*/ + const void *a, /*!< in: pointer to first comparison argument */ + const void *b) /*!< in: pointer to second comparison argument */ +{ + const trx_id_t* da = (const trx_id_t*) a; + const trx_id_t* db = (const trx_id_t*) b; + + if (*da < *db) { + return -1; + } else if (*da > *db) { + return 1; + } + + return 0; +} + +/*************************************************************//** +Reserve a slot for a given trx in the global descriptors array. */ +UNIV_INLINE +void +trx_reserve_descriptor( +/*===================*/ + const trx_t* trx) /*!< in: trx pointer */ +{ + ulint n_used; + ulint n_max; + trx_id_t* descr; + + ut_ad(mutex_own(&trx_sys->mutex) || srv_is_being_started); + ut_ad(srv_is_being_started || + !trx_find_descriptor(trx_sys->descriptors, + trx_sys->descr_n_used, + trx->id)); + + n_used = trx_sys->descr_n_used + 1; + n_max = trx_sys->descr_n_max; + + if (UNIV_UNLIKELY(n_used > n_max)) { + + n_max = n_max * 2; + + trx_sys->descriptors = static_cast<trx_id_t*>( + ut_realloc(trx_sys->descriptors, + n_max * sizeof(trx_id_t))); + + trx_sys->descr_n_max = n_max; + srv_descriptors_memory = n_max * sizeof(trx_id_t); + } + + descr = trx_sys->descriptors + n_used - 1; + + if (UNIV_UNLIKELY(n_used > 1 && trx->id < descr[-1])) { + + /* Find the slot where it should be inserted. We could use a + binary search, but in reality linear search should be faster, + because the slot we are looking for is near the array end. */ + + trx_id_t* tdescr; + + for (tdescr = descr - 1; + tdescr >= trx_sys->descriptors && *tdescr > trx->id; + tdescr--) { + } + + tdescr++; + + ut_memmove(tdescr + 1, tdescr, (descr - tdescr) * + sizeof(trx_id_t)); + + descr = tdescr; + } + + *descr = trx->id; + + trx_sys->descr_n_used = n_used; +} + +/*************************************************************//** +Release a slot for a given trx in the global descriptors array. */ +UNIV_INTERN +void +trx_release_descriptor( +/*===================*/ + trx_t* trx) /*!< in: trx pointer */ +{ + ulint size; + trx_id_t* descr; + + ut_ad(mutex_own(&trx_sys->mutex)); + + if (UNIV_LIKELY(trx->in_trx_serial_list)) { + + UT_LIST_REMOVE(trx_serial_list, trx_sys->trx_serial_list, + trx); + trx->in_trx_serial_list = false; + } + + descr = trx_find_descriptor(trx_sys->descriptors, + trx_sys->descr_n_used, + trx->id); + + if (UNIV_UNLIKELY(descr == NULL)) { + + return; + } + + size = (trx_sys->descriptors + trx_sys->descr_n_used - 1 - descr) * + sizeof(trx_id_t); + + if (UNIV_LIKELY(size > 0)) { + + ut_memmove(descr, descr + 1, size); + } + + trx_sys->descr_n_used--; +} + +/****************************************************************//** +Creates and initializes a transaction object. It must be explicitly +started with trx_start_if_not_started() before using it. The default +isolation level is TRX_ISO_REPEATABLE_READ. +@return transaction instance, should never be NULL */ +static +trx_t* +trx_create(void) +/*============*/ +{ + trx_t* trx; + mem_heap_t* heap; + ib_alloc_t* heap_alloc; + + trx = static_cast<trx_t*>(mem_zalloc(sizeof(*trx))); + + mutex_create(trx_mutex_key, &trx->mutex, SYNC_TRX); + + trx->magic_n = TRX_MAGIC_N; + + trx->active_commit_ordered = 0; + trx->state = TRX_STATE_NOT_STARTED; + + trx->isolation_level = TRX_ISO_REPEATABLE_READ; + + trx->no = TRX_ID_MAX; + trx->in_trx_serial_list = false; + + trx->support_xa = TRUE; + + trx->fake_changes = FALSE; + + trx->check_foreigns = TRUE; + trx->check_unique_secondary = TRUE; + + trx->dict_operation = TRX_DICT_OP_NONE; + + trx->idle_start = 0; + trx->last_stmt_start = 0; + + mutex_create(trx_undo_mutex_key, &trx->undo_mutex, SYNC_TRX_UNDO); + + trx->error_state = DB_SUCCESS; + + trx->lock.que_state = TRX_QUE_RUNNING; + + trx->lock.lock_heap = mem_heap_create_typed( + 256, MEM_HEAP_FOR_LOCK_HEAP); + + trx->search_latch_timeout = BTR_SEA_TIMEOUT; + + trx->io_reads = 0; + trx->io_read = 0; + trx->io_reads_wait_timer = 0; + trx->lock_que_wait_timer = 0; + trx->innodb_que_wait_timer = 0; + trx->distinct_page_access = 0; + trx->distinct_page_access_hash = NULL; + trx->take_stats = FALSE; + + trx->xid.formatID = -1; + + trx->op_info = ""; + + heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 8); + heap_alloc = ib_heap_allocator_create(heap); + + /* Remember to free the vector explicitly in trx_free(). */ + trx->autoinc_locks = ib_vector_create(heap_alloc, sizeof(void**), 4); + + /* Remember to free the vector explicitly in trx_free(). */ + heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 128); + heap_alloc = ib_heap_allocator_create(heap); + + trx->lock.table_locks = ib_vector_create( + heap_alloc, sizeof(void**), 32); +#ifdef WITH_WSREP + trx->wsrep_event = NULL; +#endif /* WITH_WSREP */ + + return(trx); +} + +/********************************************************************//** +Creates a transaction object for background operations by the master thread. +@return own: transaction object */ +UNIV_INTERN +trx_t* +trx_allocate_for_background(void) +/*=============================*/ +{ + trx_t* trx; + + trx = trx_create(); + + trx->sess = trx_dummy_sess; + + return(trx); +} + +/********************************************************************//** +Creates a transaction object for MySQL. +@return own: transaction object */ +UNIV_INTERN +trx_t* +trx_allocate_for_mysql(void) +/*========================*/ +{ + trx_t* trx; + + trx = trx_allocate_for_background(); + + mutex_enter(&trx_sys->mutex); + + ut_d(trx->in_mysql_trx_list = TRUE); + UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx); + + mutex_exit(&trx_sys->mutex); + + if (UNIV_UNLIKELY(trx->take_stats)) { + trx->distinct_page_access_hash + = static_cast<byte *>(mem_alloc(DPAH_SIZE)); + memset(trx->distinct_page_access_hash, 0, DPAH_SIZE); + } + + return(trx); +} + +/********************************************************************//** +Frees a transaction object without releasing the corresponding descriptor. +Should be used by callers that already own trx_sys->mutex. */ +static +void +trx_free_low( +/*=========*/ + trx_t* trx) /*!< in, own: trx object */ +{ + ut_a(trx->magic_n == TRX_MAGIC_N); + ut_ad(!trx->in_ro_trx_list); + ut_ad(!trx->in_rw_trx_list); + ut_ad(!trx->in_mysql_trx_list); + + mutex_free(&trx->undo_mutex); + + if (trx->undo_no_arr != NULL) { + trx_undo_arr_free(trx->undo_no_arr); + } + + ut_a(trx->lock.wait_lock == NULL); + ut_a(trx->lock.wait_thr == NULL); + + ut_a(!trx->has_search_latch); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!btr_search_own_any()); +#endif + + ut_a(trx->dict_operation_lock_mode == 0); + + if (trx->lock.lock_heap) { + mem_heap_free(trx->lock.lock_heap); + } + + ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); + + ut_a(ib_vector_is_empty(trx->autoinc_locks)); + /* We allocated a dedicated heap for the vector. */ + ib_vector_free(trx->autoinc_locks); + + if (trx->lock.table_locks != NULL) { + /* We allocated a dedicated heap for the vector. */ + ib_vector_free(trx->lock.table_locks); + } + + mutex_free(&trx->mutex); + + read_view_free(trx->prebuilt_view); + + mem_free(trx); +} + +/********************************************************************//** +Frees a transaction object. */ +static +void +trx_free( +/*=========*/ + trx_t* trx) /*!< in, own: trx object */ +{ + mutex_enter(&trx_sys->mutex); + trx_release_descriptor(trx); + mutex_exit(&trx_sys->mutex); + + trx_free_low(trx); +} + +/********************************************************************//** +Frees a transaction object of a background operation of the master thread. */ +UNIV_INTERN +void +trx_free_for_background( +/*====================*/ + trx_t* trx) /*!< in, own: trx object */ +{ + + if (trx->distinct_page_access_hash) + { + mem_free(trx->distinct_page_access_hash); + trx->distinct_page_access_hash= NULL; + } + + if (trx->declared_to_be_inside_innodb) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Freeing a trx (%p, " TRX_ID_FMT ") which is declared " + "to be processing inside InnoDB", trx, trx->id); + + trx_print(stderr, trx, 600); + putc('\n', stderr); + + /* This is an error but not a fatal error. We must keep + the counters like srv_conc_n_threads accurate. */ + srv_conc_force_exit_innodb(trx); + } + + if (trx->n_mysql_tables_in_use != 0 + || trx->mysql_n_tables_locked != 0) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "MySQL is freeing a thd though " + "trx->n_mysql_tables_in_use is %lu and " + "trx->mysql_n_tables_locked is %lu.", + (ulong) trx->n_mysql_tables_in_use, + (ulong) trx->mysql_n_tables_locked); + + trx_print(stderr, trx, 600); + ut_print_buf(stderr, trx, sizeof(trx_t)); + putc('\n', stderr); + } + + ut_a(trx->state == TRX_STATE_NOT_STARTED); + ut_a(trx->insert_undo == NULL); + ut_a(trx->update_undo == NULL); + ut_a(trx->read_view == NULL); + + trx_free(trx); +} + +/********************************************************************//** +At shutdown, frees a transaction object that is in the PREPARED state. */ +UNIV_INTERN +void +trx_free_prepared( +/*==============*/ + trx_t* trx) /*!< in, own: trx object */ +{ + ut_ad(mutex_own(&trx_sys->mutex)); + + ut_a(trx_state_eq(trx, TRX_STATE_PREPARED)); + ut_a(trx->magic_n == TRX_MAGIC_N); + + trx_undo_free_prepared(trx); + + assert_trx_in_rw_list(trx); + + ut_a(!trx->read_only); + + UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx); + ut_d(trx->in_rw_trx_list = FALSE); + + trx_release_descriptor(trx); + + /* Undo trx_resurrect_table_locks(). */ + UT_LIST_INIT(trx->lock.trx_locks); + + trx_free_low(trx); + + ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->rw_trx_list)); +} + +/********************************************************************//** +Frees a transaction object for MySQL. */ +UNIV_INTERN +void +trx_free_for_mysql( +/*===============*/ + trx_t* trx) /*!< in, own: trx object */ +{ + if (trx->distinct_page_access_hash) + { + mem_free(trx->distinct_page_access_hash); + trx->distinct_page_access_hash= NULL; + } + + mutex_enter(&trx_sys->mutex); + + ut_ad(trx->in_mysql_trx_list); + ut_d(trx->in_mysql_trx_list = FALSE); + UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx); + + ut_ad(trx_sys_validate_trx_list()); + + mutex_exit(&trx_sys->mutex); + + trx_free_for_background(trx); +} + +/****************************************************************//** +Inserts the trx handle in the trx system trx list in the right position. +The list is sorted on the trx id so that the biggest id is at the list +start. This function is used at the database startup to insert incomplete +transactions to the list. */ +static +void +trx_list_rw_insert_ordered( +/*=======================*/ + trx_t* trx) /*!< in: trx handle */ +{ + trx_t* trx2; + + ut_ad(!trx->read_only); + + ut_d(trx->start_file = __FILE__); + ut_d(trx->start_line = __LINE__); + + ut_a(srv_is_being_started); + ut_ad(!trx->in_ro_trx_list); + ut_ad(!trx->in_rw_trx_list); + ut_ad(trx->state != TRX_STATE_NOT_STARTED); + ut_ad(trx->is_recovered); + + for (trx2 = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); + trx2 != NULL; + trx2 = UT_LIST_GET_NEXT(trx_list, trx2)) { + + assert_trx_in_rw_list(trx2); + + if (trx->id >= trx2->id) { + + ut_ad(trx->id > trx2->id); + break; + } + } + + if (trx2 != NULL) { + trx2 = UT_LIST_GET_PREV(trx_list, trx2); + + if (trx2 == NULL) { + UT_LIST_ADD_FIRST(trx_list, trx_sys->rw_trx_list, trx); + ut_d(trx_sys->rw_max_trx_id = trx->id); + } else { + UT_LIST_INSERT_AFTER( + trx_list, trx_sys->rw_trx_list, trx2, trx); + } + } else { + UT_LIST_ADD_LAST(trx_list, trx_sys->rw_trx_list, trx); + } + + ut_ad(!trx->in_rw_trx_list); + ut_d(trx->in_rw_trx_list = TRUE); +} + +/****************************************************************//** +Resurrect the table locks for a resurrected transaction. */ +static +void +trx_resurrect_table_locks( +/*======================*/ + trx_t* trx, /*!< in/out: transaction */ + const trx_undo_t* undo) /*!< in: undo log */ +{ + mtr_t mtr; + page_t* undo_page; + trx_undo_rec_t* undo_rec; + table_id_set tables; + + ut_ad(undo == trx->insert_undo || undo == trx->update_undo); + + if (trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) + || undo->empty) { + return; + } + + mtr_start(&mtr); + /* trx_rseg_mem_create() may have acquired an X-latch on this + page, so we cannot acquire an S-latch. */ + undo_page = trx_undo_page_get( + undo->space, undo->zip_size, undo->top_page_no, &mtr); + undo_rec = undo_page + undo->top_offset; + + do { + ulint type; + ulint cmpl_info; + bool updated_extern; + undo_no_t undo_no; + table_id_t table_id; + + page_t* undo_rec_page = page_align(undo_rec); + + if (undo_rec_page != undo_page) { + if (!mtr_memo_release(&mtr, + buf_block_align(undo_page), + MTR_MEMO_PAGE_X_FIX)) { + /* The page of the previous undo_rec + should have been latched by + trx_undo_page_get() or + trx_undo_get_prev_rec(). */ + ut_ad(0); + } + + undo_page = undo_rec_page; + } + + trx_undo_rec_get_pars( + undo_rec, &type, &cmpl_info, + &updated_extern, &undo_no, &table_id); + tables.insert(table_id); + + undo_rec = trx_undo_get_prev_rec( + undo_rec, undo->hdr_page_no, + undo->hdr_offset, false, &mtr); + } while (undo_rec); + + mtr_commit(&mtr); + + for (table_id_set::const_iterator i = tables.begin(); + i != tables.end(); i++) { + if (dict_table_t* table = dict_table_open_on_id( + *i, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE)) { + if (table->ibd_file_missing + || dict_table_is_temporary(table)) { + mutex_enter(&dict_sys->mutex); + dict_table_close(table, TRUE, FALSE); + dict_table_remove_from_cache(table); + mutex_exit(&dict_sys->mutex); + continue; + } + + lock_table_ix_resurrect(table, trx); + + DBUG_PRINT("ib_trx", + ("resurrect" TRX_ID_FMT + " table '%s' IX lock from %s undo", + trx->id, table->name, + undo == trx->insert_undo + ? "insert" : "update")); + + dict_table_close(table, FALSE, FALSE); + } + } +} + +/****************************************************************//** +Resurrect the transactions that were doing inserts the time of the +crash, they need to be undone. +@return trx_t instance */ +static +trx_t* +trx_resurrect_insert( +/*=================*/ + trx_undo_t* undo, /*!< in: entry to UNDO */ + trx_rseg_t* rseg) /*!< in: rollback segment */ +{ + trx_t* trx; + + trx = trx_allocate_for_background(); + + trx->rseg = rseg; + trx->xid = undo->xid; + trx->id = undo->trx_id; + trx->insert_undo = undo; + trx->is_recovered = TRUE; + + /* This is single-threaded startup code, we do not need the + protection of trx->mutex or trx_sys->mutex here. */ + + if (undo->state != TRX_UNDO_ACTIVE) { + + /* Prepared transactions are left in the prepared state + waiting for a commit or abort decision from MySQL */ + + if (undo->state == TRX_UNDO_PREPARED) { + + fprintf(stderr, + "InnoDB: Transaction " TRX_ID_FMT " was in the" + " XA prepared state.\n", trx->id); + + if (srv_force_recovery == 0) { + + trx->state = TRX_STATE_PREPARED; + trx_sys->n_prepared_trx++; + trx_sys->n_prepared_recovered_trx++; + } else { + fprintf(stderr, + "InnoDB: Since innodb_force_recovery" + " > 0, we will rollback it anyway.\n"); + + trx->state = TRX_STATE_ACTIVE; + } + } else { + trx->state = TRX_STATE_COMMITTED_IN_MEMORY; + } + + /* We give a dummy value for the trx no; this should have no + relevance since purge is not interested in committed + transaction numbers, unless they are in the history + list, in which case it looks the number from the disk based + undo log structure */ + + trx->no = trx->id; + } else { + trx->state = TRX_STATE_ACTIVE; + + /* A running transaction always has the number + field inited to TRX_ID_MAX */ + + trx->no = TRX_ID_MAX; + } + + if (undo->dict_operation) { + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + trx->table_id = undo->table_id; + } + + if (!undo->empty) { + trx->undo_no = undo->top_undo_no + 1; + } + + return(trx); +} + +/****************************************************************//** +Prepared transactions are left in the prepared state waiting for a +commit or abort decision from MySQL */ +static +void +trx_resurrect_update_in_prepared_state( +/*===================================*/ + trx_t* trx, /*!< in,out: transaction */ + const trx_undo_t* undo) /*!< in: update UNDO record */ +{ + /* This is single-threaded startup code, we do not need the + protection of trx->mutex or trx_sys->mutex here. */ + + if (undo->state == TRX_UNDO_PREPARED) { + fprintf(stderr, + "InnoDB: Transaction " TRX_ID_FMT + " was in the XA prepared state.\n", trx->id); + + if (srv_force_recovery == 0) { + if (trx_state_eq(trx, TRX_STATE_NOT_STARTED)) { + trx_sys->n_prepared_trx++; + trx_sys->n_prepared_recovered_trx++; + } else { + ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED)); + } + + trx->state = TRX_STATE_PREPARED; + } else { + fprintf(stderr, + "InnoDB: Since innodb_force_recovery" + " > 0, we will rollback it anyway.\n"); + + trx->state = TRX_STATE_ACTIVE; + } + } else { + trx->state = TRX_STATE_COMMITTED_IN_MEMORY; + } +} + +/****************************************************************//** +Resurrect the transactions that were doing updates the time of the +crash, they need to be undone. */ +static +void +trx_resurrect_update( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + trx_undo_t* undo, /*!< in/out: update UNDO record */ + trx_rseg_t* rseg) /*!< in/out: rollback segment */ +{ + trx->rseg = rseg; + trx->xid = undo->xid; + trx->id = undo->trx_id; + trx->update_undo = undo; + trx->is_recovered = TRUE; + + /* This is single-threaded startup code, we do not need the + protection of trx->mutex or trx_sys->mutex here. */ + + if (undo->state != TRX_UNDO_ACTIVE) { + trx_resurrect_update_in_prepared_state(trx, undo); + + /* We give a dummy value for the trx number */ + + trx->no = trx->id; + + } else { + trx->state = TRX_STATE_ACTIVE; + + /* A running transaction always has the number field inited to + TRX_ID_MAX */ + + trx->no = TRX_ID_MAX; + } + + if (undo->dict_operation) { + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + trx->table_id = undo->table_id; + } + + if (!undo->empty && undo->top_undo_no >= trx->undo_no) { + + trx->undo_no = undo->top_undo_no + 1; + } +} + +/****************************************************************//** +Creates trx objects for transactions and initializes the trx list of +trx_sys at database start. Rollback segment and undo log lists must +already exist when this function is called, because the lists of +transactions to be rolled back or cleaned up are built based on the +undo log lists. */ +UNIV_INTERN +void +trx_lists_init_at_db_start(void) +/*============================*/ +{ + ulint i; + + ut_a(srv_is_being_started); + + UT_LIST_INIT(trx_sys->ro_trx_list); + UT_LIST_INIT(trx_sys->rw_trx_list); + UT_LIST_INIT(trx_sys->trx_serial_list); + + /* Look from the rollback segments if there exist undo logs for + transactions */ + + for (i = 0; i < TRX_SYS_N_RSEGS; ++i) { + trx_undo_t* undo; + trx_rseg_t* rseg; + + rseg = trx_sys->rseg_array[i]; + + if (rseg == NULL) { + continue; + } + + /* Resurrect transactions that were doing inserts. */ + for (undo = UT_LIST_GET_FIRST(rseg->insert_undo_list); + undo != NULL; + undo = UT_LIST_GET_NEXT(undo_list, undo)) { + trx_t* trx; + + trx = trx_resurrect_insert(undo, rseg); + + if (trx->state == TRX_STATE_ACTIVE || + trx->state == TRX_STATE_PREPARED) { + + trx_reserve_descriptor(trx); + } + trx_list_rw_insert_ordered(trx); + + trx_resurrect_table_locks(trx, undo); + } + + /* Ressurrect transactions that were doing updates. */ + for (undo = UT_LIST_GET_FIRST(rseg->update_undo_list); + undo != NULL; + undo = UT_LIST_GET_NEXT(undo_list, undo)) { + trx_t* trx; + ibool trx_created; + + /* Check the trx_sys->rw_trx_list first. */ + mutex_enter(&trx_sys->mutex); + trx = trx_get_rw_trx_by_id(undo->trx_id); + mutex_exit(&trx_sys->mutex); + + if (trx == NULL) { + trx = trx_allocate_for_background(); + trx_created = TRUE; + } else { + trx_created = FALSE; + } + + trx_resurrect_update(trx, undo, rseg); + + if (trx_created) { + if (trx->state == TRX_STATE_ACTIVE || + trx->state == TRX_STATE_PREPARED) { + + trx_reserve_descriptor(trx); + } + trx_list_rw_insert_ordered(trx); + } + + trx_resurrect_table_locks(trx, undo); + } + } +} + +/******************************************************************//** +Assigns a rollback segment to a transaction in a round-robin fashion. +@return assigned rollback segment instance */ +static +trx_rseg_t* +trx_assign_rseg_low( +/*================*/ + ulong max_undo_logs, /*!< in: maximum number of UNDO logs to use */ + ulint n_tablespaces) /*!< in: number of rollback tablespaces */ +{ + ulint i; + trx_rseg_t* rseg; + static ulint latest_rseg = 0; + + if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO || srv_read_only_mode) { + ut_a(max_undo_logs == ULONG_UNDEFINED); + return(NULL); + } + + /* This breaks true round robin but that should be OK. */ + + ut_a(max_undo_logs > 0 && max_undo_logs <= TRX_SYS_N_RSEGS); + + i = latest_rseg++; + i %= max_undo_logs; + + /* Note: The assumption here is that there can't be any gaps in + the array. Once we implement more flexible rollback segment + management this may not hold. The assertion checks for that case. */ + + ut_a(trx_sys->rseg_array[0] != NULL); + + /* Skip the system tablespace if we have more than one tablespace + defined for rollback segments. We want all UNDO records to be in + the non-system tablespaces. */ + + do { + rseg = trx_sys->rseg_array[i]; + ut_a(rseg == NULL || i == rseg->id); + + i = (rseg == NULL) ? 0 : i + 1; + + } while (rseg == NULL + || (rseg->space == 0 + && n_tablespaces > 0 + && trx_sys->rseg_array[1] != NULL)); + + return(rseg); +} + +/****************************************************************//** +Assign a read-only transaction a rollback-segment, if it is attempting +to write to a TEMPORARY table. */ +UNIV_INTERN +void +trx_assign_rseg( +/*============*/ + trx_t* trx) /*!< A read-only transaction that + needs to be assigned a RBS. */ +{ + ut_a(trx->rseg == 0); + ut_a(trx->read_only); + ut_a(!srv_read_only_mode); + ut_a(!trx_is_autocommit_non_locking(trx)); + + trx->rseg = trx_assign_rseg_low(srv_undo_logs, srv_undo_tablespaces); +} + +/****************************************************************//** +Starts a transaction. */ +static +void +trx_start_low( +/*==========*/ + trx_t* trx) /*!< in: transaction */ +{ + ut_ad(trx->rseg == NULL); + + ut_ad(trx->start_file != 0); + ut_ad(trx->start_line != 0); + ut_ad(!trx->is_recovered); + ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED)); + ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); + + /* Check whether it is an AUTOCOMMIT SELECT */ + trx->auto_commit = thd_trx_is_auto_commit(trx->mysql_thd); + + trx->read_only = + (!trx->ddl && thd_trx_is_read_only(trx->mysql_thd)) + || srv_read_only_mode; + + if (!trx->auto_commit) { + ++trx->will_lock; + } else if (trx->will_lock == 0) { + trx->read_only = TRUE; + } + + if (!trx->read_only) { + trx->rseg = trx_assign_rseg_low( + srv_undo_logs, srv_undo_tablespaces); + } + +#ifdef WITH_WSREP + memset(&trx->xid, 0, sizeof(trx->xid)); + trx->xid.formatID = -1; +#endif /* WITH_WSREP */ + + /* The initial value for trx->no: TRX_ID_MAX is used in + read_view_open_now: */ + + trx->no = TRX_ID_MAX; + + ut_a(ib_vector_is_empty(trx->autoinc_locks)); + ut_a(ib_vector_is_empty(trx->lock.table_locks)); + + mutex_enter(&trx_sys->mutex); + + /* If this transaction came from trx_allocate_for_mysql(), + trx->in_mysql_trx_list would hold. In that case, the trx->state + change must be protected by the trx_sys->mutex, so that + lock_print_info_all_transactions() will have a consistent view. */ + + trx->state = TRX_STATE_ACTIVE; + + trx->id = trx_sys_get_new_trx_id(); + + ut_ad(!trx->in_rw_trx_list); + ut_ad(!trx->in_ro_trx_list); + + if (trx->read_only) { + + /* Note: The trx_sys_t::ro_trx_list doesn't really need to + be ordered, we should exploit this using a list type that + doesn't need a list wide lock to increase concurrency. */ + + if (!trx_is_autocommit_non_locking(trx)) { + UT_LIST_ADD_FIRST(trx_list, trx_sys->ro_trx_list, trx); + ut_d(trx->in_ro_trx_list = TRUE); + } + } else { + + ut_ad(trx->rseg != NULL + || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO); + + ut_ad(!trx_is_autocommit_non_locking(trx)); + UT_LIST_ADD_FIRST(trx_list, trx_sys->rw_trx_list, trx); + ut_d(trx->in_rw_trx_list = TRUE); + ut_d(trx_sys->rw_max_trx_id = trx->id); + + trx_reserve_descriptor(trx); + } + + ut_ad(trx_sys_validate_trx_list()); + + mutex_exit(&trx_sys->mutex); + + trx->start_time = ut_time(); + + MONITOR_INC(MONITOR_TRX_ACTIVE); +} + +/****************************************************************//** +Set the transaction serialisation number. */ +static +void +trx_serialisation_number_get( +/*=========================*/ + trx_t* trx) /*!< in: transaction */ +{ + trx_rseg_t* rseg; + + rseg = trx->rseg; + + ut_ad(mutex_own(&rseg->mutex)); + + mutex_enter(&trx_sys->mutex); + + trx->no = trx_sys_get_new_trx_id(); + + if (UNIV_LIKELY(!trx->in_trx_serial_list)) { + + UT_LIST_ADD_LAST(trx_serial_list, trx_sys->trx_serial_list, + trx); + + trx->in_trx_serial_list = true; + } + + /* If the rollack segment is not empty then the + new trx_t::no can't be less than any trx_t::no + already in the rollback segment. User threads only + produce events when a rollback segment is empty. */ + + if (rseg->last_page_no == FIL_NULL) { + void* ptr; + rseg_queue_t rseg_queue; + + rseg_queue.rseg = rseg; + rseg_queue.trx_no = trx->no; + + mutex_enter(&purge_sys->bh_mutex); + + /* This is to reduce the pressure on the trx_sys_t::mutex + though in reality it should make very little (read no) + difference because this code path is only taken when the + rbs is empty. */ + + mutex_exit(&trx_sys->mutex); + + ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue); + ut_a(ptr); + + mutex_exit(&purge_sys->bh_mutex); + } else { + mutex_exit(&trx_sys->mutex); + } +} + +/****************************************************************//** +Assign the transaction its history serialisation number and write the +update UNDO log record to the assigned rollback segment. */ +static __attribute__((nonnull)) +void +trx_write_serialisation_history( +/*============================*/ + trx_t* trx, /*!< in/out: transaction */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ +#ifdef WITH_WSREP + trx_sysf_t* sys_header; +#endif /* WITH_WSREP */ + trx_rseg_t* rseg; + + rseg = trx->rseg; + + /* Change the undo log segment states from TRX_UNDO_ACTIVE + to some other state: these modifications to the file data + structure define the transaction as committed in the file + based domain, at the serialization point of the log sequence + number lsn obtained below. */ + + if (trx->update_undo != NULL) { + page_t* undo_hdr_page; + trx_undo_t* undo = trx->update_undo; + + /* We have to hold the rseg mutex because update + log headers have to be put to the history list in the + (serialisation) order of the UNDO trx number. This is + required for the purge in-memory data structures too. */ + + mutex_enter(&rseg->mutex); + + /* Assign the transaction serialisation number and also + update the purge min binary heap if this is the first + UNDO log being written to the assigned rollback segment. */ + + trx_serialisation_number_get(trx); + + /* It is not necessary to obtain trx->undo_mutex here + because only a single OS thread is allowed to do the + transaction commit for this transaction. */ + + undo_hdr_page = trx_undo_set_state_at_finish(undo, mtr); + + trx_undo_update_cleanup(trx, undo_hdr_page, mtr); + } else { + mutex_enter(&rseg->mutex); + } + + if (trx->insert_undo != NULL) { + trx_undo_set_state_at_finish(trx->insert_undo, mtr); + } + + mutex_exit(&rseg->mutex); + + MONITOR_INC(MONITOR_TRX_COMMIT_UNDO); + +#ifdef WITH_WSREP + sys_header = trx_sysf_get(mtr); + /* Update latest MySQL wsrep XID in trx sys header. */ + if (wsrep_is_wsrep_xid((const void *)&trx->xid)) + { + trx_sys_update_wsrep_checkpoint(&trx->xid, sys_header, mtr); + } +#endif /* WITH_WSREP */ + + /* Update the latest MySQL binlog name and offset info + in trx sys header if MySQL binlogging is on or the database + server is a MySQL replication slave */ + + if (trx->mysql_log_file_name + && trx->mysql_log_file_name[0] != '\0') { + + trx_sys_update_mysql_binlog_offset( + trx->mysql_log_file_name, + trx->mysql_log_offset, + TRX_SYS_MYSQL_LOG_INFO, +#ifdef WITH_WSREP + sys_header, +#endif /* WITH_WSREP */ + mtr); + + trx->mysql_log_file_name = NULL; + } +} + +/******************************************************************** +Finalize a transaction containing updates for a FTS table. */ +static __attribute__((nonnull)) +void +trx_finalize_for_fts_table( +/*=======================*/ + fts_trx_table_t* ftt) /* in: FTS trx table */ +{ + fts_t* fts = ftt->table->fts; + fts_doc_ids_t* doc_ids = ftt->added_doc_ids; + + mutex_enter(&fts->bg_threads_mutex); + + if (fts->fts_status & BG_THREAD_STOP) { + /* The table is about to be dropped, no use + adding anything to its work queue. */ + + mutex_exit(&fts->bg_threads_mutex); + } else { + mem_heap_t* heap; + mutex_exit(&fts->bg_threads_mutex); + + ut_a(fts->add_wq); + + heap = static_cast<mem_heap_t*>(doc_ids->self_heap->arg); + + ib_wqueue_add(fts->add_wq, doc_ids, heap); + + /* fts_trx_table_t no longer owns the list. */ + ftt->added_doc_ids = NULL; + } +} + +/******************************************************************//** +Finalize a transaction containing updates to FTS tables. */ +static __attribute__((nonnull)) +void +trx_finalize_for_fts( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + bool is_commit) /*!< in: true if the transaction was + committed, false if it was rolled back. */ +{ + if (is_commit) { + const ib_rbt_node_t* node; + ib_rbt_t* tables; + fts_savepoint_t* savepoint; + + savepoint = static_cast<fts_savepoint_t*>( + ib_vector_last(trx->fts_trx->savepoints)); + + tables = savepoint->tables; + + for (node = rbt_first(tables); + node; + node = rbt_next(tables, node)) { + fts_trx_table_t** ftt; + + ftt = rbt_value(fts_trx_table_t*, node); + + if ((*ftt)->added_doc_ids) { + trx_finalize_for_fts_table(*ftt); + } + } + } + + fts_trx_free(trx->fts_trx); + trx->fts_trx = NULL; +} + +/**********************************************************************//** +If required, flushes the log to disk based on the value of +innodb_flush_log_at_trx_commit. */ +static +void +trx_flush_log_if_needed_low( +/*========================*/ + lsn_t lsn, /*!< in: lsn up to which logs are to be + flushed. */ + trx_t* trx) /*!< in: transaction */ +{ + ulint flush_log_at_trx_commit; + + flush_log_at_trx_commit = srv_use_global_flush_log_at_trx_commit + ? thd_flush_log_at_trx_commit(NULL) + : thd_flush_log_at_trx_commit(trx->mysql_thd); + + switch (flush_log_at_trx_commit) { + case 0: + /* Do nothing */ + break; + case 1: + case 3: + /* Write the log and optionally flush it to disk */ + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, + srv_unix_file_flush_method != SRV_UNIX_NOSYNC); + break; + case 2: + /* Write the log but do not flush it to disk */ + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + + break; + default: + ut_error; + } +} + +/**********************************************************************//** +If required, flushes the log to disk based on the value of +innodb_flush_log_at_trx_commit. */ +static __attribute__((nonnull)) +void +trx_flush_log_if_needed( +/*====================*/ + lsn_t lsn, /*!< in: lsn up to which logs are to be + flushed. */ + trx_t* trx) /*!< in/out: transaction */ +{ + trx->op_info = "flushing log"; + trx_flush_log_if_needed_low(lsn, trx); + trx->op_info = ""; +} + +/****************************************************************//** +Commits a transaction in memory. */ +static __attribute__((nonnull)) +void +trx_commit_in_memory( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + lsn_t lsn) /*!< in: log sequence number of the mini-transaction + commit of trx_write_serialisation_history(), or 0 + if the transaction did not modify anything */ +{ + trx->must_flush_log_later = FALSE; + + if (trx_is_autocommit_non_locking(trx)) { + ut_ad(trx->read_only); + ut_a(!trx->is_recovered); + ut_ad(trx->rseg == NULL); + ut_ad(!trx->in_ro_trx_list); + ut_ad(!trx->in_rw_trx_list); + + /* Note: We are asserting without holding the lock mutex. But + that is OK because this transaction is not waiting and cannot + be rolled back and no new locks can (or should not) be added + becuase it is flagged as a non-locking read-only transaction. */ + + ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); + + /* This state change is not protected by any mutex, therefore + there is an inherent race here around state transition during + printouts. We ignore this race for the sake of efficiency. + However, the trx_sys_t::mutex will protect the trx_t instance + and it cannot be removed from the mysql_trx_list and freed + without first acquiring the trx_sys_t::mutex. */ + + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + + trx->state = TRX_STATE_NOT_STARTED; + + read_view_remove(trx->global_read_view, false); + + MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT); + } else { + lock_trx_release_locks(trx); + + /* Remove the transaction from the list of active + transactions now that it no longer holds any user locks. */ + + ut_ad(trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)); + + mutex_enter(&trx_sys->mutex); + + assert_trx_in_list(trx); + + if (trx->read_only) { + UT_LIST_REMOVE(trx_list, trx_sys->ro_trx_list, trx); + ut_d(trx->in_ro_trx_list = FALSE); + MONITOR_INC(MONITOR_TRX_RO_COMMIT); + } else { + UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx); + ut_d(trx->in_rw_trx_list = FALSE); + ut_ad(trx_sys->descr_n_used <= + UT_LIST_GET_LEN(trx_sys->rw_trx_list)); + MONITOR_INC(MONITOR_TRX_RW_COMMIT); + } + + /* If this transaction came from trx_allocate_for_mysql(), + trx->in_mysql_trx_list would hold. In that case, the + trx->state change must be protected by trx_sys->mutex, so that + lock_print_info_all_transactions() will have a consistent + view. */ + + trx->state = TRX_STATE_NOT_STARTED; + + /* We already own the trx_sys_t::mutex, by doing it here we + avoid a potential context switch later. */ + read_view_remove(trx->global_read_view, true); + + ut_ad(trx_sys_validate_trx_list()); + + mutex_exit(&trx_sys->mutex); + } + + if (trx->global_read_view != NULL) { + + trx->global_read_view = NULL; + } + + trx->read_view = NULL; + + if (lsn) { + ulint flush_log_at_trx_commit; + + if (trx->insert_undo != NULL) { + + trx_undo_insert_cleanup(trx); + } + + if (srv_use_global_flush_log_at_trx_commit) { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL); + } else { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd); + } + + /* NOTE that we could possibly make a group commit more + efficient here: call os_thread_yield here to allow also other + trxs to come to commit! */ + + /*-------------------------------------*/ + + /* Depending on the my.cnf options, we may now write the log + buffer to the log files, making the transaction durable if + the OS does not crash. We may also flush the log files to + disk, making the transaction durable also at an OS crash or a + power outage. + + The idea in InnoDB's group commit is that a group of + transactions gather behind a trx doing a physical disk write + to log files, and when that physical write has been completed, + one of those transactions does a write which commits the whole + group. Note that this group commit will only bring benefit if + there are > 2 users in the database. Then at least 2 users can + gather behind one doing the physical log write to disk. + + If we are calling trx_commit() under prepare_commit_mutex, we + will delay possible log write and flush to a separate function + trx_commit_complete_for_mysql(), which is only called when the + thread has released the mutex. This is to make the + group commit algorithm to work. Otherwise, the prepare_commit + mutex would serialize all commits and prevent a group of + transactions from gathering. */ + + if (trx->flush_log_later) { + /* Do nothing yet */ + trx->must_flush_log_later = TRUE; + } else if (flush_log_at_trx_commit == 0 + || thd_requested_durability(trx->mysql_thd) + == HA_IGNORE_DURABILITY) { + /* Do nothing */ + } else { + trx_flush_log_if_needed(lsn, trx); + } + + trx->commit_lsn = lsn; + } + + /* undo_no is non-zero if we're doing the final commit. */ + bool not_rollback = trx->undo_no != 0; + /* Free all savepoints, starting from the first. */ + trx_named_savept_t* savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + trx_roll_savepoints_free(trx, savep); + + trx->rseg = NULL; + trx->undo_no = 0; + trx->last_sql_stat_start.least_undo_no = 0; + + trx->ddl = false; +#ifdef UNIV_DEBUG + ut_ad(trx->start_file != 0); + ut_ad(trx->start_line != 0); + trx->start_file = 0; + trx->start_line = 0; +#endif /* UNIV_DEBUG */ + + trx->will_lock = 0; + trx->read_only = FALSE; + trx->auto_commit = FALSE; + + if (trx->fts_trx) { + trx_finalize_for_fts(trx, not_rollback); + } + + ut_ad(trx->lock.wait_thr == NULL); + ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); + ut_ad(!trx->in_ro_trx_list); + ut_ad(!trx->in_rw_trx_list); + + trx->dict_operation = TRX_DICT_OP_NONE; + +#ifdef WITH_WSREP + if (wsrep_on(trx->mysql_thd)) { + trx->lock.was_chosen_as_deadlock_victim = FALSE; + } +#endif + ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->rw_trx_list)); + + trx->error_state = DB_SUCCESS; + + /* trx->in_mysql_trx_list would hold between + trx_allocate_for_mysql() and trx_free_for_mysql(). It does not + hold for recovered transactions or system transactions. */ +} + +/****************************************************************//** +Commits a transaction and a mini-transaction. */ +UNIV_INTERN +void +trx_commit_low( +/*===========*/ + trx_t* trx, /*!< in/out: transaction */ + mtr_t* mtr) /*!< in/out: mini-transaction (will be committed), + or NULL if trx made no modifications */ +{ + lsn_t lsn; + + assert_trx_nonlocking_or_in_list(trx); + ut_ad(!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)); + ut_ad(!mtr || mtr->state == MTR_ACTIVE); + ut_ad(!mtr == !(trx->insert_undo || trx->update_undo)); + + /* undo_no is non-zero if we're doing the final commit. */ + if (trx->fts_trx && trx->undo_no != 0) { + dberr_t error; + + ut_a(!trx_is_autocommit_non_locking(trx)); + + error = fts_commit(trx); + + /* FTS-FIXME: Temporarily tolerate DB_DUPLICATE_KEY + instead of dying. This is a possible scenario if there + is a crash between insert to DELETED table committing + and transaction committing. The fix would be able to + return error from this function */ + if (error != DB_SUCCESS && error != DB_DUPLICATE_KEY) { + /* FTS-FIXME: once we can return values from this + function, we should do so and signal an error + instead of just dying. */ + + ut_error; + } + } + + if (mtr) { + trx_write_serialisation_history(trx, mtr); + /* The following call commits the mini-transaction, making the + whole transaction committed in the file-based world, at this + log sequence number. The transaction becomes 'durable' when + we write the log to disk, but in the logical sense the commit + in the file-based data structures (undo logs etc.) happens + here. + + NOTE that transaction numbers, which are assigned only to + transactions with an update undo log, do not necessarily come + in exactly the same order as commit lsn's, if the transactions + have different rollback segments. To get exactly the same + order we should hold the kernel mutex up to this point, + adding to the contention of the kernel mutex. However, if + a transaction T2 is able to see modifications made by + a transaction T1, T2 will always get a bigger transaction + number and a bigger commit lsn than T1. */ + + /*--------------*/ + mtr_commit(mtr); + /*--------------*/ + lsn = mtr->end_lsn; + } else { + lsn = 0; + } + + trx_commit_in_memory(trx, lsn); +} + +/****************************************************************//** +Commits a transaction. */ +UNIV_INTERN +void +trx_commit( +/*=======*/ + trx_t* trx) /*!< in/out: transaction */ +{ + mtr_t local_mtr; + mtr_t* mtr; + + if (trx->insert_undo || trx->update_undo) { + mtr = &local_mtr; + mtr_start(mtr); + } else { + mtr = NULL; + } + + trx_commit_low(trx, mtr); +} + +/****************************************************************//** +Cleans up a transaction at database startup. The cleanup is needed if +the transaction already got to the middle of a commit when the database +crashed, and we cannot roll it back. */ +UNIV_INTERN +void +trx_cleanup_at_db_startup( +/*======================*/ + trx_t* trx) /*!< in: transaction */ +{ + ut_ad(trx->is_recovered); + + if (trx->insert_undo != NULL) { + + trx_undo_insert_cleanup(trx); + } + + trx->rseg = NULL; + trx->undo_no = 0; + trx->last_sql_stat_start.least_undo_no = 0; + + mutex_enter(&trx_sys->mutex); + + ut_a(!trx->read_only); + + UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx); + ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->rw_trx_list)); + + assert_trx_in_rw_list(trx); + ut_d(trx->in_rw_trx_list = FALSE); + + trx->state = TRX_STATE_NOT_STARTED; + trx_release_descriptor(trx); + + mutex_exit(&trx_sys->mutex); + + /* Change the transaction state without mutex protection, now + that it no longer is in the trx_list. Recovered transactions + are never placed in the mysql_trx_list. */ + ut_ad(trx->is_recovered); + ut_ad(!trx->in_ro_trx_list); + ut_ad(!trx->in_rw_trx_list); + ut_ad(!trx->in_mysql_trx_list); +} + +/********************************************************************//** +Assigns a read view for a consistent read query. All the consistent reads +within the same transaction will get the same read view, which is created +when this function is first called for a new started transaction. +@return consistent read view */ +UNIV_INTERN +read_view_t* +trx_assign_read_view( +/*=================*/ + trx_t* trx) /*!< in: active transaction */ +{ + ut_ad(trx->state == TRX_STATE_ACTIVE); + + if (trx->read_view != NULL) { + return(trx->read_view); + } + + trx->read_view = read_view_open_now(trx->id, trx->prebuilt_view); + trx->global_read_view = trx->read_view; + + return(trx->read_view); +} + +/****************************************************************//** +Prepares a transaction for commit/rollback. */ +UNIV_INTERN +void +trx_commit_or_rollback_prepare( +/*===========================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + /* We are reading trx->state without holding trx_sys->mutex + here, because the commit or rollback should be invoked for a + running (or recovered prepared) transaction that is associated + with the current thread. */ + + switch (trx->state) { + case TRX_STATE_NOT_STARTED: +#ifdef WITH_WSREP + ut_d(trx->start_file = __FILE__); + ut_d(trx->start_line = __LINE__); +#endif /* WITH_WSREP */ + trx_start_low(trx); + /* fall through */ + case TRX_STATE_ACTIVE: + case TRX_STATE_PREPARED: + /* If the trx is in a lock wait state, moves the waiting + query thread to the suspended state */ + + if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) { + + ulint sec; + ulint ms; + ib_uint64_t now; + + ut_a(trx->lock.wait_thr != NULL); + trx->lock.wait_thr->state = QUE_THR_SUSPENDED; + trx->lock.wait_thr = NULL; + + if (UNIV_UNLIKELY(trx->take_stats)) { + ut_usectime(&sec, &ms); + now = (ib_uint64_t)sec * 1000000 + ms; + trx->lock_que_wait_timer + += (ulint) + (now - trx->lock_que_wait_ustarted); + } + + trx->lock.que_state = TRX_QUE_RUNNING; + } + + ut_a(trx->lock.n_active_thrs == 1); + return; + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + + ut_error; +} + +/*********************************************************************//** +Creates a commit command node struct. +@return own: commit node struct */ +UNIV_INTERN +commit_node_t* +trx_commit_node_create( +/*===================*/ + mem_heap_t* heap) /*!< in: mem heap where created */ +{ + commit_node_t* node; + + node = static_cast<commit_node_t*>(mem_heap_alloc(heap, sizeof(*node))); + node->common.type = QUE_NODE_COMMIT; + node->state = COMMIT_NODE_SEND; + + return(node); +} + +/***********************************************************//** +Performs an execution step for a commit type node in a query graph. +@return query thread to run next, or NULL */ +UNIV_INTERN +que_thr_t* +trx_commit_step( +/*============*/ + que_thr_t* thr) /*!< in: query thread */ +{ + commit_node_t* node; + + node = static_cast<commit_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = COMMIT_NODE_SEND; + } + + if (node->state == COMMIT_NODE_SEND) { + trx_t* trx; + + node->state = COMMIT_NODE_WAIT; + + trx = thr_get_trx(thr); + + ut_a(trx->lock.wait_thr == NULL); + ut_a(trx->lock.que_state != TRX_QUE_LOCK_WAIT); + + trx_commit_or_rollback_prepare(trx); + + trx->lock.que_state = TRX_QUE_COMMITTING; + + trx_commit(trx); + + ut_ad(trx->lock.wait_thr == NULL); + + trx->lock.que_state = TRX_QUE_RUNNING; + + thr = NULL; + } else { + ut_ad(node->state == COMMIT_NODE_WAIT); + + node->state = COMMIT_NODE_SEND; + + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} + +/**********************************************************************//** +Does the transaction commit for MySQL. +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +trx_commit_for_mysql( +/*=================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + /* Because we do not do the commit by sending an Innobase + sig to the transaction, we must here make sure that trx has been + started. */ + + ut_a(trx); + + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + /* Update the info whether we should skip XA steps that eat + CPU time. + + For the duration of the transaction trx->support_xa is + not reread from thd so any changes in the value take + effect in the next transaction. This is to avoid a + scenario where some undo log records generated by a + transaction contain XA information and other undo log + records, generated by the same transaction do not. */ + trx->support_xa = thd_supports_xa(trx->mysql_thd); + + ut_d(trx->start_file = __FILE__); + ut_d(trx->start_line = __LINE__); + + trx_start_low(trx); + /* fall through */ + case TRX_STATE_ACTIVE: + case TRX_STATE_PREPARED: + trx->op_info = "committing"; + trx_commit(trx); + MONITOR_DEC(MONITOR_TRX_ACTIVE); + trx->op_info = ""; + return(DB_SUCCESS); + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + ut_error; + return(DB_CORRUPTION); +} + +/**********************************************************************//** +If required, flushes the log to disk if we called trx_commit_for_mysql() +with trx->flush_log_later == TRUE. */ +UNIV_INTERN +void +trx_commit_complete_for_mysql( +/*==========================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + ut_a(trx); + + if (!trx->must_flush_log_later + || thd_requested_durability(trx->mysql_thd) + == HA_IGNORE_DURABILITY) { + return; + } + + ulint flush_log_at_trx_commit; + + flush_log_at_trx_commit = srv_use_global_flush_log_at_trx_commit + ? thd_flush_log_at_trx_commit(NULL) + : thd_flush_log_at_trx_commit(trx->mysql_thd); + + if (flush_log_at_trx_commit == 1 && trx->active_commit_ordered) { + return; + } + + trx_flush_log_if_needed(trx->commit_lsn, trx); + + trx->must_flush_log_later = FALSE; +} + +/**********************************************************************//** +Marks the latest SQL statement ended. */ +UNIV_INTERN +void +trx_mark_sql_stat_end( +/*==================*/ + trx_t* trx) /*!< in: trx handle */ +{ + ut_a(trx); + + switch (trx->state) { + case TRX_STATE_PREPARED: + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + case TRX_STATE_NOT_STARTED: + trx->undo_no = 0; + /* fall through */ + case TRX_STATE_ACTIVE: + trx->last_sql_stat_start.least_undo_no = trx->undo_no; + + if (trx->fts_trx) { + fts_savepoint_laststmt_refresh(trx); + } + + return; + } + + ut_error; +} + +/**********************************************************************//** +Prints info about a transaction. +Caller must hold trx_sys->mutex. */ +UNIV_INTERN +void +trx_print_low( +/*==========*/ + FILE* f, + /*!< in: output stream */ + const trx_t* trx, + /*!< in: transaction */ + ulint max_query_len, + /*!< in: max query length to print, + or 0 to use the default max length */ + ulint n_rec_locks, + /*!< in: lock_number_of_rows_locked(&trx->lock) */ + ulint n_trx_locks, + /*!< in: length of trx->lock.trx_locks */ + ulint heap_size) + /*!< in: mem_heap_get_size(trx->lock.lock_heap) */ +{ + ibool newline; + const char* op_info; + + ut_ad(mutex_own(&trx_sys->mutex)); + + fprintf(f, "TRANSACTION " TRX_ID_FMT, trx->id); + + /* trx->state cannot change from or to NOT_STARTED while we + are holding the trx_sys->mutex. It may change from ACTIVE to + PREPARED or COMMITTED. */ + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + fputs(", not started", f); + goto state_ok; + case TRX_STATE_ACTIVE: + fprintf(f, ", ACTIVE %lu sec", + (ulong) difftime(time(NULL), trx->start_time)); + goto state_ok; + case TRX_STATE_PREPARED: + fprintf(f, ", ACTIVE (PREPARED) %lu sec", + (ulong) difftime(time(NULL), trx->start_time)); + goto state_ok; + case TRX_STATE_COMMITTED_IN_MEMORY: + fputs(", COMMITTED IN MEMORY", f); + goto state_ok; + } + fprintf(f, ", state %lu", (ulong) trx->state); + ut_ad(0); +state_ok: + + /* prevent a race condition */ + op_info = trx->op_info; + + if (*op_info) { + putc(' ', f); + fputs(op_info, f); + } + + if (trx->is_recovered) { + fputs(" recovered trx", f); + } + + if (trx->declared_to_be_inside_innodb) { + fprintf(f, ", thread declared inside InnoDB %lu", + (ulong) trx->n_tickets_to_enter_innodb); + } + + putc('\n', f); + + if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { + fprintf(f, "mysql tables in use %lu, locked %lu\n", + (ulong) trx->n_mysql_tables_in_use, + (ulong) trx->mysql_n_tables_locked); + } + + newline = TRUE; + + /* trx->lock.que_state of an ACTIVE transaction may change + while we are not holding trx->mutex. We perform a dirty read + for performance reasons. */ + + switch (trx->lock.que_state) { + case TRX_QUE_RUNNING: + newline = FALSE; break; + case TRX_QUE_LOCK_WAIT: + fputs("LOCK WAIT ", f); break; + case TRX_QUE_ROLLING_BACK: + fputs("ROLLING BACK ", f); break; + case TRX_QUE_COMMITTING: + fputs("COMMITTING ", f); break; + default: + fprintf(f, "que state %lu ", (ulong) trx->lock.que_state); + } + + if (n_trx_locks > 0 || heap_size > 400) { + newline = TRUE; + + fprintf(f, "%lu lock struct(s), heap size %lu," + " %lu row lock(s)", + (ulong) n_trx_locks, + (ulong) heap_size, + (ulong) n_rec_locks); + } + + if (trx->has_search_latch) { + newline = TRUE; + fputs(", holds adaptive hash latch", f); + } + + if (trx->undo_no != 0) { + newline = TRUE; + fprintf(f, ", undo log entries "TRX_ID_FMT, trx->undo_no); + } + + if (newline) { + putc('\n', f); + } + + if (trx->mysql_thd != NULL) { + innobase_mysql_print_thd(f, trx->mysql_thd, max_query_len); + } +} + +/**********************************************************************//** +Prints info about a transaction. +The caller must hold lock_sys->mutex and trx_sys->mutex. +When possible, use trx_print() instead. */ +UNIV_INTERN +void +trx_print_latched( +/*==============*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ulint max_query_len) /*!< in: max query length to print, + or 0 to use the default max length */ +{ + ut_ad(lock_mutex_own()); + ut_ad(mutex_own(&trx_sys->mutex)); + + trx_print_low(f, trx, max_query_len, + lock_number_of_rows_locked(&trx->lock), + UT_LIST_GET_LEN(trx->lock.trx_locks), + mem_heap_get_size(trx->lock.lock_heap)); +} + +/**********************************************************************//** +Prints info about a transaction. +Acquires and releases lock_sys->mutex and trx_sys->mutex. */ +UNIV_INTERN +void +trx_print( +/*======*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ulint max_query_len) /*!< in: max query length to print, + or 0 to use the default max length */ +{ + ulint n_rec_locks; + ulint n_trx_locks; + ulint heap_size; + + lock_mutex_enter(); + n_rec_locks = lock_number_of_rows_locked(&trx->lock); + n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks); + heap_size = mem_heap_get_size(trx->lock.lock_heap); + lock_mutex_exit(); + + mutex_enter(&trx_sys->mutex); + trx_print_low(f, trx, max_query_len, + n_rec_locks, n_trx_locks, heap_size); + mutex_exit(&trx_sys->mutex); +} + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Asserts that a transaction has been started. +The caller must hold trx_sys->mutex. +@return TRUE if started */ +UNIV_INTERN +ibool +trx_assert_started( +/*===============*/ + const trx_t* trx) /*!< in: transaction */ +{ + ut_ad(mutex_own(&trx_sys->mutex)); + + /* Non-locking autocommits should not hold any locks and this + function is only called from the locking code. */ + assert_trx_in_list(trx); + + /* trx->state can change from or to NOT_STARTED while we are holding + trx_sys->mutex for non-locking autocommit selects but not for other + types of transactions. It may change from ACTIVE to PREPARED. Unless + we are holding lock_sys->mutex, it may also change to COMMITTED. */ + + switch (trx->state) { + case TRX_STATE_PREPARED: + return(TRUE); + + case TRX_STATE_ACTIVE: + case TRX_STATE_COMMITTED_IN_MEMORY: + return(TRUE); + + case TRX_STATE_NOT_STARTED: + break; + } + + ut_error; + return(FALSE); +} +#endif /* UNIV_DEBUG */ + +/*******************************************************************//** +Compares the "weight" (or size) of two transactions. Transactions that +have edited non-transactional tables are considered heavier than ones +that have not. +@return TRUE if weight(a) >= weight(b) */ +UNIV_INTERN +ibool +trx_weight_ge( +/*==========*/ + const trx_t* a, /*!< in: the first transaction to be compared */ + const trx_t* b) /*!< in: the second transaction to be compared */ +{ + ibool a_notrans_edit; + ibool b_notrans_edit; + + /* If mysql_thd is NULL for a transaction we assume that it has + not edited non-transactional tables. */ + + a_notrans_edit = a->mysql_thd != NULL + && thd_has_edited_nontrans_tables(a->mysql_thd); + + b_notrans_edit = b->mysql_thd != NULL + && thd_has_edited_nontrans_tables(b->mysql_thd); + + if (a_notrans_edit != b_notrans_edit) { + + return(a_notrans_edit); + } + + /* Either both had edited non-transactional tables or both had + not, we fall back to comparing the number of altered/locked + rows. */ + +#if 0 + fprintf(stderr, + "%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n", + __func__, + a->undo_no, UT_LIST_GET_LEN(a->lock.trx_locks), + b->undo_no, UT_LIST_GET_LEN(b->lock.trx_locks)); +#endif + + return(TRX_WEIGHT(a) >= TRX_WEIGHT(b)); +} + +/****************************************************************//** +Prepares a transaction. */ +static +void +trx_prepare( +/*========*/ + trx_t* trx) /*!< in/out: transaction */ +{ + trx_rseg_t* rseg; + lsn_t lsn; + mtr_t mtr; + + rseg = trx->rseg; + /* Only fresh user transactions can be prepared. + Recovered transactions cannot. */ + ut_a(!trx->is_recovered); + + if (trx->insert_undo != NULL || trx->update_undo != NULL) { + + mtr_start(&mtr); + + /* Change the undo log segment states from TRX_UNDO_ACTIVE + to TRX_UNDO_PREPARED: these modifications to the file data + structure define the transaction as prepared in the + file-based world, at the serialization point of lsn. */ + + mutex_enter(&rseg->mutex); + + if (trx->insert_undo != NULL) { + + /* It is not necessary to obtain trx->undo_mutex here + because only a single OS thread is allowed to do the + transaction prepare for this transaction. */ + + trx_undo_set_state_at_prepare(trx, trx->insert_undo, + &mtr); + } + + if (trx->update_undo) { + trx_undo_set_state_at_prepare( + trx, trx->update_undo, &mtr); + } + + mutex_exit(&rseg->mutex); + + /*--------------*/ + mtr_commit(&mtr); /* This mtr commit makes the + transaction prepared in the file-based + world */ + /*--------------*/ + lsn = mtr.end_lsn; + ut_ad(lsn); + } else { + lsn = 0; + } + + /*--------------------------------------*/ + ut_a(trx->state == TRX_STATE_ACTIVE); + mutex_enter(&trx_sys->mutex); + trx->state = TRX_STATE_PREPARED; + trx_sys->n_prepared_trx++; + mutex_exit(&trx_sys->mutex); + /*--------------------------------------*/ + + if (lsn) { + /* Depending on the my.cnf options, we may now write the log + buffer to the log files, making the prepared state of the + transaction durable if the OS does not crash. We may also + flush the log files to disk, making the prepared state of the + transaction durable also at an OS crash or a power outage. + + The idea in InnoDB's group prepare is that a group of + transactions gather behind a trx doing a physical disk write + to log files, and when that physical write has been completed, + one of those transactions does a write which prepares the whole + group. Note that this group prepare will only bring benefit if + there are > 2 users in the database. Then at least 2 users can + gather behind one doing the physical log write to disk. + + TODO: find out if MySQL holds some mutex when calling this. + That would spoil our group prepare algorithm. */ + + trx_flush_log_if_needed(lsn, trx); + } +} + +/**********************************************************************//** +Does the transaction prepare for MySQL. */ +UNIV_INTERN +void +trx_prepare_for_mysql( +/*==================*/ + trx_t* trx) /*!< in/out: trx handle */ +{ + trx_start_if_not_started_xa(trx); + + trx->op_info = "preparing"; + + trx_prepare(trx); + + trx->op_info = ""; +} + +/**********************************************************************//** +This function is used to find number of prepared transactions and +their transaction objects for a recovery. +@return number of prepared transactions stored in xid_list */ +UNIV_INTERN +int +trx_recover_for_mysql( +/*==================*/ + XID* xid_list, /*!< in/out: prepared transactions */ + ulint len) /*!< in: number of slots in xid_list */ +{ + const trx_t* trx; + ulint count = 0; + + ut_ad(xid_list); + ut_ad(len); + + /* We should set those transactions which are in the prepared state + to the xid_list */ + + mutex_enter(&trx_sys->mutex); + + for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); + trx != NULL; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + + assert_trx_in_rw_list(trx); + + /* The state of a read-write transaction cannot change + from or to NOT_STARTED while we are holding the + trx_sys->mutex. It may change to PREPARED, but not if + trx->is_recovered. It may also change to COMMITTED. */ + if (trx_state_eq(trx, TRX_STATE_PREPARED)) { + xid_list[count] = trx->xid; + + if (count == 0) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Starting recovery for" + " XA transactions...\n"); + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Transaction " TRX_ID_FMT " in" + " prepared state after recovery\n", + trx->id); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Transaction contains changes" + " to "TRX_ID_FMT" rows\n", + trx->undo_no); + + count++; + + if (count == len) { + break; + } + } + } + + mutex_exit(&trx_sys->mutex); + + if (count > 0){ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: %d transactions in prepared state" + " after recovery\n", + int (count)); + } + + return(int (count)); +} + +/*******************************************************************//** +This function is used to find one X/Open XA distributed transaction +which is in the prepared state +@return trx on match, the trx->xid will be invalidated; +note that the trx may have been committed, unless the caller is +holding lock_sys->mutex */ +static __attribute__((nonnull, warn_unused_result)) +trx_t* +trx_get_trx_by_xid_low( +/*===================*/ + const XID* xid) /*!< in: X/Open XA transaction + identifier */ +{ + trx_t* trx; + + ut_ad(mutex_own(&trx_sys->mutex)); + + for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); + trx != NULL; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + + assert_trx_in_rw_list(trx); + + /* Compare two X/Open XA transaction id's: their + length should be the same and binary comparison + of gtrid_length+bqual_length bytes should be + the same */ + + if (trx->is_recovered + && trx_state_eq(trx, TRX_STATE_PREPARED) + && xid->gtrid_length == trx->xid.gtrid_length + && xid->bqual_length == trx->xid.bqual_length + && memcmp(xid->data, trx->xid.data, + xid->gtrid_length + xid->bqual_length) == 0) { + + /* Invalidate the XID, so that subsequent calls + will not find it. */ + memset(&trx->xid, 0, sizeof(trx->xid)); + trx->xid.formatID = -1; + break; + } + } + + return(trx); +} + +/*******************************************************************//** +This function is used to find one X/Open XA distributed transaction +which is in the prepared state +@return trx or NULL; on match, the trx->xid will be invalidated; +note that the trx may have been committed, unless the caller is +holding lock_sys->mutex */ +UNIV_INTERN +trx_t* +trx_get_trx_by_xid( +/*===============*/ + const XID* xid) /*!< in: X/Open XA transaction identifier */ +{ + trx_t* trx; + + if (xid == NULL) { + + return(NULL); + } + + mutex_enter(&trx_sys->mutex); + + /* Recovered/Resurrected transactions are always only on the + trx_sys_t::rw_trx_list. */ + trx = trx_get_trx_by_xid_low(xid); + + mutex_exit(&trx_sys->mutex); + + return(trx); +} + +/*************************************************************//** +Starts the transaction if it is not yet started. */ +UNIV_INTERN +void +trx_start_if_not_started_xa_low( +/*============================*/ + trx_t* trx) /*!< in: transaction */ +{ + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + + /* Update the info whether we should skip XA steps + that eat CPU time. + + For the duration of the transaction trx->support_xa is + not reread from thd so any changes in the value take + effect in the next transaction. This is to avoid a + scenario where some undo generated by a transaction, + has XA stuff, and other undo, generated by the same + transaction, doesn't. */ + trx->support_xa = thd_supports_xa(trx->mysql_thd); + + trx_start_low(trx); + /* fall through */ + case TRX_STATE_ACTIVE: + return; + case TRX_STATE_PREPARED: + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + + ut_error; +} + +/*************************************************************//** +Starts the transaction if it is not yet started. */ +UNIV_INTERN +void +trx_start_if_not_started_low( +/*=========================*/ + trx_t* trx) /*!< in: transaction */ +{ + switch (trx->state) { + case TRX_STATE_NOT_STARTED: +#ifdef WITH_WSREP + ut_d(trx->start_file = __FILE__); + ut_d(trx->start_line = __LINE__); +#endif /* WITH_WSREP */ + trx_start_low(trx); + /* fall through */ + case TRX_STATE_ACTIVE: + return; + case TRX_STATE_PREPARED: + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + + ut_error; +} + +/*************************************************************//** +Starts the transaction for a DDL operation. */ +UNIV_INTERN +void +trx_start_for_ddl_low( +/*==================*/ + trx_t* trx, /*!< in/out: transaction */ + trx_dict_op_t op) /*!< in: dictionary operation type */ +{ + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + /* Flag this transaction as a dictionary operation, so that + the data dictionary will be locked in crash recovery. */ + + trx_set_dict_operation(trx, op); + + /* Ensure it is not flagged as an auto-commit-non-locking + transation. */ + trx->will_lock = 1; + + trx->ddl = true; + +#ifdef WITH_WSREP + ut_d(trx->start_file = __FILE__); + ut_d(trx->start_line = __LINE__); +#endif /* WITH_WSREP */ + trx_start_low(trx); + return; + + case TRX_STATE_ACTIVE: + /* We have this start if not started idiom, therefore we + can't add stronger checks here. */ + trx->ddl = true; + + ut_ad(trx->dict_operation != TRX_DICT_OP_NONE); + ut_ad(trx->will_lock > 0); + return; + case TRX_STATE_PREPARED: + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + + ut_error; +} + diff --git a/storage/xtradb/trx/trx0undo.c b/storage/xtradb/trx/trx0undo.cc index 3d794c69c8b..290271c6cab 100644 --- a/storage/xtradb/trx/trx0undo.c +++ b/storage/xtradb/trx/trx0undo.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc., *****************************************************************************/ /**************************************************//** -@file trx/trx0undo.c +@file trx/trx0undo.cc Transaction undo log Created 3/26/1996 Heikki Tuuri @@ -39,6 +39,7 @@ Created 3/26/1996 Heikki Tuuri #include "srv0start.h" #include "trx0rec.h" #include "trx0purge.h" +#include "srv0mon.h" /* How should the old versions in the history list be managed? ---------------------------------------------------------- @@ -79,7 +80,7 @@ can still remove old versions from the bottom of the stack. */ ------------------------------------------------------------------- latches? ------- -The contention of the kernel mutex should be minimized. When a transaction +The contention of the trx_sys_t::mutex should be minimized. When a transaction does its first insert or modify in an index, an undo log is assigned for it. Then we must have an x-latch to the rollback segment header. When the transaction does more modifys or rolls back, the undo log is @@ -158,6 +159,7 @@ trx_undo_get_prev_rec_from_prev_page( trx_undo_rec_t* rec, /*!< in: undo record */ ulint page_no,/*!< in: undo log header page number */ ulint offset, /*!< in: undo log header offset on page */ + bool shared, /*!< in: true=S-latch, false=X-latch */ mtr_t* mtr) /*!< in: mtr */ { ulint space; @@ -180,8 +182,12 @@ trx_undo_get_prev_rec_from_prev_page( space = page_get_space_id(undo_page); zip_size = fil_space_get_zip_size(space); - prev_page = trx_undo_page_get_s_latched(space, zip_size, - prev_page_no, mtr); + buf_block_t* block = buf_page_get(space, zip_size, prev_page_no, + shared ? RW_S_LATCH : RW_X_LATCH, + mtr); + buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); + + prev_page = buf_block_get_frame(block); return(trx_undo_page_get_last_rec(prev_page, page_no, offset)); } @@ -196,6 +202,7 @@ trx_undo_get_prev_rec( trx_undo_rec_t* rec, /*!< in: undo record */ ulint page_no,/*!< in: undo log header page number */ ulint offset, /*!< in: undo log header offset on page */ + bool shared, /*!< in: true=S-latch, false=X-latch */ mtr_t* mtr) /*!< in: mtr */ { trx_undo_rec_t* prev_rec; @@ -211,7 +218,7 @@ trx_undo_get_prev_rec( previous record */ return(trx_undo_get_prev_rec_from_prev_page(rec, page_no, offset, - mtr)); + shared, mtr)); } /***********************************************************************//** @@ -412,8 +419,8 @@ trx_undo_page_init( Creates a new undo log segment in file. @return DB_SUCCESS if page creation OK possible error codes are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t trx_undo_seg_create( /*================*/ trx_rseg_t* rseg __attribute__((unused)),/*!< in: rollback segment */ @@ -434,7 +441,7 @@ trx_undo_seg_create( trx_usegf_t* seg_hdr; ulint n_reserved; ibool success; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; ut_ad(mtr && id && rseg_hdr); ut_ad(mutex_own(&(rseg->mutex))); @@ -501,6 +508,8 @@ trx_undo_seg_create( page_get_page_no(*undo_page), mtr); *id = slot_no; + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED); + return(err); } @@ -607,13 +616,13 @@ trx_undo_write_xid( mtr_t* mtr) /*!< in: mtr */ { mlog_write_ulint(log_hdr + TRX_UNDO_XA_FORMAT, - (ulint)xid->formatID, MLOG_4BYTES, mtr); + (ulint) xid->formatID, MLOG_4BYTES, mtr); mlog_write_ulint(log_hdr + TRX_UNDO_XA_TRID_LEN, - (ulint)xid->gtrid_length, MLOG_4BYTES, mtr); + (ulint) xid->gtrid_length, MLOG_4BYTES, mtr); mlog_write_ulint(log_hdr + TRX_UNDO_XA_BQUAL_LEN, - (ulint)xid->bqual_length, MLOG_4BYTES, mtr); + (ulint) xid->bqual_length, MLOG_4BYTES, mtr); mlog_write_string(log_hdr + TRX_UNDO_XA_XID, (const byte*) xid->data, XIDDATASIZE, mtr); @@ -628,7 +637,7 @@ trx_undo_read_xid( trx_ulogf_t* log_hdr,/*!< in: undo log header */ XID* xid) /*!< out: X/Open XA Transaction Identification */ { - xid->formatID = (long)mach_read_from_4(log_hdr + TRX_UNDO_XA_FORMAT); + xid->formatID = (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_FORMAT); xid->gtrid_length = (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_TRID_LEN); @@ -894,7 +903,6 @@ trx_undo_add_page( ulint n_reserved; ut_ad(mutex_own(&(trx->undo_mutex))); - ut_ad(!mutex_own(&kernel_mutex)); ut_ad(mutex_own(&(trx->rseg->mutex))); rseg = trx->rseg; @@ -969,7 +977,6 @@ trx_undo_free_page( ulint zip_size; ut_a(hdr_page_no != page_no); - ut_ad(!mutex_own(&kernel_mutex)); ut_ad(mutex_own(&(rseg->mutex))); zip_size = rseg->zip_size; @@ -1218,8 +1225,6 @@ trx_undo_seg_free( mtr_start(&mtr); - ut_ad(!mutex_own(&kernel_mutex)); - mutex_enter(&(rseg->mutex)); seg_header = trx_undo_page_get(undo->space, undo->zip_size, @@ -1237,6 +1242,8 @@ trx_undo_seg_free( &mtr); trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, &mtr); + + MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED); } mutex_exit(&(rseg->mutex)); @@ -1355,6 +1362,7 @@ add_to_list: } else { UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_cached, undo); + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED); } } else { ut_ad(type == TRX_UNDO_UPDATE); @@ -1364,6 +1372,7 @@ add_to_list: } else { UT_LIST_ADD_LAST(undo_list, rseg->update_undo_cached, undo); + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED); } } @@ -1381,8 +1390,6 @@ trx_undo_lists_init( /*================*/ trx_rseg_t* rseg) /*!< in: rollback segment memory object */ { - ulint page_no; - trx_undo_t* undo; ulint size = 0; trx_rsegf_t* rseg_header; ulint i; @@ -1395,10 +1402,12 @@ trx_undo_lists_init( mtr_start(&mtr); - rseg_header = trx_rsegf_get_new(rseg->space, rseg->zip_size, - rseg->page_no, &mtr); + rseg_header = trx_rsegf_get_new( + rseg->space, rseg->zip_size, rseg->page_no, &mtr); for (i = 0; i < TRX_RSEG_N_SLOTS; i++) { + ulint page_no; + page_no = trx_rsegf_get_nth_undo(rseg_header, i, &mtr); /* In forced recovery: try to avoid operations which look @@ -1409,8 +1418,11 @@ trx_undo_lists_init( if (page_no != FIL_NULL && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) { - undo = trx_undo_mem_create_at_db_start(rseg, i, - page_no, &mtr); + trx_undo_t* undo; + + undo = trx_undo_mem_create_at_db_start( + rseg, i, page_no, &mtr); + size += undo->size; mtr_commit(&mtr); @@ -1420,6 +1432,9 @@ trx_undo_lists_init( rseg_header = trx_rsegf_get( rseg->space, rseg->zip_size, rseg->page_no, &mtr); + + /* Found a used slot */ + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED); } } @@ -1455,11 +1470,11 @@ trx_undo_mem_create( ut_error; } - undo = mem_alloc(sizeof(trx_undo_t)); + undo = static_cast<trx_undo_t*>(mem_alloc(sizeof(*undo))); if (undo == NULL) { - return NULL; + return(NULL); } undo->id = id; @@ -1542,8 +1557,8 @@ Creates a new undo log. @return DB_SUCCESS if successful in creating the new undo lob object, possible error codes are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE DB_OUT_OF_MEMORY */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t trx_undo_create( /*============*/ trx_t* trx, /*!< in: transaction */ @@ -1562,7 +1577,7 @@ trx_undo_create( ulint offset; ulint id; page_t* undo_page; - ulint err; + dberr_t err; ut_ad(mutex_own(&(rseg->mutex))); @@ -1639,6 +1654,8 @@ trx_undo_reuse_cached( } UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, undo); + + MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); } else { ut_ad(type == TRX_UNDO_UPDATE); @@ -1649,6 +1666,8 @@ trx_undo_reuse_cached( } UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, undo); + + MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); } ut_ad(undo->size == 1); @@ -1730,10 +1749,10 @@ trx_undo_mark_as_dict_operation( Assigns an undo log for a transaction. A new undo log is created or a cached undo log reused. @return DB_SUCCESS if undo log assign successful, possible error codes -are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE +are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE DB_READ_ONLY DB_OUT_OF_MEMORY */ UNIV_INTERN -ulint +dberr_t trx_undo_assign_undo( /*=================*/ trx_t* trx, /*!< in: transaction */ @@ -1742,10 +1761,13 @@ trx_undo_assign_undo( trx_rseg_t* rseg; trx_undo_t* undo; mtr_t mtr; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; ut_ad(trx); - ut_ad(trx->rseg); + + if (trx->rseg == NULL) { + return(DB_READ_ONLY); + } rseg = trx->rseg; @@ -1753,15 +1775,19 @@ trx_undo_assign_undo( mtr_start(&mtr); - ut_ad(!mutex_own(&kernel_mutex)); + mutex_enter(&rseg->mutex); - mutex_enter(&(rseg->mutex)); + DBUG_EXECUTE_IF( + "ib_create_table_fail_too_many_trx", + err = DB_TOO_MANY_CONCURRENT_TRXS; + goto func_exit; + ); undo = trx_undo_reuse_cached(trx, rseg, type, trx->id, &trx->xid, &mtr); if (undo == NULL) { err = trx_undo_create(trx, rseg, type, trx->id, &trx->xid, - &undo, &mtr); + &undo, &mtr); if (err != DB_SUCCESS) { goto func_exit; @@ -1786,7 +1812,7 @@ func_exit: mutex_exit(&(rseg->mutex)); mtr_commit(&mtr); - return err; + return(err); } /******************************************************************//** @@ -1804,9 +1830,6 @@ trx_undo_set_state_at_finish( page_t* undo_page; ulint state; - ut_ad(undo); - ut_ad(mtr); - if (undo->id >= TRX_RSEG_N_SLOTS) { fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", (ulong) undo->id); @@ -1919,9 +1942,10 @@ trx_undo_update_cleanup( if (undo->state == TRX_UNDO_CACHED) { UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_cached, undo); + + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED); } else { - ut_ad(undo->state == TRX_UNDO_TO_PURGE - || undo->state == TRX_UNDO_TO_FREE); + ut_ad(undo->state == TRX_UNDO_TO_PURGE); trx_undo_mem_free(undo); } @@ -1953,6 +1977,8 @@ trx_undo_insert_cleanup( if (undo->state == TRX_UNDO_CACHED) { UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_cached, undo); + + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED); } else { ut_ad(undo->state == TRX_UNDO_TO_FREE); |