diff options
Diffstat (limited to 'storage/innobase/trx')
-rw-r--r-- | storage/innobase/trx/Makefile.am | 25 | ||||
-rw-r--r-- | storage/innobase/trx/makefilewin | 26 | ||||
-rw-r--r-- | storage/innobase/trx/trx0purge.c | 1164 | ||||
-rw-r--r-- | storage/innobase/trx/trx0rec.c | 1417 | ||||
-rw-r--r-- | storage/innobase/trx/trx0roll.c | 1344 | ||||
-rw-r--r-- | storage/innobase/trx/trx0rseg.c | 261 | ||||
-rw-r--r-- | storage/innobase/trx/trx0sys.c | 964 | ||||
-rw-r--r-- | storage/innobase/trx/trx0trx.c | 2025 | ||||
-rw-r--r-- | storage/innobase/trx/trx0undo.c | 1906 |
9 files changed, 9132 insertions, 0 deletions
diff --git a/storage/innobase/trx/Makefile.am b/storage/innobase/trx/Makefile.am new file mode 100644 index 00000000000..9e2b3c398e3 --- /dev/null +++ b/storage/innobase/trx/Makefile.am @@ -0,0 +1,25 @@ +# Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB +# & Innobase Oy +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +include ../include/Makefile.i + +noinst_LIBRARIES = libtrx.a + +libtrx_a_SOURCES = trx0purge.c trx0rec.c trx0roll.c trx0rseg.c\ + trx0sys.c trx0trx.c trx0undo.c + +EXTRA_PROGRAMS = diff --git a/storage/innobase/trx/makefilewin b/storage/innobase/trx/makefilewin new file mode 100644 index 00000000000..35588779d66 --- /dev/null +++ b/storage/innobase/trx/makefilewin @@ -0,0 +1,26 @@ +include ..\include\makefile.i + +trx.lib: trx0sys.obj trx0trx.obj trx0rseg.obj trx0undo.obj trx0rec.obj trx0roll.obj trx0purge.obj + lib -out:..\libs\trx.lib trx0sys.obj trx0trx.obj trx0rseg.obj trx0undo.obj trx0rec.obj trx0roll.obj trx0purge.obj + +trx0trx.obj: trx0trx.c + $(CCOM) $(CFL) -c -I.. trx0trx.c + +trx0sys.obj: trx0sys.c + $(CCOM) $(CFL) -c -I.. trx0sys.c + +trx0rseg.obj: trx0rseg.c + $(CCOM) $(CFL) -c -I.. trx0rseg.c + +trx0undo.obj: trx0undo.c + $(CCOM) $(CFL) -c -I.. trx0undo.c + +trx0rec.obj: trx0rec.c + $(CCOM) $(CFL) -c -I.. trx0rec.c + +trx0roll.obj: trx0roll.c + $(CCOM) $(CFL) -c -I.. trx0roll.c + +trx0purge.obj: trx0purge.c + $(CCOM) $(CFL) -c -I.. trx0purge.c + diff --git a/storage/innobase/trx/trx0purge.c b/storage/innobase/trx/trx0purge.c new file mode 100644 index 00000000000..3df34111281 --- /dev/null +++ b/storage/innobase/trx/trx0purge.c @@ -0,0 +1,1164 @@ +/****************************************************** +Purge old versions + +(c) 1996 Innobase Oy + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0purge.h" + +#ifdef UNIV_NONINL +#include "trx0purge.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "read0read.h" +#include "fut0fut.h" +#include "que0que.h" +#include "row0purge.h" +#include "row0upd.h" +#include "trx0rec.h" +#include "srv0que.h" +#include "os0thread.h" + +/* The global data structure coordinating a purge */ +trx_purge_t* purge_sys = NULL; + +/* A dummy undo record used as a return value when we have a whole undo log +which needs no purge */ +trx_undo_rec_t trx_purge_dummy_rec; + +/********************************************************************* +Checks if trx_id is >= purge_view: then it is guaranteed that its update +undo log still exists in the system. */ + +ibool +trx_purge_update_undo_must_exist( +/*=============================*/ + /* out: TRUE if is sure that it is preserved, also + if the function returns FALSE, it is possible that + the undo log still exists in the system */ + dulint trx_id) /* in: transaction id */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + if (!read_view_sees_trx_id(purge_sys->view, trx_id)) { + + return(TRUE); + } + + return(FALSE); +} + +/*=================== PURGE RECORD ARRAY =============================*/ + +/*********************************************************************** +Stores info of an undo log record during a purge. */ +static +trx_undo_inf_t* +trx_purge_arr_store_info( +/*=====================*/ + /* out: pointer to the storage cell */ + dulint trx_no, /* in: transaction number */ + dulint undo_no)/* in: undo number */ +{ + trx_undo_inf_t* cell; + trx_undo_arr_t* arr; + ulint i; + + arr = purge_sys->arr; + + for (i = 0;; i++) { + cell = trx_undo_arr_get_nth_info(arr, i); + + if (!(cell->in_use)) { + /* Not in use, we may store here */ + cell->undo_no = undo_no; + cell->trx_no = trx_no; + cell->in_use = TRUE; + + arr->n_used++; + + return(cell); + } + } +} + +/*********************************************************************** +Removes info of an undo log record during a purge. */ +UNIV_INLINE +void +trx_purge_arr_remove_info( +/*======================*/ + trx_undo_inf_t* cell) /* in: pointer to the storage cell */ +{ + trx_undo_arr_t* arr; + + arr = purge_sys->arr; + + cell->in_use = FALSE; + + ut_ad(arr->n_used > 0); + + arr->n_used--; +} + +/*********************************************************************** +Gets the biggest pair of a trx number and an undo number in a purge array. */ +static +void +trx_purge_arr_get_biggest( +/*======================*/ + trx_undo_arr_t* arr, /* in: purge array */ + dulint* trx_no, /* out: transaction number: ut_dulint_zero + if array is empty */ + dulint* undo_no)/* out: undo number */ +{ + trx_undo_inf_t* cell; + dulint pair_trx_no; + dulint pair_undo_no; + int trx_cmp; + ulint n_used; + ulint i; + ulint n; + + n = 0; + n_used = arr->n_used; + pair_trx_no = ut_dulint_zero; + pair_undo_no = ut_dulint_zero; + + for (i = 0;; i++) { + cell = trx_undo_arr_get_nth_info(arr, i); + + if (cell->in_use) { + n++; + trx_cmp = ut_dulint_cmp(cell->trx_no, pair_trx_no); + + if ((trx_cmp > 0) + || ((trx_cmp == 0) + && (ut_dulint_cmp(cell->undo_no, + pair_undo_no) >= 0))) { + + pair_trx_no = cell->trx_no; + pair_undo_no = cell->undo_no; + } + } + + if (n == n_used) { + *trx_no = pair_trx_no; + *undo_no = pair_undo_no; + + return; + } + } +} + +/******************************************************************** +Builds a purge 'query' graph. The actual purge is performed by executing +this query graph. */ +static +que_t* +trx_purge_graph_build(void) +/*=======================*/ + /* out, own: the query graph */ +{ + mem_heap_t* heap; + que_fork_t* fork; + que_thr_t* thr; +/* que_thr_t* thr2; */ + + heap = mem_heap_create(512); + fork = que_fork_create(NULL, NULL, QUE_FORK_PURGE, heap); + fork->trx = purge_sys->trx; + + thr = que_thr_create(fork, heap); + + thr->child = row_purge_node_create(thr, heap); + +/* thr2 = que_thr_create(fork, fork, heap); + + thr2->child = row_purge_node_create(fork, thr2, heap); */ + + return(fork); +} + +/************************************************************************ +Creates the global purge system control structure and inits the history +mutex. */ + +void +trx_purge_sys_create(void) +/*======================*/ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + purge_sys = mem_alloc(sizeof(trx_purge_t)); + + purge_sys->state = TRX_STOP_PURGE; + + purge_sys->n_pages_handled = 0; + + purge_sys->purge_trx_no = ut_dulint_zero; + purge_sys->purge_undo_no = ut_dulint_zero; + purge_sys->next_stored = FALSE; + + rw_lock_create(&(purge_sys->latch)); + rw_lock_set_level(&(purge_sys->latch), SYNC_PURGE_LATCH); + + mutex_create(&(purge_sys->mutex)); + mutex_set_level(&(purge_sys->mutex), SYNC_PURGE_SYS); + + purge_sys->heap = mem_heap_create(256); + + purge_sys->arr = trx_undo_arr_create(); + + purge_sys->sess = sess_open(); + + purge_sys->trx = purge_sys->sess->trx; + + purge_sys->trx->type = TRX_PURGE; + + ut_a(trx_start_low(purge_sys->trx, ULINT_UNDEFINED)); + + purge_sys->query = trx_purge_graph_build(); + + purge_sys->view = read_view_oldest_copy_or_open_new(NULL, + purge_sys->heap); +} + +/*================ UNDO LOG HISTORY LIST =============================*/ + +/************************************************************************ +Adds the update undo log as the first log in the history list. Removes the +update undo log segment from the rseg slot if it is too big for reuse. */ + +void +trx_purge_add_update_undo_to_history( +/*=================================*/ + trx_t* trx, /* in: transaction */ + page_t* undo_page, /* in: update undo log header page, + x-latched */ + mtr_t* mtr) /* in: mtr */ +{ + trx_undo_t* undo; + trx_rseg_t* rseg; + trx_rsegf_t* rseg_header; + trx_usegf_t* seg_header; + trx_ulogf_t* undo_header; + trx_upagef_t* page_header; + ulint hist_size; + + undo = trx->update_undo; + + ut_ad(undo); + + rseg = undo->rseg; +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(rseg->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + rseg_header = trx_rsegf_get(rseg->space, rseg->page_no, mtr); + + undo_header = undo_page + undo->hdr_offset; + seg_header = undo_page + TRX_UNDO_SEG_HDR; + page_header = undo_page + TRX_UNDO_PAGE_HDR; + + if (undo->state != TRX_UNDO_CACHED) { + /* The undo log segment will not be reused */ + + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, + "InnoDB: Error: undo->id is %lu\n", (ulong) undo->id); + ut_error; + } + + trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, mtr); + + hist_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, + MLOG_4BYTES, mtr); + ut_ad(undo->size == + flst_get_len(seg_header + TRX_UNDO_PAGE_LIST, mtr)); + + mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, + hist_size + undo->size, MLOG_4BYTES, mtr); + } + + /* Add the log as the first in the history list */ + flst_add_first(rseg_header + TRX_RSEG_HISTORY, + undo_header + TRX_UNDO_HISTORY_NODE, mtr); + mutex_enter(&kernel_mutex); + trx_sys->rseg_history_len++; + mutex_exit(&kernel_mutex); + + /* Write the trx number to the undo log header */ + mlog_write_dulint(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr); + /* Write information about delete markings to the undo log header */ + + if (!undo->del_marks) { + mlog_write_ulint(undo_header + TRX_UNDO_DEL_MARKS, FALSE, + MLOG_2BYTES, mtr); + } + + if (rseg->last_page_no == FIL_NULL) { + + rseg->last_page_no = undo->hdr_page_no; + rseg->last_offset = undo->hdr_offset; + rseg->last_trx_no = trx->no; + rseg->last_del_marks = undo->del_marks; + } +} + +/************************************************************************** +Frees an undo log segment which is in the history list. Cuts the end of the +history list at the youngest undo log in this segment. */ +static +void +trx_purge_free_segment( +/*===================*/ + trx_rseg_t* rseg, /* in: rollback segment */ + fil_addr_t hdr_addr, /* in: the file address of log_hdr */ + ulint n_removed_logs) /* in: count of how many undo logs we + will cut off from the end of the + history list */ +{ + page_t* undo_page; + trx_rsegf_t* rseg_hdr; + trx_ulogf_t* log_hdr; + trx_usegf_t* seg_hdr; + ibool freed; + ulint seg_size; + ulint hist_size; + ibool marked = FALSE; + mtr_t mtr; + +/* fputs("Freeing an update undo log segment\n", stderr); */ + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(purge_sys->mutex))); +#endif /* UNIV_SYNC_DEBUG */ +loop: + mtr_start(&mtr); + mutex_enter(&(rseg->mutex)); + + rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr); + + undo_page = trx_undo_page_get(rseg->space, hdr_addr.page, &mtr); + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + log_hdr = undo_page + hdr_addr.boffset; + + /* Mark the last undo log totally purged, so that if the system + crashes, the tail of the undo log will not get accessed again. The + list of pages in the undo log tail gets inconsistent during the + freeing of the segment, and therefore purge should not try to access + them again. */ + + if (!marked) { + mlog_write_ulint(log_hdr + TRX_UNDO_DEL_MARKS, FALSE, + MLOG_2BYTES, &mtr); + marked = TRUE; + } + + freed = fseg_free_step_not_header(seg_hdr + TRX_UNDO_FSEG_HEADER, + &mtr); + if (!freed) { + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + goto loop; + } + + /* The page list may now be inconsistent, but the length field + stored in the list base node tells us how big it was before we + started the freeing. */ + + seg_size = flst_get_len(seg_hdr + TRX_UNDO_PAGE_LIST, &mtr); + + /* We may free the undo log segment header page; it must be freed + within the same mtr as the undo log header is removed from the + history list: otherwise, in case of a database crash, the segment + could become inaccessible garbage in the file space. */ + + flst_cut_end(rseg_hdr + TRX_RSEG_HISTORY, + log_hdr + TRX_UNDO_HISTORY_NODE, n_removed_logs, &mtr); + + mutex_enter(&kernel_mutex); + ut_ad(trx_sys->rseg_history_len >= n_removed_logs); + trx_sys->rseg_history_len -= n_removed_logs; + mutex_exit(&kernel_mutex); + + freed = FALSE; + + while (!freed) { + /* Here we assume that a file segment with just the header + page can be freed in a few steps, so that the buffer pool + is not flooded with bufferfixed pages: see the note in + fsp0fsp.c. */ + + freed = fseg_free_step(seg_hdr + TRX_UNDO_FSEG_HEADER, + &mtr); + } + + hist_size = mtr_read_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE, + MLOG_4BYTES, &mtr); + ut_ad(hist_size >= seg_size); + + mlog_write_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE, + hist_size - seg_size, MLOG_4BYTES, &mtr); + + ut_ad(rseg->curr_size >= seg_size); + + rseg->curr_size -= seg_size; + + mutex_exit(&(rseg->mutex)); + + mtr_commit(&mtr); +} + +/************************************************************************ +Removes unnecessary history data from a rollback segment. */ +static +void +trx_purge_truncate_rseg_history( +/*============================*/ + trx_rseg_t* rseg, /* in: rollback segment */ + dulint limit_trx_no, /* in: remove update undo logs whose + trx number is < limit_trx_no */ + dulint limit_undo_no) /* in: if transaction number is equal + to limit_trx_no, truncate undo records + with undo number < limit_undo_no */ +{ + fil_addr_t hdr_addr; + fil_addr_t prev_hdr_addr; + trx_rsegf_t* rseg_hdr; + page_t* undo_page; + trx_ulogf_t* log_hdr; + trx_usegf_t* seg_hdr; + int cmp; + ulint n_removed_logs = 0; + mtr_t mtr; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(purge_sys->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + mtr_start(&mtr); + mutex_enter(&(rseg->mutex)); + + rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr); + + hdr_addr = trx_purge_get_log_from_hist( + flst_get_last(rseg_hdr + TRX_RSEG_HISTORY, &mtr)); +loop: + if (hdr_addr.page == FIL_NULL) { + + mutex_exit(&(rseg->mutex)); + + mtr_commit(&mtr); + + return; + } + + undo_page = trx_undo_page_get(rseg->space, hdr_addr.page, &mtr); + + log_hdr = undo_page + hdr_addr.boffset; + + cmp = ut_dulint_cmp(mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO), + limit_trx_no); + if (cmp == 0) { + trx_undo_truncate_start(rseg, rseg->space, hdr_addr.page, + hdr_addr.boffset, limit_undo_no); + } + + if (cmp >= 0) { + mutex_enter(&kernel_mutex); + ut_a(trx_sys->rseg_history_len >= n_removed_logs); + trx_sys->rseg_history_len -= n_removed_logs; + mutex_exit(&kernel_mutex); + + flst_truncate_end(rseg_hdr + TRX_RSEG_HISTORY, + log_hdr + TRX_UNDO_HISTORY_NODE, + n_removed_logs, &mtr); + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + return; + } + + prev_hdr_addr = trx_purge_get_log_from_hist( + flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, + &mtr)); + n_removed_logs++; + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + + if ((mach_read_from_2(seg_hdr + TRX_UNDO_STATE) == TRX_UNDO_TO_PURGE) + && (mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG) == 0)) { + + /* We can free the whole log segment */ + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + trx_purge_free_segment(rseg, hdr_addr, n_removed_logs); + + n_removed_logs = 0; + } else { + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + } + + mtr_start(&mtr); + mutex_enter(&(rseg->mutex)); + + rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr); + + hdr_addr = prev_hdr_addr; + + goto loop; +} + +/************************************************************************ +Removes unnecessary history data from rollback segments. NOTE that when this +function is called, the caller must not have any latches on undo log pages! */ +static +void +trx_purge_truncate_history(void) +/*============================*/ +{ + trx_rseg_t* rseg; + dulint limit_trx_no; + dulint limit_undo_no; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(purge_sys->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + trx_purge_arr_get_biggest(purge_sys->arr, &limit_trx_no, + &limit_undo_no); + + if (ut_dulint_cmp(limit_trx_no, ut_dulint_zero) == 0) { + + limit_trx_no = purge_sys->purge_trx_no; + limit_undo_no = purge_sys->purge_undo_no; + } + + /* We play safe and set the truncate limit at most to the purge view + low_limit number, though this is not necessary */ + + if (ut_dulint_cmp(limit_trx_no, purge_sys->view->low_limit_no) >= 0) { + limit_trx_no = purge_sys->view->low_limit_no; + limit_undo_no = ut_dulint_zero; + } + + ut_ad((ut_dulint_cmp(limit_trx_no, + purge_sys->view->low_limit_no) <= 0)); + + rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + + while (rseg) { + trx_purge_truncate_rseg_history(rseg, limit_trx_no, + limit_undo_no); + rseg = UT_LIST_GET_NEXT(rseg_list, rseg); + } +} + +/************************************************************************ +Does a truncate if the purge array is empty. NOTE that when this function is +called, the caller must not have any latches on undo log pages! */ +UNIV_INLINE +ibool +trx_purge_truncate_if_arr_empty(void) +/*=================================*/ + /* out: TRUE if array empty */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(purge_sys->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + if (purge_sys->arr->n_used == 0) { + + trx_purge_truncate_history(); + + return(TRUE); + } + + return(FALSE); +} + +/*************************************************************************** +Updates the last not yet purged history log info in rseg when we have purged +a whole undo log. Advances also purge_sys->purge_trx_no past the purged log. */ +static +void +trx_purge_rseg_get_next_history_log( +/*================================*/ + trx_rseg_t* rseg) /* in: rollback segment */ +{ + page_t* undo_page; + trx_ulogf_t* log_hdr; + trx_usegf_t* seg_hdr; + fil_addr_t prev_log_addr; + dulint trx_no; + ibool del_marks; + mtr_t mtr; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(purge_sys->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + mutex_enter(&(rseg->mutex)); + + ut_a(rseg->last_page_no != FIL_NULL); + + purge_sys->purge_trx_no = ut_dulint_add(rseg->last_trx_no, 1); + purge_sys->purge_undo_no = ut_dulint_zero; + purge_sys->next_stored = FALSE; + + mtr_start(&mtr); + + undo_page = trx_undo_page_get_s_latched(rseg->space, + rseg->last_page_no, &mtr); + log_hdr = undo_page + rseg->last_offset; + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + + /* Increase the purge page count by one for every handled log */ + + purge_sys->n_pages_handled++; + + prev_log_addr = trx_purge_get_log_from_hist( + flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, + &mtr)); + if (prev_log_addr.page == FIL_NULL) { + /* No logs left in the history list */ + + rseg->last_page_no = FIL_NULL; + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + mutex_enter(&kernel_mutex); + + /* Add debug code to track history list corruption reported + on the MySQL mailing list on Nov 9, 2004. The fut0lst.c + file-based list was corrupt. The prev node pointer was + FIL_NULL, even though the list length was over 8 million nodes! + We assume that purge truncates the history list in moderate + size pieces, and if we here reach the head of the list, the + list cannot be longer than 20 000 undo logs now. */ + + if (trx_sys->rseg_history_len > 20000) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: purge reached the head of the history list,\n" +"InnoDB: but its length is still reported as %lu! Make a detailed bug\n" +"InnoDB: report, and post it to bugs.mysql.com\n", + (ulong)trx_sys->rseg_history_len); + } + + mutex_exit(&kernel_mutex); + + return; + } + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + /* Read the trx number and del marks from the previous log header */ + mtr_start(&mtr); + + log_hdr = trx_undo_page_get_s_latched(rseg->space, + prev_log_addr.page, &mtr) + + prev_log_addr.boffset; + + trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO); + + del_marks = mach_read_from_2(log_hdr + TRX_UNDO_DEL_MARKS); + + mtr_commit(&mtr); + + mutex_enter(&(rseg->mutex)); + + rseg->last_page_no = prev_log_addr.page; + rseg->last_offset = prev_log_addr.boffset; + rseg->last_trx_no = trx_no; + rseg->last_del_marks = del_marks; + + mutex_exit(&(rseg->mutex)); +} + +/*************************************************************************** +Chooses the next undo log to purge and updates the info in purge_sys. This +function is used to initialize purge_sys when the next record to purge is +not known, and also to update the purge system info on the next record when +purge has handled the whole undo log for a transaction. */ +static +void +trx_purge_choose_next_log(void) +/*===========================*/ +{ + trx_undo_rec_t* rec; + trx_rseg_t* rseg; + trx_rseg_t* min_rseg; + dulint min_trx_no; + ulint space = 0; /* remove warning (??? bug ???) */ + ulint page_no = 0; /* remove warning (??? bug ???) */ + ulint offset = 0; /* remove warning (??? bug ???) */ + mtr_t mtr; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(purge_sys->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(purge_sys->next_stored == FALSE); + + rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + + min_trx_no = ut_dulint_max; + + min_rseg = NULL; + + while (rseg) { + mutex_enter(&(rseg->mutex)); + + if (rseg->last_page_no != FIL_NULL) { + + if ((min_rseg == NULL) + || (ut_dulint_cmp(min_trx_no, rseg->last_trx_no) + > 0)) { + + min_rseg = rseg; + min_trx_no = rseg->last_trx_no; + space = rseg->space; + ut_a(space == 0); /* We assume in purge of + externally stored fields + that space id == 0 */ + page_no = rseg->last_page_no; + offset = rseg->last_offset; + } + } + + mutex_exit(&(rseg->mutex)); + + rseg = UT_LIST_GET_NEXT(rseg_list, rseg); + } + + if (min_rseg == NULL) { + + return; + } + + mtr_start(&mtr); + + if (!min_rseg->last_del_marks) { + /* No need to purge this log */ + + rec = &trx_purge_dummy_rec; + } else { + rec = trx_undo_get_first_rec(space, page_no, offset, + RW_S_LATCH, &mtr); + if (rec == NULL) { + /* Undo log empty */ + + rec = &trx_purge_dummy_rec; + } + } + + purge_sys->next_stored = TRUE; + purge_sys->rseg = min_rseg; + + purge_sys->hdr_page_no = page_no; + purge_sys->hdr_offset = offset; + + purge_sys->purge_trx_no = min_trx_no; + + if (rec == &trx_purge_dummy_rec) { + + purge_sys->purge_undo_no = ut_dulint_zero; + purge_sys->page_no = page_no; + purge_sys->offset = 0; + } else { + purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec); + + purge_sys->page_no = buf_frame_get_page_no(rec); + purge_sys->offset = rec - buf_frame_align(rec); + } + + mtr_commit(&mtr); +} + +/*************************************************************************** +Gets the next record to purge and updates the info in the purge system. */ +static +trx_undo_rec_t* +trx_purge_get_next_rec( +/*===================*/ + /* out: copy of an undo log record or + pointer to the dummy undo log record */ + mem_heap_t* heap) /* in: memory heap where copied */ +{ + trx_undo_rec_t* rec; + trx_undo_rec_t* rec_copy; + trx_undo_rec_t* rec2; + trx_undo_rec_t* next_rec; + page_t* undo_page; + page_t* page; + ulint offset; + ulint page_no; + ulint space; + ulint type; + ulint cmpl_info; + mtr_t mtr; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(purge_sys->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(purge_sys->next_stored); + + space = purge_sys->rseg->space; + page_no = purge_sys->page_no; + offset = purge_sys->offset; + + if (offset == 0) { + /* It is the dummy undo log record, which means that there is + no need to purge this undo log */ + + trx_purge_rseg_get_next_history_log(purge_sys->rseg); + + /* Look for the next undo log and record to purge */ + + trx_purge_choose_next_log(); + + return(&trx_purge_dummy_rec); + } + + mtr_start(&mtr); + + undo_page = trx_undo_page_get_s_latched(space, page_no, &mtr); + rec = undo_page + offset; + + rec2 = rec; + + for (;;) { + /* Try first to find the next record which requires a purge + operation from the same page of the same undo log */ + + next_rec = trx_undo_page_get_next_rec(rec2, + purge_sys->hdr_page_no, + purge_sys->hdr_offset); + if (next_rec == NULL) { + rec2 = trx_undo_get_next_rec(rec2, + purge_sys->hdr_page_no, + purge_sys->hdr_offset, &mtr); + break; + } + + rec2 = next_rec; + + type = trx_undo_rec_get_type(rec2); + + if (type == TRX_UNDO_DEL_MARK_REC) { + + break; + } + + cmpl_info = trx_undo_rec_get_cmpl_info(rec2); + + if (trx_undo_rec_get_extern_storage(rec2)) { + break; + } + + if ((type == TRX_UNDO_UPD_EXIST_REC) + && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + break; + } + } + + if (rec2 == NULL) { + mtr_commit(&mtr); + + trx_purge_rseg_get_next_history_log(purge_sys->rseg); + + /* Look for the next undo log and record to purge */ + + trx_purge_choose_next_log(); + + mtr_start(&mtr); + + undo_page = trx_undo_page_get_s_latched(space, page_no, &mtr); + + rec = undo_page + offset; + } else { + page = buf_frame_align(rec2); + + purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec2); + purge_sys->page_no = buf_frame_get_page_no(page); + purge_sys->offset = rec2 - page; + + if (undo_page != page) { + /* We advance to a new page of the undo log: */ + purge_sys->n_pages_handled++; + } + } + + rec_copy = trx_undo_rec_copy(rec, heap); + + mtr_commit(&mtr); + + return(rec_copy); +} + +/************************************************************************ +Fetches the next undo log record from the history list to purge. It must be +released with the corresponding release function. */ + +trx_undo_rec_t* +trx_purge_fetch_next_rec( +/*=====================*/ + /* out: copy of an undo log record or + pointer to the dummy undo log record + &trx_purge_dummy_rec, if the whole undo log + can skipped in purge; NULL if none left */ + dulint* roll_ptr,/* out: roll pointer to undo record */ + trx_undo_inf_t** cell, /* out: storage cell for the record in the + purge array */ + mem_heap_t* heap) /* in: memory heap where copied */ +{ + trx_undo_rec_t* undo_rec; + + mutex_enter(&(purge_sys->mutex)); + + if (purge_sys->state == TRX_STOP_PURGE) { + trx_purge_truncate_if_arr_empty(); + + mutex_exit(&(purge_sys->mutex)); + + return(NULL); + } + + if (!purge_sys->next_stored) { + trx_purge_choose_next_log(); + + if (!purge_sys->next_stored) { + purge_sys->state = TRX_STOP_PURGE; + + trx_purge_truncate_if_arr_empty(); + + if (srv_print_thread_releases) { + fprintf(stderr, + "Purge: No logs left in the history list; pages handled %lu\n", + (ulong) purge_sys->n_pages_handled); + } + + mutex_exit(&(purge_sys->mutex)); + + return(NULL); + } + } + + if (purge_sys->n_pages_handled >= purge_sys->handle_limit) { + + purge_sys->state = TRX_STOP_PURGE; + + trx_purge_truncate_if_arr_empty(); + + mutex_exit(&(purge_sys->mutex)); + + return(NULL); + } + + if (ut_dulint_cmp(purge_sys->purge_trx_no, + purge_sys->view->low_limit_no) >= 0) { + purge_sys->state = TRX_STOP_PURGE; + + trx_purge_truncate_if_arr_empty(); + + mutex_exit(&(purge_sys->mutex)); + + return(NULL); + } + +/* fprintf(stderr, "Thread %lu purging trx %lu undo record %lu\n", + os_thread_get_curr_id(), + ut_dulint_get_low(purge_sys->purge_trx_no), + ut_dulint_get_low(purge_sys->purge_undo_no)); */ + + *roll_ptr = trx_undo_build_roll_ptr(FALSE, (purge_sys->rseg)->id, + purge_sys->page_no, + purge_sys->offset); + + *cell = trx_purge_arr_store_info(purge_sys->purge_trx_no, + purge_sys->purge_undo_no); + + ut_ad(ut_dulint_cmp(purge_sys->purge_trx_no, + (purge_sys->view)->low_limit_no) < 0); + + /* The following call will advance the stored values of purge_trx_no + and purge_undo_no, therefore we had to store them first */ + + undo_rec = trx_purge_get_next_rec(heap); + + mutex_exit(&(purge_sys->mutex)); + + return(undo_rec); +} + +/*********************************************************************** +Releases a reserved purge undo record. */ + +void +trx_purge_rec_release( +/*==================*/ + trx_undo_inf_t* cell) /* in: storage cell */ +{ + trx_undo_arr_t* arr; + + mutex_enter(&(purge_sys->mutex)); + + arr = purge_sys->arr; + + trx_purge_arr_remove_info(cell); + + mutex_exit(&(purge_sys->mutex)); +} + +/*********************************************************************** +This function runs a purge batch. */ + +ulint +trx_purge(void) +/*===========*/ + /* out: number of undo log pages handled in + the batch */ +{ + que_thr_t* thr; +/* que_thr_t* thr2; */ + ulint old_pages_handled; + + mutex_enter(&(purge_sys->mutex)); + + if (purge_sys->trx->n_active_thrs > 0) { + + mutex_exit(&(purge_sys->mutex)); + + /* Should not happen */ + + ut_error; + + return(0); + } + + rw_lock_x_lock(&(purge_sys->latch)); + + mutex_enter(&kernel_mutex); + + /* Close and free the old purge view */ + + read_view_close(purge_sys->view); + purge_sys->view = NULL; + mem_heap_empty(purge_sys->heap); + + /* Determine how much data manipulation language (DML) statements + need to be delayed in order to reduce the lagging of the purge + thread. */ + srv_dml_needed_delay = 0; /* in microseconds; default: no delay */ + + /* If we cannot advance the 'purge view' because of an old + 'consistent read view', then the DML statements cannot be delayed. + Also, srv_max_purge_lag <= 0 means 'infinity'. */ + if (srv_max_purge_lag > 0 + && !UT_LIST_GET_LAST(trx_sys->view_list)) { + float ratio = (float) trx_sys->rseg_history_len + / srv_max_purge_lag; + if (ratio > ULINT_MAX / 10000) { + /* Avoid overflow: maximum delay is 4295 seconds */ + srv_dml_needed_delay = ULINT_MAX; + } else if (ratio > 1) { + /* If the history list length exceeds the + innodb_max_purge_lag, the + data manipulation statements are delayed + by at least 5000 microseconds. */ + srv_dml_needed_delay = (ulint) ((ratio - .5) * 10000); + } + } + + purge_sys->view = read_view_oldest_copy_or_open_new(NULL, + purge_sys->heap); + mutex_exit(&kernel_mutex); + + rw_lock_x_unlock(&(purge_sys->latch)); + + purge_sys->state = TRX_PURGE_ON; + + /* Handle at most 20 undo log pages in one purge batch */ + + purge_sys->handle_limit = purge_sys->n_pages_handled + 20; + + old_pages_handled = purge_sys->n_pages_handled; + + mutex_exit(&(purge_sys->mutex)); + + mutex_enter(&kernel_mutex); + + thr = que_fork_start_command(purge_sys->query); + + ut_ad(thr); + +/* thr2 = que_fork_start_command(purge_sys->query); + + ut_ad(thr2); */ + + + mutex_exit(&kernel_mutex); + +/* srv_que_task_enqueue(thr2); */ + + if (srv_print_thread_releases) { + + fputs("Starting purge\n", stderr); + } + + que_run_threads(thr); + + if (srv_print_thread_releases) { + + fprintf(stderr, + "Purge ends; pages handled %lu\n", + (ulong) purge_sys->n_pages_handled); + } + + return(purge_sys->n_pages_handled - old_pages_handled); +} + +/********************************************************************** +Prints information of the purge system to stderr. */ + +void +trx_purge_sys_print(void) +/*=====================*/ +{ + fprintf(stderr, "InnoDB: Purge system view:\n"); + read_view_print(purge_sys->view); + + fprintf(stderr, "InnoDB: Purge trx n:o %lu %lu, undo n_o %lu %lu\n", + (ulong) ut_dulint_get_high(purge_sys->purge_trx_no), + (ulong) ut_dulint_get_low(purge_sys->purge_trx_no), + (ulong) ut_dulint_get_high(purge_sys->purge_undo_no), + (ulong) ut_dulint_get_low(purge_sys->purge_undo_no)); + fprintf(stderr, + "InnoDB: Purge next stored %lu, page_no %lu, offset %lu,\n" + "InnoDB: Purge hdr_page_no %lu, hdr_offset %lu\n", + (ulong) purge_sys->next_stored, + (ulong) purge_sys->page_no, + (ulong) purge_sys->offset, + (ulong) purge_sys->hdr_page_no, + (ulong) purge_sys->hdr_offset); +} diff --git a/storage/innobase/trx/trx0rec.c b/storage/innobase/trx/trx0rec.c new file mode 100644 index 00000000000..fcb7582ce73 --- /dev/null +++ b/storage/innobase/trx/trx0rec.c @@ -0,0 +1,1417 @@ +/****************************************************** +Transaction undo log record + +(c) 1996 Innobase Oy + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0rec.h" + +#ifdef UNIV_NONINL +#include "trx0rec.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0undo.h" +#include "dict0dict.h" +#include "ut0mem.h" +#include "row0upd.h" +#include "que0que.h" +#include "trx0purge.h" +#include "row0row.h" + +/*=========== UNDO LOG RECORD CREATION AND DECODING ====================*/ + +/************************************************************************** +Writes the mtr log entry of the inserted undo log record on the undo log +page. */ +UNIV_INLINE +void +trx_undof_page_add_undo_rec_log( +/*============================*/ + page_t* undo_page, /* in: undo log page */ + ulint old_free, /* in: start offset of the inserted entry */ + ulint new_free, /* in: end offset of the entry */ + mtr_t* mtr) /* in: mtr */ +{ + byte* log_ptr; + const byte* log_end; + ulint len; + + log_ptr = mlog_open(mtr, 11 + 13 + MLOG_BUF_MARGIN); + + if (log_ptr == NULL) { + + return; + } + + log_end = &log_ptr[11 + 13 + MLOG_BUF_MARGIN]; + log_ptr = mlog_write_initial_log_record_fast(undo_page, + MLOG_UNDO_INSERT, log_ptr, mtr); + len = new_free - old_free - 4; + + mach_write_to_2(log_ptr, len); + log_ptr += 2; + + if (log_ptr + len <= log_end) { + memcpy(log_ptr, undo_page + old_free + 2, len); + mlog_close(mtr, log_ptr + len); + } else { + mlog_close(mtr, log_ptr); + mlog_catenate_string(mtr, undo_page + old_free + 2, len); + } +} + +/*************************************************************** +Parses a redo log record of adding an undo log record. */ + +byte* +trx_undo_parse_add_undo_rec( +/*========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page) /* in: page or NULL */ +{ + ulint len; + byte* rec; + ulint first_free; + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + len = mach_read_from_2(ptr); + ptr += 2; + + if (end_ptr < ptr + len) { + + return(NULL); + } + + if (page == NULL) { + + return(ptr + len); + } + + first_free = mach_read_from_2(page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + rec = page + first_free; + + mach_write_to_2(rec, first_free + 4 + len); + mach_write_to_2(rec + 2 + len, first_free); + + mach_write_to_2(page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, + first_free + 4 + len); + ut_memcpy(rec + 2, ptr, len); + + return(ptr + len); +} + +/************************************************************************** +Calculates the free space left for extending an undo log record. */ +UNIV_INLINE +ulint +trx_undo_left( +/*==========*/ + /* out: bytes left */ + page_t* page, /* in: undo log page */ + byte* ptr) /* in: pointer to page */ +{ + /* The '- 10' is a safety margin, in case we have some small + calculation error below */ + + return(UNIV_PAGE_SIZE - (ptr - page) - 10 - FIL_PAGE_DATA_END); +} + +/************************************************************************** +Reports in the undo log of an insert of a clustered index record. */ +static +ulint +trx_undo_page_report_insert( +/*========================*/ + /* out: offset of the inserted entry + on the page if succeed, 0 if fail */ + page_t* undo_page, /* in: undo log page */ + trx_t* trx, /* in: transaction */ + dict_index_t* index, /* in: clustered index */ + dtuple_t* clust_entry, /* in: index entry which will be + inserted to the clustered index */ + mtr_t* mtr) /* in: mtr */ +{ + ulint first_free; + byte* ptr; + ulint len; + dfield_t* field; + ulint flen; + ulint i; + + ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_INSERT); + + first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + ptr = undo_page + first_free; + + ut_ad(first_free <= UNIV_PAGE_SIZE); + + if (trx_undo_left(undo_page, ptr) < 30) { + + /* NOTE: the value 30 must be big enough such that the general + fields written below fit on the undo log page */ + + return(0); + } + + /* Reserve 2 bytes for the pointer to the next undo log record */ + ptr += 2; + + /* Store first some general parameters to the undo log */ + mach_write_to_1(ptr, TRX_UNDO_INSERT_REC); + ptr++; + + len = mach_dulint_write_much_compressed(ptr, trx->undo_no); + ptr += len; + + len = mach_dulint_write_much_compressed(ptr, (index->table)->id); + ptr += len; + /*----------------------------------------*/ + /* Store then the fields required to uniquely determine the record + to be inserted in the clustered index */ + + for (i = 0; i < dict_index_get_n_unique(index); i++) { + + field = dtuple_get_nth_field(clust_entry, i); + + flen = dfield_get_len(field); + + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + len = mach_write_compressed(ptr, flen); + ptr += len; + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_page, ptr) < flen) { + + return(0); + } + + ut_memcpy(ptr, dfield_get_data(field), flen); + ptr += flen; + } + } + + if (trx_undo_left(undo_page, ptr) < 2) { + + return(0); + } + + /*----------------------------------------*/ + /* Write pointers to the previous and the next undo log records */ + + if (trx_undo_left(undo_page, ptr) < 2) { + + return(0); + } + + mach_write_to_2(ptr, first_free); + ptr += 2; + + mach_write_to_2(undo_page + first_free, ptr - undo_page); + + mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, + ptr - undo_page); + + /* Write the log entry to the REDO log of this change in the UNDO + log */ + trx_undof_page_add_undo_rec_log(undo_page, first_free, + ptr - undo_page, mtr); + return(first_free); +} + +/************************************************************************** +Reads from an undo log record the general parameters. */ + +byte* +trx_undo_rec_get_pars( +/*==================*/ + /* out: remaining part of undo log + record after reading these values */ + trx_undo_rec_t* undo_rec, /* in: undo log record */ + ulint* type, /* out: undo record type: + TRX_UNDO_INSERT_REC, ... */ + ulint* cmpl_info, /* out: compiler info, relevant only + for update type records */ + ibool* updated_extern, /* out: TRUE if we updated an + externally stored fild */ + dulint* undo_no, /* out: undo log record number */ + dulint* table_id) /* out: table id */ +{ + byte* ptr; + ulint len; + ulint type_cmpl; + + ptr = undo_rec + 2; + + type_cmpl = mach_read_from_1(ptr); + ptr++; + + if (type_cmpl & TRX_UNDO_UPD_EXTERN) { + *updated_extern = TRUE; + type_cmpl -= TRX_UNDO_UPD_EXTERN; + } else { + *updated_extern = FALSE; + } + + *type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1); + *cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT; + + *undo_no = mach_dulint_read_much_compressed(ptr); + len = mach_dulint_get_much_compressed_size(*undo_no); + ptr += len; + + *table_id = mach_dulint_read_much_compressed(ptr); + len = mach_dulint_get_much_compressed_size(*table_id); + ptr += len; + + return(ptr); +} + +/************************************************************************** +Reads from an undo log record a stored column value. */ +static +byte* +trx_undo_rec_get_col_val( +/*=====================*/ + /* out: remaining part of undo log record after + reading these values */ + byte* ptr, /* in: pointer to remaining part of undo log record */ + byte** field, /* out: pointer to stored field */ + ulint* len) /* out: length of the field, or UNIV_SQL_NULL */ +{ + *len = mach_read_compressed(ptr); + ptr += mach_get_compressed_size(*len); + + *field = ptr; + + if (*len != UNIV_SQL_NULL) { + if (*len >= UNIV_EXTERN_STORAGE_FIELD) { + ptr += (*len - UNIV_EXTERN_STORAGE_FIELD); + } else { + ptr += *len; + } + } + + return(ptr); +} + +/*********************************************************************** +Builds a row reference from an undo log record. */ + +byte* +trx_undo_rec_get_row_ref( +/*=====================*/ + /* out: pointer to remaining part of undo + record */ + byte* ptr, /* in: remaining part of a copy of an undo log + record, at the start of the row reference; + NOTE that this copy of the undo log record must + be preserved as long as the row reference is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /* in: clustered index */ + dtuple_t** ref, /* out, own: row reference */ + mem_heap_t* heap) /* in: memory heap from which the memory + needed is allocated */ +{ + dfield_t* dfield; + byte* field; + ulint len; + ulint ref_len; + ulint i; + + ut_ad(index && ptr && ref && heap); + ut_a(index->type & DICT_CLUSTERED); + + ref_len = dict_index_get_n_unique(index); + + *ref = dtuple_create(heap, ref_len); + + dict_index_copy_types(*ref, index, ref_len); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(*ref, i); + + ptr = trx_undo_rec_get_col_val(ptr, &field, &len); + + dfield_set_data(dfield, field, len); + } + + return(ptr); +} + +/*********************************************************************** +Skips a row reference from an undo log record. */ + +byte* +trx_undo_rec_skip_row_ref( +/*======================*/ + /* out: pointer to remaining part of undo + record */ + byte* ptr, /* in: remaining part in update undo log + record, at the start of the row reference */ + dict_index_t* index) /* in: clustered index */ +{ + byte* field; + ulint len; + ulint ref_len; + ulint i; + + ut_ad(index && ptr); + ut_a(index->type & DICT_CLUSTERED); + + ref_len = dict_index_get_n_unique(index); + + for (i = 0; i < ref_len; i++) { + ptr = trx_undo_rec_get_col_val(ptr, &field, &len); + } + + return(ptr); +} + +/************************************************************************** +Reports in the undo log of an update or delete marking of a clustered index +record. */ +static +ulint +trx_undo_page_report_modify( +/*========================*/ + /* out: byte offset of the inserted + undo log entry on the page if succeed, + 0 if fail */ + page_t* undo_page, /* in: undo log page */ + trx_t* trx, /* in: transaction */ + dict_index_t* index, /* in: clustered index where update or + delete marking is done */ + rec_t* rec, /* in: clustered index record which + has NOT yet been modified */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ + upd_t* update, /* in: update vector which tells the + columns to be updated; in the case of + a delete, this should be set to NULL */ + ulint cmpl_info, /* in: compiler info on secondary + index updates */ + mtr_t* mtr) /* in: mtr */ +{ + dict_table_t* table; + upd_field_t* upd_field; + dict_col_t* col; + ulint first_free; + byte* ptr; + ulint len; + byte* field; + ulint flen; + ulint pos; + dulint roll_ptr; + dulint trx_id; + ulint bits; + ulint col_no; + byte* old_ptr; + ulint type_cmpl; + byte* type_cmpl_ptr; + ulint i; + + ut_a(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE); + table = index->table; + + first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + ptr = undo_page + first_free; + + ut_ad(first_free <= UNIV_PAGE_SIZE); + + if (trx_undo_left(undo_page, ptr) < 50) { + + /* NOTE: the value 50 must be big enough so that the general + fields written below fit on the undo log page */ + + return(0); + } + + /* Reserve 2 bytes for the pointer to the next undo log record */ + ptr += 2; + + /* Store first some general parameters to the undo log */ + + if (update) { + if (rec_get_deleted_flag(rec, table->comp)) { + type_cmpl = TRX_UNDO_UPD_DEL_REC; + } else { + type_cmpl = TRX_UNDO_UPD_EXIST_REC; + } + } else { + type_cmpl = TRX_UNDO_DEL_MARK_REC; + } + + type_cmpl = type_cmpl | (cmpl_info * TRX_UNDO_CMPL_INFO_MULT); + + mach_write_to_1(ptr, type_cmpl); + + type_cmpl_ptr = ptr; + + ptr++; + len = mach_dulint_write_much_compressed(ptr, trx->undo_no); + ptr += len; + + len = mach_dulint_write_much_compressed(ptr, table->id); + ptr += len; + + /*----------------------------------------*/ + /* Store the state of the info bits */ + + bits = rec_get_info_bits(rec, table->comp); + mach_write_to_1(ptr, bits); + ptr += 1; + + /* Store the values of the system columns */ + field = rec_get_nth_field(rec, offsets, + dict_index_get_sys_col_pos(index, DATA_TRX_ID), &len); + ut_ad(len == DATA_TRX_ID_LEN); + trx_id = trx_read_trx_id(field); + field = rec_get_nth_field(rec, offsets, + dict_index_get_sys_col_pos(index, DATA_ROLL_PTR), &len); + ut_ad(len == DATA_ROLL_PTR_LEN); + roll_ptr = trx_read_roll_ptr(field); + + len = mach_dulint_write_compressed(ptr, trx_id); + ptr += len; + + len = mach_dulint_write_compressed(ptr, roll_ptr); + ptr += len; + + /*----------------------------------------*/ + /* Store then the fields required to uniquely determine the + record which will be modified in the clustered index */ + + for (i = 0; i < dict_index_get_n_unique(index); i++) { + + field = rec_get_nth_field(rec, offsets, i, &flen); + + if (trx_undo_left(undo_page, ptr) < 4) { + + return(0); + } + + len = mach_write_compressed(ptr, flen); + ptr += len; + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_page, ptr) < flen) { + + return(0); + } + + ut_memcpy(ptr, field, flen); + ptr += flen; + } + } + + /*----------------------------------------*/ + /* Save to the undo log the old values of the columns to be updated. */ + + if (update) { + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + len = mach_write_compressed(ptr, upd_get_n_fields(update)); + ptr += len; + + for (i = 0; i < upd_get_n_fields(update); i++) { + + upd_field = upd_get_nth_field(update, i); + pos = upd_field->field_no; + + /* Write field number to undo log */ + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + len = mach_write_compressed(ptr, pos); + ptr += len; + + /* Save the old value of field */ + field = rec_get_nth_field(rec, offsets, pos, &flen); + + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + if (rec_offs_nth_extern(offsets, pos)) { + /* If a field has external storage, we add to + flen the flag */ + + len = mach_write_compressed(ptr, + UNIV_EXTERN_STORAGE_FIELD + flen); + + /* Notify purge that it eventually has to free the old + externally stored field */ + + trx->update_undo->del_marks = TRUE; + + *type_cmpl_ptr = *type_cmpl_ptr | TRX_UNDO_UPD_EXTERN; + } else { + len = mach_write_compressed(ptr, flen); + } + + ptr += len; + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_page, ptr) < flen) { + + return(0); + } + + ut_memcpy(ptr, field, flen); + ptr += flen; + } + } + } + + /*----------------------------------------*/ + /* In the case of a delete marking, and also in the case of an update + where any ordering field of any index changes, store the values of all + columns which occur as ordering fields in any index. This info is used + in the purge of old versions where we use it to build and search the + delete marked index records, to look if we can remove them from the + index tree. Note that starting from 4.0.14 also externally stored + fields can be ordering in some index. But we always store at least + 384 first bytes locally to the clustered index record, which means + we can construct the column prefix fields in the index from the + stored data. */ + + if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + + trx->update_undo->del_marks = TRUE; + + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + old_ptr = ptr; + + /* Reserve 2 bytes to write the number of bytes the stored fields + take in this undo record */ + + ptr += 2; + + for (col_no = 0; col_no < dict_table_get_n_cols(table); col_no++) { + + col = dict_table_get_nth_col(table, col_no); + + if (col->ord_part > 0) { + + pos = dict_index_get_nth_col_pos(index, col_no); + + /* Write field number to undo log */ + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + len = mach_write_compressed(ptr, pos); + ptr += len; + + /* Save the old value of field */ + field = rec_get_nth_field(rec, offsets, pos, &flen); + + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + len = mach_write_compressed(ptr, flen); + ptr += len; + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_page, ptr) < flen) { + + return(0); + } + + ut_memcpy(ptr, field, flen); + ptr += flen; + } + } + } + + mach_write_to_2(old_ptr, ptr - old_ptr); + } + + /*----------------------------------------*/ + /* Write pointers to the previous and the next undo log records */ + if (trx_undo_left(undo_page, ptr) < 2) { + + return(0); + } + + mach_write_to_2(ptr, first_free); + ptr += 2; + mach_write_to_2(undo_page + first_free, ptr - undo_page); + + mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, + ptr - undo_page); + + /* Write to the REDO log about this change in the UNDO log */ + + trx_undof_page_add_undo_rec_log(undo_page, first_free, + ptr - undo_page, mtr); + return(first_free); +} + +/************************************************************************** +Reads from an undo log update record the system field values of the old +version. */ + +byte* +trx_undo_update_rec_get_sys_cols( +/*=============================*/ + /* out: remaining part of undo log + record after reading these values */ + byte* ptr, /* in: remaining part of undo log + record after reading general + parameters */ + dulint* trx_id, /* out: trx id */ + dulint* roll_ptr, /* out: roll ptr */ + ulint* info_bits) /* out: info bits state */ +{ + ulint len; + + /* Read the state of the info bits */ + *info_bits = mach_read_from_1(ptr); + ptr += 1; + + /* Read the values of the system columns */ + + *trx_id = mach_dulint_read_compressed(ptr); + len = mach_dulint_get_compressed_size(*trx_id); + ptr += len; + + *roll_ptr = mach_dulint_read_compressed(ptr); + len = mach_dulint_get_compressed_size(*roll_ptr); + ptr += len; + + return(ptr); +} + +/************************************************************************** +Reads from an update undo log record the number of updated fields. */ +UNIV_INLINE +byte* +trx_undo_update_rec_get_n_upd_fields( +/*=================================*/ + /* out: remaining part of undo log record after + reading this value */ + byte* ptr, /* in: pointer to remaining part of undo log record */ + ulint* n) /* out: number of fields */ +{ + *n = mach_read_compressed(ptr); + ptr += mach_get_compressed_size(*n); + + return(ptr); +} + +/************************************************************************** +Reads from an update undo log record a stored field number. */ +UNIV_INLINE +byte* +trx_undo_update_rec_get_field_no( +/*=============================*/ + /* out: remaining part of undo log record after + reading this value */ + byte* ptr, /* in: pointer to remaining part of undo log record */ + ulint* field_no)/* out: field number */ +{ + *field_no = mach_read_compressed(ptr); + ptr += mach_get_compressed_size(*field_no); + + return(ptr); +} + +/*********************************************************************** +Builds an update vector based on a remaining part of an undo log record. */ + +byte* +trx_undo_update_rec_get_update( +/*===========================*/ + /* out: remaining part of the record, + NULL if an error detected, which means that + the record is corrupted */ + byte* ptr, /* in: remaining part in update undo log + record, after reading the row reference + NOTE that this copy of the undo log record must + be preserved as long as the update vector is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /* in: clustered index */ + ulint type, /* in: TRX_UNDO_UPD_EXIST_REC, + TRX_UNDO_UPD_DEL_REC, or + TRX_UNDO_DEL_MARK_REC; in the last case, + only trx id and roll ptr fields are added to + the update vector */ + dulint trx_id, /* in: transaction id from this undo record */ + dulint roll_ptr,/* in: roll pointer from this undo record */ + ulint info_bits,/* in: info bits from this undo record */ + trx_t* trx, /* in: transaction */ + mem_heap_t* heap, /* in: memory heap from which the memory + needed is allocated */ + upd_t** upd) /* out, own: update vector */ +{ + upd_field_t* upd_field; + upd_t* update; + ulint n_fields; + byte* buf; + byte* field; + ulint len; + ulint field_no; + ulint i; + + ut_a(index->type & DICT_CLUSTERED); + + if (type != TRX_UNDO_DEL_MARK_REC) { + ptr = trx_undo_update_rec_get_n_upd_fields(ptr, &n_fields); + } else { + n_fields = 0; + } + + update = upd_create(n_fields + 2, heap); + + update->info_bits = info_bits; + + /* Store first trx id and roll ptr to update vector */ + + upd_field = upd_get_nth_field(update, n_fields); + buf = mem_heap_alloc(heap, DATA_TRX_ID_LEN); + trx_write_trx_id(buf, trx_id); + + upd_field_set_field_no(upd_field, + dict_index_get_sys_col_pos(index, DATA_TRX_ID), + index, trx); + dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN); + + upd_field = upd_get_nth_field(update, n_fields + 1); + buf = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN); + trx_write_roll_ptr(buf, roll_ptr); + + upd_field_set_field_no(upd_field, + dict_index_get_sys_col_pos(index, DATA_ROLL_PTR), + index, trx); + dfield_set_data(&(upd_field->new_val), buf, DATA_ROLL_PTR_LEN); + + /* Store then the updated ordinary columns to the update vector */ + + for (i = 0; i < n_fields; i++) { + + ptr = trx_undo_update_rec_get_field_no(ptr, &field_no); + + if (field_no >= dict_index_get_n_fields(index)) { + fprintf(stderr, +"InnoDB: Error: trying to access update undo rec field %lu in ", (ulong) field_no); + dict_index_name_print(stderr, trx, index); + fprintf(stderr, "\n" +"InnoDB: but index has only %lu fields\n" +"InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n" +"InnoDB: Run also CHECK TABLE ", + (ulong) dict_index_get_n_fields(index)); + ut_print_name(stderr, trx, index->table_name); + fprintf(stderr, "\n" +"InnoDB: n_fields = %lu, i = %lu, ptr %p\n", + (ulong) n_fields, (ulong) i, ptr); + return(NULL); + } + + ptr = trx_undo_rec_get_col_val(ptr, &field, &len); + + upd_field = upd_get_nth_field(update, i); + + upd_field_set_field_no(upd_field, field_no, index, trx); + + if (len != UNIV_SQL_NULL && len >= UNIV_EXTERN_STORAGE_FIELD) { + + upd_field->extern_storage = TRUE; + + len -= UNIV_EXTERN_STORAGE_FIELD; + } + + dfield_set_data(&(upd_field->new_val), field, len); + } + + *upd = update; + + return(ptr); +} + +/*********************************************************************** +Builds a partial row from an update undo log record. It contains the +columns which occur as ordering in any index of the table. */ + +byte* +trx_undo_rec_get_partial_row( +/*=========================*/ + /* out: pointer to remaining part of undo + record */ + byte* ptr, /* in: remaining part in update undo log + record of a suitable type, at the start of + the stored index columns; + NOTE that this copy of the undo log record must + be preserved as long as the partial row is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /* in: clustered index */ + dtuple_t** row, /* out, own: partial row */ + mem_heap_t* heap) /* in: memory heap from which the memory + needed is allocated */ +{ + dfield_t* dfield; + byte* field; + ulint len; + ulint field_no; + ulint col_no; + ulint row_len; + ulint total_len; + byte* start_ptr; + ulint i; + + ut_ad(index && ptr && row && heap); + + row_len = dict_table_get_n_cols(index->table); + + *row = dtuple_create(heap, row_len); + + dict_table_copy_types(*row, index->table); + + start_ptr = ptr; + + total_len = mach_read_from_2(ptr); + ptr += 2; + + for (i = 0;; i++) { + + if (ptr == start_ptr + total_len) { + + break; + } + + ptr = trx_undo_update_rec_get_field_no(ptr, &field_no); + + col_no = dict_index_get_nth_col_no(index, field_no); + + ptr = trx_undo_rec_get_col_val(ptr, &field, &len); + + dfield = dtuple_get_nth_field(*row, col_no); + + dfield_set_data(dfield, field, len); + } + + return(ptr); +} + +/*************************************************************************** +Erases the unused undo log page end. */ +static +void +trx_undo_erase_page_end( +/*====================*/ + page_t* undo_page, /* in: undo page whose end to erase */ + mtr_t* mtr) /* in: mtr */ +{ + ulint first_free; + ulint i; + + first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + for (i = first_free; i < UNIV_PAGE_SIZE - FIL_PAGE_DATA_END; i++) { + undo_page[i] = 0xFF; + } + + mlog_write_initial_log_record(undo_page, MLOG_UNDO_ERASE_END, mtr); +} + +/*************************************************************** +Parses a redo log record of erasing of an undo page end. */ + +byte* +trx_undo_parse_erase_page_end( +/*==========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr __attribute__((unused)), /* in: buffer end */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + ut_ad(ptr && end_ptr); + + if (page == NULL) { + + return(ptr); + } + + trx_undo_erase_page_end(page, mtr); + + return(ptr); +} + +/*************************************************************************** +Writes information to an undo log about an insert, update, or a delete marking +of a clustered index record. This information is used in a rollback of the +transaction and in consistent reads that must look to the history of this +transaction. */ + +ulint +trx_undo_report_row_operation( +/*==========================*/ + /* out: DB_SUCCESS or error code */ + ulint flags, /* in: if BTR_NO_UNDO_LOG_FLAG bit is + set, does nothing */ + ulint op_type, /* in: TRX_UNDO_INSERT_OP or + TRX_UNDO_MODIFY_OP */ + que_thr_t* thr, /* in: query thread */ + dict_index_t* index, /* in: clustered index */ + dtuple_t* clust_entry, /* in: in the case of an insert, + index entry to insert into the + clustered index, otherwise NULL */ + upd_t* update, /* in: in the case of an update, + the update vector, otherwise NULL */ + ulint cmpl_info, /* in: compiler info on secondary + index updates */ + rec_t* rec, /* in: in case of an update or delete + marking, the record in the clustered + index, otherwise NULL */ + dulint* roll_ptr) /* out: rollback pointer to the + inserted undo log record, + ut_dulint_zero if BTR_NO_UNDO_LOG + flag was specified */ +{ + trx_t* trx; + trx_undo_t* undo; + page_t* undo_page; + ulint offset; + ulint page_no; + ibool is_insert; + trx_rseg_t* rseg; + mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + ut_a(index->type & DICT_CLUSTERED); + + if (flags & BTR_NO_UNDO_LOG_FLAG) { + + *roll_ptr = ut_dulint_zero; + + return(DB_SUCCESS); + } + + ut_ad(thr); + ut_ad((op_type != TRX_UNDO_INSERT_OP) + || (clust_entry && !update && !rec)); + + trx = thr_get_trx(thr); + rseg = trx->rseg; + + mutex_enter(&(trx->undo_mutex)); + + /* If the undo log is not assigned yet, assign one */ + + if (op_type == TRX_UNDO_INSERT_OP) { + + if (trx->insert_undo == NULL) { + + trx_undo_assign_undo(trx, TRX_UNDO_INSERT); + } + + undo = trx->insert_undo; + is_insert = TRUE; + } else { + ut_ad(op_type == TRX_UNDO_MODIFY_OP); + + if (trx->update_undo == NULL) { + + trx_undo_assign_undo(trx, TRX_UNDO_UPDATE); + + } + + undo = trx->update_undo; + is_insert = FALSE; + } + + if (undo == NULL) { + /* Did not succeed: out of space */ + mutex_exit(&(trx->undo_mutex)); + + return(DB_OUT_OF_FILE_SPACE); + } + + page_no = undo->last_page_no; + + mtr_start(&mtr); + + for (;;) { + undo_page = buf_page_get_gen(undo->space, page_no, + RW_X_LATCH, undo->guess_page, + BUF_GET, + __FILE__, __LINE__, + &mtr); + +#ifdef UNIV_SYNC_DEBUG + buf_page_dbg_add_level(undo_page, SYNC_TRX_UNDO_PAGE); +#endif /* UNIV_SYNC_DEBUG */ + + if (op_type == TRX_UNDO_INSERT_OP) { + offset = trx_undo_page_report_insert(undo_page, trx, + index, clust_entry, + &mtr); + } else { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + offset = trx_undo_page_report_modify(undo_page, trx, + index, rec, offsets, update, cmpl_info, &mtr); + } + + if (offset == 0) { + /* The record did not fit on the page. We erase the + end segment of the undo log page and write a log + record of it: this is to ensure that in the debug + version the replicate page constructed using the log + records stays identical to the original page */ + + trx_undo_erase_page_end(undo_page, &mtr); + } + + mtr_commit(&mtr); + + if (offset != 0) { + /* Success */ + + break; + } + + ut_ad(page_no == undo->last_page_no); + + /* We have to extend the undo log by one page */ + + mtr_start(&mtr); + + /* When we add a page to an undo log, this is analogous to + a pessimistic insert in a B-tree, and we must reserve the + counterpart of the tree latch, which is the rseg mutex. */ + + mutex_enter(&(rseg->mutex)); + + page_no = trx_undo_add_page(trx, undo, &mtr); + + mutex_exit(&(rseg->mutex)); + + if (page_no == FIL_NULL) { + /* Did not succeed: out of space */ + + mutex_exit(&(trx->undo_mutex)); + mtr_commit(&mtr); + if (heap) { + mem_heap_free(heap); + } + return(DB_OUT_OF_FILE_SPACE); + } + } + + undo->empty = FALSE; + undo->top_page_no = page_no; + undo->top_offset = offset; + undo->top_undo_no = trx->undo_no; + undo->guess_page = undo_page; + + UT_DULINT_INC(trx->undo_no); + + mutex_exit(&(trx->undo_mutex)); + + *roll_ptr = trx_undo_build_roll_ptr(is_insert, rseg->id, page_no, + offset); + if (heap) { + mem_heap_free(heap); + } + return(DB_SUCCESS); +} + +/*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/ + +/********************************************************************** +Copies an undo record to heap. This function can be called if we know that +the undo log record exists. */ + +trx_undo_rec_t* +trx_undo_get_undo_rec_low( +/*======================*/ + /* out, own: copy of the record */ + dulint roll_ptr, /* in: roll pointer to record */ + mem_heap_t* heap) /* in: memory heap where copied */ +{ + trx_undo_rec_t* undo_rec; + ulint rseg_id; + ulint page_no; + ulint offset; + page_t* undo_page; + trx_rseg_t* rseg; + ibool is_insert; + mtr_t mtr; + + trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no, + &offset); + rseg = trx_rseg_get_on_id(rseg_id); + + mtr_start(&mtr); + + undo_page = trx_undo_page_get_s_latched(rseg->space, page_no, &mtr); + + undo_rec = trx_undo_rec_copy(undo_page + offset, heap); + + mtr_commit(&mtr); + + return(undo_rec); +} + +/********************************************************************** +Copies an undo record to heap. */ + +ulint +trx_undo_get_undo_rec( +/*==================*/ + /* out: DB_SUCCESS, or + DB_MISSING_HISTORY if the undo log + has been truncated and we cannot + fetch the old version; NOTE: the + caller must have latches on the + clustered index page and purge_view */ + dulint roll_ptr, /* in: roll pointer to record */ + dulint trx_id, /* in: id of the trx that generated + the roll pointer: it points to an + undo log of this transaction */ + trx_undo_rec_t** undo_rec, /* out, own: copy of the record */ + mem_heap_t* heap) /* in: memory heap where copied */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + if (!trx_purge_update_undo_must_exist(trx_id)) { + + /* It may be that the necessary undo log has already been + deleted */ + + return(DB_MISSING_HISTORY); + } + + *undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap); + + return(DB_SUCCESS); +} + +/*********************************************************************** +Build a previous version of a clustered index record. This function checks +that the caller has a latch on the index page of the clustered index record +and an s-latch on the purge_view. This guarantees that the stack of versions +is locked. */ + +ulint +trx_undo_prev_version_build( +/*========================*/ + /* out: DB_SUCCESS, or DB_MISSING_HISTORY if + the previous version is not >= purge_view, + which means that it may have been removed, + DB_ERROR if corrupted record */ + rec_t* index_rec,/* in: clustered index record in the + index tree */ + mtr_t* index_mtr __attribute__((unused)), + /* in: mtr which contains the latch to + index_rec page and purge_view */ + rec_t* rec, /* in: version of a clustered index record */ + dict_index_t* index, /* in: clustered index */ + ulint* offsets,/* in: rec_get_offsets(rec, index) */ + mem_heap_t* heap, /* in: memory heap from which the memory + needed is allocated */ + rec_t** old_vers)/* out, own: previous version, or NULL if + rec is the first inserted version, or if + history data has been deleted */ +{ + trx_undo_rec_t* undo_rec; + dtuple_t* entry; + dulint rec_trx_id; + ulint type; + dulint undo_no; + dulint table_id; + dulint trx_id; + dulint roll_ptr; + dulint old_roll_ptr; + upd_t* update; + byte* ptr; + ulint info_bits; + ulint cmpl_info; + ibool dummy_extern; + byte* buf; + ulint err; +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mtr_memo_contains(index_mtr, buf_block_align(index_rec), + MTR_MEMO_PAGE_S_FIX) || + mtr_memo_contains(index_mtr, buf_block_align(index_rec), + MTR_MEMO_PAGE_X_FIX)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (!(index->type & DICT_CLUSTERED)) { + fprintf(stderr, "InnoDB: Error: trying to access" + " update undo rec for non-clustered index %s\n" + "InnoDB: Submit a detailed bug report to" + " http://bugs.mysql.com\n" + "InnoDB: index record ", index->name); + rec_print(stderr, index_rec, index); + fputs("\n" + "InnoDB: record version ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + return(DB_ERROR); + } + + roll_ptr = row_get_rec_roll_ptr(rec, index, offsets); + old_roll_ptr = roll_ptr; + + *old_vers = NULL; + + if (trx_undo_roll_ptr_is_insert(roll_ptr)) { + + /* The record rec is the first inserted version */ + + return(DB_SUCCESS); + } + + rec_trx_id = row_get_rec_trx_id(rec, index, offsets); + + err = trx_undo_get_undo_rec(roll_ptr, rec_trx_id, &undo_rec, heap); + + if (err != DB_SUCCESS) { + + return(err); + } + + ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info, + &dummy_extern, &undo_no, &table_id); + + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, + &info_bits); + ptr = trx_undo_rec_skip_row_ref(ptr, index); + + ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id, + roll_ptr, info_bits, NULL, heap, &update); + + if (ut_dulint_cmp(table_id, index->table->id) != 0) { + ptr = NULL; + + fprintf(stderr, +"InnoDB: Error: trying to access update undo rec for table %s\n" +"InnoDB: but the table id in the undo record is wrong\n" +"InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n" +"InnoDB: Run also CHECK TABLE %s\n", + index->table_name, index->table_name); + } + + if (ptr == NULL) { + /* The record was corrupted, return an error; these printfs + should catch an elusive bug in row_vers_old_has_index_entry */ + + fprintf(stderr, + "InnoDB: table %s, index %s, n_uniq %lu\n" + "InnoDB: undo rec address %p, type %lu cmpl_info %lu\n" + "InnoDB: undo rec table id %lu %lu, index table id %lu %lu\n" + "InnoDB: dump of 150 bytes in undo rec: ", + index->table_name, index->name, + (ulong) dict_index_get_n_unique(index), + undo_rec, (ulong) type, (ulong) cmpl_info, + (ulong) ut_dulint_get_high(table_id), + (ulong) ut_dulint_get_low(table_id), + (ulong) ut_dulint_get_high(index->table->id), + (ulong) ut_dulint_get_low(index->table->id)); + ut_print_buf(stderr, undo_rec, 150); + fputs("\n" + "InnoDB: index record ", stderr); + rec_print(stderr, index_rec, index); + fputs("\n" + "InnoDB: record version ", stderr); + rec_print_new(stderr, rec, offsets); + fprintf(stderr, "\n" + "InnoDB: Record trx id %lu %lu, update rec trx id %lu %lu\n" + "InnoDB: Roll ptr in rec %lu %lu, in update rec %lu %lu\n", + (ulong) ut_dulint_get_high(rec_trx_id), + (ulong) ut_dulint_get_low(rec_trx_id), + (ulong) ut_dulint_get_high(trx_id), + (ulong) ut_dulint_get_low(trx_id), + (ulong) ut_dulint_get_high(old_roll_ptr), + (ulong) ut_dulint_get_low(old_roll_ptr), + (ulong) ut_dulint_get_high(roll_ptr), + (ulong) ut_dulint_get_low(roll_ptr)); + + trx_purge_sys_print(); + return(DB_ERROR); + } + + if (row_upd_changes_field_size_or_external(index, offsets, update)) { + ulint* ext_vect; + ulint n_ext_vect; + + /* We have to set the appropriate extern storage bits in the + old version of the record: the extern bits in rec for those + fields that update does NOT update, as well as the the bits for + those fields that update updates to become externally stored + fields. Store the info to ext_vect: */ + + ext_vect = mem_alloc(sizeof(ulint) + * rec_offs_n_fields(offsets)); + n_ext_vect = btr_push_update_extern_fields(ext_vect, offsets, + update); + entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, + heap); + row_upd_index_replace_new_col_vals(entry, index, update, heap); + + buf = mem_heap_alloc(heap, + rec_get_converted_size(index, entry)); + + *old_vers = rec_convert_dtuple_to_rec(buf, index, entry); + + /* Now set the extern bits in the old version of the record */ + rec_set_field_extern_bits(*old_vers, index, + ext_vect, n_ext_vect, NULL); + mem_free(ext_vect); + } else { + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + *old_vers = rec_copy(buf, rec, offsets); + rec_offs_make_valid(*old_vers, index, offsets); + row_upd_rec_in_place(*old_vers, offsets, update); + } + + return(DB_SUCCESS); +} diff --git a/storage/innobase/trx/trx0roll.c b/storage/innobase/trx/trx0roll.c new file mode 100644 index 00000000000..69f7a99187f --- /dev/null +++ b/storage/innobase/trx/trx0roll.c @@ -0,0 +1,1344 @@ +/****************************************************** +Transaction rollback + +(c) 1996 Innobase Oy + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0roll.h" + +#ifdef UNIV_NONINL +#include "trx0roll.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0undo.h" +#include "trx0rec.h" +#include "que0que.h" +#include "usr0sess.h" +#include "srv0que.h" +#include "srv0start.h" +#include "row0undo.h" +#include "row0mysql.h" +#include "lock0lock.h" +#include "pars0pars.h" + +/* This many pages must be undone before a truncate is tried within rollback */ +#define TRX_ROLL_TRUNC_THRESHOLD 1 + +/* In crash recovery, the current trx to be rolled back */ +trx_t* trx_roll_crash_recv_trx = NULL; + +/* In crash recovery we set this to the undo n:o of the current trx to be +rolled back. Then we can print how many % the rollback has progressed. */ +ib_longlong trx_roll_max_undo_no; + +/* Auxiliary variable which tells the previous progress % we printed */ +ulint trx_roll_progress_printed_pct; + +/*********************************************************************** +Rollback a transaction used in MySQL. */ + +int +trx_general_rollback_for_mysql( +/*===========================*/ + /* out: error code or DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + ibool partial,/* in: TRUE if partial rollback requested */ + trx_savept_t* savept) /* in: pointer to savepoint undo number, if + partial rollback requested */ +{ +#ifndef UNIV_HOTBACKUP + mem_heap_t* heap; + que_thr_t* thr; + roll_node_t* roll_node; + + /* Tell Innobase server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + + trx_start_if_not_started(trx); + + heap = mem_heap_create(512); + + roll_node = roll_node_create(heap); + + roll_node->partial = partial; + + if (partial) { + roll_node->savept = *savept; + } + + trx->error_state = DB_SUCCESS; + + thr = pars_complete_graph_for_exec(roll_node, trx, heap); + + ut_a(thr == que_fork_start_command(que_node_get_parent(thr))); + que_run_threads(thr); + + mutex_enter(&kernel_mutex); + + while (trx->que_state != TRX_QUE_RUNNING) { + + mutex_exit(&kernel_mutex); + + os_thread_sleep(100000); + + mutex_enter(&kernel_mutex); + } + + mutex_exit(&kernel_mutex); + + mem_heap_free(heap); + + ut_a(trx->error_state == DB_SUCCESS); + + /* Tell Innobase server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + + return((int) trx->error_state); +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; +#endif /* UNIV_HOTBACKUP */ +} + +/*********************************************************************** +Rollback a transaction used in MySQL. */ + +int +trx_rollback_for_mysql( +/*===================*/ + /* out: error code or DB_SUCCESS */ + trx_t* trx) /* in: transaction handle */ +{ + int err; + + if (trx->conc_state == TRX_NOT_STARTED) { + + return(DB_SUCCESS); + } + + trx->op_info = "rollback"; + + err = trx_general_rollback_for_mysql(trx, FALSE, NULL); + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************** +Rollback the latest SQL statement for MySQL. */ + +int +trx_rollback_last_sql_stat_for_mysql( +/*=================================*/ + /* out: error code or DB_SUCCESS */ + trx_t* trx) /* in: transaction handle */ +{ + int err; + + if (trx->conc_state == TRX_NOT_STARTED) { + + return(DB_SUCCESS); + } + + trx->op_info = "rollback of SQL statement"; + + err = trx_general_rollback_for_mysql(trx, TRUE, + &(trx->last_sql_stat_start)); + /* The following call should not be needed, but we play safe: */ + trx_mark_sql_stat_end(trx); + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************** +Frees savepoint structs. */ + +void +trx_roll_savepoints_free( +/*=====================*/ + trx_t* trx, /* in: transaction handle */ + trx_named_savept_t* savep) /* in: free all savepoints > this one; + if this is NULL, free all savepoints + of trx */ +{ + trx_named_savept_t* next_savep; + + if (savep == NULL) { + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + } else { + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + + while (savep != NULL) { + next_savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + + UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep); + mem_free(savep->name); + mem_free(savep); + + savep = next_savep; + } +} + +/*********************************************************************** +Rolls back a transaction back to a named savepoint. Modifications after the +savepoint are undone but InnoDB does NOT release the corresponding locks +which are stored in memory. If a lock is 'implicit', that is, a new inserted +row holds a lock where the lock information is carried by the trx id stored in +the row, these locks are naturally released in the rollback. Savepoints which +were set after this savepoint are deleted. */ + +ulint +trx_rollback_to_savepoint_for_mysql( +/*================================*/ + /* out: if no savepoint + of the name found then + DB_NO_SAVEPOINT, + otherwise DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + const char* savepoint_name, /* in: savepoint name */ + ib_longlong* mysql_binlog_cache_pos) /* out: the MySQL binlog cache + position corresponding to this + savepoint; MySQL needs this + information to remove the + binlog entries of the queries + executed after the savepoint */ +{ + trx_named_savept_t* savep; + ulint err; + + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + + while (savep != NULL) { + if (0 == ut_strcmp(savep->name, savepoint_name)) { + /* Found */ + break; + } + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + + if (savep == NULL) { + + return(DB_NO_SAVEPOINT); + } + + if (trx->conc_state == TRX_NOT_STARTED) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: transaction has a savepoint ", stderr); + ut_print_name(stderr, trx, savep->name); + fputs(" though it is not started\n", stderr); + return(DB_ERROR); + } + + /* We can now free all savepoints strictly later than this one */ + + trx_roll_savepoints_free(trx, savep); + + *mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos; + + trx->op_info = "rollback to a savepoint"; + + err = trx_general_rollback_for_mysql(trx, TRUE, &(savep->savept)); + + /* Store the current undo_no of the transaction so that we know where + to roll back if we have to roll back the next SQL statement: */ + + trx_mark_sql_stat_end(trx); + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************** +Creates a named savepoint. If the transaction is not yet started, starts it. +If there is already a savepoint of the same name, this call erases that old +savepoint and replaces it with a new. Savepoints are deleted in a transaction +commit or rollback. */ + +ulint +trx_savepoint_for_mysql( +/*====================*/ + /* out: always DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + const char* savepoint_name, /* in: savepoint name */ + ib_longlong binlog_cache_pos) /* in: MySQL binlog cache + position corresponding to this + connection at the time of the + savepoint */ +{ + trx_named_savept_t* savep; + + ut_a(trx); + ut_a(savepoint_name); + + trx_start_if_not_started(trx); + + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + + while (savep != NULL) { + if (0 == ut_strcmp(savep->name, savepoint_name)) { + /* Found */ + break; + } + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + + if (savep) { + /* There is a savepoint with the same name: free that */ + + UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep); + + mem_free(savep->name); + mem_free(savep); + } + + /* Create a new savepoint and add it as the last in the list */ + + savep = mem_alloc(sizeof(trx_named_savept_t)); + + savep->name = mem_strdup(savepoint_name); + + savep->savept = trx_savept_take(trx); + + savep->mysql_binlog_cache_pos = binlog_cache_pos; + + UT_LIST_ADD_LAST(trx_savepoints, trx->trx_savepoints, savep); + + return(DB_SUCCESS); +} + +/*********************************************************************** +Releases a named savepoint. Savepoints which +were set after this savepoint are deleted. */ + +ulint +trx_release_savepoint_for_mysql( +/*============================*/ + /* out: if no savepoint + of the name found then + DB_NO_SAVEPOINT, + otherwise DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + const char* savepoint_name) /* in: savepoint name */ +{ + trx_named_savept_t* savep; + + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + + while (savep != NULL) { + if (0 == ut_strcmp(savep->name, savepoint_name)) { + /* Found */ + break; + } + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + + if (savep == NULL) { + + return(DB_NO_SAVEPOINT); + } + + /* We can now free all savepoints strictly later than this one */ + + trx_roll_savepoints_free(trx, savep); + + /* Now we can free this savepoint too */ + + UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep); + + mem_free(savep->name); + mem_free(savep); + + return(DB_SUCCESS); +} + +/*********************************************************************** +Returns a transaction savepoint taken at this point in time. */ + +trx_savept_t +trx_savept_take( +/*============*/ + /* out: savepoint */ + trx_t* trx) /* in: transaction */ +{ + trx_savept_t savept; + + savept.least_undo_no = trx->undo_no; + + return(savept); +} + +/*********************************************************************** +Rollback or clean up transactions which have no user session. If the +transaction already was committed, then we clean up a possible insert +undo log. If the transaction was not yet committed, then we roll it back. +Note: this is done in a background thread. */ + +#ifndef __WIN__ +void* +#else +ulint +#endif +trx_rollback_or_clean_all_without_sess( +/*===================================*/ + /* out: a dummy parameter */ + void* arg __attribute__((unused))) + /* in: a dummy parameter required by + os_thread_create */ +{ + mem_heap_t* heap; + que_fork_t* fork; + que_thr_t* thr; + roll_node_t* roll_node; + trx_t* trx; + dict_table_t* table; + ib_longlong rows_to_undo; + const char* unit = ""; + int err; + + mutex_enter(&kernel_mutex); + + /* Open a dummy session */ + + if (!trx_dummy_sess) { + trx_dummy_sess = sess_open(); + } + + mutex_exit(&kernel_mutex); + + if (UT_LIST_GET_FIRST(trx_sys->trx_list)) { + + fprintf(stderr, +"InnoDB: Starting in background the rollback of uncommitted transactions\n"); + } else { + goto leave_function; + } +loop: + heap = mem_heap_create(512); + + mutex_enter(&kernel_mutex); + + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx) { + if ((trx->sess || (trx->conc_state == TRX_NOT_STARTED))) { + trx = UT_LIST_GET_NEXT(trx_list, trx); + } else if (trx->conc_state == TRX_PREPARED) { + + trx->sess = trx_dummy_sess; + trx = UT_LIST_GET_NEXT(trx_list, trx); + } else { + break; + } + } + + mutex_exit(&kernel_mutex); + + if (trx == NULL) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Rollback of non-prepared transactions completed\n"); + + mem_heap_free(heap); + + goto leave_function; + } + + trx->sess = trx_dummy_sess; + + if (trx->conc_state == TRX_COMMITTED_IN_MEMORY) { + fprintf(stderr, "InnoDB: Cleaning up trx with id %lu %lu\n", + (ulong) ut_dulint_get_high(trx->id), + (ulong) ut_dulint_get_low(trx->id)); + + trx_cleanup_at_db_startup(trx); + + mem_heap_free(heap); + + goto loop; + } + + fork = que_fork_create(NULL, NULL, QUE_FORK_RECOVERY, heap); + fork->trx = trx; + + thr = que_thr_create(fork, heap); + + roll_node = roll_node_create(heap); + + thr->child = roll_node; + roll_node->common.parent = thr; + + mutex_enter(&kernel_mutex); + + trx->graph = fork; + + ut_a(thr == que_fork_start_command(fork)); + + trx_roll_crash_recv_trx = trx; + trx_roll_max_undo_no = ut_conv_dulint_to_longlong(trx->undo_no); + trx_roll_progress_printed_pct = 0; + rows_to_undo = trx_roll_max_undo_no; + + if (rows_to_undo > 1000000000) { + rows_to_undo = rows_to_undo / 1000000; + unit = "M"; + } + + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Rolling back trx with id %lu %lu, %lu%s rows to undo\n", + (ulong) ut_dulint_get_high(trx->id), + (ulong) ut_dulint_get_low(trx->id), + (ulong) rows_to_undo, unit); + mutex_exit(&kernel_mutex); + + trx->mysql_thread_id = os_thread_get_curr_id(); + + trx->mysql_process_no = os_proc_get_number(); + + if (trx->dict_operation) { + row_mysql_lock_data_dictionary(trx); + } + + que_run_threads(thr); + + mutex_enter(&kernel_mutex); + + while (trx->que_state != TRX_QUE_RUNNING) { + + mutex_exit(&kernel_mutex); + + fprintf(stderr, + "InnoDB: Waiting for rollback of trx id %lu to end\n", + (ulong) ut_dulint_get_low(trx->id)); + os_thread_sleep(100000); + + mutex_enter(&kernel_mutex); + } + + mutex_exit(&kernel_mutex); + + if (trx->dict_operation) { + /* If the transaction was for a dictionary operation, we + drop the relevant table, if it still exists */ + + fprintf(stderr, +"InnoDB: Dropping table with id %lu %lu in recovery if it exists\n", + (ulong) ut_dulint_get_high(trx->table_id), + (ulong) ut_dulint_get_low(trx->table_id)); + + table = dict_table_get_on_id_low(trx->table_id, trx); + + if (table) { + fputs("InnoDB: Table found: dropping table ", stderr); + ut_print_name(stderr, trx, table->name); + fputs(" in recovery\n", stderr); + + err = row_drop_table_for_mysql(table->name, trx, TRUE); + + ut_a(err == (int) DB_SUCCESS); + } + } + + if (trx->dict_operation) { + row_mysql_unlock_data_dictionary(trx); + } + + fprintf(stderr, "\nInnoDB: Rolling back of trx id %lu %lu completed\n", + (ulong) ut_dulint_get_high(trx->id), + (ulong) ut_dulint_get_low(trx->id)); + mem_heap_free(heap); + + trx_roll_crash_recv_trx = NULL; + + goto loop; + +leave_function: + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + + os_thread_exit(NULL); + + /* The following is dummy code to keep the compiler happy: */ + +#ifndef __WIN__ + return(NULL); +#else + return(0); +#endif +} + +/*********************************************************************** +Creates an undo number array. */ + +trx_undo_arr_t* +trx_undo_arr_create(void) +/*=====================*/ +{ + trx_undo_arr_t* arr; + mem_heap_t* heap; + ulint i; + + heap = mem_heap_create(1024); + + arr = mem_heap_alloc(heap, sizeof(trx_undo_arr_t)); + + arr->infos = mem_heap_alloc(heap, sizeof(trx_undo_inf_t) + * UNIV_MAX_PARALLELISM); + arr->n_cells = UNIV_MAX_PARALLELISM; + arr->n_used = 0; + + arr->heap = heap; + + for (i = 0; i < UNIV_MAX_PARALLELISM; i++) { + + (trx_undo_arr_get_nth_info(arr, i))->in_use = FALSE; + } + + return(arr); +} + +/*********************************************************************** +Frees an undo number array. */ + +void +trx_undo_arr_free( +/*==============*/ + trx_undo_arr_t* arr) /* in: undo number array */ +{ + ut_ad(arr->n_used == 0); + + mem_heap_free(arr->heap); +} + +/*********************************************************************** +Stores info of an undo log record to the array if it is not stored yet. */ +static +ibool +trx_undo_arr_store_info( +/*====================*/ + /* out: FALSE if the record already existed in the + array */ + trx_t* trx, /* in: transaction */ + dulint undo_no)/* in: undo number */ +{ + trx_undo_inf_t* cell; + trx_undo_inf_t* stored_here; + trx_undo_arr_t* arr; + ulint n_used; + ulint n; + ulint i; + + n = 0; + arr = trx->undo_no_arr; + n_used = arr->n_used; + stored_here = NULL; + + for (i = 0;; i++) { + cell = trx_undo_arr_get_nth_info(arr, i); + + if (!cell->in_use) { + if (!stored_here) { + /* Not in use, we may store here */ + cell->undo_no = undo_no; + cell->in_use = TRUE; + + arr->n_used++; + + stored_here = cell; + } + } else { + n++; + + if (0 == ut_dulint_cmp(cell->undo_no, undo_no)) { + + if (stored_here) { + stored_here->in_use = FALSE; + ut_ad(arr->n_used > 0); + arr->n_used--; + } + + ut_ad(arr->n_used == n_used); + + return(FALSE); + } + } + + if (n == n_used && stored_here) { + + ut_ad(arr->n_used == 1 + n_used); + + return(TRUE); + } + } +} + +/*********************************************************************** +Removes an undo number from the array. */ +static +void +trx_undo_arr_remove_info( +/*=====================*/ + trx_undo_arr_t* arr, /* in: undo number array */ + dulint undo_no)/* in: undo number */ +{ + trx_undo_inf_t* cell; + ulint n_used; + ulint n; + ulint i; + + n_used = arr->n_used; + n = 0; + + for (i = 0;; i++) { + cell = trx_undo_arr_get_nth_info(arr, i); + + if (cell->in_use + && 0 == ut_dulint_cmp(cell->undo_no, undo_no)) { + + cell->in_use = FALSE; + + ut_ad(arr->n_used > 0); + + arr->n_used--; + + return; + } + } +} + +/*********************************************************************** +Gets the biggest undo number in an array. */ +static +dulint +trx_undo_arr_get_biggest( +/*=====================*/ + /* out: biggest value, ut_dulint_zero if + the array is empty */ + trx_undo_arr_t* arr) /* in: undo number array */ +{ + trx_undo_inf_t* cell; + ulint n_used; + dulint biggest; + ulint n; + ulint i; + + n = 0; + n_used = arr->n_used; + biggest = ut_dulint_zero; + + for (i = 0;; i++) { + cell = trx_undo_arr_get_nth_info(arr, i); + + if (cell->in_use) { + n++; + if (ut_dulint_cmp(cell->undo_no, biggest) > 0) { + + biggest = cell->undo_no; + } + } + + if (n == n_used) { + return(biggest); + } + } +} + +/*************************************************************************** +Tries truncate the undo logs. */ + +void +trx_roll_try_truncate( +/*==================*/ + trx_t* trx) /* in: transaction */ +{ + trx_undo_arr_t* arr; + dulint limit; + dulint biggest; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(trx->undo_mutex))); + ut_ad(mutex_own(&((trx->rseg)->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + trx->pages_undone = 0; + + arr = trx->undo_no_arr; + + limit = trx->undo_no; + + if (arr->n_used > 0) { + biggest = trx_undo_arr_get_biggest(arr); + + if (ut_dulint_cmp(biggest, limit) >= 0) { + + limit = ut_dulint_add(biggest, 1); + } + } + + if (trx->insert_undo) { + trx_undo_truncate_end(trx, trx->insert_undo, limit); + } + + if (trx->update_undo) { + trx_undo_truncate_end(trx, trx->update_undo, limit); + } +} + +/*************************************************************************** +Pops the topmost undo log record in a single undo log and updates the info +about the topmost record in the undo log memory struct. */ +static +trx_undo_rec_t* +trx_roll_pop_top_rec( +/*=================*/ + /* out: undo log record, the page s-latched */ + trx_t* trx, /* in: transaction */ + trx_undo_t* undo, /* in: undo log */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* undo_page; + ulint offset; + trx_undo_rec_t* prev_rec; + page_t* prev_rec_page; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(trx->undo_mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + undo_page = trx_undo_page_get_s_latched(undo->space, + undo->top_page_no, mtr); + offset = undo->top_offset; + +/* fprintf(stderr, "Thread %lu undoing trx %lu undo record %lu\n", + os_thread_get_curr_id(), ut_dulint_get_low(trx->id), + ut_dulint_get_low(undo->top_undo_no)); */ + + prev_rec = trx_undo_get_prev_rec(undo_page + offset, + undo->hdr_page_no, undo->hdr_offset, + mtr); + if (prev_rec == NULL) { + + undo->empty = TRUE; + } else { + prev_rec_page = buf_frame_align(prev_rec); + + if (prev_rec_page != undo_page) { + + trx->pages_undone++; + } + + undo->top_page_no = buf_frame_get_page_no(prev_rec_page); + undo->top_offset = prev_rec - prev_rec_page; + undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec); + } + + return(undo_page + offset); +} + +/************************************************************************ +Pops the topmost record when the two undo logs of a transaction are seen +as a single stack of records ordered by their undo numbers. Inserts the +undo number of the popped undo record to the array of currently processed +undo numbers in the transaction. When the query thread finishes processing +of this undo record, it must be released with trx_undo_rec_release. */ + +trx_undo_rec_t* +trx_roll_pop_top_rec_of_trx( +/*========================*/ + /* out: undo log record copied to heap, NULL + if none left, or if the undo number of the + top record would be less than the limit */ + trx_t* trx, /* in: transaction */ + dulint limit, /* in: least undo number we need */ + dulint* roll_ptr,/* out: roll pointer to undo record */ + mem_heap_t* heap) /* in: memory heap where copied */ +{ + trx_undo_t* undo; + trx_undo_t* ins_undo; + trx_undo_t* upd_undo; + trx_undo_rec_t* undo_rec; + trx_undo_rec_t* undo_rec_copy; + dulint undo_no; + ibool is_insert; + trx_rseg_t* rseg; + ulint progress_pct; + mtr_t mtr; + + rseg = trx->rseg; +try_again: + mutex_enter(&(trx->undo_mutex)); + + if (trx->pages_undone >= TRX_ROLL_TRUNC_THRESHOLD) { + mutex_enter(&(rseg->mutex)); + + trx_roll_try_truncate(trx); + + mutex_exit(&(rseg->mutex)); + } + + ins_undo = trx->insert_undo; + upd_undo = trx->update_undo; + + if (!ins_undo || ins_undo->empty) { + undo = upd_undo; + } else if (!upd_undo || upd_undo->empty) { + undo = ins_undo; + } else if (ut_dulint_cmp(upd_undo->top_undo_no, + ins_undo->top_undo_no) > 0) { + undo = upd_undo; + } else { + undo = ins_undo; + } + + if (!undo || undo->empty + || (ut_dulint_cmp(limit, undo->top_undo_no) > 0)) { + + if ((trx->undo_no_arr)->n_used == 0) { + /* Rollback is ending */ + + mutex_enter(&(rseg->mutex)); + + trx_roll_try_truncate(trx); + + mutex_exit(&(rseg->mutex)); + } + + mutex_exit(&(trx->undo_mutex)); + + return(NULL); + } + + if (undo == ins_undo) { + is_insert = TRUE; + } else { + is_insert = FALSE; + } + + *roll_ptr = trx_undo_build_roll_ptr(is_insert, (undo->rseg)->id, + undo->top_page_no, undo->top_offset); + mtr_start(&mtr); + + undo_rec = trx_roll_pop_top_rec(trx, undo, &mtr); + + undo_no = trx_undo_rec_get_undo_no(undo_rec); + + ut_ad(ut_dulint_cmp(ut_dulint_add(undo_no, 1), trx->undo_no) == 0); + + /* We print rollback progress info if we are in a crash recovery + and the transaction has at least 1000 row operations to undo. */ + + if (trx == trx_roll_crash_recv_trx && trx_roll_max_undo_no > 1000) { + + progress_pct = 100 - (ulint) + ((ut_conv_dulint_to_longlong(undo_no) * 100) + / trx_roll_max_undo_no); + if (progress_pct != trx_roll_progress_printed_pct) { + if (trx_roll_progress_printed_pct == 0) { + fprintf(stderr, +"\nInnoDB: Progress in percents: %lu", (ulong) progress_pct); + } else { + fprintf(stderr, + " %lu", (ulong) progress_pct); + } + fflush(stderr); + trx_roll_progress_printed_pct = progress_pct; + } + } + + trx->undo_no = undo_no; + + if (!trx_undo_arr_store_info(trx, undo_no)) { + /* A query thread is already processing this undo log record */ + + mutex_exit(&(trx->undo_mutex)); + + mtr_commit(&mtr); + + goto try_again; + } + + undo_rec_copy = trx_undo_rec_copy(undo_rec, heap); + + mutex_exit(&(trx->undo_mutex)); + + mtr_commit(&mtr); + + return(undo_rec_copy); +} + +/************************************************************************ +Reserves an undo log record for a query thread to undo. This should be +called if the query thread gets the undo log record not using the pop +function above. */ + +ibool +trx_undo_rec_reserve( +/*=================*/ + /* out: TRUE if succeeded */ + trx_t* trx, /* in: transaction */ + dulint undo_no)/* in: undo number of the record */ +{ + ibool ret; + + mutex_enter(&(trx->undo_mutex)); + + ret = trx_undo_arr_store_info(trx, undo_no); + + mutex_exit(&(trx->undo_mutex)); + + return(ret); +} + +/*********************************************************************** +Releases a reserved undo record. */ + +void +trx_undo_rec_release( +/*=================*/ + trx_t* trx, /* in: transaction */ + dulint undo_no)/* in: undo number */ +{ + trx_undo_arr_t* arr; + + mutex_enter(&(trx->undo_mutex)); + + arr = trx->undo_no_arr; + + trx_undo_arr_remove_info(arr, undo_no); + + mutex_exit(&(trx->undo_mutex)); +} + +/************************************************************************* +Starts a rollback operation. */ + +void +trx_rollback( +/*=========*/ + trx_t* trx, /* in: transaction */ + trx_sig_t* sig, /* in: signal starting the rollback */ + que_thr_t** next_thr)/* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread; if the passed value is + NULL, the parameter is ignored */ +{ + que_t* roll_graph; + que_thr_t* thr; +/* que_thr_t* thr2; */ + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad((trx->undo_no_arr == NULL) || ((trx->undo_no_arr)->n_used == 0)); + + /* Initialize the rollback field in the transaction */ + + if (sig->type == TRX_SIG_TOTAL_ROLLBACK) { + + trx->roll_limit = ut_dulint_zero; + + } else if (sig->type == TRX_SIG_ROLLBACK_TO_SAVEPT) { + + trx->roll_limit = (sig->savept).least_undo_no; + + } else if (sig->type == TRX_SIG_ERROR_OCCURRED) { + + trx->roll_limit = trx->last_sql_stat_start.least_undo_no; + } else { + ut_error; + } + + ut_a(ut_dulint_cmp(trx->roll_limit, trx->undo_no) <= 0); + + trx->pages_undone = 0; + + if (trx->undo_no_arr == NULL) { + trx->undo_no_arr = trx_undo_arr_create(); + } + + /* Build a 'query' graph which will perform the undo operations */ + + roll_graph = trx_roll_graph_build(trx); + + trx->graph = roll_graph; + trx->que_state = TRX_QUE_ROLLING_BACK; + + thr = que_fork_start_command(roll_graph); + + ut_ad(thr); + +/* thr2 = que_fork_start_command(roll_graph); + + ut_ad(thr2); */ + + if (next_thr && (*next_thr == NULL)) { + *next_thr = thr; +/* srv_que_task_enqueue_low(thr2); */ + } else { + srv_que_task_enqueue_low(thr); +/* srv_que_task_enqueue_low(thr2); */ + } +} + +/******************************************************************** +Builds an undo 'query' graph for a transaction. The actual rollback is +performed by executing this query graph like a query subprocedure call. +The reply about the completion of the rollback will be sent by this +graph. */ + +que_t* +trx_roll_graph_build( +/*=================*/ + /* out, own: the query graph */ + trx_t* trx) /* in: trx handle */ +{ + mem_heap_t* heap; + que_fork_t* fork; + que_thr_t* thr; +/* que_thr_t* thr2; */ + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + heap = mem_heap_create(512); + fork = que_fork_create(NULL, NULL, QUE_FORK_ROLLBACK, heap); + fork->trx = trx; + + thr = que_thr_create(fork, heap); +/* thr2 = que_thr_create(fork, heap); */ + + thr->child = row_undo_node_create(trx, thr, heap); +/* thr2->child = row_undo_node_create(trx, thr2, heap); */ + + return(fork); +} + +/************************************************************************* +Finishes error processing after the necessary partial rollback has been +done. */ +static +void +trx_finish_error_processing( +/*========================*/ + trx_t* trx) /* in: transaction */ +{ + trx_sig_t* sig; + trx_sig_t* next_sig; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + sig = UT_LIST_GET_FIRST(trx->signals); + + while (sig != NULL) { + next_sig = UT_LIST_GET_NEXT(signals, sig); + + if (sig->type == TRX_SIG_ERROR_OCCURRED) { + + trx_sig_remove(trx, sig); + } + + sig = next_sig; + } + + trx->que_state = TRX_QUE_RUNNING; +} + +/************************************************************************* +Finishes a partial rollback operation. */ +static +void +trx_finish_partial_rollback_off_kernel( +/*===================================*/ + trx_t* trx, /* in: transaction */ + que_thr_t** next_thr)/* in/out: next query thread to run; + if the value which is passed in is a pointer + to a NULL pointer, then the calling function + can start running a new query thread; if this + parameter is NULL, it is ignored */ +{ + trx_sig_t* sig; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + sig = UT_LIST_GET_FIRST(trx->signals); + + /* Remove the signal from the signal queue and send reply message + to it */ + + trx_sig_reply(sig, next_thr); + trx_sig_remove(trx, sig); + + trx->que_state = TRX_QUE_RUNNING; +} + +/******************************************************************** +Finishes a transaction rollback. */ + +void +trx_finish_rollback_off_kernel( +/*===========================*/ + que_t* graph, /* in: undo graph which can now be freed */ + trx_t* trx, /* in: transaction */ + que_thr_t** next_thr)/* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread; if this parameter is + NULL, it is ignored */ +{ + trx_sig_t* sig; + trx_sig_t* next_sig; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_a(trx->undo_no_arr == NULL || trx->undo_no_arr->n_used == 0); + + /* Free the memory reserved by the undo graph */ + que_graph_free(graph); + + sig = UT_LIST_GET_FIRST(trx->signals); + + if (sig->type == TRX_SIG_ROLLBACK_TO_SAVEPT) { + + trx_finish_partial_rollback_off_kernel(trx, next_thr); + + return; + + } else if (sig->type == TRX_SIG_ERROR_OCCURRED) { + + trx_finish_error_processing(trx); + + return; + } + + if (lock_print_waits) { + fprintf(stderr, "Trx %lu rollback finished\n", + (ulong) ut_dulint_get_low(trx->id)); + } + + trx_commit_off_kernel(trx); + + /* Remove all TRX_SIG_TOTAL_ROLLBACK signals from the signal queue and + send reply messages to them */ + + trx->que_state = TRX_QUE_RUNNING; + + while (sig != NULL) { + next_sig = UT_LIST_GET_NEXT(signals, sig); + + if (sig->type == TRX_SIG_TOTAL_ROLLBACK) { + + trx_sig_reply(sig, next_thr); + + trx_sig_remove(trx, sig); + } + + sig = next_sig; + } +} + +/************************************************************************* +Creates a rollback command node struct. */ + +roll_node_t* +roll_node_create( +/*=============*/ + /* out, own: rollback node struct */ + mem_heap_t* heap) /* in: mem heap where created */ +{ + roll_node_t* node; + + node = mem_heap_alloc(heap, sizeof(roll_node_t)); + node->common.type = QUE_NODE_ROLLBACK; + node->state = ROLL_NODE_SEND; + + node->partial = FALSE; + + return(node); +} + +/*************************************************************** +Performs an execution step for a rollback command node in a query graph. */ + +que_thr_t* +trx_rollback_step( +/*==============*/ + /* out: query thread to run next, or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + roll_node_t* node; + ibool success; + ulint sig_no; + trx_savept_t* savept; + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = ROLL_NODE_SEND; + } + + if (node->state == ROLL_NODE_SEND) { + mutex_enter(&kernel_mutex); + + node->state = ROLL_NODE_WAIT; + + if (node->partial) { + sig_no = TRX_SIG_ROLLBACK_TO_SAVEPT; + savept = &(node->savept); + } else { + sig_no = TRX_SIG_TOTAL_ROLLBACK; + savept = NULL; + } + + /* Send a rollback signal to the transaction */ + + success = trx_sig_send(thr_get_trx(thr), + sig_no, TRX_SIG_SELF, + thr, savept, NULL); + + thr->state = QUE_THR_SIG_REPLY_WAIT; + + mutex_exit(&kernel_mutex); + + if (!success) { + /* Error in delivering the rollback signal */ + que_thr_handle_error(thr, DB_ERROR, NULL, 0); + } + + return(NULL); + } + + ut_ad(node->state == ROLL_NODE_WAIT); + + thr->run_node = que_node_get_parent(node); + + return(thr); +} diff --git a/storage/innobase/trx/trx0rseg.c b/storage/innobase/trx/trx0rseg.c new file mode 100644 index 00000000000..a01d4bb835d --- /dev/null +++ b/storage/innobase/trx/trx0rseg.c @@ -0,0 +1,261 @@ +/****************************************************** +Rollback segment + +(c) 1996 Innobase Oy + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0rseg.h" + +#ifdef UNIV_NONINL +#include "trx0rseg.ic" +#endif + +#include "trx0undo.h" +#include "fut0lst.h" +#include "srv0srv.h" +#include "trx0purge.h" + +/********************************************************************** +Looks for a rollback segment, based on the rollback segment id. */ + +trx_rseg_t* +trx_rseg_get_on_id( +/*===============*/ + /* out: rollback segment */ + ulint id) /* in: rollback segment id */ +{ + trx_rseg_t* rseg; + + rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + ut_ad(rseg); + + while (rseg->id != id) { + rseg = UT_LIST_GET_NEXT(rseg_list, rseg); + ut_ad(rseg); + } + + return(rseg); +} + +/******************************************************************** +Creates a rollback segment header. This function is called only when +a new rollback segment is created in the database. */ + +ulint +trx_rseg_header_create( +/*===================*/ + /* out: page number of the created segment, + FIL_NULL if fail */ + ulint space, /* in: space id */ + ulint max_size, /* in: max size in pages */ + ulint* slot_no, /* out: rseg id == slot number in trx sys */ + mtr_t* mtr) /* in: mtr */ +{ + ulint page_no; + trx_rsegf_t* rsegf; + trx_sysf_t* sys_header; + ulint i; + page_t* page; + + ut_ad(mtr); +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space), + MTR_MEMO_X_LOCK)); + sys_header = trx_sysf_get(mtr); + + *slot_no = trx_sysf_rseg_find_free(mtr); + + if (*slot_no == ULINT_UNDEFINED) { + + return(FIL_NULL); + } + + /* Allocate a new file segment for the rollback segment */ + page = fseg_create(space, 0, TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr); + + if (page == NULL) { + /* No space left */ + + return(FIL_NULL); + } + +#ifdef UNIV_SYNC_DEBUG + buf_page_dbg_add_level(page, SYNC_RSEG_HEADER_NEW); +#endif /* UNIV_SYNC_DEBUG */ + + page_no = buf_frame_get_page_no(page); + + /* Get the rollback segment file page */ + rsegf = trx_rsegf_get_new(space, page_no, mtr); + + /* Initialize max size field */ + mlog_write_ulint(rsegf + TRX_RSEG_MAX_SIZE, max_size, MLOG_4BYTES, mtr); + + /* Initialize the history list */ + + mlog_write_ulint(rsegf + TRX_RSEG_HISTORY_SIZE, 0, MLOG_4BYTES, mtr); + flst_init(rsegf + TRX_RSEG_HISTORY, mtr); + + /* Reset the undo log slots */ + for (i = 0; i < TRX_RSEG_N_SLOTS; i++) { + + trx_rsegf_set_nth_undo(rsegf, i, FIL_NULL, mtr); + } + + /* Add the rollback segment info to the free slot in the trx system + header */ + + trx_sysf_rseg_set_space(sys_header, *slot_no, space, mtr); + trx_sysf_rseg_set_page_no(sys_header, *slot_no, page_no, mtr); + + return(page_no); +} + +/*************************************************************************** +Creates and initializes a rollback segment object. The values for the +fields are read from the header. The object is inserted to the rseg +list of the trx system object and a pointer is inserted in the rseg +array in the trx system object. */ +static +trx_rseg_t* +trx_rseg_mem_create( +/*================*/ + /* out, own: rollback segment object */ + ulint id, /* in: rollback segment id */ + ulint space, /* in: space where the segment placed */ + ulint page_no, /* in: page number of the segment header */ + mtr_t* mtr) /* in: mtr */ +{ + trx_rsegf_t* rseg_header; + trx_rseg_t* rseg; + trx_ulogf_t* undo_log_hdr; + fil_addr_t node_addr; + ulint sum_of_undo_sizes; + ulint len; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + rseg = mem_alloc(sizeof(trx_rseg_t)); + + rseg->id = id; + rseg->space = space; + rseg->page_no = page_no; + + mutex_create(&(rseg->mutex)); + mutex_set_level(&(rseg->mutex), SYNC_RSEG); + + UT_LIST_ADD_LAST(rseg_list, trx_sys->rseg_list, rseg); + + trx_sys_set_nth_rseg(trx_sys, id, rseg); + + rseg_header = trx_rsegf_get_new(space, page_no, mtr); + + rseg->max_size = mtr_read_ulint(rseg_header + TRX_RSEG_MAX_SIZE, + MLOG_4BYTES, mtr); + + /* Initialize the undo log lists according to the rseg header */ + + sum_of_undo_sizes = trx_undo_lists_init(rseg); + + rseg->curr_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, + MLOG_4BYTES, mtr) + + 1 + sum_of_undo_sizes; + + len = flst_get_len(rseg_header + TRX_RSEG_HISTORY, mtr); + if (len > 0) { + trx_sys->rseg_history_len += len; + + node_addr = trx_purge_get_log_from_hist( + flst_get_last(rseg_header + TRX_RSEG_HISTORY, + mtr)); + rseg->last_page_no = node_addr.page; + rseg->last_offset = node_addr.boffset; + + undo_log_hdr = trx_undo_page_get(rseg->space, node_addr.page, + mtr) + + node_addr.boffset; + + rseg->last_trx_no = mtr_read_dulint( + undo_log_hdr + TRX_UNDO_TRX_NO, mtr); + rseg->last_del_marks = mtr_read_ulint( + undo_log_hdr + TRX_UNDO_DEL_MARKS, + MLOG_2BYTES, mtr); + } else { + rseg->last_page_no = FIL_NULL; + } + + return(rseg); +} + +/************************************************************************* +Creates the memory copies for rollback segments and initializes the +rseg list and array in trx_sys at a database startup. */ + +void +trx_rseg_list_and_array_init( +/*=========================*/ + trx_sysf_t* sys_header, /* in: trx system header */ + mtr_t* mtr) /* in: mtr */ +{ + ulint i; + ulint page_no; + ulint space; + + UT_LIST_INIT(trx_sys->rseg_list); + + trx_sys->rseg_history_len = 0; + + for (i = 0; i < TRX_SYS_N_RSEGS; i++) { + + page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr); + + if (page_no == FIL_NULL) { + + trx_sys_set_nth_rseg(trx_sys, i, NULL); + } else { + space = trx_sysf_rseg_get_space(sys_header, i, mtr); + + trx_rseg_mem_create(i, space, page_no, mtr); + } + } +} + +/******************************************************************** +Creates a new rollback segment to the database. */ + +trx_rseg_t* +trx_rseg_create( +/*============*/ + /* out: the created segment object, NULL if + fail */ + ulint space, /* in: space id */ + ulint max_size, /* in: max size in pages */ + ulint* id, /* out: rseg id */ + mtr_t* mtr) /* in: mtr */ +{ + ulint page_no; + trx_rseg_t* rseg; + + mtr_x_lock(fil_space_get_latch(space), mtr); + mutex_enter(&kernel_mutex); + + page_no = trx_rseg_header_create(space, max_size, id, mtr); + + if (page_no == FIL_NULL) { + + mutex_exit(&kernel_mutex); + return(NULL); + } + + rseg = trx_rseg_mem_create(*id, space, page_no, mtr); + + mutex_exit(&kernel_mutex); + + return(rseg); +} diff --git a/storage/innobase/trx/trx0sys.c b/storage/innobase/trx/trx0sys.c new file mode 100644 index 00000000000..68fe6d5079a --- /dev/null +++ b/storage/innobase/trx/trx0sys.c @@ -0,0 +1,964 @@ +/****************************************************** +Transaction system + +(c) 1996 Innobase Oy + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0sys.h" + +#ifdef UNIV_NONINL +#include "trx0sys.ic" +#endif + +#include "fsp0fsp.h" +#include "mtr0mtr.h" +#include "trx0trx.h" +#include "trx0rseg.h" +#include "trx0undo.h" +#include "srv0srv.h" +#include "trx0purge.h" +#include "log0log.h" +#include "os0file.h" + +/* The transaction system */ +trx_sys_t* trx_sys = NULL; +trx_doublewrite_t* trx_doublewrite = NULL; + +/* The following is set to TRUE when we are upgrading from the old format data +files to the new >= 4.1.x format multiple tablespaces format data files */ + +ibool trx_doublewrite_must_reset_space_ids = FALSE; + +/* The following is TRUE when we are using the database in the new format, +i.e., we have successfully upgraded, or have created a new database +installation */ + +ibool trx_sys_multiple_tablespace_format = FALSE; + +/* In a MySQL replication slave, in crash recovery we store the master log +file name and position here. We have successfully got the updates to InnoDB +up to this position. If .._pos is -1, it means no crash recovery was needed, +or there was no master log position info inside InnoDB. */ + +char trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN]; +ib_longlong trx_sys_mysql_master_log_pos = -1; + +/* If this MySQL server uses binary logging, after InnoDB has been inited +and if it has done a crash recovery, we store the binlog file name and position +here. If .._pos is -1, it means there was no binlog position info inside +InnoDB. */ + +char trx_sys_mysql_bin_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN]; +ib_longlong trx_sys_mysql_bin_log_pos = -1; + + +/******************************************************************** +Determines if a page number is located inside the doublewrite buffer. */ + +ibool +trx_doublewrite_page_inside( +/*========================*/ + /* out: TRUE if the location is inside + the two blocks of the doublewrite buffer */ + ulint page_no) /* in: page number */ +{ + if (trx_doublewrite == NULL) { + + return(FALSE); + } + + if (page_no >= trx_doublewrite->block1 + && page_no < trx_doublewrite->block1 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + return(TRUE); + } + + if (page_no >= trx_doublewrite->block2 + && page_no < trx_doublewrite->block2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + return(TRUE); + } + + return(FALSE); +} + +/******************************************************************** +Creates or initialializes the doublewrite buffer at a database start. */ +static +void +trx_doublewrite_init( +/*=================*/ + byte* doublewrite) /* in: pointer to the doublewrite buf + header on trx sys page */ +{ + trx_doublewrite = mem_alloc(sizeof(trx_doublewrite_t)); + + /* Since we now start to use the doublewrite buffer, no need to call + fsync() after every write to a data file */ + + os_do_not_call_flush_at_each_write = TRUE; + + mutex_create(&(trx_doublewrite->mutex)); + mutex_set_level(&(trx_doublewrite->mutex), SYNC_DOUBLEWRITE); + + trx_doublewrite->first_free = 0; + + trx_doublewrite->block1 = mach_read_from_4( + doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK1); + trx_doublewrite->block2 = mach_read_from_4( + doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK2); + trx_doublewrite->write_buf_unaligned = + ut_malloc( + (1 + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) + * UNIV_PAGE_SIZE); + + trx_doublewrite->write_buf = ut_align( + trx_doublewrite->write_buf_unaligned, + UNIV_PAGE_SIZE); + trx_doublewrite->buf_block_arr = mem_alloc( + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + * sizeof(void*)); +} + +/******************************************************************** +Frees the doublewrite buffer. */ +static +void +trx_doublewrite_free(void) +/*======================*/ +{ + mutex_free(&(trx_doublewrite->mutex)); + + mem_free(trx_doublewrite->buf_block_arr); + ut_free(trx_doublewrite->write_buf_unaligned); + + mem_free(trx_doublewrite); + trx_doublewrite = NULL; +} + +/******************************************************************** +Marks the trx sys header when we have successfully upgraded to the >= 4.1.x +multiple tablespace format. */ + +void +trx_sys_mark_upgraded_to_multiple_tablespaces(void) +/*===============================================*/ +{ + page_t* page; + byte* doublewrite; + mtr_t mtr; + + /* We upgraded to 4.1.x and reset the space id fields in the + doublewrite buffer. Let us mark to the trx_sys header that the upgrade + has been done. */ + + mtr_start(&mtr); + + page = buf_page_get(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr); +#ifdef UNIV_SYNC_DEBUG + buf_page_dbg_add_level(page, SYNC_NO_ORDER_CHECK); +#endif /* UNIV_SYNC_DEBUG */ + + doublewrite = page + TRX_SYS_DOUBLEWRITE; + + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED, + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, + MLOG_4BYTES, &mtr); + mtr_commit(&mtr); + + /* Flush the modified pages to disk and make a checkpoint */ + log_make_checkpoint_at(ut_dulint_max, TRUE); + + trx_sys_multiple_tablespace_format = TRUE; +} + +/******************************************************************** +Creates the doublewrite buffer to a new InnoDB installation. The header of the +doublewrite buffer is placed on the trx system header page. */ + +void +trx_sys_create_doublewrite_buf(void) +/*================================*/ +{ + page_t* page; + page_t* page2; + page_t* new_page; + byte* doublewrite; + byte* fseg_header; + ulint page_no; + ulint prev_page_no; + ulint i; + mtr_t mtr; + + if (trx_doublewrite) { + /* Already inited */ + + return; + } + +start_again: + mtr_start(&mtr); + + page = buf_page_get(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr); +#ifdef UNIV_SYNC_DEBUG + buf_page_dbg_add_level(page, SYNC_NO_ORDER_CHECK); +#endif /* UNIV_SYNC_DEBUG */ + + doublewrite = page + TRX_SYS_DOUBLEWRITE; + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) + == TRX_SYS_DOUBLEWRITE_MAGIC_N) { + /* The doublewrite buffer has already been created: + just read in some numbers */ + + trx_doublewrite_init(doublewrite); + + mtr_commit(&mtr); + } else { + fprintf(stderr, + "InnoDB: Doublewrite buffer not found: creating new\n"); + + if (buf_pool_get_curr_size() < + (2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + + FSP_EXTENT_SIZE / 2 + 100) + * UNIV_PAGE_SIZE) { + fprintf(stderr, + "InnoDB: Cannot create doublewrite buffer: you must\n" + "InnoDB: increase your buffer pool size.\n" + "InnoDB: Cannot continue operation.\n"); + + exit(1); + } + + page2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG, &mtr); + + /* fseg_create acquires a second latch on the page, + therefore we must declare it: */ + +#ifdef UNIV_SYNC_DEBUG + buf_page_dbg_add_level(page2, SYNC_NO_ORDER_CHECK); +#endif /* UNIV_SYNC_DEBUG */ + + if (page2 == NULL) { + fprintf(stderr, + "InnoDB: Cannot create doublewrite buffer: you must\n" + "InnoDB: increase your tablespace size.\n" + "InnoDB: Cannot continue operation.\n"); + + /* We exit without committing the mtr to prevent + its modifications to the database getting to disk */ + + exit(1); + } + + fseg_header = page + TRX_SYS_DOUBLEWRITE + + TRX_SYS_DOUBLEWRITE_FSEG; + prev_page_no = 0; + + for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + + FSP_EXTENT_SIZE / 2; i++) { + page_no = fseg_alloc_free_page(fseg_header, + prev_page_no + 1, + FSP_UP, &mtr); + if (page_no == FIL_NULL) { + fprintf(stderr, + "InnoDB: Cannot create doublewrite buffer: you must\n" + "InnoDB: increase your tablespace size.\n" + "InnoDB: Cannot continue operation.\n"); + + exit(1); + } + + /* We read the allocated pages to the buffer pool; + when they are written to disk in a flush, the space + id and page number fields are also written to the + pages. When we at database startup read pages + from the doublewrite buffer, we know that if the + space id and page number in them are the same as + the page position in the tablespace, then the page + has not been written to in doublewrite. */ + + new_page = buf_page_get(TRX_SYS_SPACE, page_no, + RW_X_LATCH, &mtr); +#ifdef UNIV_SYNC_DEBUG + buf_page_dbg_add_level(new_page, SYNC_NO_ORDER_CHECK); +#endif /* UNIV_SYNC_DEBUG */ + + /* Make a dummy change to the page to ensure it will + be written to disk in a flush */ + + mlog_write_ulint(new_page + FIL_PAGE_DATA, + TRX_SYS_DOUBLEWRITE_MAGIC_N, + MLOG_4BYTES, &mtr); + + if (i == FSP_EXTENT_SIZE / 2) { + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK1, + page_no, MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK1, + page_no, MLOG_4BYTES, &mtr); + } else if (i == FSP_EXTENT_SIZE / 2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK2, + page_no, MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK2, + page_no, MLOG_4BYTES, &mtr); + } else if (i > FSP_EXTENT_SIZE / 2) { + ut_a(page_no == prev_page_no + 1); + } + + prev_page_no = page_no; + } + + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC, + TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC + + TRX_SYS_DOUBLEWRITE_REPEAT, + TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr); + + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED, + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, + MLOG_4BYTES, &mtr); + mtr_commit(&mtr); + + /* Flush the modified pages to disk and make a checkpoint */ + log_make_checkpoint_at(ut_dulint_max, TRUE); + + fprintf(stderr, "InnoDB: Doublewrite buffer created\n"); + + trx_sys_multiple_tablespace_format = TRUE; + + goto start_again; + } +} + +/******************************************************************** +At a database startup initializes the doublewrite buffer memory structure if +we already have a doublewrite buffer created in the data files. If we are +upgrading to an InnoDB version which supports multiple tablespaces, then this +function performs the necessary update operations. If we are in a crash +recovery, this function uses a possible doublewrite buffer to restore +half-written pages in the data files. */ + +void +trx_sys_doublewrite_init_or_restore_pages( +/*======================================*/ + ibool restore_corrupt_pages) +{ + byte* buf; + byte* read_buf; + byte* unaligned_read_buf; + ulint block1; + ulint block2; + ulint source_page_no; + byte* page; + byte* doublewrite; + ulint space_id; + ulint page_no; + ulint i; + + /* We do the file i/o past the buffer pool */ + + unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE); + read_buf = ut_align(unaligned_read_buf, UNIV_PAGE_SIZE); + + /* Read the trx sys header to check if we are using the doublewrite + buffer */ + + fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, TRX_SYS_PAGE_NO, 0, + UNIV_PAGE_SIZE, read_buf, NULL); + doublewrite = read_buf + TRX_SYS_DOUBLEWRITE; + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) + == TRX_SYS_DOUBLEWRITE_MAGIC_N) { + /* The doublewrite buffer has been created */ + + trx_doublewrite_init(doublewrite); + + block1 = trx_doublewrite->block1; + block2 = trx_doublewrite->block2; + + buf = trx_doublewrite->write_buf; + } else { + goto leave_func; + } + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED) + != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) { + + /* We are upgrading from a version < 4.1.x to a version where + multiple tablespaces are supported. We must reset the space id + field in the pages in the doublewrite buffer because starting + from this version the space id is stored to + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */ + + trx_doublewrite_must_reset_space_ids = TRUE; + + fprintf(stderr, +"InnoDB: Resetting space id's in the doublewrite buffer\n"); + } else { + trx_sys_multiple_tablespace_format = TRUE; + } + + /* Read the pages from the doublewrite buffer to memory */ + + fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, block1, 0, + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, + buf, NULL); + fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, block2, 0, + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, + buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, + NULL); + /* Check if any of these pages is half-written in data files, in the + intended position */ + + page = buf; + + for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) { + + page_no = mach_read_from_4(page + FIL_PAGE_OFFSET); + + if (trx_doublewrite_must_reset_space_ids) { + + space_id = 0; + mach_write_to_4(page + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0); + /* We do not need to calculate new checksums for the + pages because the field .._SPACE_ID does not affect + them. Write the page back to where we read it from. */ + + if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + source_page_no = block1 + i; + } else { + source_page_no = block2 + + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; + } + + fil_io(OS_FILE_WRITE, TRUE, 0, source_page_no, 0, + UNIV_PAGE_SIZE, page, NULL); + /* printf("Resetting space id in page %lu\n", + source_page_no); */ + } else { + space_id = mach_read_from_4( + page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + } + + if (!restore_corrupt_pages) { + /* The database was shut down gracefully: no need to + restore pages */ + + } else if (!fil_tablespace_exists_in_mem(space_id)) { + /* Maybe we have dropped the single-table tablespace + and this page once belonged to it: do nothing */ + + } else if (!fil_check_adress_in_tablespace(space_id, + page_no)) { + fprintf(stderr, +"InnoDB: Warning: a page in the doublewrite buffer is not within space\n" +"InnoDB: bounds; space id %lu page number %lu, page %lu in doublewrite buf.\n", + (ulong) space_id, (ulong) page_no, (ulong) i); + + } else if (space_id == TRX_SYS_SPACE + && ( (page_no >= block1 + && page_no + < block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) + || (page_no >= block2 + && page_no + < block2 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) { + + /* It is an unwritten doublewrite buffer page: + do nothing */ + } else { + /* Read in the actual page from the data files */ + + fil_io(OS_FILE_READ, TRUE, space_id, page_no, 0, + UNIV_PAGE_SIZE, read_buf, NULL); + /* Check if the page is corrupt */ + + if (buf_page_is_corrupted(read_buf)) { + + fprintf(stderr, + "InnoDB: Warning: database page corruption or a failed\n" + "InnoDB: file read of page %lu.\n", (ulong) page_no); + fprintf(stderr, + "InnoDB: Trying to recover it from the doublewrite buffer.\n"); + + if (buf_page_is_corrupted(page)) { + fprintf(stderr, + "InnoDB: Dump of the page:\n"); + buf_page_print(read_buf); + fprintf(stderr, + "InnoDB: Dump of corresponding page in doublewrite buffer:\n"); + buf_page_print(page); + + fprintf(stderr, + "InnoDB: Also the page in the doublewrite buffer is corrupt.\n" + "InnoDB: Cannot continue operation.\n" + "InnoDB: You can try to recover the database with the my.cnf\n" + "InnoDB: option:\n" + "InnoDB: set-variable=innodb_force_recovery=6\n"); + exit(1); + } + + /* Write the good page from the + doublewrite buffer to the intended + position */ + + fil_io(OS_FILE_WRITE, TRUE, space_id, + page_no, 0, + UNIV_PAGE_SIZE, page, NULL); + fprintf(stderr, + "InnoDB: Recovered the page from the doublewrite buffer.\n"); + } + } + + page += UNIV_PAGE_SIZE; + } + + fil_flush_file_spaces(FIL_TABLESPACE); + + if (!srv_use_doublewrite_buf) + trx_doublewrite_free(); + +leave_func: + ut_free(unaligned_read_buf); +} + +/******************************************************************** +Checks that trx is in the trx list. */ + +ibool +trx_in_trx_list( +/*============*/ + /* out: TRUE if is in */ + trx_t* in_trx) /* in: trx */ +{ + trx_t* trx; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(kernel_mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx != NULL) { + + if (trx == in_trx) { + + return(TRUE); + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + return(FALSE); +} + +/********************************************************************* +Writes the value of max_trx_id to the file based trx system header. */ + +void +trx_sys_flush_max_trx_id(void) +/*==========================*/ +{ + trx_sysf_t* sys_header; + mtr_t mtr; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE, + trx_sys->max_trx_id, &mtr); + mtr_commit(&mtr); +} + +/********************************************************************* +Updates the offset information about the end of the MySQL binlog entry +which corresponds to the transaction just being committed. In a MySQL +replication slave updates the latest master binlog position up to which +replication has proceeded. */ + +void +trx_sys_update_mysql_binlog_offset( +/*===============================*/ + const char* file_name,/* in: MySQL log file name */ + ib_longlong offset, /* in: position in that log file */ + ulint field, /* in: offset of the MySQL log info field in + the trx sys header */ + mtr_t* mtr) /* in: mtr */ +{ + trx_sysf_t* sys_header; + + if (ut_strlen(file_name) >= TRX_SYS_MYSQL_LOG_NAME_LEN) { + + /* We cannot fit the name to the 512 bytes we have reserved */ + + return; + } + + sys_header = trx_sysf_get(mtr); + + if (mach_read_from_4(sys_header + field + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + != TRX_SYS_MYSQL_LOG_MAGIC_N) { + + mlog_write_ulint(sys_header + field + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD, + TRX_SYS_MYSQL_LOG_MAGIC_N, + MLOG_4BYTES, mtr); + } + + if (0 != strcmp((char*) (sys_header + field + TRX_SYS_MYSQL_LOG_NAME), file_name)) { + + mlog_write_string(sys_header + field + + TRX_SYS_MYSQL_LOG_NAME, + (byte*) file_name, 1 + ut_strlen(file_name), mtr); + } + + if (mach_read_from_4(sys_header + field + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH) > 0 + || (offset >> 32) > 0) { + + mlog_write_ulint(sys_header + field + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH, + (ulint)(offset >> 32), + MLOG_4BYTES, mtr); + } + + mlog_write_ulint(sys_header + field + + TRX_SYS_MYSQL_LOG_OFFSET_LOW, + (ulint)(offset & 0xFFFFFFFFUL), + MLOG_4BYTES, mtr); +} + +/********************************************************************* +Prints to stderr the MySQL binlog info in the system header if the +magic number shows it valid. */ + +void +trx_sys_print_mysql_binlog_offset_from_page( +/*========================================*/ + byte* page) /* in: buffer containing the trx system header page, + i.e., page number TRX_SYS_PAGE_NO in the tablespace */ +{ + trx_sysf_t* sys_header; + + sys_header = page + TRX_SYS; + + if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + == TRX_SYS_MYSQL_LOG_MAGIC_N) { + + fprintf(stderr, + "ibbackup: Last MySQL binlog file position %lu %lu, file name %s\n", + (ulong) mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH), + (ulong) mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW), + sys_header + TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_NAME); + } +} + +/********************************************************************* +Stores the MySQL binlog offset info in the trx system header if +the magic number shows it valid, and print the info to stderr */ + +void +trx_sys_print_mysql_binlog_offset(void) +/*===================================*/ +{ + trx_sysf_t* sys_header; + mtr_t mtr; + ulint trx_sys_mysql_bin_log_pos_high; + ulint trx_sys_mysql_bin_log_pos_low; + + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + != TRX_SYS_MYSQL_LOG_MAGIC_N) { + + mtr_commit(&mtr); + + return; + } + + trx_sys_mysql_bin_log_pos_high = mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH); + trx_sys_mysql_bin_log_pos_low = mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW); + + trx_sys_mysql_bin_log_pos = (((ib_longlong)trx_sys_mysql_bin_log_pos_high) << 32) + + (ib_longlong)trx_sys_mysql_bin_log_pos_low; + + ut_memcpy(trx_sys_mysql_bin_log_name, sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME, TRX_SYS_MYSQL_LOG_NAME_LEN); + + fprintf(stderr, + "InnoDB: Last MySQL binlog file position %lu %lu, file name %s\n", + trx_sys_mysql_bin_log_pos_high, trx_sys_mysql_bin_log_pos_low, + trx_sys_mysql_bin_log_name); + + mtr_commit(&mtr); +} + +/********************************************************************* +Prints to stderr the MySQL master log offset info in the trx system header if +the magic number shows it valid. */ + +void +trx_sys_print_mysql_master_log_pos(void) +/*====================================*/ +{ + trx_sysf_t* sys_header; + mtr_t mtr; + + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + != TRX_SYS_MYSQL_LOG_MAGIC_N) { + + mtr_commit(&mtr); + + return; + } + + fprintf(stderr, +"InnoDB: In a MySQL replication slave the last master binlog file\n" +"InnoDB: position %lu %lu, file name %s\n", + (ulong) mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH), + (ulong) mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW), + sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME); + /* Copy the master log position info to global variables we can + use in ha_innobase.cc to initialize glob_mi to right values */ + + ut_memcpy(trx_sys_mysql_master_log_name, + sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME, + TRX_SYS_MYSQL_LOG_NAME_LEN); + + trx_sys_mysql_master_log_pos = + (((ib_longlong)mach_read_from_4( + sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH)) + << 32) + + (ib_longlong) + mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW); + mtr_commit(&mtr); +} + +/******************************************************************** +Looks for a free slot for a rollback segment in the trx system file copy. */ + +ulint +trx_sysf_rseg_find_free( +/*====================*/ + /* out: slot index or ULINT_UNDEFINED if not found */ + mtr_t* mtr) /* in: mtr */ +{ + trx_sysf_t* sys_header; + ulint page_no; + ulint i; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(kernel_mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + sys_header = trx_sysf_get(mtr); + + for (i = 0; i < TRX_SYS_N_RSEGS; i++) { + + page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr); + + if (page_no == FIL_NULL) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/********************************************************************* +Creates the file page for the transaction system. This function is called only +at the database creation, before trx_sys_init. */ +static +void +trx_sysf_create( +/*============*/ + mtr_t* mtr) /* in: mtr */ +{ + trx_sysf_t* sys_header; + ulint slot_no; + page_t* page; + ulint page_no; + ulint i; + + ut_ad(mtr); + + /* Note that below we first reserve the file space x-latch, and + then enter the kernel: we must do it in this order to conform + to the latching order rules. */ + + mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE), mtr); + mutex_enter(&kernel_mutex); + + /* Create the trx sys file block in a new allocated file segment */ + page = fseg_create(TRX_SYS_SPACE, 0, TRX_SYS + TRX_SYS_FSEG_HEADER, + mtr); + ut_a(buf_frame_get_page_no(page) == TRX_SYS_PAGE_NO); + +#ifdef UNIV_SYNC_DEBUG + buf_page_dbg_add_level(page, SYNC_TRX_SYS_HEADER); +#endif /* UNIV_SYNC_DEBUG */ + + sys_header = trx_sysf_get(mtr); + + /* Start counting transaction ids from number 1 up */ + mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE, + ut_dulint_create(0, 1), mtr); + + /* Reset the rollback segment slots */ + for (i = 0; i < TRX_SYS_N_RSEGS; i++) { + + trx_sysf_rseg_set_page_no(sys_header, i, FIL_NULL, mtr); + } + + /* Create the first rollback segment in the SYSTEM tablespace */ + page_no = trx_rseg_header_create(TRX_SYS_SPACE, ULINT_MAX, &slot_no, + mtr); + ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID); + ut_a(page_no != FIL_NULL); + + mutex_exit(&kernel_mutex); +} + +/********************************************************************* +Creates and initializes the central memory structures for the transaction +system. This is called when the database is started. */ + +void +trx_sys_init_at_db_start(void) +/*==========================*/ +{ + trx_sysf_t* sys_header; + ib_longlong rows_to_undo = 0; + const char* unit = ""; + trx_t* trx; + mtr_t mtr; + + mtr_start(&mtr); + + ut_ad(trx_sys == NULL); + + mutex_enter(&kernel_mutex); + + trx_sys = mem_alloc(sizeof(trx_sys_t)); + + sys_header = trx_sysf_get(&mtr); + + trx_rseg_list_and_array_init(sys_header, &mtr); + + trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + + /* VERY important: after the database is started, max_trx_id value is + divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in + trx_sys_get_new_trx_id will evaluate to TRUE when the function + is first time called, and the value for trx id will be written + to the disk-based header! Thus trx id values will not overlap when + the database is repeatedly started! */ + + trx_sys->max_trx_id = ut_dulint_add( + ut_dulint_align_up( + mtr_read_dulint(sys_header + + TRX_SYS_TRX_ID_STORE, &mtr), + TRX_SYS_TRX_ID_WRITE_MARGIN), + 2 * TRX_SYS_TRX_ID_WRITE_MARGIN); + + UT_LIST_INIT(trx_sys->mysql_trx_list); + trx_lists_init_at_db_start(); + + if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) { + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + for (;;) { + + if ( trx->conc_state != TRX_PREPARED) { + rows_to_undo += + ut_conv_dulint_to_longlong(trx->undo_no); + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + + if (!trx) { + break; + } + } + + if (rows_to_undo > 1000000000) { + unit = "M"; + rows_to_undo = rows_to_undo / 1000000; + } + + fprintf(stderr, +"InnoDB: %lu transaction(s) which must be rolled back or cleaned up\n" +"InnoDB: in total %lu%s row operations to undo\n", + (ulong) UT_LIST_GET_LEN(trx_sys->trx_list), + (ulong) rows_to_undo, unit); + + fprintf(stderr, "InnoDB: Trx id counter is %lu %lu\n", + (ulong) ut_dulint_get_high(trx_sys->max_trx_id), + (ulong) ut_dulint_get_low(trx_sys->max_trx_id)); + } + + UT_LIST_INIT(trx_sys->view_list); + + trx_purge_sys_create(); + + mutex_exit(&kernel_mutex); + + mtr_commit(&mtr); +} + +/********************************************************************* +Creates and initializes the transaction system at the database creation. */ + +void +trx_sys_create(void) +/*================*/ +{ + mtr_t mtr; + + mtr_start(&mtr); + + trx_sysf_create(&mtr); + + mtr_commit(&mtr); + + trx_sys_init_at_db_start(); +} diff --git a/storage/innobase/trx/trx0trx.c b/storage/innobase/trx/trx0trx.c new file mode 100644 index 00000000000..cdda1dd4dee --- /dev/null +++ b/storage/innobase/trx/trx0trx.c @@ -0,0 +1,2025 @@ +/****************************************************** +The transaction + +(c) 1996 Innobase Oy + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0trx.h" + +#ifdef UNIV_NONINL +#include "trx0trx.ic" +#endif + +#include "trx0undo.h" +#include "trx0rseg.h" +#include "log0log.h" +#include "que0que.h" +#include "lock0lock.h" +#include "trx0roll.h" +#include "usr0sess.h" +#include "read0read.h" +#include "srv0srv.h" +#include "thr0loc.h" +#include "btr0sea.h" +#include "os0proc.h" +#include "trx0xa.h" + +/* Copy of the prototype for innobase_mysql_print_thd: this +copy MUST be equal to the one in mysql/sql/ha_innodb.cc ! */ + +void innobase_mysql_print_thd( + FILE* f, + void* thd); + +/* Dummy session used currently in MySQL interface */ +sess_t* trx_dummy_sess = NULL; + +/* Number of transactions currently allocated for MySQL: protected by +the kernel mutex */ +ulint trx_n_mysql_transactions = 0; + +/***************************************************************** +Starts the transaction if it is not yet started. */ + +void +trx_start_if_not_started_noninline( +/*===============================*/ + trx_t* trx) /* in: transaction */ +{ + trx_start_if_not_started(trx); +} + +/******************************************************************** +Retrieves the error_info field from a trx. */ + +void* +trx_get_error_info( +/*===============*/ + /* out: the error info */ + trx_t* trx) /* in: trx object */ +{ + return(trx->error_info); +} + +/******************************************************************** +Creates and initializes a transaction object. */ + +trx_t* +trx_create( +/*=======*/ + /* out, own: the transaction */ + sess_t* sess) /* in: session or NULL */ +{ + trx_t* trx; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + trx = mem_alloc(sizeof(trx_t)); + + trx->magic_n = TRX_MAGIC_N; + + trx->op_info = ""; + + trx->type = TRX_USER; + trx->conc_state = TRX_NOT_STARTED; + trx->start_time = time(NULL); + + trx->isolation_level = TRX_ISO_REPEATABLE_READ; + + trx->id = ut_dulint_zero; + trx->no = ut_dulint_max; + + trx->support_xa = TRUE; + + trx->check_foreigns = TRUE; + trx->check_unique_secondary = TRUE; + + trx->flush_log_later = FALSE; + trx->must_flush_log_later = FALSE; + + trx->dict_operation = FALSE; + + trx->mysql_thd = NULL; + trx->mysql_query_str = NULL; + + trx->n_mysql_tables_in_use = 0; + trx->mysql_n_tables_locked = 0; + + trx->mysql_log_file_name = NULL; + trx->mysql_log_offset = 0; + trx->mysql_master_log_file_name = ""; + trx->mysql_master_log_pos = 0; + + trx->repl_wait_binlog_name = NULL; + trx->repl_wait_binlog_pos = 0; + + mutex_create(&(trx->undo_mutex)); + mutex_set_level(&(trx->undo_mutex), SYNC_TRX_UNDO); + + trx->rseg = NULL; + + trx->undo_no = ut_dulint_zero; + trx->last_sql_stat_start.least_undo_no = ut_dulint_zero; + trx->insert_undo = NULL; + trx->update_undo = NULL; + trx->undo_no_arr = NULL; + + trx->error_state = DB_SUCCESS; + + trx->sess = sess; + trx->que_state = TRX_QUE_RUNNING; + trx->n_active_thrs = 0; + + trx->handling_signals = FALSE; + + UT_LIST_INIT(trx->signals); + UT_LIST_INIT(trx->reply_signals); + + trx->graph = NULL; + + trx->wait_lock = NULL; + trx->was_chosen_as_deadlock_victim = FALSE; + UT_LIST_INIT(trx->wait_thrs); + + trx->lock_heap = mem_heap_create_in_buffer(256); + UT_LIST_INIT(trx->trx_locks); + + UT_LIST_INIT(trx->trx_savepoints); + + trx->dict_operation_lock_mode = 0; + trx->has_search_latch = FALSE; + trx->search_latch_timeout = BTR_SEA_TIMEOUT; + + trx->declared_to_be_inside_innodb = FALSE; + trx->n_tickets_to_enter_innodb = 0; + + trx->auto_inc_lock = NULL; + trx->n_lock_table_exp = 0; + trx->n_lock_table_transactional = 0; + + trx->read_view_heap = mem_heap_create(256); + trx->read_view = NULL; + + /* Set X/Open XA transaction identification to NULL */ + memset(&trx->xid, 0, sizeof(trx->xid)); + trx->xid.formatID = -1; + + return(trx); +} + +/************************************************************************ +Creates a transaction object for MySQL. */ + +trx_t* +trx_allocate_for_mysql(void) +/*========================*/ + /* out, own: transaction object */ +{ + trx_t* trx; + + mutex_enter(&kernel_mutex); + + /* Open a dummy session */ + + if (!trx_dummy_sess) { + trx_dummy_sess = sess_open(); + } + + trx = trx_create(trx_dummy_sess); + + trx_n_mysql_transactions++; + + UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx); + + mutex_exit(&kernel_mutex); + + trx->mysql_thread_id = os_thread_get_curr_id(); + + trx->mysql_process_no = os_proc_get_number(); + + return(trx); +} + +/************************************************************************ +Creates a transaction object for background operations by the master thread. */ + +trx_t* +trx_allocate_for_background(void) +/*=============================*/ + /* out, own: transaction object */ +{ + trx_t* trx; + + mutex_enter(&kernel_mutex); + + /* Open a dummy session */ + + if (!trx_dummy_sess) { + trx_dummy_sess = sess_open(); + } + + trx = trx_create(trx_dummy_sess); + + mutex_exit(&kernel_mutex); + + return(trx); +} + +/************************************************************************ +Releases the search latch if trx has reserved it. */ + +void +trx_search_latch_release_if_reserved( +/*=================================*/ + trx_t* trx) /* in: transaction */ +{ + if (trx->has_search_latch) { + rw_lock_s_unlock(&btr_search_latch); + + trx->has_search_latch = FALSE; + } +} + +/************************************************************************ +Frees a transaction object. */ + +void +trx_free( +/*=====*/ + trx_t* trx) /* in, own: trx object */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + if (trx->declared_to_be_inside_innodb) { + ut_print_timestamp(stderr); + fputs( +" InnoDB: Error: Freeing a trx which is declared to be processing\n" +"InnoDB: inside InnoDB.\n", stderr); + trx_print(stderr, trx); + putc('\n', stderr); + } + + if (trx->n_mysql_tables_in_use != 0 + || trx->mysql_n_tables_locked != 0) { + + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: MySQL is freeing a thd\n" +"InnoDB: though trx->n_mysql_tables_in_use is %lu\n" +"InnoDB: and trx->mysql_n_tables_locked is %lu.\n", + (ulong)trx->n_mysql_tables_in_use, + (ulong)trx->mysql_n_tables_locked); + + trx_print(stderr, trx); + + ut_print_buf(stderr, (byte*)trx, sizeof(trx_t)); + } + + ut_a(trx->magic_n == TRX_MAGIC_N); + + trx->magic_n = 11112222; + + ut_a(trx->conc_state == TRX_NOT_STARTED); + + mutex_free(&(trx->undo_mutex)); + + ut_a(trx->insert_undo == NULL); + ut_a(trx->update_undo == NULL); + + if (trx->undo_no_arr) { + trx_undo_arr_free(trx->undo_no_arr); + } + + if (trx->repl_wait_binlog_name != NULL) { + + mem_free(trx->repl_wait_binlog_name); + } + + ut_a(UT_LIST_GET_LEN(trx->signals) == 0); + ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0); + + ut_a(trx->wait_lock == NULL); + ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0); + + ut_a(!trx->has_search_latch); + ut_a(!trx->auto_inc_lock); + ut_a(!trx->n_lock_table_exp); + ut_a(!trx->n_lock_table_transactional); + + ut_a(trx->dict_operation_lock_mode == 0); + + if (trx->lock_heap) { + mem_heap_free(trx->lock_heap); + } + + ut_a(UT_LIST_GET_LEN(trx->trx_locks) == 0); + + if (trx->read_view_heap) { + mem_heap_free(trx->read_view_heap); + } + + ut_a(trx->read_view == NULL); + + mem_free(trx); +} + +/************************************************************************ +Frees a transaction object for MySQL. */ + +void +trx_free_for_mysql( +/*===============*/ + trx_t* trx) /* in, own: trx object */ +{ + thr_local_free(trx->mysql_thread_id); + + mutex_enter(&kernel_mutex); + + UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx); + + trx_free(trx); + + ut_a(trx_n_mysql_transactions > 0); + + trx_n_mysql_transactions--; + + mutex_exit(&kernel_mutex); +} + +/************************************************************************ +Frees a transaction object of a background operation of the master thread. */ + +void +trx_free_for_background( +/*====================*/ + trx_t* trx) /* in, own: trx object */ +{ + mutex_enter(&kernel_mutex); + + trx_free(trx); + + mutex_exit(&kernel_mutex); +} + +/******************************************************************** +Inserts the trx handle in the trx system trx list in the right position. +The list is sorted on the trx id so that the biggest id is at the list +start. This function is used at the database startup to insert incomplete +transactions to the list. */ +static +void +trx_list_insert_ordered( +/*====================*/ + trx_t* trx) /* in: trx handle */ +{ + trx_t* trx2; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + trx2 = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx2 != NULL) { + if (ut_dulint_cmp(trx->id, trx2->id) >= 0) { + + ut_ad(ut_dulint_cmp(trx->id, trx2->id) == 1); + break; + } + trx2 = UT_LIST_GET_NEXT(trx_list, trx2); + } + + if (trx2 != NULL) { + trx2 = UT_LIST_GET_PREV(trx_list, trx2); + + if (trx2 == NULL) { + UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx); + } else { + UT_LIST_INSERT_AFTER(trx_list, trx_sys->trx_list, + trx2, trx); + } + } else { + UT_LIST_ADD_LAST(trx_list, trx_sys->trx_list, trx); + } +} + +/******************************************************************** +Creates trx objects for transactions and initializes the trx list of +trx_sys at database start. Rollback segment and undo log lists must +already exist when this function is called, because the lists of +transactions to be rolled back or cleaned up are built based on the +undo log lists. */ + +void +trx_lists_init_at_db_start(void) +/*============================*/ +{ + trx_rseg_t* rseg; + trx_undo_t* undo; + trx_t* trx; + + UT_LIST_INIT(trx_sys->trx_list); + + /* Look from the rollback segments if there exist undo logs for + transactions */ + + rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + + while (rseg != NULL) { + undo = UT_LIST_GET_FIRST(rseg->insert_undo_list); + + while (undo != NULL) { + + trx = trx_create(NULL); + + trx->id = undo->trx_id; + trx->xid = undo->xid; + trx->insert_undo = undo; + trx->rseg = rseg; + + if (undo->state != TRX_UNDO_ACTIVE) { + + /* Prepared transactions are left in + the prepared state waiting for a + commit or abort decision from MySQL */ + + if (undo->state == TRX_UNDO_PREPARED) { + + fprintf(stderr, +"InnoDB: Transaction %lu %lu was in the XA prepared state.\n", + ut_dulint_get_high(trx->id), + ut_dulint_get_low(trx->id)); + + if (srv_force_recovery == 0) { + + trx->conc_state = TRX_PREPARED; + } else { + fprintf(stderr, +"InnoDB: Since innodb_force_recovery > 0, we will rollback it anyway.\n"); + + trx->conc_state = TRX_ACTIVE; + } + } else { + trx->conc_state = + TRX_COMMITTED_IN_MEMORY; + } + + /* We give a dummy value for the trx no; + this should have no relevance since purge + is not interested in committed transaction + numbers, unless they are in the history + list, in which case it looks the number + from the disk based undo log structure */ + + trx->no = trx->id; + } else { + trx->conc_state = TRX_ACTIVE; + + /* A running transaction always has the number + field inited to ut_dulint_max */ + + trx->no = ut_dulint_max; + } + + if (undo->dict_operation) { + trx->dict_operation = undo->dict_operation; + trx->table_id = undo->table_id; + } + + if (!undo->empty) { + trx->undo_no = ut_dulint_add(undo->top_undo_no, + 1); + } + + trx_list_insert_ordered(trx); + + undo = UT_LIST_GET_NEXT(undo_list, undo); + } + + undo = UT_LIST_GET_FIRST(rseg->update_undo_list); + + while (undo != NULL) { + trx = trx_get_on_id(undo->trx_id); + + if (NULL == trx) { + trx = trx_create(NULL); + + trx->id = undo->trx_id; + trx->xid = undo->xid; + + if (undo->state != TRX_UNDO_ACTIVE) { + + /* Prepared transactions are left in + the prepared state waiting for a + commit or abort decision from MySQL */ + + if (undo->state == TRX_UNDO_PREPARED) { + fprintf(stderr, +"InnoDB: Transaction %lu %lu was in the XA prepared state.\n", + ut_dulint_get_high(trx->id), + ut_dulint_get_low(trx->id)); + + if (srv_force_recovery == 0) { + + trx->conc_state = TRX_PREPARED; + } else { + fprintf(stderr, +"InnoDB: Since innodb_force_recovery > 0, we will rollback it anyway.\n"); + + trx->conc_state = TRX_ACTIVE; + } + } else { + trx->conc_state = + TRX_COMMITTED_IN_MEMORY; + } + + /* We give a dummy value for the trx + number */ + + trx->no = trx->id; + } else { + trx->conc_state = TRX_ACTIVE; + + /* A running transaction always has + the number field inited to + ut_dulint_max */ + + trx->no = ut_dulint_max; + } + + trx->rseg = rseg; + trx_list_insert_ordered(trx); + + if (undo->dict_operation) { + trx->dict_operation = + undo->dict_operation; + trx->table_id = undo->table_id; + } + } + + trx->update_undo = undo; + + if ((!undo->empty) + && (ut_dulint_cmp(undo->top_undo_no, trx->undo_no) + >= 0)) { + + trx->undo_no = ut_dulint_add(undo->top_undo_no, + 1); + } + + undo = UT_LIST_GET_NEXT(undo_list, undo); + } + + rseg = UT_LIST_GET_NEXT(rseg_list, rseg); + } +} + +/********************************************************************** +Assigns a rollback segment to a transaction in a round-robin fashion. +Skips the SYSTEM rollback segment if another is available. */ +UNIV_INLINE +ulint +trx_assign_rseg(void) +/*=================*/ + /* out: assigned rollback segment id */ +{ + trx_rseg_t* rseg = trx_sys->latest_rseg; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ +loop: + /* Get next rseg in a round-robin fashion */ + + rseg = UT_LIST_GET_NEXT(rseg_list, rseg); + + if (rseg == NULL) { + rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + } + + /* If it is the SYSTEM rollback segment, and there exist others, skip + it */ + + if ((rseg->id == TRX_SYS_SYSTEM_RSEG_ID) + && (UT_LIST_GET_LEN(trx_sys->rseg_list) > 1)) { + goto loop; + } + + trx_sys->latest_rseg = rseg; + + return(rseg->id); +} + +/******************************************************************** +Starts a new transaction. */ + +ibool +trx_start_low( +/*==========*/ + /* out: TRUE */ + trx_t* trx, /* in: transaction */ + ulint rseg_id)/* in: rollback segment id; if ULINT_UNDEFINED + is passed, the system chooses the rollback segment + automatically in a round-robin fashion */ +{ + trx_rseg_t* rseg; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(trx->rseg == NULL); + + if (trx->type == TRX_PURGE) { + trx->id = ut_dulint_zero; + trx->conc_state = TRX_ACTIVE; + trx->start_time = time(NULL); + + return(TRUE); + } + + ut_ad(trx->conc_state != TRX_ACTIVE); + + if (rseg_id == ULINT_UNDEFINED) { + + rseg_id = trx_assign_rseg(); + } + + rseg = trx_sys_get_nth_rseg(trx_sys, rseg_id); + + trx->id = trx_sys_get_new_trx_id(); + + /* The initial value for trx->no: ut_dulint_max is used in + read_view_open_now: */ + + trx->no = ut_dulint_max; + + trx->rseg = rseg; + + trx->conc_state = TRX_ACTIVE; + trx->start_time = time(NULL); + + UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx); + + return(TRUE); +} + +/******************************************************************** +Starts a new transaction. */ + +ibool +trx_start( +/*======*/ + /* out: TRUE */ + trx_t* trx, /* in: transaction */ + ulint rseg_id)/* in: rollback segment id; if ULINT_UNDEFINED + is passed, the system chooses the rollback segment + automatically in a round-robin fashion */ +{ + ibool ret; + + mutex_enter(&kernel_mutex); + + ret = trx_start_low(trx, rseg_id); + + mutex_exit(&kernel_mutex); + + return(ret); +} + +/******************************************************************** +Commits a transaction. */ + +void +trx_commit_off_kernel( +/*==================*/ + trx_t* trx) /* in: transaction */ +{ + page_t* update_hdr_page; + dulint lsn; + trx_rseg_t* rseg; + trx_undo_t* undo; + ibool must_flush_log = FALSE; + mtr_t mtr; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + trx->must_flush_log_later = FALSE; + + rseg = trx->rseg; + + if (trx->insert_undo != NULL || trx->update_undo != NULL) { + + mutex_exit(&kernel_mutex); + + mtr_start(&mtr); + + must_flush_log = TRUE; + + /* Change the undo log segment states from TRX_UNDO_ACTIVE + to some other state: these modifications to the file data + structure define the transaction as committed in the file + based world, at the serialization point of the log sequence + number lsn obtained below. */ + + mutex_enter(&(rseg->mutex)); + + if (trx->insert_undo != NULL) { + trx_undo_set_state_at_finish(trx, trx->insert_undo, + &mtr); + } + + undo = trx->update_undo; + + if (undo) { + mutex_enter(&kernel_mutex); + trx->no = trx_sys_get_new_trx_no(); + + mutex_exit(&kernel_mutex); + + /* It is not necessary to obtain trx->undo_mutex here + because only a single OS thread is allowed to do the + transaction commit for this transaction. */ + + update_hdr_page = trx_undo_set_state_at_finish(trx, + undo, &mtr); + + /* We have to do the cleanup for the update log while + holding the rseg mutex because update log headers + have to be put to the history list in the order of + the trx number. */ + + trx_undo_update_cleanup(trx, update_hdr_page, &mtr); + } + + mutex_exit(&(rseg->mutex)); + + /* Update the latest MySQL binlog name and offset info + in trx sys header if MySQL binlogging is on or the database + server is a MySQL replication slave */ + + if (trx->mysql_log_file_name) { + trx_sys_update_mysql_binlog_offset( + trx->mysql_log_file_name, + trx->mysql_log_offset, + TRX_SYS_MYSQL_LOG_INFO, &mtr); + trx->mysql_log_file_name = NULL; + } + + if (trx->mysql_master_log_file_name[0] != '\0') { + /* This database server is a MySQL replication slave */ + trx_sys_update_mysql_binlog_offset( + trx->mysql_master_log_file_name, + trx->mysql_master_log_pos, + TRX_SYS_MYSQL_MASTER_LOG_INFO, &mtr); + } + + /* The following call commits the mini-transaction, making the + whole transaction committed in the file-based world, at this + log sequence number. The transaction becomes 'durable' when + we write the log to disk, but in the logical sense the commit + in the file-based data structures (undo logs etc.) happens + here. + + NOTE that transaction numbers, which are assigned only to + transactions with an update undo log, do not necessarily come + in exactly the same order as commit lsn's, if the transactions + have different rollback segments. To get exactly the same + order we should hold the kernel mutex up to this point, + adding to to the contention of the kernel mutex. However, if + a transaction T2 is able to see modifications made by + a transaction T1, T2 will always get a bigger transaction + number and a bigger commit lsn than T1. */ + + /*--------------*/ + mtr_commit(&mtr); + /*--------------*/ + lsn = mtr.end_lsn; + + mutex_enter(&kernel_mutex); + } + + ut_ad(trx->conc_state == TRX_ACTIVE + || trx->conc_state == TRX_PREPARED); +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + /* The following assignment makes the transaction committed in memory + and makes its changes to data visible to other transactions. + NOTE that there is a small discrepancy from the strict formal + visibility rules here: a human user of the database can see + modifications made by another transaction T even before the necessary + log segment has been flushed to the disk. If the database happens to + crash before the flush, the user has seen modifications from T which + will never be a committed transaction. However, any transaction T2 + which sees the modifications of the committing transaction T, and + which also itself makes modifications to the database, will get an lsn + larger than the committing transaction T. In the case where the log + flush fails, and T never gets committed, also T2 will never get + committed. */ + + /*--------------------------------------*/ + trx->conc_state = TRX_COMMITTED_IN_MEMORY; + /*--------------------------------------*/ + + lock_release_off_kernel(trx); + + if (trx->read_view) { + read_view_close(trx->read_view); + + mem_heap_empty(trx->read_view_heap); + trx->read_view = NULL; + } + + if (must_flush_log) { + + mutex_exit(&kernel_mutex); + + if (trx->insert_undo != NULL) { + + trx_undo_insert_cleanup(trx); + } + + /* NOTE that we could possibly make a group commit more + efficient here: call os_thread_yield here to allow also other + trxs to come to commit! */ + + /*-------------------------------------*/ + + /* Depending on the my.cnf options, we may now write the log + buffer to the log files, making the transaction durable if + the OS does not crash. We may also flush the log files to + disk, making the transaction durable also at an OS crash or a + power outage. + + The idea in InnoDB's group commit is that a group of + transactions gather behind a trx doing a physical disk write + to log files, and when that physical write has been completed, + one of those transactions does a write which commits the whole + group. Note that this group commit will only bring benefit if + there are > 2 users in the database. Then at least 2 users can + gather behind one doing the physical log write to disk. + + If we are calling trx_commit() under MySQL's binlog mutex, we + will delay possible log write and flush to a separate function + trx_commit_complete_for_mysql(), which is only called when the + thread has released the binlog mutex. This is to make the + group commit algorithm to work. Otherwise, the MySQL binlog + mutex would serialize all commits and prevent a group of + transactions from gathering. */ + + if (trx->flush_log_later) { + /* Do nothing yet */ + trx->must_flush_log_later = TRUE; + } else if (srv_flush_log_at_trx_commit == 0) { + /* Do nothing */ + } else if (srv_flush_log_at_trx_commit == 1) { + if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, + FALSE); + } else { + /* Write the log to the log files AND flush + them to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); + } + } else if (srv_flush_log_at_trx_commit == 2) { + + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + ut_error; + } + + trx->commit_lsn = lsn; + + /*-------------------------------------*/ + + mutex_enter(&kernel_mutex); + } + + /* Free savepoints */ + trx_roll_savepoints_free(trx, NULL); + + trx->conc_state = TRX_NOT_STARTED; + trx->rseg = NULL; + trx->undo_no = ut_dulint_zero; + trx->last_sql_stat_start.least_undo_no = ut_dulint_zero; + + ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0); + ut_ad(UT_LIST_GET_LEN(trx->trx_locks) == 0); + + UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx); +} + +/******************************************************************** +Cleans up a transaction at database startup. The cleanup is needed if +the transaction already got to the middle of a commit when the database +crashed, andf we cannot roll it back. */ + +void +trx_cleanup_at_db_startup( +/*======================*/ + trx_t* trx) /* in: transaction */ +{ + if (trx->insert_undo != NULL) { + + trx_undo_insert_cleanup(trx); + } + + trx->conc_state = TRX_NOT_STARTED; + trx->rseg = NULL; + trx->undo_no = ut_dulint_zero; + trx->last_sql_stat_start.least_undo_no = ut_dulint_zero; + + UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx); +} + +/************************************************************************ +Assigns a read view for a consistent read query. All the consistent reads +within the same transaction will get the same read view, which is created +when this function is first called for a new started transaction. */ + +read_view_t* +trx_assign_read_view( +/*=================*/ + /* out: consistent read view */ + trx_t* trx) /* in: active transaction */ +{ + ut_ad(trx->conc_state == TRX_ACTIVE); + + if (trx->read_view) { + return(trx->read_view); + } + + mutex_enter(&kernel_mutex); + + if (!trx->read_view) { + trx->read_view = read_view_open_now(trx, trx->read_view_heap); + } + + mutex_exit(&kernel_mutex); + + return(trx->read_view); +} + +/******************************************************************** +Commits a transaction. NOTE that the kernel mutex is temporarily released. */ +static +void +trx_handle_commit_sig_off_kernel( +/*=============================*/ + trx_t* trx, /* in: transaction */ + que_thr_t** next_thr) /* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread */ +{ + trx_sig_t* sig; + trx_sig_t* next_sig; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + trx->que_state = TRX_QUE_COMMITTING; + + trx_commit_off_kernel(trx); + + ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0); + + /* Remove all TRX_SIG_COMMIT signals from the signal queue and send + reply messages to them */ + + sig = UT_LIST_GET_FIRST(trx->signals); + + while (sig != NULL) { + next_sig = UT_LIST_GET_NEXT(signals, sig); + + if (sig->type == TRX_SIG_COMMIT) { + + trx_sig_reply(sig, next_thr); + trx_sig_remove(trx, sig); + } + + sig = next_sig; + } + + trx->que_state = TRX_QUE_RUNNING; +} + +/*************************************************************** +The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to +the TRX_QUE_RUNNING state and releases query threads which were +waiting for a lock in the wait_thrs list. */ + +void +trx_end_lock_wait( +/*==============*/ + trx_t* trx) /* in: transaction */ +{ + que_thr_t* thr; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT); + + thr = UT_LIST_GET_FIRST(trx->wait_thrs); + + while (thr != NULL) { + que_thr_end_wait_no_next_thr(thr); + + UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr); + + thr = UT_LIST_GET_FIRST(trx->wait_thrs); + } + + trx->que_state = TRX_QUE_RUNNING; +} + +/*************************************************************** +Moves the query threads in the lock wait list to the SUSPENDED state and puts +the transaction to the TRX_QUE_RUNNING state. */ +static +void +trx_lock_wait_to_suspended( +/*=======================*/ + trx_t* trx) /* in: transaction in the TRX_QUE_LOCK_WAIT state */ +{ + que_thr_t* thr; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT); + + thr = UT_LIST_GET_FIRST(trx->wait_thrs); + + while (thr != NULL) { + thr->state = QUE_THR_SUSPENDED; + + UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr); + + thr = UT_LIST_GET_FIRST(trx->wait_thrs); + } + + trx->que_state = TRX_QUE_RUNNING; +} + +/*************************************************************** +Moves the query threads in the sig reply wait list of trx to the SUSPENDED +state. */ +static +void +trx_sig_reply_wait_to_suspended( +/*============================*/ + trx_t* trx) /* in: transaction */ +{ + trx_sig_t* sig; + que_thr_t* thr; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + sig = UT_LIST_GET_FIRST(trx->reply_signals); + + while (sig != NULL) { + thr = sig->receiver; + + ut_ad(thr->state == QUE_THR_SIG_REPLY_WAIT); + + thr->state = QUE_THR_SUSPENDED; + + sig->receiver = NULL; + + UT_LIST_REMOVE(reply_signals, trx->reply_signals, sig); + + sig = UT_LIST_GET_FIRST(trx->reply_signals); + } +} + +/********************************************************************* +Checks the compatibility of a new signal with the other signals in the +queue. */ +static +ibool +trx_sig_is_compatible( +/*==================*/ + /* out: TRUE if the signal can be queued */ + trx_t* trx, /* in: trx handle */ + ulint type, /* in: signal type */ + ulint sender) /* in: TRX_SIG_SELF or TRX_SIG_OTHER_SESS */ +{ + trx_sig_t* sig; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + if (UT_LIST_GET_LEN(trx->signals) == 0) { + + return(TRUE); + } + + if (sender == TRX_SIG_SELF) { + if (type == TRX_SIG_ERROR_OCCURRED) { + + return(TRUE); + + } else if (type == TRX_SIG_BREAK_EXECUTION) { + + return(TRUE); + } else { + return(FALSE); + } + } + + ut_ad(sender == TRX_SIG_OTHER_SESS); + + sig = UT_LIST_GET_FIRST(trx->signals); + + if (type == TRX_SIG_COMMIT) { + while (sig != NULL) { + + if (sig->type == TRX_SIG_TOTAL_ROLLBACK) { + + return(FALSE); + } + + sig = UT_LIST_GET_NEXT(signals, sig); + } + + return(TRUE); + + } else if (type == TRX_SIG_TOTAL_ROLLBACK) { + while (sig != NULL) { + + if (sig->type == TRX_SIG_COMMIT) { + + return(FALSE); + } + + sig = UT_LIST_GET_NEXT(signals, sig); + } + + return(TRUE); + + } else if (type == TRX_SIG_BREAK_EXECUTION) { + + return(TRUE); + } else { + ut_error; + + return(FALSE); + } +} + +/******************************************************************** +Sends a signal to a trx object. */ + +ibool +trx_sig_send( +/*=========*/ + /* out: TRUE if the signal was + successfully delivered */ + trx_t* trx, /* in: trx handle */ + ulint type, /* in: signal type */ + ulint sender, /* in: TRX_SIG_SELF or + TRX_SIG_OTHER_SESS */ + que_thr_t* receiver_thr, /* in: query thread which wants the + reply, or NULL; if type is + TRX_SIG_END_WAIT, this must be NULL */ + trx_savept_t* savept, /* in: possible rollback savepoint, or + NULL */ + que_thr_t** next_thr) /* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread; if the parameter + is NULL, it is ignored */ +{ + trx_sig_t* sig; + trx_t* receiver_trx; + + ut_ad(trx); +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + if (!trx_sig_is_compatible(trx, type, sender)) { + /* The signal is not compatible with the other signals in + the queue: do nothing */ + + ut_error; + + return(FALSE); + } + + /* Queue the signal object */ + + if (UT_LIST_GET_LEN(trx->signals) == 0) { + + /* The signal list is empty: the 'sig' slot must be unused + (we improve performance a bit by avoiding mem_alloc) */ + sig = &(trx->sig); + } else { + /* It might be that the 'sig' slot is unused also in this + case, but we choose the easy way of using mem_alloc */ + + sig = mem_alloc(sizeof(trx_sig_t)); + } + + UT_LIST_ADD_LAST(signals, trx->signals, sig); + + sig->type = type; + sig->state = TRX_SIG_WAITING; + sig->sender = sender; + sig->receiver = receiver_thr; + + if (savept) { + sig->savept = *savept; + } + + if (receiver_thr) { + receiver_trx = thr_get_trx(receiver_thr); + + UT_LIST_ADD_LAST(reply_signals, receiver_trx->reply_signals, + sig); + } + + if (trx->sess->state == SESS_ERROR) { + + trx_sig_reply_wait_to_suspended(trx); + } + + if ((sender != TRX_SIG_SELF) || (type == TRX_SIG_BREAK_EXECUTION)) { + + /* The following call will add a TRX_SIG_ERROR_OCCURRED + signal to the end of the queue, if the session is not yet + in the error state: */ + + ut_error; + } + + /* If there were no other signals ahead in the queue, try to start + handling of the signal */ + + if (UT_LIST_GET_FIRST(trx->signals) == sig) { + + trx_sig_start_handle(trx, next_thr); + } + + return(TRUE); +} + +/******************************************************************** +Ends signal handling. If the session is in the error state, and +trx->graph_before_signal_handling != NULL, then returns control to the error +handling routine of the graph (currently just returns the control to the +graph root which then will send an error message to the client). */ + +void +trx_end_signal_handling( +/*====================*/ + trx_t* trx) /* in: trx */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(trx->handling_signals == TRUE); + + trx->handling_signals = FALSE; + + trx->graph = trx->graph_before_signal_handling; + + if (trx->graph && (trx->sess->state == SESS_ERROR)) { + + que_fork_error_handle(trx, trx->graph); + } +} + +/******************************************************************** +Starts handling of a trx signal. */ + +void +trx_sig_start_handle( +/*=================*/ + trx_t* trx, /* in: trx handle */ + que_thr_t** next_thr) /* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread; if the parameter + is NULL, it is ignored */ +{ + trx_sig_t* sig; + ulint type; +loop: + /* We loop in this function body as long as there are queued signals + we can process immediately */ + + ut_ad(trx); +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + if (trx->handling_signals && (UT_LIST_GET_LEN(trx->signals) == 0)) { + + trx_end_signal_handling(trx); + + return; + } + + if (trx->conc_state == TRX_NOT_STARTED) { + + trx_start_low(trx, ULINT_UNDEFINED); + } + + /* If the trx is in a lock wait state, moves the waiting query threads + to the suspended state */ + + if (trx->que_state == TRX_QUE_LOCK_WAIT) { + + trx_lock_wait_to_suspended(trx); + } + + /* If the session is in the error state and this trx has threads + waiting for reply from signals, moves these threads to the suspended + state, canceling wait reservations; note that if the transaction has + sent a commit or rollback signal to itself, and its session is not in + the error state, then nothing is done here. */ + + if (trx->sess->state == SESS_ERROR) { + trx_sig_reply_wait_to_suspended(trx); + } + + /* If there are no running query threads, we can start processing of a + signal, otherwise we have to wait until all query threads of this + transaction are aware of the arrival of the signal. */ + + if (trx->n_active_thrs > 0) { + + return; + } + + if (trx->handling_signals == FALSE) { + trx->graph_before_signal_handling = trx->graph; + + trx->handling_signals = TRUE; + } + + sig = UT_LIST_GET_FIRST(trx->signals); + type = sig->type; + + if (type == TRX_SIG_COMMIT) { + + trx_handle_commit_sig_off_kernel(trx, next_thr); + + } else if ((type == TRX_SIG_TOTAL_ROLLBACK) + || (type == TRX_SIG_ROLLBACK_TO_SAVEPT)) { + + trx_rollback(trx, sig, next_thr); + + /* No further signals can be handled until the rollback + completes, therefore we return */ + + return; + + } else if (type == TRX_SIG_ERROR_OCCURRED) { + + trx_rollback(trx, sig, next_thr); + + /* No further signals can be handled until the rollback + completes, therefore we return */ + + return; + + } else if (type == TRX_SIG_BREAK_EXECUTION) { + + trx_sig_reply(sig, next_thr); + trx_sig_remove(trx, sig); + } else { + ut_error; + } + + goto loop; +} + +/******************************************************************** +Send the reply message when a signal in the queue of the trx has been +handled. */ + +void +trx_sig_reply( +/*==========*/ + trx_sig_t* sig, /* in: signal */ + que_thr_t** next_thr) /* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread */ +{ + trx_t* receiver_trx; + + ut_ad(sig); +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + if (sig->receiver != NULL) { + ut_ad((sig->receiver)->state == QUE_THR_SIG_REPLY_WAIT); + + receiver_trx = thr_get_trx(sig->receiver); + + UT_LIST_REMOVE(reply_signals, receiver_trx->reply_signals, + sig); + ut_ad(receiver_trx->sess->state != SESS_ERROR); + + que_thr_end_wait(sig->receiver, next_thr); + + sig->receiver = NULL; + + } +} + +/******************************************************************** +Removes a signal object from the trx signal queue. */ + +void +trx_sig_remove( +/*===========*/ + trx_t* trx, /* in: trx handle */ + trx_sig_t* sig) /* in, own: signal */ +{ + ut_ad(trx && sig); +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(sig->receiver == NULL); + + UT_LIST_REMOVE(signals, trx->signals, sig); + sig->type = 0; /* reset the field to catch possible bugs */ + + if (sig != &(trx->sig)) { + mem_free(sig); + } +} + +/************************************************************************* +Creates a commit command node struct. */ + +commit_node_t* +commit_node_create( +/*===============*/ + /* out, own: commit node struct */ + mem_heap_t* heap) /* in: mem heap where created */ +{ + commit_node_t* node; + + node = mem_heap_alloc(heap, sizeof(commit_node_t)); + node->common.type = QUE_NODE_COMMIT; + node->state = COMMIT_NODE_SEND; + + return(node); +} + +/*************************************************************** +Performs an execution step for a commit type node in a query graph. */ + +que_thr_t* +trx_commit_step( +/*============*/ + /* out: query thread to run next, or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + commit_node_t* node; + que_thr_t* next_thr; + ibool success; + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = COMMIT_NODE_SEND; + } + + if (node->state == COMMIT_NODE_SEND) { + mutex_enter(&kernel_mutex); + + node->state = COMMIT_NODE_WAIT; + + next_thr = NULL; + + thr->state = QUE_THR_SIG_REPLY_WAIT; + + /* Send the commit signal to the transaction */ + + success = trx_sig_send(thr_get_trx(thr), TRX_SIG_COMMIT, + TRX_SIG_SELF, thr, NULL, &next_thr); + + mutex_exit(&kernel_mutex); + + if (!success) { + /* Error in delivering the commit signal */ + que_thr_handle_error(thr, DB_ERROR, NULL, 0); + } + + return(next_thr); + } + + ut_ad(node->state == COMMIT_NODE_WAIT); + + node->state = COMMIT_NODE_SEND; + + thr->run_node = que_node_get_parent(node); + + return(thr); +} + +/************************************************************************** +Does the transaction commit for MySQL. */ + +ulint +trx_commit_for_mysql( +/*=================*/ + /* out: 0 or error number */ + trx_t* trx) /* in: trx handle */ +{ + /* Because we do not do the commit by sending an Innobase + sig to the transaction, we must here make sure that trx has been + started. */ + + ut_a(trx); + + trx->op_info = "committing"; + + trx_start_if_not_started(trx); + + mutex_enter(&kernel_mutex); + + trx_commit_off_kernel(trx); + + mutex_exit(&kernel_mutex); + + trx->op_info = ""; + + return(0); +} + +/************************************************************************** +If required, flushes the log to disk if we called trx_commit_for_mysql() +with trx->flush_log_later == TRUE. */ + +ulint +trx_commit_complete_for_mysql( +/*==========================*/ + /* out: 0 or error number */ + trx_t* trx) /* in: trx handle */ +{ + dulint lsn = trx->commit_lsn; + + ut_a(trx); + + trx->op_info = "flushing log"; + + if (!trx->must_flush_log_later) { + /* Do nothing */ + } else if (srv_flush_log_at_trx_commit == 0) { + /* Do nothing */ + } else if (srv_flush_log_at_trx_commit == 1) { + if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + /* Write the log to the log files AND flush them to + disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); + } + } else if (srv_flush_log_at_trx_commit == 2) { + + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + ut_error; + } + + trx->must_flush_log_later = FALSE; + + trx->op_info = ""; + + return(0); +} + +/************************************************************************** +Marks the latest SQL statement ended. */ + +void +trx_mark_sql_stat_end( +/*==================*/ + trx_t* trx) /* in: trx handle */ +{ + ut_a(trx); + + if (trx->conc_state == TRX_NOT_STARTED) { + trx->undo_no = ut_dulint_zero; + } + + trx->last_sql_stat_start.least_undo_no = trx->undo_no; +} + +/************************************************************************** +Prints info about a transaction to the standard output. The caller must +own the kernel mutex and must have called +innobase_mysql_prepare_print_arbitrary_thd(), unless he knows that MySQL or +InnoDB cannot meanwhile change the info printed here. */ + +void +trx_print( +/*======*/ + FILE* f, /* in: output stream */ + trx_t* trx) /* in: transaction */ +{ + ibool newline; + + fprintf(f, "TRANSACTION %lu %lu", + (ulong) ut_dulint_get_high(trx->id), + (ulong) ut_dulint_get_low(trx->id)); + + switch (trx->conc_state) { + case TRX_NOT_STARTED: + fputs(", not started", f); + break; + case TRX_ACTIVE: + fprintf(f, ", ACTIVE %lu sec", + (ulong)difftime(time(NULL), trx->start_time)); + break; + case TRX_PREPARED: + fprintf(f, ", ACTIVE (PREPARED) %lu sec", + (ulong)difftime(time(NULL), trx->start_time)); + break; + case TRX_COMMITTED_IN_MEMORY: + fputs(", COMMITTED IN MEMORY", f); + break; + default: + fprintf(f, " state %lu", (ulong) trx->conc_state); + } + +#ifdef UNIV_LINUX + fprintf(f, ", process no %lu", trx->mysql_process_no); +#endif + fprintf(f, ", OS thread id %lu", + (ulong) os_thread_pf(trx->mysql_thread_id)); + + if (*trx->op_info) { + putc(' ', f); + fputs(trx->op_info, f); + } + + if (trx->type != TRX_USER) { + fputs(" purge trx", f); + } + + if (trx->declared_to_be_inside_innodb) { + fprintf(f, ", thread declared inside InnoDB %lu", + (ulong) trx->n_tickets_to_enter_innodb); + } + + putc('\n', f); + + if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { + fprintf(f, "mysql tables in use %lu, locked %lu\n", + (ulong) trx->n_mysql_tables_in_use, + (ulong) trx->mysql_n_tables_locked); + } + + if (trx->n_lock_table_transactional > 0 || trx->n_lock_table_exp > 0) { +fprintf(f, "mysql explicit table locks %lu, transactional table locks %lu\n", + (ulong) trx->n_lock_table_exp, + (ulong) trx->n_lock_table_transactional); + } + + newline = TRUE; + + switch (trx->que_state) { + case TRX_QUE_RUNNING: + newline = FALSE; break; + case TRX_QUE_LOCK_WAIT: + fputs("LOCK WAIT ", f); break; + case TRX_QUE_ROLLING_BACK: + fputs("ROLLING BACK ", f); break; + case TRX_QUE_COMMITTING: + fputs("COMMITTING ", f); break; + default: + fprintf(f, "que state %lu ", (ulong) trx->que_state); + } + + if (0 < UT_LIST_GET_LEN(trx->trx_locks) || + mem_heap_get_size(trx->lock_heap) > 400) { + newline = TRUE; + + fprintf(f, "%lu lock struct(s), heap size %lu", + (ulong) UT_LIST_GET_LEN(trx->trx_locks), + (ulong) mem_heap_get_size(trx->lock_heap)); + } + + if (trx->has_search_latch) { + newline = TRUE; + fputs(", holds adaptive hash latch", f); + } + + if (ut_dulint_cmp(trx->undo_no, ut_dulint_zero) != 0) { + newline = TRUE; + fprintf(f, ", undo log entries %lu", + (ulong) ut_dulint_get_low(trx->undo_no)); + } + + if (newline) { + putc('\n', f); + } + + if (trx->mysql_thd != NULL) { + innobase_mysql_print_thd(f, trx->mysql_thd); + } +} + +/******************************************************************** +Prepares a transaction. */ + +void +trx_prepare_off_kernel( +/*===================*/ + trx_t* trx) /* in: transaction */ +{ + page_t* update_hdr_page; + trx_rseg_t* rseg; + ibool must_flush_log = FALSE; + dulint lsn; + mtr_t mtr; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + rseg = trx->rseg; + + if (trx->insert_undo != NULL || trx->update_undo != NULL) { + + mutex_exit(&kernel_mutex); + + mtr_start(&mtr); + + must_flush_log = TRUE; + + /* Change the undo log segment states from TRX_UNDO_ACTIVE + to TRX_UNDO_PREPARED: these modifications to the file data + structure define the transaction as prepared in the + file-based world, at the serialization point of lsn. */ + + mutex_enter(&(rseg->mutex)); + + if (trx->insert_undo != NULL) { + + /* It is not necessary to obtain trx->undo_mutex here + because only a single OS thread is allowed to do the + transaction prepare for this transaction. */ + + trx_undo_set_state_at_prepare(trx, trx->insert_undo, + &mtr); + } + + if (trx->update_undo) { + update_hdr_page = trx_undo_set_state_at_prepare(trx, + trx->update_undo, &mtr); + } + + mutex_exit(&(rseg->mutex)); + + /*--------------*/ + mtr_commit(&mtr); /* This mtr commit makes the + transaction prepared in the file-based + world */ + /*--------------*/ + lsn = mtr.end_lsn; + + mutex_enter(&kernel_mutex); + } + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + /*--------------------------------------*/ + trx->conc_state = TRX_PREPARED; + /*--------------------------------------*/ + + if (must_flush_log) { + /* Depending on the my.cnf options, we may now write the log + buffer to the log files, making the prepared state of the + transaction durable if the OS does not crash. We may also + flush the log files to disk, making the prepared state of the + transaction durable also at an OS crash or a power outage. + + The idea in InnoDB's group prepare is that a group of + transactions gather behind a trx doing a physical disk write + to log files, and when that physical write has been completed, + one of those transactions does a write which prepares the whole + group. Note that this group prepare will only bring benefit if + there are > 2 users in the database. Then at least 2 users can + gather behind one doing the physical log write to disk. + + TODO: find out if MySQL holds some mutex when calling this. + That would spoil our group prepare algorithm. */ + + mutex_exit(&kernel_mutex); + + if (srv_flush_log_at_trx_commit == 0) { + /* Do nothing */ + } else if (srv_flush_log_at_trx_commit == 1) { + if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, + FALSE); + } else { + /* Write the log to the log files AND flush + them to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); + } + } else if (srv_flush_log_at_trx_commit == 2) { + + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + ut_error; + } + + mutex_enter(&kernel_mutex); + } +} + +/************************************************************************** +Does the transaction prepare for MySQL. */ + +ulint +trx_prepare_for_mysql( +/*====-=============*/ + /* out: 0 or error number */ + trx_t* trx) /* in: trx handle */ +{ + /* Because we do not do the prepare by sending an Innobase + sig to the transaction, we must here make sure that trx has been + started. */ + + ut_a(trx); + + trx->op_info = "preparing"; + + trx_start_if_not_started(trx); + + mutex_enter(&kernel_mutex); + + trx_prepare_off_kernel(trx); + + mutex_exit(&kernel_mutex); + + trx->op_info = ""; + + return(0); +} + +/************************************************************************** +This function is used to find number of prepared transactions and +their transaction objects for a recovery. */ + +int +trx_recover_for_mysql( +/*==================*/ + /* out: number of prepared transactions + stored in xid_list */ + XID* xid_list, /* in/out: prepared transactions */ + ulint len) /* in: number of slots in xid_list */ +{ + trx_t* trx; + int count = 0; + + ut_ad(xid_list); + ut_ad(len); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Starting recovery for XA transactions...\n"); + + /* We should set those transactions which are in the prepared state + to the xid_list */ + + mutex_enter(&kernel_mutex); + + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx) { + if (trx->conc_state == TRX_PREPARED) { + xid_list[count] = trx->xid; + + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Transaction %lu %lu in prepared state after recovery\n", + (ulong) ut_dulint_get_high(trx->id), + (ulong) ut_dulint_get_low(trx->id)); + + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Transaction contains changes to %lu rows\n", + (ulong)ut_conv_dulint_to_longlong(trx->undo_no)); + + count++; + + if ((uint)count == len ) { + break; + } + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + mutex_exit(&kernel_mutex); + + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: %d transactions in prepared state after recovery\n", + count); + + return (count); +} + +/*********************************************************************** +This function is used to find one X/Open XA distributed transaction +which is in the prepared state */ + +trx_t* +trx_get_trx_by_xid( +/*===============*/ + /* out: trx or NULL */ + XID* xid) /* in: X/Open XA transaction identification */ +{ + trx_t* trx; + + if (xid == NULL) { + + return (NULL); + } + + mutex_enter(&kernel_mutex); + + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx) { + /* Compare two X/Open XA transaction id's: their + length should be the same and binary comparison + of gtrid_lenght+bqual_length bytes should be + the same */ + + if (xid->gtrid_length == trx->xid.gtrid_length && + xid->bqual_length == trx->xid.bqual_length && + memcmp(xid->data, trx->xid.data, + xid->gtrid_length + + xid->bqual_length) == 0) { + break; + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + mutex_exit(&kernel_mutex); + + if (trx) { + if (trx->conc_state != TRX_PREPARED) { + + return(NULL); + } + + return(trx); + } else { + return(NULL); + } +} diff --git a/storage/innobase/trx/trx0undo.c b/storage/innobase/trx/trx0undo.c new file mode 100644 index 00000000000..bb314dd35e9 --- /dev/null +++ b/storage/innobase/trx/trx0undo.c @@ -0,0 +1,1906 @@ +/****************************************************** +Transaction undo log + +(c) 1996 Innobase Oy + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0undo.h" + +#ifdef UNIV_NONINL +#include "trx0undo.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "srv0srv.h" +#include "trx0rec.h" +#include "trx0purge.h" +#include "trx0xa.h" + +/* How should the old versions in the history list be managed? + ---------------------------------------------------------- +If each transaction is given a whole page for its update undo log, file +space consumption can be 10 times higher than necessary. Therefore, +partly filled update undo log pages should be reusable. But then there +is no way individual pages can be ordered so that the ordering agrees +with the serialization numbers of the transactions on the pages. Thus, +the history list must be formed of undo logs, not their header pages as +it was in the old implementation. + However, on a single header page the transactions are placed in +the order of their serialization numbers. As old versions are purged, we +may free the page when the last transaction on the page has been purged. + A problem is that the purge has to go through the transactions +in the serialization order. This means that we have to look through all +rollback segments for the one that has the smallest transaction number +in its history list. + When should we do a purge? A purge is necessary when space is +running out in any of the rollback segments. Then we may have to purge +also old version which might be needed by some consistent read. How do +we trigger the start of a purge? When a transaction writes to an undo log, +it may notice that the space is running out. When a read view is closed, +it may make some history superfluous. The server can have an utility which +periodically checks if it can purge some history. + In a parallellized purge we have the problem that a query thread +can remove a delete marked clustered index record before another query +thread has processed an earlier version of the record, which cannot then +be done because the row cannot be constructed from the clustered index +record. To avoid this problem, we will store in the update and delete mark +undo record also the columns necessary to construct the secondary index +entries which are modified. + We can latch the stack of versions of a single clustered index record +by taking a latch on the clustered index page. As long as the latch is held, +no new versions can be added and no versions removed by undo. But, a purge +can still remove old versions from the bottom of the stack. */ + +/* How to protect rollback segments, undo logs, and history lists with + ------------------------------------------------------------------- +latches? +------- +The contention of the kernel mutex should be minimized. When a transaction +does its first insert or modify in an index, an undo log is assigned for it. +Then we must have an x-latch to the rollback segment header. + When the transaction does more modifys or rolls back, the undo log is +protected with undo_mutex in the transaction. + When the transaction commits, its insert undo log is either reset and +cached for a fast reuse, or freed. In these cases we must have an x-latch on +the rollback segment page. The update undo log is put to the history list. If +it is not suitable for reuse, its slot in the rollback segment is reset. In +both cases, an x-latch must be acquired on the rollback segment. + The purge operation steps through the history list without modifying +it until a truncate operation occurs, which can remove undo logs from the end +of the list and release undo log segments. In stepping through the list, +s-latches on the undo log pages are enough, but in a truncate, x-latches must +be obtained on the rollback segment and individual pages. */ + +/************************************************************************ +Initializes the fields in an undo log segment page. */ +static +void +trx_undo_page_init( +/*================*/ + page_t* undo_page, /* in: undo log segment page */ + ulint type, /* in: undo log segment type */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************ +Creates and initializes an undo log memory object. */ +static +trx_undo_t* +trx_undo_mem_create( +/*================*/ + /* out, own: the undo log memory object */ + trx_rseg_t* rseg, /* in: rollback segment memory object */ + ulint id, /* in: slot index within rseg */ + ulint type, /* in: type of the log: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + dulint trx_id, /* in: id of the trx for which the undo log + is created */ + XID* xid, /* in: X/Open XA transaction identification*/ + ulint page_no,/* in: undo log header page number */ + ulint offset);/* in: undo log header byte offset on page */ +/******************************************************************* +Initializes a cached insert undo log header page for new use. NOTE that this +function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change +the operation of this function! */ +static +ulint +trx_undo_insert_header_reuse( +/*=========================*/ + /* out: undo log header byte offset on page */ + page_t* undo_page, /* in: insert undo log segment header page, + x-latched */ + dulint trx_id, /* in: transaction id */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************** +If an update undo log can be discarded immediately, this function frees the +space, resetting the page to the proper state for caching. */ +static +void +trx_undo_discard_latest_update_undo( +/*================================*/ + page_t* undo_page, /* in: header page of an undo log of size 1 */ + mtr_t* mtr); /* in: mtr */ + + +/*************************************************************************** +Gets the previous record in an undo log from the previous page. */ +static +trx_undo_rec_t* +trx_undo_get_prev_rec_from_prev_page( +/*=================================*/ + /* out: undo log record, the page s-latched, + NULL if none */ + trx_undo_rec_t* rec, /* in: undo record */ + ulint page_no,/* in: undo log header page number */ + ulint offset, /* in: undo log header offset on page */ + mtr_t* mtr) /* in: mtr */ +{ + ulint prev_page_no; + page_t* prev_page; + page_t* undo_page; + + undo_page = buf_frame_align(rec); + + prev_page_no = flst_get_prev_addr(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_NODE, mtr) + .page; + + if (prev_page_no == FIL_NULL) { + + return(NULL); + } + + prev_page = trx_undo_page_get_s_latched( + buf_frame_get_space_id(undo_page), + prev_page_no, mtr); + + return(trx_undo_page_get_last_rec(prev_page, page_no, offset)); +} + +/*************************************************************************** +Gets the previous record in an undo log. */ + +trx_undo_rec_t* +trx_undo_get_prev_rec( +/*==================*/ + /* out: undo log record, the page s-latched, + NULL if none */ + trx_undo_rec_t* rec, /* in: undo record */ + ulint page_no,/* in: undo log header page number */ + ulint offset, /* in: undo log header offset on page */ + mtr_t* mtr) /* in: mtr */ +{ + trx_undo_rec_t* prev_rec; + + prev_rec = trx_undo_page_get_prev_rec(rec, page_no, offset); + + if (prev_rec) { + + return(prev_rec); + } + + /* We have to go to the previous undo log page to look for the + previous record */ + + return(trx_undo_get_prev_rec_from_prev_page(rec, page_no, offset, + mtr)); +} + +/*************************************************************************** +Gets the next record in an undo log from the next page. */ +static +trx_undo_rec_t* +trx_undo_get_next_rec_from_next_page( +/*=================================*/ + /* out: undo log record, the page latched, NULL if + none */ + page_t* undo_page, /* in: undo log page */ + ulint page_no,/* in: undo log header page number */ + ulint offset, /* in: undo log header offset on page */ + ulint mode, /* in: latch mode: RW_S_LATCH or RW_X_LATCH */ + mtr_t* mtr) /* in: mtr */ +{ + trx_ulogf_t* log_hdr; + ulint next_page_no; + page_t* next_page; + ulint space; + ulint next; + + if (page_no == buf_frame_get_page_no(undo_page)) { + + log_hdr = undo_page + offset; + next = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG); + + if (next != 0) { + + return(NULL); + } + } + + space = buf_frame_get_space_id(undo_page); + + next_page_no = flst_get_next_addr(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_NODE, mtr) + .page; + if (next_page_no == FIL_NULL) { + + return(NULL); + } + + if (mode == RW_S_LATCH) { + next_page = trx_undo_page_get_s_latched(space, next_page_no, + mtr); + } else { + ut_ad(mode == RW_X_LATCH); + next_page = trx_undo_page_get(space, next_page_no, mtr); + } + + return(trx_undo_page_get_first_rec(next_page, page_no, offset)); +} + +/*************************************************************************** +Gets the next record in an undo log. */ + +trx_undo_rec_t* +trx_undo_get_next_rec( +/*==================*/ + /* out: undo log record, the page s-latched, + NULL if none */ + trx_undo_rec_t* rec, /* in: undo record */ + ulint page_no,/* in: undo log header page number */ + ulint offset, /* in: undo log header offset on page */ + mtr_t* mtr) /* in: mtr */ +{ + trx_undo_rec_t* next_rec; + + next_rec = trx_undo_page_get_next_rec(rec, page_no, offset); + + if (next_rec) { + return(next_rec); + } + + return(trx_undo_get_next_rec_from_next_page(buf_frame_align(rec), + page_no, offset, + RW_S_LATCH, mtr)); +} + +/*************************************************************************** +Gets the first record in an undo log. */ + +trx_undo_rec_t* +trx_undo_get_first_rec( +/*===================*/ + /* out: undo log record, the page latched, NULL if + none */ + ulint space, /* in: undo log header space */ + ulint page_no,/* in: undo log header page number */ + ulint offset, /* in: undo log header offset on page */ + ulint mode, /* in: latching mode: RW_S_LATCH or RW_X_LATCH */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* undo_page; + trx_undo_rec_t* rec; + + if (mode == RW_S_LATCH) { + undo_page = trx_undo_page_get_s_latched(space, page_no, mtr); + } else { + undo_page = trx_undo_page_get(space, page_no, mtr); + } + + rec = trx_undo_page_get_first_rec(undo_page, page_no, offset); + + if (rec) { + return(rec); + } + + return(trx_undo_get_next_rec_from_next_page(undo_page, page_no, offset, + mode, mtr)); +} + +/*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/ + +/************************************************************************** +Writes the mtr log entry of an undo log page initialization. */ +UNIV_INLINE +void +trx_undo_page_init_log( +/*====================*/ + page_t* undo_page, /* in: undo log page */ + ulint type, /* in: undo log type */ + mtr_t* mtr) /* in: mtr */ +{ + mlog_write_initial_log_record(undo_page, MLOG_UNDO_INIT, mtr); + + mlog_catenate_ulint_compressed(mtr, type); +} + +/*************************************************************** +Parses the redo log entry of an undo log page initialization. */ + +byte* +trx_undo_parse_page_init( +/*======================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + ulint type; + + ptr = mach_parse_compressed(ptr, end_ptr, &type); + + if (ptr == NULL) { + + return(NULL); + } + + if (page) { + trx_undo_page_init(page, type, mtr); + } + + return(ptr); +} + +/************************************************************************ +Initializes the fields in an undo log segment page. */ +static +void +trx_undo_page_init( +/*================*/ + page_t* undo_page, /* in: undo log segment page */ + ulint type, /* in: undo log segment type */ + mtr_t* mtr) /* in: mtr */ +{ + trx_upagef_t* page_hdr; + + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_TYPE, type); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); + + fil_page_set_type(undo_page, FIL_PAGE_UNDO_LOG); + + trx_undo_page_init_log(undo_page, type, mtr); +} + +/******************************************************************* +Creates a new undo log segment in file. */ +static +page_t* +trx_undo_seg_create( +/*================*/ + /* out: segment header page x-latched, NULL + if no space left */ + trx_rseg_t* rseg __attribute__((unused)),/* in: rollback segment */ + trx_rsegf_t* rseg_hdr,/* in: rollback segment header, page + x-latched */ + ulint type, /* in: type of the segment: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + ulint* id, /* out: slot index within rseg header */ + mtr_t* mtr) /* in: mtr */ +{ + ulint slot_no; + ulint space; + page_t* undo_page; + trx_upagef_t* page_hdr; + trx_usegf_t* seg_hdr; + ulint n_reserved; + ibool success; + + ut_ad(mtr && id && rseg_hdr); +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(rseg->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + +/* fputs(type == TRX_UNDO_INSERT + ? "Creating insert undo log segment\n" + : "Creating update undo log segment\n", stderr); */ + slot_no = trx_rsegf_undo_find_free(rseg_hdr, mtr); + + if (slot_no == ULINT_UNDEFINED) { + ut_print_timestamp(stderr); + fprintf(stderr, +"InnoDB: Warning: cannot find a free slot for an undo log. Do you have too\n" +"InnoDB: many active transactions running concurrently?\n"); + + return(NULL); + } + + space = buf_frame_get_space_id(rseg_hdr); + + success = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO, + mtr); + if (!success) { + + return(NULL); + } + + /* Allocate a new file segment for the undo log */ + undo_page = fseg_create_general(space, 0, + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER, TRUE, mtr); + + fil_space_release_free_extents(space, n_reserved); + + if (undo_page == NULL) { + /* No space left */ + + return(NULL); + } + +#ifdef UNIV_SYNC_DEBUG + buf_page_dbg_add_level(undo_page, SYNC_TRX_UNDO_PAGE); +#endif /* UNIV_SYNC_DEBUG */ + + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + + trx_undo_page_init(undo_page, type, mtr); + + mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE, + TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE, + MLOG_2BYTES, mtr); + + mlog_write_ulint(seg_hdr + TRX_UNDO_LAST_LOG, 0, MLOG_2BYTES, mtr); + + flst_init(seg_hdr + TRX_UNDO_PAGE_LIST, mtr); + + flst_add_last(seg_hdr + TRX_UNDO_PAGE_LIST, + page_hdr + TRX_UNDO_PAGE_NODE, mtr); + + trx_rsegf_set_nth_undo(rseg_hdr, slot_no, + buf_frame_get_page_no(undo_page), mtr); + *id = slot_no; + + return(undo_page); +} + +/************************************************************************** +Writes the mtr log entry of an undo log header initialization. */ +UNIV_INLINE +void +trx_undo_header_create_log( +/*=======================*/ + page_t* undo_page, /* in: undo log header page */ + dulint trx_id, /* in: transaction id */ + mtr_t* mtr) /* in: mtr */ +{ + mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_CREATE, mtr); + + mlog_catenate_dulint_compressed(mtr, trx_id); +} + +/******************************************************************* +Creates a new undo log header in file. NOTE that this function has its own +log record type MLOG_UNDO_HDR_CREATE. You must NOT change the operation of +this function! */ +static +ulint +trx_undo_header_create( +/*===================*/ + /* out: header byte offset on page */ + page_t* undo_page, /* in: undo log segment header page, + x-latched; it is assumed that there are + TRX_UNDO_LOG_XA_HDR_SIZE bytes free space + on it */ + dulint trx_id, /* in: transaction id */ + mtr_t* mtr) /* in: mtr */ +{ + trx_upagef_t* page_hdr; + trx_usegf_t* seg_hdr; + trx_ulogf_t* log_hdr; + trx_ulogf_t* prev_log_hdr; + ulint prev_log; + ulint free; + ulint new_free; + + ut_ad(mtr && undo_page); + + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + + free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE); + + log_hdr = undo_page + free; + + new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE; + + ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free); + + mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE); + + prev_log = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG); + + if (prev_log != 0) { + prev_log_hdr = undo_page + prev_log; + + mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, free); + } + + mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, free); + + log_hdr = undo_page + free; + + mach_write_to_2(log_hdr + TRX_UNDO_DEL_MARKS, TRUE); + + mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id); + mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free); + + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE); + mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE); + + mach_write_to_2(log_hdr + TRX_UNDO_NEXT_LOG, 0); + mach_write_to_2(log_hdr + TRX_UNDO_PREV_LOG, prev_log); + + /* Write the log record about the header creation */ + trx_undo_header_create_log(undo_page, trx_id, mtr); + + return(free); +} + +/************************************************************************ +Write X/Open XA Transaction Identification (XID) to undo log header */ +static +void +trx_undo_write_xid( +/*===============*/ + trx_ulogf_t* log_hdr,/* in: undo log header */ + XID* xid, /* in: X/Open XA Transaction Identification */ + mtr_t* mtr) /* in: mtr */ +{ + mlog_write_ulint(log_hdr + TRX_UNDO_XA_FORMAT, xid->formatID, + MLOG_4BYTES, mtr); + + mlog_write_ulint(log_hdr + TRX_UNDO_XA_TRID_LEN, xid->gtrid_length, + MLOG_4BYTES, mtr); + + mlog_write_ulint(log_hdr + TRX_UNDO_XA_BQUAL_LEN, xid->bqual_length, + MLOG_4BYTES, mtr); + + mlog_write_string(log_hdr + TRX_UNDO_XA_XID, xid->data, + XIDDATASIZE, mtr); +} + +/************************************************************************ +Read X/Open XA Transaction Identification (XID) from undo log header */ +static +void +trx_undo_read_xid( +/*==============*/ + trx_ulogf_t* log_hdr,/* in: undo log header */ + XID* xid) /* out: X/Open XA Transaction Identification */ +{ + ulint i; + + xid->formatID = mach_read_from_4(log_hdr + TRX_UNDO_XA_FORMAT); + + xid->gtrid_length = mach_read_from_4(log_hdr + TRX_UNDO_XA_TRID_LEN); + + xid->bqual_length = mach_read_from_4(log_hdr + TRX_UNDO_XA_BQUAL_LEN); + + for (i = 0; i < XIDDATASIZE; i++) { + xid->data[i] = (char)mach_read_from_1(log_hdr + + TRX_UNDO_XA_XID + i); + } +} + +/******************************************************************* +Adds space for the XA XID after an undo log old-style header. */ +static +void +trx_undo_header_add_space_for_xid( +/*==============================*/ + page_t* undo_page,/* in: undo log segment header page */ + trx_ulogf_t* log_hdr,/* in: undo log header */ + mtr_t* mtr) /* in: mtr */ +{ + trx_upagef_t* page_hdr; + ulint free; + ulint new_free; + + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE); + + /* free is now the end offset of the old style undo log header */ + + ut_a(free == (ulint)(log_hdr - undo_page) + TRX_UNDO_LOG_OLD_HDR_SIZE); + + new_free = free + (TRX_UNDO_LOG_XA_HDR_SIZE + - TRX_UNDO_LOG_OLD_HDR_SIZE); + + /* Add space for a XID after the header, update the free offset + fields on the undo log page and in the undo log header */ + + mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_START, new_free, + MLOG_2BYTES, mtr); + + mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE, new_free, + MLOG_2BYTES, mtr); + + mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, new_free, + MLOG_2BYTES, mtr); +} + +/************************************************************************** +Writes the mtr log entry of an undo log header reuse. */ +UNIV_INLINE +void +trx_undo_insert_header_reuse_log( +/*=============================*/ + page_t* undo_page, /* in: undo log header page */ + dulint trx_id, /* in: transaction id */ + mtr_t* mtr) /* in: mtr */ +{ + mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_REUSE, mtr); + + mlog_catenate_dulint_compressed(mtr, trx_id); +} + +/*************************************************************** +Parses the redo log entry of an undo log page header create or reuse. */ + +byte* +trx_undo_parse_page_header( +/*=======================*/ + /* out: end of log record or NULL */ + ulint type, /* in: MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + dulint trx_id; + + ptr = mach_dulint_parse_compressed(ptr, end_ptr, &trx_id); + + if (ptr == NULL) { + + return(NULL); + } + + if (page) { + if (type == MLOG_UNDO_HDR_CREATE) { + trx_undo_header_create(page, trx_id, mtr); + } else { + ut_ad(type == MLOG_UNDO_HDR_REUSE); + trx_undo_insert_header_reuse(page, trx_id, mtr); + } + } + + return(ptr); +} + +/******************************************************************* +Initializes a cached insert undo log header page for new use. NOTE that this +function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change +the operation of this function! */ +static +ulint +trx_undo_insert_header_reuse( +/*=========================*/ + /* out: undo log header byte offset on page */ + page_t* undo_page, /* in: insert undo log segment header page, + x-latched */ + dulint trx_id, /* in: transaction id */ + mtr_t* mtr) /* in: mtr */ +{ + trx_upagef_t* page_hdr; + trx_usegf_t* seg_hdr; + trx_ulogf_t* log_hdr; + ulint free; + ulint new_free; + + ut_ad(mtr && undo_page); + + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + + free = TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE; + + ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100); + + log_hdr = undo_page + free; + + new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE; + + /* Insert undo data is not needed after commit: we may free all + the space on the page */ + + ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) + == TRX_UNDO_INSERT); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free); + + mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE); + + log_hdr = undo_page + free; + + mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id); + mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free); + + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE); + mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE); + + /* Write the log record MLOG_UNDO_HDR_REUSE */ + trx_undo_insert_header_reuse_log(undo_page, trx_id, mtr); + + return(free); +} + +/************************************************************************** +Writes the redo log entry of an update undo log header discard. */ +UNIV_INLINE +void +trx_undo_discard_latest_log( +/*========================*/ + page_t* undo_page, /* in: undo log header page */ + mtr_t* mtr) /* in: mtr */ +{ + mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_DISCARD, mtr); +} + +/*************************************************************** +Parses the redo log entry of an undo log page header discard. */ + +byte* +trx_undo_parse_discard_latest( +/*==========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr __attribute__((unused)), /* in: buffer end */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + ut_ad(end_ptr); + + if (page) { + trx_undo_discard_latest_update_undo(page, mtr); + } + + return(ptr); +} + +/************************************************************************** +If an update undo log can be discarded immediately, this function frees the +space, resetting the page to the proper state for caching. */ +static +void +trx_undo_discard_latest_update_undo( +/*================================*/ + page_t* undo_page, /* in: header page of an undo log of size 1 */ + mtr_t* mtr) /* in: mtr */ +{ + trx_usegf_t* seg_hdr; + trx_upagef_t* page_hdr; + trx_ulogf_t* log_hdr; + trx_ulogf_t* prev_log_hdr; + ulint free; + ulint prev_hdr_offset; + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + free = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG); + log_hdr = undo_page + free; + + prev_hdr_offset = mach_read_from_2(log_hdr + TRX_UNDO_PREV_LOG); + + if (prev_hdr_offset != 0) { + prev_log_hdr = undo_page + prev_hdr_offset; + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, + mach_read_from_2(prev_log_hdr + TRX_UNDO_LOG_START)); + mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, 0); + } + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, free); + + mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_CACHED); + mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, prev_hdr_offset); + + trx_undo_discard_latest_log(undo_page, mtr); +} + +/************************************************************************ +Tries to add a page to the undo log segment where the undo log is placed. */ + +ulint +trx_undo_add_page( +/*==============*/ + /* out: page number if success, else + FIL_NULL */ + trx_t* trx, /* in: transaction */ + trx_undo_t* undo, /* in: undo log memory object */ + mtr_t* mtr) /* in: mtr which does not have a latch to any + undo log page; the caller must have reserved + the rollback segment mutex */ +{ + page_t* header_page; + page_t* new_page; + trx_rseg_t* rseg; + ulint page_no; + ulint n_reserved; + ibool success; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(trx->undo_mutex))); + ut_ad(!mutex_own(&kernel_mutex)); + ut_ad(mutex_own(&(trx->rseg->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + rseg = trx->rseg; + + if (rseg->curr_size == rseg->max_size) { + + return(FIL_NULL); + } + + header_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr); + + success = fsp_reserve_free_extents(&n_reserved, undo->space, 1, + FSP_UNDO, mtr); + if (!success) { + + return(FIL_NULL); + } + + page_no = fseg_alloc_free_page_general(header_page + TRX_UNDO_SEG_HDR + + TRX_UNDO_FSEG_HEADER, + undo->top_page_no + 1, FSP_UP, + TRUE, mtr); + + fil_space_release_free_extents(undo->space, n_reserved); + + if (page_no == FIL_NULL) { + + /* No space left */ + + return(FIL_NULL); + } + + undo->last_page_no = page_no; + + new_page = trx_undo_page_get(undo->space, page_no, mtr); + + trx_undo_page_init(new_page, undo->type, mtr); + + flst_add_last(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST, + new_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr); + undo->size++; + rseg->curr_size++; + + return(page_no); +} + +/************************************************************************ +Frees an undo log page that is not the header page. */ +static +ulint +trx_undo_free_page( +/*===============*/ + /* out: last page number in remaining log */ + trx_rseg_t* rseg, /* in: rollback segment */ + ibool in_history, /* in: TRUE if the undo log is in the history + list */ + ulint space, /* in: space */ + ulint hdr_page_no, /* in: header page number */ + ulint page_no, /* in: page number to free: must not be the + header page */ + mtr_t* mtr) /* in: mtr which does not have a latch to any + undo log page; the caller must have reserved + the rollback segment mutex */ +{ + page_t* header_page; + page_t* undo_page; + fil_addr_t last_addr; + trx_rsegf_t* rseg_header; + ulint hist_size; + + ut_a(hdr_page_no != page_no); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!mutex_own(&kernel_mutex)); + ut_ad(mutex_own(&(rseg->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + undo_page = trx_undo_page_get(space, page_no, mtr); + + header_page = trx_undo_page_get(space, hdr_page_no, mtr); + + flst_remove(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST, + undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr); + + fseg_free_page(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER, + space, page_no, mtr); + + last_addr = flst_get_last(header_page + TRX_UNDO_SEG_HDR + + TRX_UNDO_PAGE_LIST, mtr); + rseg->curr_size--; + + if (in_history) { + rseg_header = trx_rsegf_get(space, rseg->page_no, mtr); + + hist_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, + MLOG_4BYTES, mtr); + ut_ad(hist_size > 0); + mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, + hist_size - 1, MLOG_4BYTES, mtr); + } + + return(last_addr.page); +} + +/************************************************************************ +Frees an undo log page when there is also the memory object for the undo +log. */ +static +void +trx_undo_free_page_in_rollback( +/*===========================*/ + trx_t* trx __attribute__((unused)), /* in: transaction */ + trx_undo_t* undo, /* in: undo log memory copy */ + ulint page_no,/* in: page number to free: must not be the + header page */ + mtr_t* mtr) /* in: mtr which does not have a latch to any + undo log page; the caller must have reserved + the rollback segment mutex */ +{ + ulint last_page_no; + + ut_ad(undo->hdr_page_no != page_no); +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(trx->undo_mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + last_page_no = trx_undo_free_page(undo->rseg, FALSE, undo->space, + undo->hdr_page_no, page_no, mtr); + + undo->last_page_no = last_page_no; + undo->size--; +} + +/************************************************************************ +Empties an undo log header page of undo records for that undo log. Other +undo logs may still have records on that page, if it is an update undo log. */ +static +void +trx_undo_empty_header_page( +/*=======================*/ + ulint space, /* in: space */ + ulint hdr_page_no, /* in: header page number */ + ulint hdr_offset, /* in: header offset */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* header_page; + trx_ulogf_t* log_hdr; + ulint end; + + header_page = trx_undo_page_get(space, hdr_page_no, mtr); + + log_hdr = header_page + hdr_offset; + + end = trx_undo_page_get_end(header_page, hdr_page_no, hdr_offset); + + mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, end, MLOG_2BYTES, mtr); +} + +/*************************************************************************** +Truncates an undo log from the end. This function is used during a rollback +to free space from an undo log. */ + +void +trx_undo_truncate_end( +/*==================*/ + trx_t* trx, /* in: transaction whose undo log it is */ + trx_undo_t* undo, /* in: undo log */ + dulint limit) /* in: all undo records with undo number + >= this value should be truncated */ +{ + page_t* undo_page; + ulint last_page_no; + trx_undo_rec_t* rec; + trx_undo_rec_t* trunc_here; + trx_rseg_t* rseg; + mtr_t mtr; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(trx->undo_mutex))); + ut_ad(mutex_own(&(trx->rseg->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + rseg = trx->rseg; + + for (;;) { + mtr_start(&mtr); + + trunc_here = NULL; + + last_page_no = undo->last_page_no; + + undo_page = trx_undo_page_get(undo->space, last_page_no, &mtr); + + rec = trx_undo_page_get_last_rec(undo_page, undo->hdr_page_no, + undo->hdr_offset); + for (;;) { + if (rec == NULL) { + if (last_page_no == undo->hdr_page_no) { + + goto function_exit; + } + + trx_undo_free_page_in_rollback(trx, undo, + last_page_no, &mtr); + break; + } + + if (ut_dulint_cmp(trx_undo_rec_get_undo_no(rec), limit) + >= 0) { + /* Truncate at least this record off, maybe + more */ + trunc_here = rec; + } else { + goto function_exit; + } + + rec = trx_undo_page_get_prev_rec(rec, + undo->hdr_page_no, + undo->hdr_offset); + } + + mtr_commit(&mtr); + } + +function_exit: + if (trunc_here) { + mlog_write_ulint(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE, + trunc_here - undo_page, MLOG_2BYTES, &mtr); + } + + mtr_commit(&mtr); +} + +/*************************************************************************** +Truncates an undo log from the start. This function is used during a purge +operation. */ + +void +trx_undo_truncate_start( +/*====================*/ + trx_rseg_t* rseg, /* in: rollback segment */ + ulint space, /* in: space id of the log */ + ulint hdr_page_no, /* in: header page number */ + ulint hdr_offset, /* in: header offset on the page */ + dulint limit) /* in: all undo pages with undo numbers < + this value should be truncated; NOTE that + the function only frees whole pages; the + header page is not freed, but emptied, if + all the records there are < limit */ +{ + page_t* undo_page; + trx_undo_rec_t* rec; + trx_undo_rec_t* last_rec; + ulint page_no; + mtr_t mtr; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(rseg->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + if (0 == ut_dulint_cmp(limit, ut_dulint_zero)) { + + return; + } +loop: + mtr_start(&mtr); + + rec = trx_undo_get_first_rec(space, hdr_page_no, hdr_offset, + RW_X_LATCH, &mtr); + if (rec == NULL) { + /* Already empty */ + + mtr_commit(&mtr); + + return; + } + + undo_page = buf_frame_align(rec); + + last_rec = trx_undo_page_get_last_rec(undo_page, hdr_page_no, + hdr_offset); + if (ut_dulint_cmp(trx_undo_rec_get_undo_no(last_rec), limit) >= 0) { + + mtr_commit(&mtr); + + return; + } + + page_no = buf_frame_get_page_no(undo_page); + + if (page_no == hdr_page_no) { + trx_undo_empty_header_page(space, hdr_page_no, hdr_offset, + &mtr); + } else { + trx_undo_free_page(rseg, TRUE, space, hdr_page_no, + page_no, &mtr); + } + + mtr_commit(&mtr); + + goto loop; +} + +/************************************************************************** +Frees an undo log segment which is not in the history list. */ +static +void +trx_undo_seg_free( +/*==============*/ + trx_undo_t* undo) /* in: undo log */ +{ + trx_rseg_t* rseg; + fseg_header_t* file_seg; + trx_rsegf_t* rseg_header; + trx_usegf_t* seg_header; + ibool finished; + mtr_t mtr; + + finished = FALSE; + rseg = undo->rseg; + + while (!finished) { + + mtr_start(&mtr); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + mutex_enter(&(rseg->mutex)); + + seg_header = trx_undo_page_get(undo->space, undo->hdr_page_no, + &mtr) + + TRX_UNDO_SEG_HDR; + + file_seg = seg_header + TRX_UNDO_FSEG_HEADER; + + finished = fseg_free_step(file_seg, &mtr); + + if (finished) { + /* Update the rseg header */ + rseg_header = trx_rsegf_get(rseg->space, rseg->page_no, + &mtr); + trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, + &mtr); + } + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + } +} + +/*========== UNDO LOG MEMORY COPY INITIALIZATION =====================*/ + +/************************************************************************ +Creates and initializes an undo log memory object according to the values +in the header in file, when the database is started. The memory object is +inserted in the appropriate list of rseg. */ +static +trx_undo_t* +trx_undo_mem_create_at_db_start( +/*============================*/ + /* out, own: the undo log memory object */ + trx_rseg_t* rseg, /* in: rollback segment memory object */ + ulint id, /* in: slot index within rseg */ + ulint page_no,/* in: undo log segment page number */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* undo_page; + trx_upagef_t* page_header; + trx_usegf_t* seg_header; + trx_ulogf_t* undo_header; + trx_undo_t* undo; + ulint type; + ulint state; + dulint trx_id; + ulint offset; + fil_addr_t last_addr; + page_t* last_page; + trx_undo_rec_t* rec; + XID xid; + ibool xid_exists = FALSE; + + if (id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, + "InnoDB: Error: undo->id is %lu\n", (ulong) id); + ut_error; + } + + undo_page = trx_undo_page_get(rseg->space, page_no, mtr); + + page_header = undo_page + TRX_UNDO_PAGE_HDR; + + type = mtr_read_ulint(page_header + TRX_UNDO_PAGE_TYPE, MLOG_2BYTES, + mtr); + seg_header = undo_page + TRX_UNDO_SEG_HDR; + + state = mach_read_from_2(seg_header + TRX_UNDO_STATE); + + offset = mach_read_from_2(seg_header + TRX_UNDO_LAST_LOG); + + undo_header = undo_page + offset; + + trx_id = mtr_read_dulint(undo_header + TRX_UNDO_TRX_ID, mtr); + + xid_exists = mtr_read_ulint(undo_header + TRX_UNDO_XID_EXISTS, + MLOG_1BYTE, mtr); + + /* Read X/Open XA transaction identification if it exists, or + set it to NULL. */ + + memset(&xid, 0, sizeof(xid)); + xid.formatID = -1; + + if (xid_exists == TRUE) { + trx_undo_read_xid(undo_header, &xid); + } + + mutex_enter(&(rseg->mutex)); + + undo = trx_undo_mem_create(rseg, id, type, trx_id, &xid, + page_no, offset); + mutex_exit(&(rseg->mutex)); + + undo->dict_operation = mtr_read_ulint( + undo_header + TRX_UNDO_DICT_TRANS, + MLOG_1BYTE, mtr); + + undo->table_id = mtr_read_dulint(undo_header + TRX_UNDO_TABLE_ID, mtr); + undo->state = state; + undo->size = flst_get_len(seg_header + TRX_UNDO_PAGE_LIST, mtr); + + /* If the log segment is being freed, the page list is inconsistent! */ + if (state == TRX_UNDO_TO_FREE) { + + goto add_to_list; + } + + last_addr = flst_get_last(seg_header + TRX_UNDO_PAGE_LIST, mtr); + + undo->last_page_no = last_addr.page; + undo->top_page_no = last_addr.page; + + last_page = trx_undo_page_get(rseg->space, undo->last_page_no, mtr); + + rec = trx_undo_page_get_last_rec(last_page, page_no, offset); + + if (rec == NULL) { + undo->empty = TRUE; + } else { + undo->empty = FALSE; + undo->top_offset = rec - last_page; + undo->top_undo_no = trx_undo_rec_get_undo_no(rec); + } +add_to_list: + if (type == TRX_UNDO_INSERT) { + if (state != TRX_UNDO_CACHED) { + UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_list, + undo); + } else { + UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_cached, + undo); + } + } else { + ut_ad(type == TRX_UNDO_UPDATE); + if (state != TRX_UNDO_CACHED) { + UT_LIST_ADD_LAST(undo_list, rseg->update_undo_list, + undo); + } else { + UT_LIST_ADD_LAST(undo_list, rseg->update_undo_cached, + undo); + } + } + + return(undo); +} + +/************************************************************************ +Initializes the undo log lists for a rollback segment memory copy. This +function is only called when the database is started or a new rollback +segment is created. */ + +ulint +trx_undo_lists_init( +/*================*/ + /* out: the combined size of undo log segments + in pages */ + trx_rseg_t* rseg) /* in: rollback segment memory object */ +{ + ulint page_no; + trx_undo_t* undo; + ulint size = 0; + trx_rsegf_t* rseg_header; + ulint i; + mtr_t mtr; + + UT_LIST_INIT(rseg->update_undo_list); + UT_LIST_INIT(rseg->update_undo_cached); + UT_LIST_INIT(rseg->insert_undo_list); + UT_LIST_INIT(rseg->insert_undo_cached); + + mtr_start(&mtr); + + rseg_header = trx_rsegf_get_new(rseg->space, rseg->page_no, &mtr); + + for (i = 0; i < TRX_RSEG_N_SLOTS; i++) { + page_no = trx_rsegf_get_nth_undo(rseg_header, i, &mtr); + + /* In forced recovery: try to avoid operations which look + at database pages; undo logs are rapidly changing data, and + the probability that they are in an inconsistent state is + high */ + + if (page_no != FIL_NULL + && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) { + + undo = trx_undo_mem_create_at_db_start(rseg, i, + page_no, &mtr); + size += undo->size; + + mtr_commit(&mtr); + + mtr_start(&mtr); + + rseg_header = trx_rsegf_get(rseg->space, + rseg->page_no, &mtr); + } + } + + mtr_commit(&mtr); + + return(size); +} + +/************************************************************************ +Creates and initializes an undo log memory object. */ +static +trx_undo_t* +trx_undo_mem_create( +/*================*/ + /* out, own: the undo log memory object */ + trx_rseg_t* rseg, /* in: rollback segment memory object */ + ulint id, /* in: slot index within rseg */ + ulint type, /* in: type of the log: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + dulint trx_id, /* in: id of the trx for which the undo log + is created */ + XID* xid, /* in: X/Open transaction identification */ + ulint page_no,/* in: undo log header page number */ + ulint offset) /* in: undo log header byte offset on page */ +{ + trx_undo_t* undo; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(rseg->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + if (id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, + "InnoDB: Error: undo->id is %lu\n", (ulong) id); + ut_error; + } + + undo = mem_alloc(sizeof(trx_undo_t)); + + undo->id = id; + undo->type = type; + undo->state = TRX_UNDO_ACTIVE; + undo->del_marks = FALSE; + undo->trx_id = trx_id; + undo->xid = *xid; + + undo->dict_operation = FALSE; + + undo->rseg = rseg; + + undo->space = rseg->space; + undo->hdr_page_no = page_no; + undo->hdr_offset = offset; + undo->last_page_no = page_no; + undo->size = 1; + + undo->empty = TRUE; + undo->top_page_no = page_no; + undo->guess_page = NULL; + + return(undo); +} + +/************************************************************************ +Initializes a cached undo log object for new use. */ +static +void +trx_undo_mem_init_for_reuse( +/*========================*/ + trx_undo_t* undo, /* in: undo log to init */ + dulint trx_id, /* in: id of the trx for which the undo log + is created */ + XID* xid, /* in: X/Open XA transaction identification*/ + ulint offset) /* in: undo log header byte offset on page */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&((undo->rseg)->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + + mem_analyze_corruption((byte*)undo); + ut_error; + } + + undo->state = TRX_UNDO_ACTIVE; + undo->del_marks = FALSE; + undo->trx_id = trx_id; + undo->xid = *xid; + + undo->dict_operation = FALSE; + + undo->hdr_offset = offset; + undo->empty = TRUE; +} + +/************************************************************************ +Frees an undo log memory copy. */ +static +void +trx_undo_mem_free( +/*==============*/ + trx_undo_t* undo) /* in: the undo object to be freed */ +{ + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, + "InnoDB: Error: undo->id is %lu\n", (ulong) undo->id); + ut_error; + } + + mem_free(undo); +} + +/************************************************************************** +Creates a new undo log. */ +static +trx_undo_t* +trx_undo_create( +/*============*/ + /* out: undo log object, NULL if did not + succeed: out of space */ + trx_t* trx, /* in: transaction */ + trx_rseg_t* rseg, /* in: rollback segment memory copy */ + ulint type, /* in: type of the log: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + dulint trx_id, /* in: id of the trx for which the undo log + is created */ + XID* xid, /* in: X/Open transaction identification*/ + mtr_t* mtr) /* in: mtr */ +{ + trx_rsegf_t* rseg_header; + ulint page_no; + ulint offset; + ulint id; + trx_undo_t* undo; + page_t* undo_page; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(rseg->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + if (rseg->curr_size == rseg->max_size) { + + return(NULL); + } + + rseg->curr_size++; + + rseg_header = trx_rsegf_get(rseg->space, rseg->page_no, mtr); + + undo_page = trx_undo_seg_create(rseg, rseg_header, type, &id, mtr); + + if (undo_page == NULL) { + /* Did not succeed */ + + rseg->curr_size--; + + return(NULL); + } + + page_no = buf_frame_get_page_no(undo_page); + + offset = trx_undo_header_create(undo_page, trx_id, mtr); + + if (trx->support_xa) { + trx_undo_header_add_space_for_xid(undo_page, + undo_page + offset, mtr); + } + + undo = trx_undo_mem_create(rseg, id, type, trx_id, xid, + page_no, offset); + return(undo); +} + +/*================ UNDO LOG ASSIGNMENT AND CLEANUP =====================*/ + +/************************************************************************ +Reuses a cached undo log. */ +static +trx_undo_t* +trx_undo_reuse_cached( +/*==================*/ + /* out: the undo log memory object, NULL if + none cached */ + trx_t* trx, /* in: transaction */ + trx_rseg_t* rseg, /* in: rollback segment memory object */ + ulint type, /* in: type of the log: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + dulint trx_id, /* in: id of the trx for which the undo log + is used */ + XID* xid, /* in: X/Open XA transaction identification */ + mtr_t* mtr) /* in: mtr */ +{ + trx_undo_t* undo; + page_t* undo_page; + ulint offset; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(rseg->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + if (type == TRX_UNDO_INSERT) { + + undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached); + if (undo == NULL) { + + return(NULL); + } + + UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, undo); + } else { + ut_ad(type == TRX_UNDO_UPDATE); + + undo = UT_LIST_GET_FIRST(rseg->update_undo_cached); + if (undo == NULL) { + + return(NULL); + } + + UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, undo); + } + + ut_ad(undo->size == 1); + + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + mem_analyze_corruption((byte*)undo); + ut_error; + } + + undo_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr); + + if (type == TRX_UNDO_INSERT) { + offset = trx_undo_insert_header_reuse(undo_page, trx_id, mtr); + + if (trx->support_xa) { + trx_undo_header_add_space_for_xid(undo_page, + undo_page + offset, mtr); + } + } else { + ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) + == TRX_UNDO_UPDATE); + + offset = trx_undo_header_create(undo_page, trx_id, mtr); + + if (trx->support_xa) { + trx_undo_header_add_space_for_xid(undo_page, + undo_page + offset, mtr); + } + } + + trx_undo_mem_init_for_reuse(undo, trx_id, xid, offset); + + return(undo); +} + +/************************************************************************** +Marks an undo log header as a header of a data dictionary operation +transaction. */ +static +void +trx_undo_mark_as_dict_operation( +/*============================*/ + trx_t* trx, /* in: dict op transaction */ + trx_undo_t* undo, /* in: assigned undo log */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* hdr_page; + + ut_a(trx->dict_operation); + + hdr_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr); + + mlog_write_ulint(hdr_page + undo->hdr_offset + + TRX_UNDO_DICT_TRANS, + trx->dict_operation, MLOG_1BYTE, mtr); + + mlog_write_dulint(hdr_page + undo->hdr_offset + TRX_UNDO_TABLE_ID, + trx->table_id, mtr); + + undo->dict_operation = trx->dict_operation; + undo->table_id = trx->table_id; +} + +/************************************************************************** +Assigns an undo log for a transaction. A new undo log is created or a cached +undo log reused. */ + +trx_undo_t* +trx_undo_assign_undo( +/*=================*/ + /* out: the undo log, NULL if did not succeed: out of + space */ + trx_t* trx, /* in: transaction */ + ulint type) /* in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */ +{ + trx_rseg_t* rseg; + trx_undo_t* undo; + mtr_t mtr; + + ut_ad(trx); + ut_ad(trx->rseg); + + rseg = trx->rseg; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(trx->undo_mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + mtr_start(&mtr); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + mutex_enter(&(rseg->mutex)); + + undo = trx_undo_reuse_cached(trx, rseg, type, trx->id, &trx->xid, + &mtr); + if (undo == NULL) { + undo = trx_undo_create(trx, rseg, type, trx->id, &trx->xid, + &mtr); + if (undo == NULL) { + /* Did not succeed */ + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + return(NULL); + } + } + + if (type == TRX_UNDO_INSERT) { + UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_list, undo); + ut_ad(trx->insert_undo == NULL); + trx->insert_undo = undo; + } else { + UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_list, undo); + ut_ad(trx->update_undo == NULL); + trx->update_undo = undo; + } + + if (trx->dict_operation) { + trx_undo_mark_as_dict_operation(trx, undo, &mtr); + } + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + return(undo); +} + +/********************************************************************** +Sets the state of the undo log segment at a transaction finish. */ + +page_t* +trx_undo_set_state_at_finish( +/*=========================*/ + /* out: undo log segment header page, + x-latched */ + trx_t* trx __attribute__((unused)), /* in: transaction */ + trx_undo_t* undo, /* in: undo log memory copy */ + mtr_t* mtr) /* in: mtr */ +{ + trx_usegf_t* seg_hdr; + trx_upagef_t* page_hdr; + page_t* undo_page; + ulint state; + + ut_ad(trx && undo && mtr); + + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + mem_analyze_corruption((byte*)undo); + ut_error; + } + + undo_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr); + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + if (undo->size == 1 && mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE) + < TRX_UNDO_PAGE_REUSE_LIMIT) { + state = TRX_UNDO_CACHED; + + } else if (undo->type == TRX_UNDO_INSERT) { + + state = TRX_UNDO_TO_FREE; + } else { + state = TRX_UNDO_TO_PURGE; + } + + undo->state = state; + + mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, state, MLOG_2BYTES, mtr); + + return(undo_page); +} + +/********************************************************************** +Sets the state of the undo log segment at a transaction prepare. */ + +page_t* +trx_undo_set_state_at_prepare( +/*==========================*/ + /* out: undo log segment header page, + x-latched */ + trx_t* trx, /* in: transaction */ + trx_undo_t* undo, /* in: undo log memory copy */ + mtr_t* mtr) /* in: mtr */ +{ + trx_usegf_t* seg_hdr; + trx_upagef_t* page_hdr; + trx_ulogf_t* undo_header; + page_t* undo_page; + ulint offset; + + ut_ad(trx && undo && mtr); + + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + mem_analyze_corruption((byte*)undo); + ut_error; + } + + undo_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr); + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + /*------------------------------*/ + undo->state = TRX_UNDO_PREPARED; + undo->xid = trx->xid; + /*------------------------------*/ + + mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, undo->state, + MLOG_2BYTES, mtr); + + offset = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG); + undo_header = undo_page + offset; + + mlog_write_ulint(undo_header + TRX_UNDO_XID_EXISTS, + TRUE, MLOG_1BYTE, mtr); + + trx_undo_write_xid(undo_header, &undo->xid, mtr); + + return(undo_page); +} + +/************************************************************************** +Adds the update undo log header as the first in the history list, and +frees the memory object, or puts it to the list of cached update undo log +segments. */ + +void +trx_undo_update_cleanup( +/*====================*/ + trx_t* trx, /* in: trx owning the update undo log */ + page_t* undo_page, /* in: update undo log header page, + x-latched */ + mtr_t* mtr) /* in: mtr */ +{ + trx_rseg_t* rseg; + trx_undo_t* undo; + + undo = trx->update_undo; + rseg = trx->rseg; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(rseg->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + trx_purge_add_update_undo_to_history(trx, undo_page, mtr); + + UT_LIST_REMOVE(undo_list, rseg->update_undo_list, undo); + + trx->update_undo = NULL; + + if (undo->state == TRX_UNDO_CACHED) { + + UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_cached, undo); + } else { + ut_ad(undo->state == TRX_UNDO_TO_PURGE); + + trx_undo_mem_free(undo); + } +} + +/********************************************************************** +Frees or caches an insert undo log after a transaction commit or rollback. +Knowledge of inserts is not needed after a commit or rollback, therefore +the data can be discarded. */ + +void +trx_undo_insert_cleanup( +/*====================*/ + trx_t* trx) /* in: transaction handle */ +{ + trx_undo_t* undo; + trx_rseg_t* rseg; + + undo = trx->insert_undo; + ut_ad(undo); + + rseg = trx->rseg; + + mutex_enter(&(rseg->mutex)); + + UT_LIST_REMOVE(undo_list, rseg->insert_undo_list, undo); + trx->insert_undo = NULL; + + if (undo->state == TRX_UNDO_CACHED) { + + UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_cached, undo); + } else { + ut_ad(undo->state == TRX_UNDO_TO_FREE); + + /* Delete first the undo log segment in the file */ + + mutex_exit(&(rseg->mutex)); + + trx_undo_seg_free(undo); + + mutex_enter(&(rseg->mutex)); + + ut_ad(rseg->curr_size > undo->size); + + rseg->curr_size -= undo->size; + + trx_undo_mem_free(undo); + } + + mutex_exit(&(rseg->mutex)); +} |