summaryrefslogtreecommitdiff
path: root/storage/innobase/trx
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/trx')
-rw-r--r--storage/innobase/trx/Makefile.am25
-rw-r--r--storage/innobase/trx/makefilewin26
-rw-r--r--storage/innobase/trx/trx0purge.c1164
-rw-r--r--storage/innobase/trx/trx0rec.c1417
-rw-r--r--storage/innobase/trx/trx0roll.c1344
-rw-r--r--storage/innobase/trx/trx0rseg.c261
-rw-r--r--storage/innobase/trx/trx0sys.c964
-rw-r--r--storage/innobase/trx/trx0trx.c2025
-rw-r--r--storage/innobase/trx/trx0undo.c1906
9 files changed, 9132 insertions, 0 deletions
diff --git a/storage/innobase/trx/Makefile.am b/storage/innobase/trx/Makefile.am
new file mode 100644
index 00000000000..9e2b3c398e3
--- /dev/null
+++ b/storage/innobase/trx/Makefile.am
@@ -0,0 +1,25 @@
+# Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+# & Innobase Oy
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+include ../include/Makefile.i
+
+noinst_LIBRARIES = libtrx.a
+
+libtrx_a_SOURCES = trx0purge.c trx0rec.c trx0roll.c trx0rseg.c\
+ trx0sys.c trx0trx.c trx0undo.c
+
+EXTRA_PROGRAMS =
diff --git a/storage/innobase/trx/makefilewin b/storage/innobase/trx/makefilewin
new file mode 100644
index 00000000000..35588779d66
--- /dev/null
+++ b/storage/innobase/trx/makefilewin
@@ -0,0 +1,26 @@
+include ..\include\makefile.i
+
+trx.lib: trx0sys.obj trx0trx.obj trx0rseg.obj trx0undo.obj trx0rec.obj trx0roll.obj trx0purge.obj
+ lib -out:..\libs\trx.lib trx0sys.obj trx0trx.obj trx0rseg.obj trx0undo.obj trx0rec.obj trx0roll.obj trx0purge.obj
+
+trx0trx.obj: trx0trx.c
+ $(CCOM) $(CFL) -c -I.. trx0trx.c
+
+trx0sys.obj: trx0sys.c
+ $(CCOM) $(CFL) -c -I.. trx0sys.c
+
+trx0rseg.obj: trx0rseg.c
+ $(CCOM) $(CFL) -c -I.. trx0rseg.c
+
+trx0undo.obj: trx0undo.c
+ $(CCOM) $(CFL) -c -I.. trx0undo.c
+
+trx0rec.obj: trx0rec.c
+ $(CCOM) $(CFL) -c -I.. trx0rec.c
+
+trx0roll.obj: trx0roll.c
+ $(CCOM) $(CFL) -c -I.. trx0roll.c
+
+trx0purge.obj: trx0purge.c
+ $(CCOM) $(CFL) -c -I.. trx0purge.c
+
diff --git a/storage/innobase/trx/trx0purge.c b/storage/innobase/trx/trx0purge.c
new file mode 100644
index 00000000000..3df34111281
--- /dev/null
+++ b/storage/innobase/trx/trx0purge.c
@@ -0,0 +1,1164 @@
+/******************************************************
+Purge old versions
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0purge.h"
+
+#ifdef UNIV_NONINL
+#include "trx0purge.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "read0read.h"
+#include "fut0fut.h"
+#include "que0que.h"
+#include "row0purge.h"
+#include "row0upd.h"
+#include "trx0rec.h"
+#include "srv0que.h"
+#include "os0thread.h"
+
+/* The global data structure coordinating a purge */
+trx_purge_t* purge_sys = NULL;
+
+/* A dummy undo record used as a return value when we have a whole undo log
+which needs no purge */
+trx_undo_rec_t trx_purge_dummy_rec;
+
+/*********************************************************************
+Checks if trx_id is >= purge_view: then it is guaranteed that its update
+undo log still exists in the system. */
+
+ibool
+trx_purge_update_undo_must_exist(
+/*=============================*/
+ /* out: TRUE if is sure that it is preserved, also
+ if the function returns FALSE, it is possible that
+ the undo log still exists in the system */
+ dulint trx_id) /* in: transaction id */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (!read_view_sees_trx_id(purge_sys->view, trx_id)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*=================== PURGE RECORD ARRAY =============================*/
+
+/***********************************************************************
+Stores info of an undo log record during a purge. */
+static
+trx_undo_inf_t*
+trx_purge_arr_store_info(
+/*=====================*/
+ /* out: pointer to the storage cell */
+ dulint trx_no, /* in: transaction number */
+ dulint undo_no)/* in: undo number */
+{
+ trx_undo_inf_t* cell;
+ trx_undo_arr_t* arr;
+ ulint i;
+
+ arr = purge_sys->arr;
+
+ for (i = 0;; i++) {
+ cell = trx_undo_arr_get_nth_info(arr, i);
+
+ if (!(cell->in_use)) {
+ /* Not in use, we may store here */
+ cell->undo_no = undo_no;
+ cell->trx_no = trx_no;
+ cell->in_use = TRUE;
+
+ arr->n_used++;
+
+ return(cell);
+ }
+ }
+}
+
+/***********************************************************************
+Removes info of an undo log record during a purge. */
+UNIV_INLINE
+void
+trx_purge_arr_remove_info(
+/*======================*/
+ trx_undo_inf_t* cell) /* in: pointer to the storage cell */
+{
+ trx_undo_arr_t* arr;
+
+ arr = purge_sys->arr;
+
+ cell->in_use = FALSE;
+
+ ut_ad(arr->n_used > 0);
+
+ arr->n_used--;
+}
+
+/***********************************************************************
+Gets the biggest pair of a trx number and an undo number in a purge array. */
+static
+void
+trx_purge_arr_get_biggest(
+/*======================*/
+ trx_undo_arr_t* arr, /* in: purge array */
+ dulint* trx_no, /* out: transaction number: ut_dulint_zero
+ if array is empty */
+ dulint* undo_no)/* out: undo number */
+{
+ trx_undo_inf_t* cell;
+ dulint pair_trx_no;
+ dulint pair_undo_no;
+ int trx_cmp;
+ ulint n_used;
+ ulint i;
+ ulint n;
+
+ n = 0;
+ n_used = arr->n_used;
+ pair_trx_no = ut_dulint_zero;
+ pair_undo_no = ut_dulint_zero;
+
+ for (i = 0;; i++) {
+ cell = trx_undo_arr_get_nth_info(arr, i);
+
+ if (cell->in_use) {
+ n++;
+ trx_cmp = ut_dulint_cmp(cell->trx_no, pair_trx_no);
+
+ if ((trx_cmp > 0)
+ || ((trx_cmp == 0)
+ && (ut_dulint_cmp(cell->undo_no,
+ pair_undo_no) >= 0))) {
+
+ pair_trx_no = cell->trx_no;
+ pair_undo_no = cell->undo_no;
+ }
+ }
+
+ if (n == n_used) {
+ *trx_no = pair_trx_no;
+ *undo_no = pair_undo_no;
+
+ return;
+ }
+ }
+}
+
+/********************************************************************
+Builds a purge 'query' graph. The actual purge is performed by executing
+this query graph. */
+static
+que_t*
+trx_purge_graph_build(void)
+/*=======================*/
+ /* out, own: the query graph */
+{
+ mem_heap_t* heap;
+ que_fork_t* fork;
+ que_thr_t* thr;
+/* que_thr_t* thr2; */
+
+ heap = mem_heap_create(512);
+ fork = que_fork_create(NULL, NULL, QUE_FORK_PURGE, heap);
+ fork->trx = purge_sys->trx;
+
+ thr = que_thr_create(fork, heap);
+
+ thr->child = row_purge_node_create(thr, heap);
+
+/* thr2 = que_thr_create(fork, fork, heap);
+
+ thr2->child = row_purge_node_create(fork, thr2, heap); */
+
+ return(fork);
+}
+
+/************************************************************************
+Creates the global purge system control structure and inits the history
+mutex. */
+
+void
+trx_purge_sys_create(void)
+/*======================*/
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ purge_sys = mem_alloc(sizeof(trx_purge_t));
+
+ purge_sys->state = TRX_STOP_PURGE;
+
+ purge_sys->n_pages_handled = 0;
+
+ purge_sys->purge_trx_no = ut_dulint_zero;
+ purge_sys->purge_undo_no = ut_dulint_zero;
+ purge_sys->next_stored = FALSE;
+
+ rw_lock_create(&(purge_sys->latch));
+ rw_lock_set_level(&(purge_sys->latch), SYNC_PURGE_LATCH);
+
+ mutex_create(&(purge_sys->mutex));
+ mutex_set_level(&(purge_sys->mutex), SYNC_PURGE_SYS);
+
+ purge_sys->heap = mem_heap_create(256);
+
+ purge_sys->arr = trx_undo_arr_create();
+
+ purge_sys->sess = sess_open();
+
+ purge_sys->trx = purge_sys->sess->trx;
+
+ purge_sys->trx->type = TRX_PURGE;
+
+ ut_a(trx_start_low(purge_sys->trx, ULINT_UNDEFINED));
+
+ purge_sys->query = trx_purge_graph_build();
+
+ purge_sys->view = read_view_oldest_copy_or_open_new(NULL,
+ purge_sys->heap);
+}
+
+/*================ UNDO LOG HISTORY LIST =============================*/
+
+/************************************************************************
+Adds the update undo log as the first log in the history list. Removes the
+update undo log segment from the rseg slot if it is too big for reuse. */
+
+void
+trx_purge_add_update_undo_to_history(
+/*=================================*/
+ trx_t* trx, /* in: transaction */
+ page_t* undo_page, /* in: update undo log header page,
+ x-latched */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_undo_t* undo;
+ trx_rseg_t* rseg;
+ trx_rsegf_t* rseg_header;
+ trx_usegf_t* seg_header;
+ trx_ulogf_t* undo_header;
+ trx_upagef_t* page_header;
+ ulint hist_size;
+
+ undo = trx->update_undo;
+
+ ut_ad(undo);
+
+ rseg = undo->rseg;
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(rseg->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ rseg_header = trx_rsegf_get(rseg->space, rseg->page_no, mtr);
+
+ undo_header = undo_page + undo->hdr_offset;
+ seg_header = undo_page + TRX_UNDO_SEG_HDR;
+ page_header = undo_page + TRX_UNDO_PAGE_HDR;
+
+ if (undo->state != TRX_UNDO_CACHED) {
+ /* The undo log segment will not be reused */
+
+ if (undo->id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr,
+ "InnoDB: Error: undo->id is %lu\n", (ulong) undo->id);
+ ut_error;
+ }
+
+ trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, mtr);
+
+ hist_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+ MLOG_4BYTES, mtr);
+ ut_ad(undo->size ==
+ flst_get_len(seg_header + TRX_UNDO_PAGE_LIST, mtr));
+
+ mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+ hist_size + undo->size, MLOG_4BYTES, mtr);
+ }
+
+ /* Add the log as the first in the history list */
+ flst_add_first(rseg_header + TRX_RSEG_HISTORY,
+ undo_header + TRX_UNDO_HISTORY_NODE, mtr);
+ mutex_enter(&kernel_mutex);
+ trx_sys->rseg_history_len++;
+ mutex_exit(&kernel_mutex);
+
+ /* Write the trx number to the undo log header */
+ mlog_write_dulint(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr);
+ /* Write information about delete markings to the undo log header */
+
+ if (!undo->del_marks) {
+ mlog_write_ulint(undo_header + TRX_UNDO_DEL_MARKS, FALSE,
+ MLOG_2BYTES, mtr);
+ }
+
+ if (rseg->last_page_no == FIL_NULL) {
+
+ rseg->last_page_no = undo->hdr_page_no;
+ rseg->last_offset = undo->hdr_offset;
+ rseg->last_trx_no = trx->no;
+ rseg->last_del_marks = undo->del_marks;
+ }
+}
+
+/**************************************************************************
+Frees an undo log segment which is in the history list. Cuts the end of the
+history list at the youngest undo log in this segment. */
+static
+void
+trx_purge_free_segment(
+/*===================*/
+ trx_rseg_t* rseg, /* in: rollback segment */
+ fil_addr_t hdr_addr, /* in: the file address of log_hdr */
+ ulint n_removed_logs) /* in: count of how many undo logs we
+ will cut off from the end of the
+ history list */
+{
+ page_t* undo_page;
+ trx_rsegf_t* rseg_hdr;
+ trx_ulogf_t* log_hdr;
+ trx_usegf_t* seg_hdr;
+ ibool freed;
+ ulint seg_size;
+ ulint hist_size;
+ ibool marked = FALSE;
+ mtr_t mtr;
+
+/* fputs("Freeing an update undo log segment\n", stderr); */
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(purge_sys->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+loop:
+ mtr_start(&mtr);
+ mutex_enter(&(rseg->mutex));
+
+ rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr);
+
+ undo_page = trx_undo_page_get(rseg->space, hdr_addr.page, &mtr);
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+ log_hdr = undo_page + hdr_addr.boffset;
+
+ /* Mark the last undo log totally purged, so that if the system
+ crashes, the tail of the undo log will not get accessed again. The
+ list of pages in the undo log tail gets inconsistent during the
+ freeing of the segment, and therefore purge should not try to access
+ them again. */
+
+ if (!marked) {
+ mlog_write_ulint(log_hdr + TRX_UNDO_DEL_MARKS, FALSE,
+ MLOG_2BYTES, &mtr);
+ marked = TRUE;
+ }
+
+ freed = fseg_free_step_not_header(seg_hdr + TRX_UNDO_FSEG_HEADER,
+ &mtr);
+ if (!freed) {
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ goto loop;
+ }
+
+ /* The page list may now be inconsistent, but the length field
+ stored in the list base node tells us how big it was before we
+ started the freeing. */
+
+ seg_size = flst_get_len(seg_hdr + TRX_UNDO_PAGE_LIST, &mtr);
+
+ /* We may free the undo log segment header page; it must be freed
+ within the same mtr as the undo log header is removed from the
+ history list: otherwise, in case of a database crash, the segment
+ could become inaccessible garbage in the file space. */
+
+ flst_cut_end(rseg_hdr + TRX_RSEG_HISTORY,
+ log_hdr + TRX_UNDO_HISTORY_NODE, n_removed_logs, &mtr);
+
+ mutex_enter(&kernel_mutex);
+ ut_ad(trx_sys->rseg_history_len >= n_removed_logs);
+ trx_sys->rseg_history_len -= n_removed_logs;
+ mutex_exit(&kernel_mutex);
+
+ freed = FALSE;
+
+ while (!freed) {
+ /* Here we assume that a file segment with just the header
+ page can be freed in a few steps, so that the buffer pool
+ is not flooded with bufferfixed pages: see the note in
+ fsp0fsp.c. */
+
+ freed = fseg_free_step(seg_hdr + TRX_UNDO_FSEG_HEADER,
+ &mtr);
+ }
+
+ hist_size = mtr_read_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE,
+ MLOG_4BYTES, &mtr);
+ ut_ad(hist_size >= seg_size);
+
+ mlog_write_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE,
+ hist_size - seg_size, MLOG_4BYTES, &mtr);
+
+ ut_ad(rseg->curr_size >= seg_size);
+
+ rseg->curr_size -= seg_size;
+
+ mutex_exit(&(rseg->mutex));
+
+ mtr_commit(&mtr);
+}
+
+/************************************************************************
+Removes unnecessary history data from a rollback segment. */
+static
+void
+trx_purge_truncate_rseg_history(
+/*============================*/
+ trx_rseg_t* rseg, /* in: rollback segment */
+ dulint limit_trx_no, /* in: remove update undo logs whose
+ trx number is < limit_trx_no */
+ dulint limit_undo_no) /* in: if transaction number is equal
+ to limit_trx_no, truncate undo records
+ with undo number < limit_undo_no */
+{
+ fil_addr_t hdr_addr;
+ fil_addr_t prev_hdr_addr;
+ trx_rsegf_t* rseg_hdr;
+ page_t* undo_page;
+ trx_ulogf_t* log_hdr;
+ trx_usegf_t* seg_hdr;
+ int cmp;
+ ulint n_removed_logs = 0;
+ mtr_t mtr;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(purge_sys->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ mtr_start(&mtr);
+ mutex_enter(&(rseg->mutex));
+
+ rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr);
+
+ hdr_addr = trx_purge_get_log_from_hist(
+ flst_get_last(rseg_hdr + TRX_RSEG_HISTORY, &mtr));
+loop:
+ if (hdr_addr.page == FIL_NULL) {
+
+ mutex_exit(&(rseg->mutex));
+
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ undo_page = trx_undo_page_get(rseg->space, hdr_addr.page, &mtr);
+
+ log_hdr = undo_page + hdr_addr.boffset;
+
+ cmp = ut_dulint_cmp(mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO),
+ limit_trx_no);
+ if (cmp == 0) {
+ trx_undo_truncate_start(rseg, rseg->space, hdr_addr.page,
+ hdr_addr.boffset, limit_undo_no);
+ }
+
+ if (cmp >= 0) {
+ mutex_enter(&kernel_mutex);
+ ut_a(trx_sys->rseg_history_len >= n_removed_logs);
+ trx_sys->rseg_history_len -= n_removed_logs;
+ mutex_exit(&kernel_mutex);
+
+ flst_truncate_end(rseg_hdr + TRX_RSEG_HISTORY,
+ log_hdr + TRX_UNDO_HISTORY_NODE,
+ n_removed_logs, &mtr);
+
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ prev_hdr_addr = trx_purge_get_log_from_hist(
+ flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE,
+ &mtr));
+ n_removed_logs++;
+
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+ if ((mach_read_from_2(seg_hdr + TRX_UNDO_STATE) == TRX_UNDO_TO_PURGE)
+ && (mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG) == 0)) {
+
+ /* We can free the whole log segment */
+
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ trx_purge_free_segment(rseg, hdr_addr, n_removed_logs);
+
+ n_removed_logs = 0;
+ } else {
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+ }
+
+ mtr_start(&mtr);
+ mutex_enter(&(rseg->mutex));
+
+ rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr);
+
+ hdr_addr = prev_hdr_addr;
+
+ goto loop;
+}
+
+/************************************************************************
+Removes unnecessary history data from rollback segments. NOTE that when this
+function is called, the caller must not have any latches on undo log pages! */
+static
+void
+trx_purge_truncate_history(void)
+/*============================*/
+{
+ trx_rseg_t* rseg;
+ dulint limit_trx_no;
+ dulint limit_undo_no;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(purge_sys->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ trx_purge_arr_get_biggest(purge_sys->arr, &limit_trx_no,
+ &limit_undo_no);
+
+ if (ut_dulint_cmp(limit_trx_no, ut_dulint_zero) == 0) {
+
+ limit_trx_no = purge_sys->purge_trx_no;
+ limit_undo_no = purge_sys->purge_undo_no;
+ }
+
+ /* We play safe and set the truncate limit at most to the purge view
+ low_limit number, though this is not necessary */
+
+ if (ut_dulint_cmp(limit_trx_no, purge_sys->view->low_limit_no) >= 0) {
+ limit_trx_no = purge_sys->view->low_limit_no;
+ limit_undo_no = ut_dulint_zero;
+ }
+
+ ut_ad((ut_dulint_cmp(limit_trx_no,
+ purge_sys->view->low_limit_no) <= 0));
+
+ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+ while (rseg) {
+ trx_purge_truncate_rseg_history(rseg, limit_trx_no,
+ limit_undo_no);
+ rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+ }
+}
+
+/************************************************************************
+Does a truncate if the purge array is empty. NOTE that when this function is
+called, the caller must not have any latches on undo log pages! */
+UNIV_INLINE
+ibool
+trx_purge_truncate_if_arr_empty(void)
+/*=================================*/
+ /* out: TRUE if array empty */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(purge_sys->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (purge_sys->arr->n_used == 0) {
+
+ trx_purge_truncate_history();
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/***************************************************************************
+Updates the last not yet purged history log info in rseg when we have purged
+a whole undo log. Advances also purge_sys->purge_trx_no past the purged log. */
+static
+void
+trx_purge_rseg_get_next_history_log(
+/*================================*/
+ trx_rseg_t* rseg) /* in: rollback segment */
+{
+ page_t* undo_page;
+ trx_ulogf_t* log_hdr;
+ trx_usegf_t* seg_hdr;
+ fil_addr_t prev_log_addr;
+ dulint trx_no;
+ ibool del_marks;
+ mtr_t mtr;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(purge_sys->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ mutex_enter(&(rseg->mutex));
+
+ ut_a(rseg->last_page_no != FIL_NULL);
+
+ purge_sys->purge_trx_no = ut_dulint_add(rseg->last_trx_no, 1);
+ purge_sys->purge_undo_no = ut_dulint_zero;
+ purge_sys->next_stored = FALSE;
+
+ mtr_start(&mtr);
+
+ undo_page = trx_undo_page_get_s_latched(rseg->space,
+ rseg->last_page_no, &mtr);
+ log_hdr = undo_page + rseg->last_offset;
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+ /* Increase the purge page count by one for every handled log */
+
+ purge_sys->n_pages_handled++;
+
+ prev_log_addr = trx_purge_get_log_from_hist(
+ flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE,
+ &mtr));
+ if (prev_log_addr.page == FIL_NULL) {
+ /* No logs left in the history list */
+
+ rseg->last_page_no = FIL_NULL;
+
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ mutex_enter(&kernel_mutex);
+
+ /* Add debug code to track history list corruption reported
+ on the MySQL mailing list on Nov 9, 2004. The fut0lst.c
+ file-based list was corrupt. The prev node pointer was
+ FIL_NULL, even though the list length was over 8 million nodes!
+ We assume that purge truncates the history list in moderate
+ size pieces, and if we here reach the head of the list, the
+ list cannot be longer than 20 000 undo logs now. */
+
+ if (trx_sys->rseg_history_len > 20000) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Warning: purge reached the head of the history list,\n"
+"InnoDB: but its length is still reported as %lu! Make a detailed bug\n"
+"InnoDB: report, and post it to bugs.mysql.com\n",
+ (ulong)trx_sys->rseg_history_len);
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ return;
+ }
+
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ /* Read the trx number and del marks from the previous log header */
+ mtr_start(&mtr);
+
+ log_hdr = trx_undo_page_get_s_latched(rseg->space,
+ prev_log_addr.page, &mtr)
+ + prev_log_addr.boffset;
+
+ trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
+
+ del_marks = mach_read_from_2(log_hdr + TRX_UNDO_DEL_MARKS);
+
+ mtr_commit(&mtr);
+
+ mutex_enter(&(rseg->mutex));
+
+ rseg->last_page_no = prev_log_addr.page;
+ rseg->last_offset = prev_log_addr.boffset;
+ rseg->last_trx_no = trx_no;
+ rseg->last_del_marks = del_marks;
+
+ mutex_exit(&(rseg->mutex));
+}
+
+/***************************************************************************
+Chooses the next undo log to purge and updates the info in purge_sys. This
+function is used to initialize purge_sys when the next record to purge is
+not known, and also to update the purge system info on the next record when
+purge has handled the whole undo log for a transaction. */
+static
+void
+trx_purge_choose_next_log(void)
+/*===========================*/
+{
+ trx_undo_rec_t* rec;
+ trx_rseg_t* rseg;
+ trx_rseg_t* min_rseg;
+ dulint min_trx_no;
+ ulint space = 0; /* remove warning (??? bug ???) */
+ ulint page_no = 0; /* remove warning (??? bug ???) */
+ ulint offset = 0; /* remove warning (??? bug ???) */
+ mtr_t mtr;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(purge_sys->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(purge_sys->next_stored == FALSE);
+
+ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+ min_trx_no = ut_dulint_max;
+
+ min_rseg = NULL;
+
+ while (rseg) {
+ mutex_enter(&(rseg->mutex));
+
+ if (rseg->last_page_no != FIL_NULL) {
+
+ if ((min_rseg == NULL)
+ || (ut_dulint_cmp(min_trx_no, rseg->last_trx_no)
+ > 0)) {
+
+ min_rseg = rseg;
+ min_trx_no = rseg->last_trx_no;
+ space = rseg->space;
+ ut_a(space == 0); /* We assume in purge of
+ externally stored fields
+ that space id == 0 */
+ page_no = rseg->last_page_no;
+ offset = rseg->last_offset;
+ }
+ }
+
+ mutex_exit(&(rseg->mutex));
+
+ rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+ }
+
+ if (min_rseg == NULL) {
+
+ return;
+ }
+
+ mtr_start(&mtr);
+
+ if (!min_rseg->last_del_marks) {
+ /* No need to purge this log */
+
+ rec = &trx_purge_dummy_rec;
+ } else {
+ rec = trx_undo_get_first_rec(space, page_no, offset,
+ RW_S_LATCH, &mtr);
+ if (rec == NULL) {
+ /* Undo log empty */
+
+ rec = &trx_purge_dummy_rec;
+ }
+ }
+
+ purge_sys->next_stored = TRUE;
+ purge_sys->rseg = min_rseg;
+
+ purge_sys->hdr_page_no = page_no;
+ purge_sys->hdr_offset = offset;
+
+ purge_sys->purge_trx_no = min_trx_no;
+
+ if (rec == &trx_purge_dummy_rec) {
+
+ purge_sys->purge_undo_no = ut_dulint_zero;
+ purge_sys->page_no = page_no;
+ purge_sys->offset = 0;
+ } else {
+ purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec);
+
+ purge_sys->page_no = buf_frame_get_page_no(rec);
+ purge_sys->offset = rec - buf_frame_align(rec);
+ }
+
+ mtr_commit(&mtr);
+}
+
+/***************************************************************************
+Gets the next record to purge and updates the info in the purge system. */
+static
+trx_undo_rec_t*
+trx_purge_get_next_rec(
+/*===================*/
+ /* out: copy of an undo log record or
+ pointer to the dummy undo log record */
+ mem_heap_t* heap) /* in: memory heap where copied */
+{
+ trx_undo_rec_t* rec;
+ trx_undo_rec_t* rec_copy;
+ trx_undo_rec_t* rec2;
+ trx_undo_rec_t* next_rec;
+ page_t* undo_page;
+ page_t* page;
+ ulint offset;
+ ulint page_no;
+ ulint space;
+ ulint type;
+ ulint cmpl_info;
+ mtr_t mtr;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(purge_sys->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(purge_sys->next_stored);
+
+ space = purge_sys->rseg->space;
+ page_no = purge_sys->page_no;
+ offset = purge_sys->offset;
+
+ if (offset == 0) {
+ /* It is the dummy undo log record, which means that there is
+ no need to purge this undo log */
+
+ trx_purge_rseg_get_next_history_log(purge_sys->rseg);
+
+ /* Look for the next undo log and record to purge */
+
+ trx_purge_choose_next_log();
+
+ return(&trx_purge_dummy_rec);
+ }
+
+ mtr_start(&mtr);
+
+ undo_page = trx_undo_page_get_s_latched(space, page_no, &mtr);
+ rec = undo_page + offset;
+
+ rec2 = rec;
+
+ for (;;) {
+ /* Try first to find the next record which requires a purge
+ operation from the same page of the same undo log */
+
+ next_rec = trx_undo_page_get_next_rec(rec2,
+ purge_sys->hdr_page_no,
+ purge_sys->hdr_offset);
+ if (next_rec == NULL) {
+ rec2 = trx_undo_get_next_rec(rec2,
+ purge_sys->hdr_page_no,
+ purge_sys->hdr_offset, &mtr);
+ break;
+ }
+
+ rec2 = next_rec;
+
+ type = trx_undo_rec_get_type(rec2);
+
+ if (type == TRX_UNDO_DEL_MARK_REC) {
+
+ break;
+ }
+
+ cmpl_info = trx_undo_rec_get_cmpl_info(rec2);
+
+ if (trx_undo_rec_get_extern_storage(rec2)) {
+ break;
+ }
+
+ if ((type == TRX_UNDO_UPD_EXIST_REC)
+ && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+ break;
+ }
+ }
+
+ if (rec2 == NULL) {
+ mtr_commit(&mtr);
+
+ trx_purge_rseg_get_next_history_log(purge_sys->rseg);
+
+ /* Look for the next undo log and record to purge */
+
+ trx_purge_choose_next_log();
+
+ mtr_start(&mtr);
+
+ undo_page = trx_undo_page_get_s_latched(space, page_no, &mtr);
+
+ rec = undo_page + offset;
+ } else {
+ page = buf_frame_align(rec2);
+
+ purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec2);
+ purge_sys->page_no = buf_frame_get_page_no(page);
+ purge_sys->offset = rec2 - page;
+
+ if (undo_page != page) {
+ /* We advance to a new page of the undo log: */
+ purge_sys->n_pages_handled++;
+ }
+ }
+
+ rec_copy = trx_undo_rec_copy(rec, heap);
+
+ mtr_commit(&mtr);
+
+ return(rec_copy);
+}
+
+/************************************************************************
+Fetches the next undo log record from the history list to purge. It must be
+released with the corresponding release function. */
+
+trx_undo_rec_t*
+trx_purge_fetch_next_rec(
+/*=====================*/
+ /* out: copy of an undo log record or
+ pointer to the dummy undo log record
+ &trx_purge_dummy_rec, if the whole undo log
+ can skipped in purge; NULL if none left */
+ dulint* roll_ptr,/* out: roll pointer to undo record */
+ trx_undo_inf_t** cell, /* out: storage cell for the record in the
+ purge array */
+ mem_heap_t* heap) /* in: memory heap where copied */
+{
+ trx_undo_rec_t* undo_rec;
+
+ mutex_enter(&(purge_sys->mutex));
+
+ if (purge_sys->state == TRX_STOP_PURGE) {
+ trx_purge_truncate_if_arr_empty();
+
+ mutex_exit(&(purge_sys->mutex));
+
+ return(NULL);
+ }
+
+ if (!purge_sys->next_stored) {
+ trx_purge_choose_next_log();
+
+ if (!purge_sys->next_stored) {
+ purge_sys->state = TRX_STOP_PURGE;
+
+ trx_purge_truncate_if_arr_empty();
+
+ if (srv_print_thread_releases) {
+ fprintf(stderr,
+ "Purge: No logs left in the history list; pages handled %lu\n",
+ (ulong) purge_sys->n_pages_handled);
+ }
+
+ mutex_exit(&(purge_sys->mutex));
+
+ return(NULL);
+ }
+ }
+
+ if (purge_sys->n_pages_handled >= purge_sys->handle_limit) {
+
+ purge_sys->state = TRX_STOP_PURGE;
+
+ trx_purge_truncate_if_arr_empty();
+
+ mutex_exit(&(purge_sys->mutex));
+
+ return(NULL);
+ }
+
+ if (ut_dulint_cmp(purge_sys->purge_trx_no,
+ purge_sys->view->low_limit_no) >= 0) {
+ purge_sys->state = TRX_STOP_PURGE;
+
+ trx_purge_truncate_if_arr_empty();
+
+ mutex_exit(&(purge_sys->mutex));
+
+ return(NULL);
+ }
+
+/* fprintf(stderr, "Thread %lu purging trx %lu undo record %lu\n",
+ os_thread_get_curr_id(),
+ ut_dulint_get_low(purge_sys->purge_trx_no),
+ ut_dulint_get_low(purge_sys->purge_undo_no)); */
+
+ *roll_ptr = trx_undo_build_roll_ptr(FALSE, (purge_sys->rseg)->id,
+ purge_sys->page_no,
+ purge_sys->offset);
+
+ *cell = trx_purge_arr_store_info(purge_sys->purge_trx_no,
+ purge_sys->purge_undo_no);
+
+ ut_ad(ut_dulint_cmp(purge_sys->purge_trx_no,
+ (purge_sys->view)->low_limit_no) < 0);
+
+ /* The following call will advance the stored values of purge_trx_no
+ and purge_undo_no, therefore we had to store them first */
+
+ undo_rec = trx_purge_get_next_rec(heap);
+
+ mutex_exit(&(purge_sys->mutex));
+
+ return(undo_rec);
+}
+
+/***********************************************************************
+Releases a reserved purge undo record. */
+
+void
+trx_purge_rec_release(
+/*==================*/
+ trx_undo_inf_t* cell) /* in: storage cell */
+{
+ trx_undo_arr_t* arr;
+
+ mutex_enter(&(purge_sys->mutex));
+
+ arr = purge_sys->arr;
+
+ trx_purge_arr_remove_info(cell);
+
+ mutex_exit(&(purge_sys->mutex));
+}
+
+/***********************************************************************
+This function runs a purge batch. */
+
+ulint
+trx_purge(void)
+/*===========*/
+ /* out: number of undo log pages handled in
+ the batch */
+{
+ que_thr_t* thr;
+/* que_thr_t* thr2; */
+ ulint old_pages_handled;
+
+ mutex_enter(&(purge_sys->mutex));
+
+ if (purge_sys->trx->n_active_thrs > 0) {
+
+ mutex_exit(&(purge_sys->mutex));
+
+ /* Should not happen */
+
+ ut_error;
+
+ return(0);
+ }
+
+ rw_lock_x_lock(&(purge_sys->latch));
+
+ mutex_enter(&kernel_mutex);
+
+ /* Close and free the old purge view */
+
+ read_view_close(purge_sys->view);
+ purge_sys->view = NULL;
+ mem_heap_empty(purge_sys->heap);
+
+ /* Determine how much data manipulation language (DML) statements
+ need to be delayed in order to reduce the lagging of the purge
+ thread. */
+ srv_dml_needed_delay = 0; /* in microseconds; default: no delay */
+
+ /* If we cannot advance the 'purge view' because of an old
+ 'consistent read view', then the DML statements cannot be delayed.
+ Also, srv_max_purge_lag <= 0 means 'infinity'. */
+ if (srv_max_purge_lag > 0
+ && !UT_LIST_GET_LAST(trx_sys->view_list)) {
+ float ratio = (float) trx_sys->rseg_history_len
+ / srv_max_purge_lag;
+ if (ratio > ULINT_MAX / 10000) {
+ /* Avoid overflow: maximum delay is 4295 seconds */
+ srv_dml_needed_delay = ULINT_MAX;
+ } else if (ratio > 1) {
+ /* If the history list length exceeds the
+ innodb_max_purge_lag, the
+ data manipulation statements are delayed
+ by at least 5000 microseconds. */
+ srv_dml_needed_delay = (ulint) ((ratio - .5) * 10000);
+ }
+ }
+
+ purge_sys->view = read_view_oldest_copy_or_open_new(NULL,
+ purge_sys->heap);
+ mutex_exit(&kernel_mutex);
+
+ rw_lock_x_unlock(&(purge_sys->latch));
+
+ purge_sys->state = TRX_PURGE_ON;
+
+ /* Handle at most 20 undo log pages in one purge batch */
+
+ purge_sys->handle_limit = purge_sys->n_pages_handled + 20;
+
+ old_pages_handled = purge_sys->n_pages_handled;
+
+ mutex_exit(&(purge_sys->mutex));
+
+ mutex_enter(&kernel_mutex);
+
+ thr = que_fork_start_command(purge_sys->query);
+
+ ut_ad(thr);
+
+/* thr2 = que_fork_start_command(purge_sys->query);
+
+ ut_ad(thr2); */
+
+
+ mutex_exit(&kernel_mutex);
+
+/* srv_que_task_enqueue(thr2); */
+
+ if (srv_print_thread_releases) {
+
+ fputs("Starting purge\n", stderr);
+ }
+
+ que_run_threads(thr);
+
+ if (srv_print_thread_releases) {
+
+ fprintf(stderr,
+ "Purge ends; pages handled %lu\n",
+ (ulong) purge_sys->n_pages_handled);
+ }
+
+ return(purge_sys->n_pages_handled - old_pages_handled);
+}
+
+/**********************************************************************
+Prints information of the purge system to stderr. */
+
+void
+trx_purge_sys_print(void)
+/*=====================*/
+{
+ fprintf(stderr, "InnoDB: Purge system view:\n");
+ read_view_print(purge_sys->view);
+
+ fprintf(stderr, "InnoDB: Purge trx n:o %lu %lu, undo n_o %lu %lu\n",
+ (ulong) ut_dulint_get_high(purge_sys->purge_trx_no),
+ (ulong) ut_dulint_get_low(purge_sys->purge_trx_no),
+ (ulong) ut_dulint_get_high(purge_sys->purge_undo_no),
+ (ulong) ut_dulint_get_low(purge_sys->purge_undo_no));
+ fprintf(stderr,
+ "InnoDB: Purge next stored %lu, page_no %lu, offset %lu,\n"
+ "InnoDB: Purge hdr_page_no %lu, hdr_offset %lu\n",
+ (ulong) purge_sys->next_stored,
+ (ulong) purge_sys->page_no,
+ (ulong) purge_sys->offset,
+ (ulong) purge_sys->hdr_page_no,
+ (ulong) purge_sys->hdr_offset);
+}
diff --git a/storage/innobase/trx/trx0rec.c b/storage/innobase/trx/trx0rec.c
new file mode 100644
index 00000000000..fcb7582ce73
--- /dev/null
+++ b/storage/innobase/trx/trx0rec.c
@@ -0,0 +1,1417 @@
+/******************************************************
+Transaction undo log record
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rec.h"
+
+#ifdef UNIV_NONINL
+#include "trx0rec.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0undo.h"
+#include "dict0dict.h"
+#include "ut0mem.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "trx0purge.h"
+#include "row0row.h"
+
+/*=========== UNDO LOG RECORD CREATION AND DECODING ====================*/
+
+/**************************************************************************
+Writes the mtr log entry of the inserted undo log record on the undo log
+page. */
+UNIV_INLINE
+void
+trx_undof_page_add_undo_rec_log(
+/*============================*/
+ page_t* undo_page, /* in: undo log page */
+ ulint old_free, /* in: start offset of the inserted entry */
+ ulint new_free, /* in: end offset of the entry */
+ mtr_t* mtr) /* in: mtr */
+{
+ byte* log_ptr;
+ const byte* log_end;
+ ulint len;
+
+ log_ptr = mlog_open(mtr, 11 + 13 + MLOG_BUF_MARGIN);
+
+ if (log_ptr == NULL) {
+
+ return;
+ }
+
+ log_end = &log_ptr[11 + 13 + MLOG_BUF_MARGIN];
+ log_ptr = mlog_write_initial_log_record_fast(undo_page,
+ MLOG_UNDO_INSERT, log_ptr, mtr);
+ len = new_free - old_free - 4;
+
+ mach_write_to_2(log_ptr, len);
+ log_ptr += 2;
+
+ if (log_ptr + len <= log_end) {
+ memcpy(log_ptr, undo_page + old_free + 2, len);
+ mlog_close(mtr, log_ptr + len);
+ } else {
+ mlog_close(mtr, log_ptr);
+ mlog_catenate_string(mtr, undo_page + old_free + 2, len);
+ }
+}
+
+/***************************************************************
+Parses a redo log record of adding an undo log record. */
+
+byte*
+trx_undo_parse_add_undo_rec(
+/*========================*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page) /* in: page or NULL */
+{
+ ulint len;
+ byte* rec;
+ ulint first_free;
+
+ if (end_ptr < ptr + 2) {
+
+ return(NULL);
+ }
+
+ len = mach_read_from_2(ptr);
+ ptr += 2;
+
+ if (end_ptr < ptr + len) {
+
+ return(NULL);
+ }
+
+ if (page == NULL) {
+
+ return(ptr + len);
+ }
+
+ first_free = mach_read_from_2(page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE);
+ rec = page + first_free;
+
+ mach_write_to_2(rec, first_free + 4 + len);
+ mach_write_to_2(rec + 2 + len, first_free);
+
+ mach_write_to_2(page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE,
+ first_free + 4 + len);
+ ut_memcpy(rec + 2, ptr, len);
+
+ return(ptr + len);
+}
+
+/**************************************************************************
+Calculates the free space left for extending an undo log record. */
+UNIV_INLINE
+ulint
+trx_undo_left(
+/*==========*/
+ /* out: bytes left */
+ page_t* page, /* in: undo log page */
+ byte* ptr) /* in: pointer to page */
+{
+ /* The '- 10' is a safety margin, in case we have some small
+ calculation error below */
+
+ return(UNIV_PAGE_SIZE - (ptr - page) - 10 - FIL_PAGE_DATA_END);
+}
+
+/**************************************************************************
+Reports in the undo log of an insert of a clustered index record. */
+static
+ulint
+trx_undo_page_report_insert(
+/*========================*/
+ /* out: offset of the inserted entry
+ on the page if succeed, 0 if fail */
+ page_t* undo_page, /* in: undo log page */
+ trx_t* trx, /* in: transaction */
+ dict_index_t* index, /* in: clustered index */
+ dtuple_t* clust_entry, /* in: index entry which will be
+ inserted to the clustered index */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint first_free;
+ byte* ptr;
+ ulint len;
+ dfield_t* field;
+ ulint flen;
+ ulint i;
+
+ ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_INSERT);
+
+ first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE);
+ ptr = undo_page + first_free;
+
+ ut_ad(first_free <= UNIV_PAGE_SIZE);
+
+ if (trx_undo_left(undo_page, ptr) < 30) {
+
+ /* NOTE: the value 30 must be big enough such that the general
+ fields written below fit on the undo log page */
+
+ return(0);
+ }
+
+ /* Reserve 2 bytes for the pointer to the next undo log record */
+ ptr += 2;
+
+ /* Store first some general parameters to the undo log */
+ mach_write_to_1(ptr, TRX_UNDO_INSERT_REC);
+ ptr++;
+
+ len = mach_dulint_write_much_compressed(ptr, trx->undo_no);
+ ptr += len;
+
+ len = mach_dulint_write_much_compressed(ptr, (index->table)->id);
+ ptr += len;
+ /*----------------------------------------*/
+ /* Store then the fields required to uniquely determine the record
+ to be inserted in the clustered index */
+
+ for (i = 0; i < dict_index_get_n_unique(index); i++) {
+
+ field = dtuple_get_nth_field(clust_entry, i);
+
+ flen = dfield_get_len(field);
+
+ if (trx_undo_left(undo_page, ptr) < 5) {
+
+ return(0);
+ }
+
+ len = mach_write_compressed(ptr, flen);
+ ptr += len;
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_page, ptr) < flen) {
+
+ return(0);
+ }
+
+ ut_memcpy(ptr, dfield_get_data(field), flen);
+ ptr += flen;
+ }
+ }
+
+ if (trx_undo_left(undo_page, ptr) < 2) {
+
+ return(0);
+ }
+
+ /*----------------------------------------*/
+ /* Write pointers to the previous and the next undo log records */
+
+ if (trx_undo_left(undo_page, ptr) < 2) {
+
+ return(0);
+ }
+
+ mach_write_to_2(ptr, first_free);
+ ptr += 2;
+
+ mach_write_to_2(undo_page + first_free, ptr - undo_page);
+
+ mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE,
+ ptr - undo_page);
+
+ /* Write the log entry to the REDO log of this change in the UNDO
+ log */
+ trx_undof_page_add_undo_rec_log(undo_page, first_free,
+ ptr - undo_page, mtr);
+ return(first_free);
+}
+
+/**************************************************************************
+Reads from an undo log record the general parameters. */
+
+byte*
+trx_undo_rec_get_pars(
+/*==================*/
+ /* out: remaining part of undo log
+ record after reading these values */
+ trx_undo_rec_t* undo_rec, /* in: undo log record */
+ ulint* type, /* out: undo record type:
+ TRX_UNDO_INSERT_REC, ... */
+ ulint* cmpl_info, /* out: compiler info, relevant only
+ for update type records */
+ ibool* updated_extern, /* out: TRUE if we updated an
+ externally stored fild */
+ dulint* undo_no, /* out: undo log record number */
+ dulint* table_id) /* out: table id */
+{
+ byte* ptr;
+ ulint len;
+ ulint type_cmpl;
+
+ ptr = undo_rec + 2;
+
+ type_cmpl = mach_read_from_1(ptr);
+ ptr++;
+
+ if (type_cmpl & TRX_UNDO_UPD_EXTERN) {
+ *updated_extern = TRUE;
+ type_cmpl -= TRX_UNDO_UPD_EXTERN;
+ } else {
+ *updated_extern = FALSE;
+ }
+
+ *type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1);
+ *cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT;
+
+ *undo_no = mach_dulint_read_much_compressed(ptr);
+ len = mach_dulint_get_much_compressed_size(*undo_no);
+ ptr += len;
+
+ *table_id = mach_dulint_read_much_compressed(ptr);
+ len = mach_dulint_get_much_compressed_size(*table_id);
+ ptr += len;
+
+ return(ptr);
+}
+
+/**************************************************************************
+Reads from an undo log record a stored column value. */
+static
+byte*
+trx_undo_rec_get_col_val(
+/*=====================*/
+ /* out: remaining part of undo log record after
+ reading these values */
+ byte* ptr, /* in: pointer to remaining part of undo log record */
+ byte** field, /* out: pointer to stored field */
+ ulint* len) /* out: length of the field, or UNIV_SQL_NULL */
+{
+ *len = mach_read_compressed(ptr);
+ ptr += mach_get_compressed_size(*len);
+
+ *field = ptr;
+
+ if (*len != UNIV_SQL_NULL) {
+ if (*len >= UNIV_EXTERN_STORAGE_FIELD) {
+ ptr += (*len - UNIV_EXTERN_STORAGE_FIELD);
+ } else {
+ ptr += *len;
+ }
+ }
+
+ return(ptr);
+}
+
+/***********************************************************************
+Builds a row reference from an undo log record. */
+
+byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+ /* out: pointer to remaining part of undo
+ record */
+ byte* ptr, /* in: remaining part of a copy of an undo log
+ record, at the start of the row reference;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the row reference is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /* in: clustered index */
+ dtuple_t** ref, /* out, own: row reference */
+ mem_heap_t* heap) /* in: memory heap from which the memory
+ needed is allocated */
+{
+ dfield_t* dfield;
+ byte* field;
+ ulint len;
+ ulint ref_len;
+ ulint i;
+
+ ut_ad(index && ptr && ref && heap);
+ ut_a(index->type & DICT_CLUSTERED);
+
+ ref_len = dict_index_get_n_unique(index);
+
+ *ref = dtuple_create(heap, ref_len);
+
+ dict_index_copy_types(*ref, index, ref_len);
+
+ for (i = 0; i < ref_len; i++) {
+ dfield = dtuple_get_nth_field(*ref, i);
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len);
+
+ dfield_set_data(dfield, field, len);
+ }
+
+ return(ptr);
+}
+
+/***********************************************************************
+Skips a row reference from an undo log record. */
+
+byte*
+trx_undo_rec_skip_row_ref(
+/*======================*/
+ /* out: pointer to remaining part of undo
+ record */
+ byte* ptr, /* in: remaining part in update undo log
+ record, at the start of the row reference */
+ dict_index_t* index) /* in: clustered index */
+{
+ byte* field;
+ ulint len;
+ ulint ref_len;
+ ulint i;
+
+ ut_ad(index && ptr);
+ ut_a(index->type & DICT_CLUSTERED);
+
+ ref_len = dict_index_get_n_unique(index);
+
+ for (i = 0; i < ref_len; i++) {
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len);
+ }
+
+ return(ptr);
+}
+
+/**************************************************************************
+Reports in the undo log of an update or delete marking of a clustered index
+record. */
+static
+ulint
+trx_undo_page_report_modify(
+/*========================*/
+ /* out: byte offset of the inserted
+ undo log entry on the page if succeed,
+ 0 if fail */
+ page_t* undo_page, /* in: undo log page */
+ trx_t* trx, /* in: transaction */
+ dict_index_t* index, /* in: clustered index where update or
+ delete marking is done */
+ rec_t* rec, /* in: clustered index record which
+ has NOT yet been modified */
+ const ulint* offsets, /* in: rec_get_offsets(rec, index) */
+ upd_t* update, /* in: update vector which tells the
+ columns to be updated; in the case of
+ a delete, this should be set to NULL */
+ ulint cmpl_info, /* in: compiler info on secondary
+ index updates */
+ mtr_t* mtr) /* in: mtr */
+{
+ dict_table_t* table;
+ upd_field_t* upd_field;
+ dict_col_t* col;
+ ulint first_free;
+ byte* ptr;
+ ulint len;
+ byte* field;
+ ulint flen;
+ ulint pos;
+ dulint roll_ptr;
+ dulint trx_id;
+ ulint bits;
+ ulint col_no;
+ byte* old_ptr;
+ ulint type_cmpl;
+ byte* type_cmpl_ptr;
+ ulint i;
+
+ ut_a(index->type & DICT_CLUSTERED);
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE);
+ table = index->table;
+
+ first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE);
+ ptr = undo_page + first_free;
+
+ ut_ad(first_free <= UNIV_PAGE_SIZE);
+
+ if (trx_undo_left(undo_page, ptr) < 50) {
+
+ /* NOTE: the value 50 must be big enough so that the general
+ fields written below fit on the undo log page */
+
+ return(0);
+ }
+
+ /* Reserve 2 bytes for the pointer to the next undo log record */
+ ptr += 2;
+
+ /* Store first some general parameters to the undo log */
+
+ if (update) {
+ if (rec_get_deleted_flag(rec, table->comp)) {
+ type_cmpl = TRX_UNDO_UPD_DEL_REC;
+ } else {
+ type_cmpl = TRX_UNDO_UPD_EXIST_REC;
+ }
+ } else {
+ type_cmpl = TRX_UNDO_DEL_MARK_REC;
+ }
+
+ type_cmpl = type_cmpl | (cmpl_info * TRX_UNDO_CMPL_INFO_MULT);
+
+ mach_write_to_1(ptr, type_cmpl);
+
+ type_cmpl_ptr = ptr;
+
+ ptr++;
+ len = mach_dulint_write_much_compressed(ptr, trx->undo_no);
+ ptr += len;
+
+ len = mach_dulint_write_much_compressed(ptr, table->id);
+ ptr += len;
+
+ /*----------------------------------------*/
+ /* Store the state of the info bits */
+
+ bits = rec_get_info_bits(rec, table->comp);
+ mach_write_to_1(ptr, bits);
+ ptr += 1;
+
+ /* Store the values of the system columns */
+ field = rec_get_nth_field(rec, offsets,
+ dict_index_get_sys_col_pos(index, DATA_TRX_ID), &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ trx_id = trx_read_trx_id(field);
+ field = rec_get_nth_field(rec, offsets,
+ dict_index_get_sys_col_pos(index, DATA_ROLL_PTR), &len);
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ roll_ptr = trx_read_roll_ptr(field);
+
+ len = mach_dulint_write_compressed(ptr, trx_id);
+ ptr += len;
+
+ len = mach_dulint_write_compressed(ptr, roll_ptr);
+ ptr += len;
+
+ /*----------------------------------------*/
+ /* Store then the fields required to uniquely determine the
+ record which will be modified in the clustered index */
+
+ for (i = 0; i < dict_index_get_n_unique(index); i++) {
+
+ field = rec_get_nth_field(rec, offsets, i, &flen);
+
+ if (trx_undo_left(undo_page, ptr) < 4) {
+
+ return(0);
+ }
+
+ len = mach_write_compressed(ptr, flen);
+ ptr += len;
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_page, ptr) < flen) {
+
+ return(0);
+ }
+
+ ut_memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+ }
+
+ /*----------------------------------------*/
+ /* Save to the undo log the old values of the columns to be updated. */
+
+ if (update) {
+ if (trx_undo_left(undo_page, ptr) < 5) {
+
+ return(0);
+ }
+
+ len = mach_write_compressed(ptr, upd_get_n_fields(update));
+ ptr += len;
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+
+ upd_field = upd_get_nth_field(update, i);
+ pos = upd_field->field_no;
+
+ /* Write field number to undo log */
+ if (trx_undo_left(undo_page, ptr) < 5) {
+
+ return(0);
+ }
+
+ len = mach_write_compressed(ptr, pos);
+ ptr += len;
+
+ /* Save the old value of field */
+ field = rec_get_nth_field(rec, offsets, pos, &flen);
+
+ if (trx_undo_left(undo_page, ptr) < 5) {
+
+ return(0);
+ }
+
+ if (rec_offs_nth_extern(offsets, pos)) {
+ /* If a field has external storage, we add to
+ flen the flag */
+
+ len = mach_write_compressed(ptr,
+ UNIV_EXTERN_STORAGE_FIELD + flen);
+
+ /* Notify purge that it eventually has to free the old
+ externally stored field */
+
+ trx->update_undo->del_marks = TRUE;
+
+ *type_cmpl_ptr = *type_cmpl_ptr | TRX_UNDO_UPD_EXTERN;
+ } else {
+ len = mach_write_compressed(ptr, flen);
+ }
+
+ ptr += len;
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_page, ptr) < flen) {
+
+ return(0);
+ }
+
+ ut_memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+ }
+ }
+
+ /*----------------------------------------*/
+ /* In the case of a delete marking, and also in the case of an update
+ where any ordering field of any index changes, store the values of all
+ columns which occur as ordering fields in any index. This info is used
+ in the purge of old versions where we use it to build and search the
+ delete marked index records, to look if we can remove them from the
+ index tree. Note that starting from 4.0.14 also externally stored
+ fields can be ordering in some index. But we always store at least
+ 384 first bytes locally to the clustered index record, which means
+ we can construct the column prefix fields in the index from the
+ stored data. */
+
+ if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+
+ trx->update_undo->del_marks = TRUE;
+
+ if (trx_undo_left(undo_page, ptr) < 5) {
+
+ return(0);
+ }
+
+ old_ptr = ptr;
+
+ /* Reserve 2 bytes to write the number of bytes the stored fields
+ take in this undo record */
+
+ ptr += 2;
+
+ for (col_no = 0; col_no < dict_table_get_n_cols(table); col_no++) {
+
+ col = dict_table_get_nth_col(table, col_no);
+
+ if (col->ord_part > 0) {
+
+ pos = dict_index_get_nth_col_pos(index, col_no);
+
+ /* Write field number to undo log */
+ if (trx_undo_left(undo_page, ptr) < 5) {
+
+ return(0);
+ }
+
+ len = mach_write_compressed(ptr, pos);
+ ptr += len;
+
+ /* Save the old value of field */
+ field = rec_get_nth_field(rec, offsets, pos, &flen);
+
+ if (trx_undo_left(undo_page, ptr) < 5) {
+
+ return(0);
+ }
+
+ len = mach_write_compressed(ptr, flen);
+ ptr += len;
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_page, ptr) < flen) {
+
+ return(0);
+ }
+
+ ut_memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+ }
+ }
+
+ mach_write_to_2(old_ptr, ptr - old_ptr);
+ }
+
+ /*----------------------------------------*/
+ /* Write pointers to the previous and the next undo log records */
+ if (trx_undo_left(undo_page, ptr) < 2) {
+
+ return(0);
+ }
+
+ mach_write_to_2(ptr, first_free);
+ ptr += 2;
+ mach_write_to_2(undo_page + first_free, ptr - undo_page);
+
+ mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE,
+ ptr - undo_page);
+
+ /* Write to the REDO log about this change in the UNDO log */
+
+ trx_undof_page_add_undo_rec_log(undo_page, first_free,
+ ptr - undo_page, mtr);
+ return(first_free);
+}
+
+/**************************************************************************
+Reads from an undo log update record the system field values of the old
+version. */
+
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+ /* out: remaining part of undo log
+ record after reading these values */
+ byte* ptr, /* in: remaining part of undo log
+ record after reading general
+ parameters */
+ dulint* trx_id, /* out: trx id */
+ dulint* roll_ptr, /* out: roll ptr */
+ ulint* info_bits) /* out: info bits state */
+{
+ ulint len;
+
+ /* Read the state of the info bits */
+ *info_bits = mach_read_from_1(ptr);
+ ptr += 1;
+
+ /* Read the values of the system columns */
+
+ *trx_id = mach_dulint_read_compressed(ptr);
+ len = mach_dulint_get_compressed_size(*trx_id);
+ ptr += len;
+
+ *roll_ptr = mach_dulint_read_compressed(ptr);
+ len = mach_dulint_get_compressed_size(*roll_ptr);
+ ptr += len;
+
+ return(ptr);
+}
+
+/**************************************************************************
+Reads from an update undo log record the number of updated fields. */
+UNIV_INLINE
+byte*
+trx_undo_update_rec_get_n_upd_fields(
+/*=================================*/
+ /* out: remaining part of undo log record after
+ reading this value */
+ byte* ptr, /* in: pointer to remaining part of undo log record */
+ ulint* n) /* out: number of fields */
+{
+ *n = mach_read_compressed(ptr);
+ ptr += mach_get_compressed_size(*n);
+
+ return(ptr);
+}
+
+/**************************************************************************
+Reads from an update undo log record a stored field number. */
+UNIV_INLINE
+byte*
+trx_undo_update_rec_get_field_no(
+/*=============================*/
+ /* out: remaining part of undo log record after
+ reading this value */
+ byte* ptr, /* in: pointer to remaining part of undo log record */
+ ulint* field_no)/* out: field number */
+{
+ *field_no = mach_read_compressed(ptr);
+ ptr += mach_get_compressed_size(*field_no);
+
+ return(ptr);
+}
+
+/***********************************************************************
+Builds an update vector based on a remaining part of an undo log record. */
+
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+ /* out: remaining part of the record,
+ NULL if an error detected, which means that
+ the record is corrupted */
+ byte* ptr, /* in: remaining part in update undo log
+ record, after reading the row reference
+ NOTE that this copy of the undo log record must
+ be preserved as long as the update vector is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /* in: clustered index */
+ ulint type, /* in: TRX_UNDO_UPD_EXIST_REC,
+ TRX_UNDO_UPD_DEL_REC, or
+ TRX_UNDO_DEL_MARK_REC; in the last case,
+ only trx id and roll ptr fields are added to
+ the update vector */
+ dulint trx_id, /* in: transaction id from this undo record */
+ dulint roll_ptr,/* in: roll pointer from this undo record */
+ ulint info_bits,/* in: info bits from this undo record */
+ trx_t* trx, /* in: transaction */
+ mem_heap_t* heap, /* in: memory heap from which the memory
+ needed is allocated */
+ upd_t** upd) /* out, own: update vector */
+{
+ upd_field_t* upd_field;
+ upd_t* update;
+ ulint n_fields;
+ byte* buf;
+ byte* field;
+ ulint len;
+ ulint field_no;
+ ulint i;
+
+ ut_a(index->type & DICT_CLUSTERED);
+
+ if (type != TRX_UNDO_DEL_MARK_REC) {
+ ptr = trx_undo_update_rec_get_n_upd_fields(ptr, &n_fields);
+ } else {
+ n_fields = 0;
+ }
+
+ update = upd_create(n_fields + 2, heap);
+
+ update->info_bits = info_bits;
+
+ /* Store first trx id and roll ptr to update vector */
+
+ upd_field = upd_get_nth_field(update, n_fields);
+ buf = mem_heap_alloc(heap, DATA_TRX_ID_LEN);
+ trx_write_trx_id(buf, trx_id);
+
+ upd_field_set_field_no(upd_field,
+ dict_index_get_sys_col_pos(index, DATA_TRX_ID),
+ index, trx);
+ dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN);
+
+ upd_field = upd_get_nth_field(update, n_fields + 1);
+ buf = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN);
+ trx_write_roll_ptr(buf, roll_ptr);
+
+ upd_field_set_field_no(upd_field,
+ dict_index_get_sys_col_pos(index, DATA_ROLL_PTR),
+ index, trx);
+ dfield_set_data(&(upd_field->new_val), buf, DATA_ROLL_PTR_LEN);
+
+ /* Store then the updated ordinary columns to the update vector */
+
+ for (i = 0; i < n_fields; i++) {
+
+ ptr = trx_undo_update_rec_get_field_no(ptr, &field_no);
+
+ if (field_no >= dict_index_get_n_fields(index)) {
+ fprintf(stderr,
+"InnoDB: Error: trying to access update undo rec field %lu in ", (ulong) field_no);
+ dict_index_name_print(stderr, trx, index);
+ fprintf(stderr, "\n"
+"InnoDB: but index has only %lu fields\n"
+"InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n"
+"InnoDB: Run also CHECK TABLE ",
+ (ulong) dict_index_get_n_fields(index));
+ ut_print_name(stderr, trx, index->table_name);
+ fprintf(stderr, "\n"
+"InnoDB: n_fields = %lu, i = %lu, ptr %p\n",
+ (ulong) n_fields, (ulong) i, ptr);
+ return(NULL);
+ }
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len);
+
+ upd_field = upd_get_nth_field(update, i);
+
+ upd_field_set_field_no(upd_field, field_no, index, trx);
+
+ if (len != UNIV_SQL_NULL && len >= UNIV_EXTERN_STORAGE_FIELD) {
+
+ upd_field->extern_storage = TRUE;
+
+ len -= UNIV_EXTERN_STORAGE_FIELD;
+ }
+
+ dfield_set_data(&(upd_field->new_val), field, len);
+ }
+
+ *upd = update;
+
+ return(ptr);
+}
+
+/***********************************************************************
+Builds a partial row from an update undo log record. It contains the
+columns which occur as ordering in any index of the table. */
+
+byte*
+trx_undo_rec_get_partial_row(
+/*=========================*/
+ /* out: pointer to remaining part of undo
+ record */
+ byte* ptr, /* in: remaining part in update undo log
+ record of a suitable type, at the start of
+ the stored index columns;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the partial row is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /* in: clustered index */
+ dtuple_t** row, /* out, own: partial row */
+ mem_heap_t* heap) /* in: memory heap from which the memory
+ needed is allocated */
+{
+ dfield_t* dfield;
+ byte* field;
+ ulint len;
+ ulint field_no;
+ ulint col_no;
+ ulint row_len;
+ ulint total_len;
+ byte* start_ptr;
+ ulint i;
+
+ ut_ad(index && ptr && row && heap);
+
+ row_len = dict_table_get_n_cols(index->table);
+
+ *row = dtuple_create(heap, row_len);
+
+ dict_table_copy_types(*row, index->table);
+
+ start_ptr = ptr;
+
+ total_len = mach_read_from_2(ptr);
+ ptr += 2;
+
+ for (i = 0;; i++) {
+
+ if (ptr == start_ptr + total_len) {
+
+ break;
+ }
+
+ ptr = trx_undo_update_rec_get_field_no(ptr, &field_no);
+
+ col_no = dict_index_get_nth_col_no(index, field_no);
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len);
+
+ dfield = dtuple_get_nth_field(*row, col_no);
+
+ dfield_set_data(dfield, field, len);
+ }
+
+ return(ptr);
+}
+
+/***************************************************************************
+Erases the unused undo log page end. */
+static
+void
+trx_undo_erase_page_end(
+/*====================*/
+ page_t* undo_page, /* in: undo page whose end to erase */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint first_free;
+ ulint i;
+
+ first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE);
+ for (i = first_free; i < UNIV_PAGE_SIZE - FIL_PAGE_DATA_END; i++) {
+ undo_page[i] = 0xFF;
+ }
+
+ mlog_write_initial_log_record(undo_page, MLOG_UNDO_ERASE_END, mtr);
+}
+
+/***************************************************************
+Parses a redo log record of erasing of an undo page end. */
+
+byte*
+trx_undo_parse_erase_page_end(
+/*==========================*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr __attribute__((unused)), /* in: buffer end */
+ page_t* page, /* in: page or NULL */
+ mtr_t* mtr) /* in: mtr or NULL */
+{
+ ut_ad(ptr && end_ptr);
+
+ if (page == NULL) {
+
+ return(ptr);
+ }
+
+ trx_undo_erase_page_end(page, mtr);
+
+ return(ptr);
+}
+
+/***************************************************************************
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction. */
+
+ulint
+trx_undo_report_row_operation(
+/*==========================*/
+ /* out: DB_SUCCESS or error code */
+ ulint flags, /* in: if BTR_NO_UNDO_LOG_FLAG bit is
+ set, does nothing */
+ ulint op_type, /* in: TRX_UNDO_INSERT_OP or
+ TRX_UNDO_MODIFY_OP */
+ que_thr_t* thr, /* in: query thread */
+ dict_index_t* index, /* in: clustered index */
+ dtuple_t* clust_entry, /* in: in the case of an insert,
+ index entry to insert into the
+ clustered index, otherwise NULL */
+ upd_t* update, /* in: in the case of an update,
+ the update vector, otherwise NULL */
+ ulint cmpl_info, /* in: compiler info on secondary
+ index updates */
+ rec_t* rec, /* in: in case of an update or delete
+ marking, the record in the clustered
+ index, otherwise NULL */
+ dulint* roll_ptr) /* out: rollback pointer to the
+ inserted undo log record,
+ ut_dulint_zero if BTR_NO_UNDO_LOG
+ flag was specified */
+{
+ trx_t* trx;
+ trx_undo_t* undo;
+ page_t* undo_page;
+ ulint offset;
+ ulint page_no;
+ ibool is_insert;
+ trx_rseg_t* rseg;
+ mtr_t mtr;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ ut_a(index->type & DICT_CLUSTERED);
+
+ if (flags & BTR_NO_UNDO_LOG_FLAG) {
+
+ *roll_ptr = ut_dulint_zero;
+
+ return(DB_SUCCESS);
+ }
+
+ ut_ad(thr);
+ ut_ad((op_type != TRX_UNDO_INSERT_OP)
+ || (clust_entry && !update && !rec));
+
+ trx = thr_get_trx(thr);
+ rseg = trx->rseg;
+
+ mutex_enter(&(trx->undo_mutex));
+
+ /* If the undo log is not assigned yet, assign one */
+
+ if (op_type == TRX_UNDO_INSERT_OP) {
+
+ if (trx->insert_undo == NULL) {
+
+ trx_undo_assign_undo(trx, TRX_UNDO_INSERT);
+ }
+
+ undo = trx->insert_undo;
+ is_insert = TRUE;
+ } else {
+ ut_ad(op_type == TRX_UNDO_MODIFY_OP);
+
+ if (trx->update_undo == NULL) {
+
+ trx_undo_assign_undo(trx, TRX_UNDO_UPDATE);
+
+ }
+
+ undo = trx->update_undo;
+ is_insert = FALSE;
+ }
+
+ if (undo == NULL) {
+ /* Did not succeed: out of space */
+ mutex_exit(&(trx->undo_mutex));
+
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+
+ page_no = undo->last_page_no;
+
+ mtr_start(&mtr);
+
+ for (;;) {
+ undo_page = buf_page_get_gen(undo->space, page_no,
+ RW_X_LATCH, undo->guess_page,
+ BUF_GET,
+ __FILE__, __LINE__,
+ &mtr);
+
+#ifdef UNIV_SYNC_DEBUG
+ buf_page_dbg_add_level(undo_page, SYNC_TRX_UNDO_PAGE);
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (op_type == TRX_UNDO_INSERT_OP) {
+ offset = trx_undo_page_report_insert(undo_page, trx,
+ index, clust_entry,
+ &mtr);
+ } else {
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ offset = trx_undo_page_report_modify(undo_page, trx,
+ index, rec, offsets, update, cmpl_info, &mtr);
+ }
+
+ if (offset == 0) {
+ /* The record did not fit on the page. We erase the
+ end segment of the undo log page and write a log
+ record of it: this is to ensure that in the debug
+ version the replicate page constructed using the log
+ records stays identical to the original page */
+
+ trx_undo_erase_page_end(undo_page, &mtr);
+ }
+
+ mtr_commit(&mtr);
+
+ if (offset != 0) {
+ /* Success */
+
+ break;
+ }
+
+ ut_ad(page_no == undo->last_page_no);
+
+ /* We have to extend the undo log by one page */
+
+ mtr_start(&mtr);
+
+ /* When we add a page to an undo log, this is analogous to
+ a pessimistic insert in a B-tree, and we must reserve the
+ counterpart of the tree latch, which is the rseg mutex. */
+
+ mutex_enter(&(rseg->mutex));
+
+ page_no = trx_undo_add_page(trx, undo, &mtr);
+
+ mutex_exit(&(rseg->mutex));
+
+ if (page_no == FIL_NULL) {
+ /* Did not succeed: out of space */
+
+ mutex_exit(&(trx->undo_mutex));
+ mtr_commit(&mtr);
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+ }
+
+ undo->empty = FALSE;
+ undo->top_page_no = page_no;
+ undo->top_offset = offset;
+ undo->top_undo_no = trx->undo_no;
+ undo->guess_page = undo_page;
+
+ UT_DULINT_INC(trx->undo_no);
+
+ mutex_exit(&(trx->undo_mutex));
+
+ *roll_ptr = trx_undo_build_roll_ptr(is_insert, rseg->id, page_no,
+ offset);
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(DB_SUCCESS);
+}
+
+/*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/
+
+/**********************************************************************
+Copies an undo record to heap. This function can be called if we know that
+the undo log record exists. */
+
+trx_undo_rec_t*
+trx_undo_get_undo_rec_low(
+/*======================*/
+ /* out, own: copy of the record */
+ dulint roll_ptr, /* in: roll pointer to record */
+ mem_heap_t* heap) /* in: memory heap where copied */
+{
+ trx_undo_rec_t* undo_rec;
+ ulint rseg_id;
+ ulint page_no;
+ ulint offset;
+ page_t* undo_page;
+ trx_rseg_t* rseg;
+ ibool is_insert;
+ mtr_t mtr;
+
+ trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no,
+ &offset);
+ rseg = trx_rseg_get_on_id(rseg_id);
+
+ mtr_start(&mtr);
+
+ undo_page = trx_undo_page_get_s_latched(rseg->space, page_no, &mtr);
+
+ undo_rec = trx_undo_rec_copy(undo_page + offset, heap);
+
+ mtr_commit(&mtr);
+
+ return(undo_rec);
+}
+
+/**********************************************************************
+Copies an undo record to heap. */
+
+ulint
+trx_undo_get_undo_rec(
+/*==================*/
+ /* out: DB_SUCCESS, or
+ DB_MISSING_HISTORY if the undo log
+ has been truncated and we cannot
+ fetch the old version; NOTE: the
+ caller must have latches on the
+ clustered index page and purge_view */
+ dulint roll_ptr, /* in: roll pointer to record */
+ dulint trx_id, /* in: id of the trx that generated
+ the roll pointer: it points to an
+ undo log of this transaction */
+ trx_undo_rec_t** undo_rec, /* out, own: copy of the record */
+ mem_heap_t* heap) /* in: memory heap where copied */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (!trx_purge_update_undo_must_exist(trx_id)) {
+
+ /* It may be that the necessary undo log has already been
+ deleted */
+
+ return(DB_MISSING_HISTORY);
+ }
+
+ *undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
+
+ return(DB_SUCCESS);
+}
+
+/***********************************************************************
+Build a previous version of a clustered index record. This function checks
+that the caller has a latch on the index page of the clustered index record
+and an s-latch on the purge_view. This guarantees that the stack of versions
+is locked. */
+
+ulint
+trx_undo_prev_version_build(
+/*========================*/
+ /* out: DB_SUCCESS, or DB_MISSING_HISTORY if
+ the previous version is not >= purge_view,
+ which means that it may have been removed,
+ DB_ERROR if corrupted record */
+ rec_t* index_rec,/* in: clustered index record in the
+ index tree */
+ mtr_t* index_mtr __attribute__((unused)),
+ /* in: mtr which contains the latch to
+ index_rec page and purge_view */
+ rec_t* rec, /* in: version of a clustered index record */
+ dict_index_t* index, /* in: clustered index */
+ ulint* offsets,/* in: rec_get_offsets(rec, index) */
+ mem_heap_t* heap, /* in: memory heap from which the memory
+ needed is allocated */
+ rec_t** old_vers)/* out, own: previous version, or NULL if
+ rec is the first inserted version, or if
+ history data has been deleted */
+{
+ trx_undo_rec_t* undo_rec;
+ dtuple_t* entry;
+ dulint rec_trx_id;
+ ulint type;
+ dulint undo_no;
+ dulint table_id;
+ dulint trx_id;
+ dulint roll_ptr;
+ dulint old_roll_ptr;
+ upd_t* update;
+ byte* ptr;
+ ulint info_bits;
+ ulint cmpl_info;
+ ibool dummy_extern;
+ byte* buf;
+ ulint err;
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(mtr_memo_contains(index_mtr, buf_block_align(index_rec),
+ MTR_MEMO_PAGE_S_FIX) ||
+ mtr_memo_contains(index_mtr, buf_block_align(index_rec),
+ MTR_MEMO_PAGE_X_FIX));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (!(index->type & DICT_CLUSTERED)) {
+ fprintf(stderr, "InnoDB: Error: trying to access"
+ " update undo rec for non-clustered index %s\n"
+ "InnoDB: Submit a detailed bug report to"
+ " http://bugs.mysql.com\n"
+ "InnoDB: index record ", index->name);
+ rec_print(stderr, index_rec, index);
+ fputs("\n"
+ "InnoDB: record version ", stderr);
+ rec_print_new(stderr, rec, offsets);
+ putc('\n', stderr);
+ return(DB_ERROR);
+ }
+
+ roll_ptr = row_get_rec_roll_ptr(rec, index, offsets);
+ old_roll_ptr = roll_ptr;
+
+ *old_vers = NULL;
+
+ if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+ /* The record rec is the first inserted version */
+
+ return(DB_SUCCESS);
+ }
+
+ rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+ err = trx_undo_get_undo_rec(roll_ptr, rec_trx_id, &undo_rec, heap);
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
+ &dummy_extern, &undo_no, &table_id);
+
+ ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+ &info_bits);
+ ptr = trx_undo_rec_skip_row_ref(ptr, index);
+
+ ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id,
+ roll_ptr, info_bits, NULL, heap, &update);
+
+ if (ut_dulint_cmp(table_id, index->table->id) != 0) {
+ ptr = NULL;
+
+ fprintf(stderr,
+"InnoDB: Error: trying to access update undo rec for table %s\n"
+"InnoDB: but the table id in the undo record is wrong\n"
+"InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n"
+"InnoDB: Run also CHECK TABLE %s\n",
+ index->table_name, index->table_name);
+ }
+
+ if (ptr == NULL) {
+ /* The record was corrupted, return an error; these printfs
+ should catch an elusive bug in row_vers_old_has_index_entry */
+
+ fprintf(stderr,
+ "InnoDB: table %s, index %s, n_uniq %lu\n"
+ "InnoDB: undo rec address %p, type %lu cmpl_info %lu\n"
+ "InnoDB: undo rec table id %lu %lu, index table id %lu %lu\n"
+ "InnoDB: dump of 150 bytes in undo rec: ",
+ index->table_name, index->name,
+ (ulong) dict_index_get_n_unique(index),
+ undo_rec, (ulong) type, (ulong) cmpl_info,
+ (ulong) ut_dulint_get_high(table_id),
+ (ulong) ut_dulint_get_low(table_id),
+ (ulong) ut_dulint_get_high(index->table->id),
+ (ulong) ut_dulint_get_low(index->table->id));
+ ut_print_buf(stderr, undo_rec, 150);
+ fputs("\n"
+ "InnoDB: index record ", stderr);
+ rec_print(stderr, index_rec, index);
+ fputs("\n"
+ "InnoDB: record version ", stderr);
+ rec_print_new(stderr, rec, offsets);
+ fprintf(stderr, "\n"
+ "InnoDB: Record trx id %lu %lu, update rec trx id %lu %lu\n"
+ "InnoDB: Roll ptr in rec %lu %lu, in update rec %lu %lu\n",
+ (ulong) ut_dulint_get_high(rec_trx_id),
+ (ulong) ut_dulint_get_low(rec_trx_id),
+ (ulong) ut_dulint_get_high(trx_id),
+ (ulong) ut_dulint_get_low(trx_id),
+ (ulong) ut_dulint_get_high(old_roll_ptr),
+ (ulong) ut_dulint_get_low(old_roll_ptr),
+ (ulong) ut_dulint_get_high(roll_ptr),
+ (ulong) ut_dulint_get_low(roll_ptr));
+
+ trx_purge_sys_print();
+ return(DB_ERROR);
+ }
+
+ if (row_upd_changes_field_size_or_external(index, offsets, update)) {
+ ulint* ext_vect;
+ ulint n_ext_vect;
+
+ /* We have to set the appropriate extern storage bits in the
+ old version of the record: the extern bits in rec for those
+ fields that update does NOT update, as well as the the bits for
+ those fields that update updates to become externally stored
+ fields. Store the info to ext_vect: */
+
+ ext_vect = mem_alloc(sizeof(ulint)
+ * rec_offs_n_fields(offsets));
+ n_ext_vect = btr_push_update_extern_fields(ext_vect, offsets,
+ update);
+ entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec,
+ heap);
+ row_upd_index_replace_new_col_vals(entry, index, update, heap);
+
+ buf = mem_heap_alloc(heap,
+ rec_get_converted_size(index, entry));
+
+ *old_vers = rec_convert_dtuple_to_rec(buf, index, entry);
+
+ /* Now set the extern bits in the old version of the record */
+ rec_set_field_extern_bits(*old_vers, index,
+ ext_vect, n_ext_vect, NULL);
+ mem_free(ext_vect);
+ } else {
+ buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+ *old_vers = rec_copy(buf, rec, offsets);
+ rec_offs_make_valid(*old_vers, index, offsets);
+ row_upd_rec_in_place(*old_vers, offsets, update);
+ }
+
+ return(DB_SUCCESS);
+}
diff --git a/storage/innobase/trx/trx0roll.c b/storage/innobase/trx/trx0roll.c
new file mode 100644
index 00000000000..69f7a99187f
--- /dev/null
+++ b/storage/innobase/trx/trx0roll.c
@@ -0,0 +1,1344 @@
+/******************************************************
+Transaction rollback
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0roll.h"
+
+#ifdef UNIV_NONINL
+#include "trx0roll.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0undo.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "usr0sess.h"
+#include "srv0que.h"
+#include "srv0start.h"
+#include "row0undo.h"
+#include "row0mysql.h"
+#include "lock0lock.h"
+#include "pars0pars.h"
+
+/* This many pages must be undone before a truncate is tried within rollback */
+#define TRX_ROLL_TRUNC_THRESHOLD 1
+
+/* In crash recovery, the current trx to be rolled back */
+trx_t* trx_roll_crash_recv_trx = NULL;
+
+/* In crash recovery we set this to the undo n:o of the current trx to be
+rolled back. Then we can print how many % the rollback has progressed. */
+ib_longlong trx_roll_max_undo_no;
+
+/* Auxiliary variable which tells the previous progress % we printed */
+ulint trx_roll_progress_printed_pct;
+
+/***********************************************************************
+Rollback a transaction used in MySQL. */
+
+int
+trx_general_rollback_for_mysql(
+/*===========================*/
+ /* out: error code or DB_SUCCESS */
+ trx_t* trx, /* in: transaction handle */
+ ibool partial,/* in: TRUE if partial rollback requested */
+ trx_savept_t* savept) /* in: pointer to savepoint undo number, if
+ partial rollback requested */
+{
+#ifndef UNIV_HOTBACKUP
+ mem_heap_t* heap;
+ que_thr_t* thr;
+ roll_node_t* roll_node;
+
+ /* Tell Innobase server that there might be work for
+ utility threads: */
+
+ srv_active_wake_master_thread();
+
+ trx_start_if_not_started(trx);
+
+ heap = mem_heap_create(512);
+
+ roll_node = roll_node_create(heap);
+
+ roll_node->partial = partial;
+
+ if (partial) {
+ roll_node->savept = *savept;
+ }
+
+ trx->error_state = DB_SUCCESS;
+
+ thr = pars_complete_graph_for_exec(roll_node, trx, heap);
+
+ ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+ que_run_threads(thr);
+
+ mutex_enter(&kernel_mutex);
+
+ while (trx->que_state != TRX_QUE_RUNNING) {
+
+ mutex_exit(&kernel_mutex);
+
+ os_thread_sleep(100000);
+
+ mutex_enter(&kernel_mutex);
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ mem_heap_free(heap);
+
+ ut_a(trx->error_state == DB_SUCCESS);
+
+ /* Tell Innobase server that there might be work for
+ utility threads: */
+
+ srv_active_wake_master_thread();
+
+ return((int) trx->error_state);
+#else /* UNIV_HOTBACKUP */
+ /* This function depends on MySQL code that is not included in
+ InnoDB Hot Backup builds. Besides, this function should never
+ be called in InnoDB Hot Backup. */
+ ut_error;
+#endif /* UNIV_HOTBACKUP */
+}
+
+/***********************************************************************
+Rollback a transaction used in MySQL. */
+
+int
+trx_rollback_for_mysql(
+/*===================*/
+ /* out: error code or DB_SUCCESS */
+ trx_t* trx) /* in: transaction handle */
+{
+ int err;
+
+ if (trx->conc_state == TRX_NOT_STARTED) {
+
+ return(DB_SUCCESS);
+ }
+
+ trx->op_info = "rollback";
+
+ err = trx_general_rollback_for_mysql(trx, FALSE, NULL);
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/***********************************************************************
+Rollback the latest SQL statement for MySQL. */
+
+int
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+ /* out: error code or DB_SUCCESS */
+ trx_t* trx) /* in: transaction handle */
+{
+ int err;
+
+ if (trx->conc_state == TRX_NOT_STARTED) {
+
+ return(DB_SUCCESS);
+ }
+
+ trx->op_info = "rollback of SQL statement";
+
+ err = trx_general_rollback_for_mysql(trx, TRUE,
+ &(trx->last_sql_stat_start));
+ /* The following call should not be needed, but we play safe: */
+ trx_mark_sql_stat_end(trx);
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/***********************************************************************
+Frees savepoint structs. */
+
+void
+trx_roll_savepoints_free(
+/*=====================*/
+ trx_t* trx, /* in: transaction handle */
+ trx_named_savept_t* savep) /* in: free all savepoints > this one;
+ if this is NULL, free all savepoints
+ of trx */
+{
+ trx_named_savept_t* next_savep;
+
+ if (savep == NULL) {
+ savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+ } else {
+ savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+ }
+
+ while (savep != NULL) {
+ next_savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+
+ UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep);
+ mem_free(savep->name);
+ mem_free(savep);
+
+ savep = next_savep;
+ }
+}
+
+/***********************************************************************
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted. */
+
+ulint
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+ /* out: if no savepoint
+ of the name found then
+ DB_NO_SAVEPOINT,
+ otherwise DB_SUCCESS */
+ trx_t* trx, /* in: transaction handle */
+ const char* savepoint_name, /* in: savepoint name */
+ ib_longlong* mysql_binlog_cache_pos) /* out: the MySQL binlog cache
+ position corresponding to this
+ savepoint; MySQL needs this
+ information to remove the
+ binlog entries of the queries
+ executed after the savepoint */
+{
+ trx_named_savept_t* savep;
+ ulint err;
+
+ savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+
+ while (savep != NULL) {
+ if (0 == ut_strcmp(savep->name, savepoint_name)) {
+ /* Found */
+ break;
+ }
+ savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+ }
+
+ if (savep == NULL) {
+
+ return(DB_NO_SAVEPOINT);
+ }
+
+ if (trx->conc_state == TRX_NOT_STARTED) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: transaction has a savepoint ", stderr);
+ ut_print_name(stderr, trx, savep->name);
+ fputs(" though it is not started\n", stderr);
+ return(DB_ERROR);
+ }
+
+ /* We can now free all savepoints strictly later than this one */
+
+ trx_roll_savepoints_free(trx, savep);
+
+ *mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos;
+
+ trx->op_info = "rollback to a savepoint";
+
+ err = trx_general_rollback_for_mysql(trx, TRUE, &(savep->savept));
+
+ /* Store the current undo_no of the transaction so that we know where
+ to roll back if we have to roll back the next SQL statement: */
+
+ trx_mark_sql_stat_end(trx);
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/***********************************************************************
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback. */
+
+ulint
+trx_savepoint_for_mysql(
+/*====================*/
+ /* out: always DB_SUCCESS */
+ trx_t* trx, /* in: transaction handle */
+ const char* savepoint_name, /* in: savepoint name */
+ ib_longlong binlog_cache_pos) /* in: MySQL binlog cache
+ position corresponding to this
+ connection at the time of the
+ savepoint */
+{
+ trx_named_savept_t* savep;
+
+ ut_a(trx);
+ ut_a(savepoint_name);
+
+ trx_start_if_not_started(trx);
+
+ savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+
+ while (savep != NULL) {
+ if (0 == ut_strcmp(savep->name, savepoint_name)) {
+ /* Found */
+ break;
+ }
+ savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+ }
+
+ if (savep) {
+ /* There is a savepoint with the same name: free that */
+
+ UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep);
+
+ mem_free(savep->name);
+ mem_free(savep);
+ }
+
+ /* Create a new savepoint and add it as the last in the list */
+
+ savep = mem_alloc(sizeof(trx_named_savept_t));
+
+ savep->name = mem_strdup(savepoint_name);
+
+ savep->savept = trx_savept_take(trx);
+
+ savep->mysql_binlog_cache_pos = binlog_cache_pos;
+
+ UT_LIST_ADD_LAST(trx_savepoints, trx->trx_savepoints, savep);
+
+ return(DB_SUCCESS);
+}
+
+/***********************************************************************
+Releases a named savepoint. Savepoints which
+were set after this savepoint are deleted. */
+
+ulint
+trx_release_savepoint_for_mysql(
+/*============================*/
+ /* out: if no savepoint
+ of the name found then
+ DB_NO_SAVEPOINT,
+ otherwise DB_SUCCESS */
+ trx_t* trx, /* in: transaction handle */
+ const char* savepoint_name) /* in: savepoint name */
+{
+ trx_named_savept_t* savep;
+
+ savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+
+ while (savep != NULL) {
+ if (0 == ut_strcmp(savep->name, savepoint_name)) {
+ /* Found */
+ break;
+ }
+ savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+ }
+
+ if (savep == NULL) {
+
+ return(DB_NO_SAVEPOINT);
+ }
+
+ /* We can now free all savepoints strictly later than this one */
+
+ trx_roll_savepoints_free(trx, savep);
+
+ /* Now we can free this savepoint too */
+
+ UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep);
+
+ mem_free(savep->name);
+ mem_free(savep);
+
+ return(DB_SUCCESS);
+}
+
+/***********************************************************************
+Returns a transaction savepoint taken at this point in time. */
+
+trx_savept_t
+trx_savept_take(
+/*============*/
+ /* out: savepoint */
+ trx_t* trx) /* in: transaction */
+{
+ trx_savept_t savept;
+
+ savept.least_undo_no = trx->undo_no;
+
+ return(savept);
+}
+
+/***********************************************************************
+Rollback or clean up transactions which have no user session. If the
+transaction already was committed, then we clean up a possible insert
+undo log. If the transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread. */
+
+#ifndef __WIN__
+void*
+#else
+ulint
+#endif
+trx_rollback_or_clean_all_without_sess(
+/*===================================*/
+ /* out: a dummy parameter */
+ void* arg __attribute__((unused)))
+ /* in: a dummy parameter required by
+ os_thread_create */
+{
+ mem_heap_t* heap;
+ que_fork_t* fork;
+ que_thr_t* thr;
+ roll_node_t* roll_node;
+ trx_t* trx;
+ dict_table_t* table;
+ ib_longlong rows_to_undo;
+ const char* unit = "";
+ int err;
+
+ mutex_enter(&kernel_mutex);
+
+ /* Open a dummy session */
+
+ if (!trx_dummy_sess) {
+ trx_dummy_sess = sess_open();
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ if (UT_LIST_GET_FIRST(trx_sys->trx_list)) {
+
+ fprintf(stderr,
+"InnoDB: Starting in background the rollback of uncommitted transactions\n");
+ } else {
+ goto leave_function;
+ }
+loop:
+ heap = mem_heap_create(512);
+
+ mutex_enter(&kernel_mutex);
+
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ while (trx) {
+ if ((trx->sess || (trx->conc_state == TRX_NOT_STARTED))) {
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+ } else if (trx->conc_state == TRX_PREPARED) {
+
+ trx->sess = trx_dummy_sess;
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+ } else {
+ break;
+ }
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ if (trx == NULL) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Rollback of non-prepared transactions completed\n");
+
+ mem_heap_free(heap);
+
+ goto leave_function;
+ }
+
+ trx->sess = trx_dummy_sess;
+
+ if (trx->conc_state == TRX_COMMITTED_IN_MEMORY) {
+ fprintf(stderr, "InnoDB: Cleaning up trx with id %lu %lu\n",
+ (ulong) ut_dulint_get_high(trx->id),
+ (ulong) ut_dulint_get_low(trx->id));
+
+ trx_cleanup_at_db_startup(trx);
+
+ mem_heap_free(heap);
+
+ goto loop;
+ }
+
+ fork = que_fork_create(NULL, NULL, QUE_FORK_RECOVERY, heap);
+ fork->trx = trx;
+
+ thr = que_thr_create(fork, heap);
+
+ roll_node = roll_node_create(heap);
+
+ thr->child = roll_node;
+ roll_node->common.parent = thr;
+
+ mutex_enter(&kernel_mutex);
+
+ trx->graph = fork;
+
+ ut_a(thr == que_fork_start_command(fork));
+
+ trx_roll_crash_recv_trx = trx;
+ trx_roll_max_undo_no = ut_conv_dulint_to_longlong(trx->undo_no);
+ trx_roll_progress_printed_pct = 0;
+ rows_to_undo = trx_roll_max_undo_no;
+
+ if (rows_to_undo > 1000000000) {
+ rows_to_undo = rows_to_undo / 1000000;
+ unit = "M";
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Rolling back trx with id %lu %lu, %lu%s rows to undo\n",
+ (ulong) ut_dulint_get_high(trx->id),
+ (ulong) ut_dulint_get_low(trx->id),
+ (ulong) rows_to_undo, unit);
+ mutex_exit(&kernel_mutex);
+
+ trx->mysql_thread_id = os_thread_get_curr_id();
+
+ trx->mysql_process_no = os_proc_get_number();
+
+ if (trx->dict_operation) {
+ row_mysql_lock_data_dictionary(trx);
+ }
+
+ que_run_threads(thr);
+
+ mutex_enter(&kernel_mutex);
+
+ while (trx->que_state != TRX_QUE_RUNNING) {
+
+ mutex_exit(&kernel_mutex);
+
+ fprintf(stderr,
+ "InnoDB: Waiting for rollback of trx id %lu to end\n",
+ (ulong) ut_dulint_get_low(trx->id));
+ os_thread_sleep(100000);
+
+ mutex_enter(&kernel_mutex);
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ if (trx->dict_operation) {
+ /* If the transaction was for a dictionary operation, we
+ drop the relevant table, if it still exists */
+
+ fprintf(stderr,
+"InnoDB: Dropping table with id %lu %lu in recovery if it exists\n",
+ (ulong) ut_dulint_get_high(trx->table_id),
+ (ulong) ut_dulint_get_low(trx->table_id));
+
+ table = dict_table_get_on_id_low(trx->table_id, trx);
+
+ if (table) {
+ fputs("InnoDB: Table found: dropping table ", stderr);
+ ut_print_name(stderr, trx, table->name);
+ fputs(" in recovery\n", stderr);
+
+ err = row_drop_table_for_mysql(table->name, trx, TRUE);
+
+ ut_a(err == (int) DB_SUCCESS);
+ }
+ }
+
+ if (trx->dict_operation) {
+ row_mysql_unlock_data_dictionary(trx);
+ }
+
+ fprintf(stderr, "\nInnoDB: Rolling back of trx id %lu %lu completed\n",
+ (ulong) ut_dulint_get_high(trx->id),
+ (ulong) ut_dulint_get_low(trx->id));
+ mem_heap_free(heap);
+
+ trx_roll_crash_recv_trx = NULL;
+
+ goto loop;
+
+leave_function:
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+
+ os_thread_exit(NULL);
+
+ /* The following is dummy code to keep the compiler happy: */
+
+#ifndef __WIN__
+ return(NULL);
+#else
+ return(0);
+#endif
+}
+
+/***********************************************************************
+Creates an undo number array. */
+
+trx_undo_arr_t*
+trx_undo_arr_create(void)
+/*=====================*/
+{
+ trx_undo_arr_t* arr;
+ mem_heap_t* heap;
+ ulint i;
+
+ heap = mem_heap_create(1024);
+
+ arr = mem_heap_alloc(heap, sizeof(trx_undo_arr_t));
+
+ arr->infos = mem_heap_alloc(heap, sizeof(trx_undo_inf_t)
+ * UNIV_MAX_PARALLELISM);
+ arr->n_cells = UNIV_MAX_PARALLELISM;
+ arr->n_used = 0;
+
+ arr->heap = heap;
+
+ for (i = 0; i < UNIV_MAX_PARALLELISM; i++) {
+
+ (trx_undo_arr_get_nth_info(arr, i))->in_use = FALSE;
+ }
+
+ return(arr);
+}
+
+/***********************************************************************
+Frees an undo number array. */
+
+void
+trx_undo_arr_free(
+/*==============*/
+ trx_undo_arr_t* arr) /* in: undo number array */
+{
+ ut_ad(arr->n_used == 0);
+
+ mem_heap_free(arr->heap);
+}
+
+/***********************************************************************
+Stores info of an undo log record to the array if it is not stored yet. */
+static
+ibool
+trx_undo_arr_store_info(
+/*====================*/
+ /* out: FALSE if the record already existed in the
+ array */
+ trx_t* trx, /* in: transaction */
+ dulint undo_no)/* in: undo number */
+{
+ trx_undo_inf_t* cell;
+ trx_undo_inf_t* stored_here;
+ trx_undo_arr_t* arr;
+ ulint n_used;
+ ulint n;
+ ulint i;
+
+ n = 0;
+ arr = trx->undo_no_arr;
+ n_used = arr->n_used;
+ stored_here = NULL;
+
+ for (i = 0;; i++) {
+ cell = trx_undo_arr_get_nth_info(arr, i);
+
+ if (!cell->in_use) {
+ if (!stored_here) {
+ /* Not in use, we may store here */
+ cell->undo_no = undo_no;
+ cell->in_use = TRUE;
+
+ arr->n_used++;
+
+ stored_here = cell;
+ }
+ } else {
+ n++;
+
+ if (0 == ut_dulint_cmp(cell->undo_no, undo_no)) {
+
+ if (stored_here) {
+ stored_here->in_use = FALSE;
+ ut_ad(arr->n_used > 0);
+ arr->n_used--;
+ }
+
+ ut_ad(arr->n_used == n_used);
+
+ return(FALSE);
+ }
+ }
+
+ if (n == n_used && stored_here) {
+
+ ut_ad(arr->n_used == 1 + n_used);
+
+ return(TRUE);
+ }
+ }
+}
+
+/***********************************************************************
+Removes an undo number from the array. */
+static
+void
+trx_undo_arr_remove_info(
+/*=====================*/
+ trx_undo_arr_t* arr, /* in: undo number array */
+ dulint undo_no)/* in: undo number */
+{
+ trx_undo_inf_t* cell;
+ ulint n_used;
+ ulint n;
+ ulint i;
+
+ n_used = arr->n_used;
+ n = 0;
+
+ for (i = 0;; i++) {
+ cell = trx_undo_arr_get_nth_info(arr, i);
+
+ if (cell->in_use
+ && 0 == ut_dulint_cmp(cell->undo_no, undo_no)) {
+
+ cell->in_use = FALSE;
+
+ ut_ad(arr->n_used > 0);
+
+ arr->n_used--;
+
+ return;
+ }
+ }
+}
+
+/***********************************************************************
+Gets the biggest undo number in an array. */
+static
+dulint
+trx_undo_arr_get_biggest(
+/*=====================*/
+ /* out: biggest value, ut_dulint_zero if
+ the array is empty */
+ trx_undo_arr_t* arr) /* in: undo number array */
+{
+ trx_undo_inf_t* cell;
+ ulint n_used;
+ dulint biggest;
+ ulint n;
+ ulint i;
+
+ n = 0;
+ n_used = arr->n_used;
+ biggest = ut_dulint_zero;
+
+ for (i = 0;; i++) {
+ cell = trx_undo_arr_get_nth_info(arr, i);
+
+ if (cell->in_use) {
+ n++;
+ if (ut_dulint_cmp(cell->undo_no, biggest) > 0) {
+
+ biggest = cell->undo_no;
+ }
+ }
+
+ if (n == n_used) {
+ return(biggest);
+ }
+ }
+}
+
+/***************************************************************************
+Tries truncate the undo logs. */
+
+void
+trx_roll_try_truncate(
+/*==================*/
+ trx_t* trx) /* in: transaction */
+{
+ trx_undo_arr_t* arr;
+ dulint limit;
+ dulint biggest;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(trx->undo_mutex)));
+ ut_ad(mutex_own(&((trx->rseg)->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ trx->pages_undone = 0;
+
+ arr = trx->undo_no_arr;
+
+ limit = trx->undo_no;
+
+ if (arr->n_used > 0) {
+ biggest = trx_undo_arr_get_biggest(arr);
+
+ if (ut_dulint_cmp(biggest, limit) >= 0) {
+
+ limit = ut_dulint_add(biggest, 1);
+ }
+ }
+
+ if (trx->insert_undo) {
+ trx_undo_truncate_end(trx, trx->insert_undo, limit);
+ }
+
+ if (trx->update_undo) {
+ trx_undo_truncate_end(trx, trx->update_undo, limit);
+ }
+}
+
+/***************************************************************************
+Pops the topmost undo log record in a single undo log and updates the info
+about the topmost record in the undo log memory struct. */
+static
+trx_undo_rec_t*
+trx_roll_pop_top_rec(
+/*=================*/
+ /* out: undo log record, the page s-latched */
+ trx_t* trx, /* in: transaction */
+ trx_undo_t* undo, /* in: undo log */
+ mtr_t* mtr) /* in: mtr */
+{
+ page_t* undo_page;
+ ulint offset;
+ trx_undo_rec_t* prev_rec;
+ page_t* prev_rec_page;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(trx->undo_mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ undo_page = trx_undo_page_get_s_latched(undo->space,
+ undo->top_page_no, mtr);
+ offset = undo->top_offset;
+
+/* fprintf(stderr, "Thread %lu undoing trx %lu undo record %lu\n",
+ os_thread_get_curr_id(), ut_dulint_get_low(trx->id),
+ ut_dulint_get_low(undo->top_undo_no)); */
+
+ prev_rec = trx_undo_get_prev_rec(undo_page + offset,
+ undo->hdr_page_no, undo->hdr_offset,
+ mtr);
+ if (prev_rec == NULL) {
+
+ undo->empty = TRUE;
+ } else {
+ prev_rec_page = buf_frame_align(prev_rec);
+
+ if (prev_rec_page != undo_page) {
+
+ trx->pages_undone++;
+ }
+
+ undo->top_page_no = buf_frame_get_page_no(prev_rec_page);
+ undo->top_offset = prev_rec - prev_rec_page;
+ undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec);
+ }
+
+ return(undo_page + offset);
+}
+
+/************************************************************************
+Pops the topmost record when the two undo logs of a transaction are seen
+as a single stack of records ordered by their undo numbers. Inserts the
+undo number of the popped undo record to the array of currently processed
+undo numbers in the transaction. When the query thread finishes processing
+of this undo record, it must be released with trx_undo_rec_release. */
+
+trx_undo_rec_t*
+trx_roll_pop_top_rec_of_trx(
+/*========================*/
+ /* out: undo log record copied to heap, NULL
+ if none left, or if the undo number of the
+ top record would be less than the limit */
+ trx_t* trx, /* in: transaction */
+ dulint limit, /* in: least undo number we need */
+ dulint* roll_ptr,/* out: roll pointer to undo record */
+ mem_heap_t* heap) /* in: memory heap where copied */
+{
+ trx_undo_t* undo;
+ trx_undo_t* ins_undo;
+ trx_undo_t* upd_undo;
+ trx_undo_rec_t* undo_rec;
+ trx_undo_rec_t* undo_rec_copy;
+ dulint undo_no;
+ ibool is_insert;
+ trx_rseg_t* rseg;
+ ulint progress_pct;
+ mtr_t mtr;
+
+ rseg = trx->rseg;
+try_again:
+ mutex_enter(&(trx->undo_mutex));
+
+ if (trx->pages_undone >= TRX_ROLL_TRUNC_THRESHOLD) {
+ mutex_enter(&(rseg->mutex));
+
+ trx_roll_try_truncate(trx);
+
+ mutex_exit(&(rseg->mutex));
+ }
+
+ ins_undo = trx->insert_undo;
+ upd_undo = trx->update_undo;
+
+ if (!ins_undo || ins_undo->empty) {
+ undo = upd_undo;
+ } else if (!upd_undo || upd_undo->empty) {
+ undo = ins_undo;
+ } else if (ut_dulint_cmp(upd_undo->top_undo_no,
+ ins_undo->top_undo_no) > 0) {
+ undo = upd_undo;
+ } else {
+ undo = ins_undo;
+ }
+
+ if (!undo || undo->empty
+ || (ut_dulint_cmp(limit, undo->top_undo_no) > 0)) {
+
+ if ((trx->undo_no_arr)->n_used == 0) {
+ /* Rollback is ending */
+
+ mutex_enter(&(rseg->mutex));
+
+ trx_roll_try_truncate(trx);
+
+ mutex_exit(&(rseg->mutex));
+ }
+
+ mutex_exit(&(trx->undo_mutex));
+
+ return(NULL);
+ }
+
+ if (undo == ins_undo) {
+ is_insert = TRUE;
+ } else {
+ is_insert = FALSE;
+ }
+
+ *roll_ptr = trx_undo_build_roll_ptr(is_insert, (undo->rseg)->id,
+ undo->top_page_no, undo->top_offset);
+ mtr_start(&mtr);
+
+ undo_rec = trx_roll_pop_top_rec(trx, undo, &mtr);
+
+ undo_no = trx_undo_rec_get_undo_no(undo_rec);
+
+ ut_ad(ut_dulint_cmp(ut_dulint_add(undo_no, 1), trx->undo_no) == 0);
+
+ /* We print rollback progress info if we are in a crash recovery
+ and the transaction has at least 1000 row operations to undo. */
+
+ if (trx == trx_roll_crash_recv_trx && trx_roll_max_undo_no > 1000) {
+
+ progress_pct = 100 - (ulint)
+ ((ut_conv_dulint_to_longlong(undo_no) * 100)
+ / trx_roll_max_undo_no);
+ if (progress_pct != trx_roll_progress_printed_pct) {
+ if (trx_roll_progress_printed_pct == 0) {
+ fprintf(stderr,
+"\nInnoDB: Progress in percents: %lu", (ulong) progress_pct);
+ } else {
+ fprintf(stderr,
+ " %lu", (ulong) progress_pct);
+ }
+ fflush(stderr);
+ trx_roll_progress_printed_pct = progress_pct;
+ }
+ }
+
+ trx->undo_no = undo_no;
+
+ if (!trx_undo_arr_store_info(trx, undo_no)) {
+ /* A query thread is already processing this undo log record */
+
+ mutex_exit(&(trx->undo_mutex));
+
+ mtr_commit(&mtr);
+
+ goto try_again;
+ }
+
+ undo_rec_copy = trx_undo_rec_copy(undo_rec, heap);
+
+ mutex_exit(&(trx->undo_mutex));
+
+ mtr_commit(&mtr);
+
+ return(undo_rec_copy);
+}
+
+/************************************************************************
+Reserves an undo log record for a query thread to undo. This should be
+called if the query thread gets the undo log record not using the pop
+function above. */
+
+ibool
+trx_undo_rec_reserve(
+/*=================*/
+ /* out: TRUE if succeeded */
+ trx_t* trx, /* in: transaction */
+ dulint undo_no)/* in: undo number of the record */
+{
+ ibool ret;
+
+ mutex_enter(&(trx->undo_mutex));
+
+ ret = trx_undo_arr_store_info(trx, undo_no);
+
+ mutex_exit(&(trx->undo_mutex));
+
+ return(ret);
+}
+
+/***********************************************************************
+Releases a reserved undo record. */
+
+void
+trx_undo_rec_release(
+/*=================*/
+ trx_t* trx, /* in: transaction */
+ dulint undo_no)/* in: undo number */
+{
+ trx_undo_arr_t* arr;
+
+ mutex_enter(&(trx->undo_mutex));
+
+ arr = trx->undo_no_arr;
+
+ trx_undo_arr_remove_info(arr, undo_no);
+
+ mutex_exit(&(trx->undo_mutex));
+}
+
+/*************************************************************************
+Starts a rollback operation. */
+
+void
+trx_rollback(
+/*=========*/
+ trx_t* trx, /* in: transaction */
+ trx_sig_t* sig, /* in: signal starting the rollback */
+ que_thr_t** next_thr)/* in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if the passed value is
+ NULL, the parameter is ignored */
+{
+ que_t* roll_graph;
+ que_thr_t* thr;
+/* que_thr_t* thr2; */
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad((trx->undo_no_arr == NULL) || ((trx->undo_no_arr)->n_used == 0));
+
+ /* Initialize the rollback field in the transaction */
+
+ if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
+
+ trx->roll_limit = ut_dulint_zero;
+
+ } else if (sig->type == TRX_SIG_ROLLBACK_TO_SAVEPT) {
+
+ trx->roll_limit = (sig->savept).least_undo_no;
+
+ } else if (sig->type == TRX_SIG_ERROR_OCCURRED) {
+
+ trx->roll_limit = trx->last_sql_stat_start.least_undo_no;
+ } else {
+ ut_error;
+ }
+
+ ut_a(ut_dulint_cmp(trx->roll_limit, trx->undo_no) <= 0);
+
+ trx->pages_undone = 0;
+
+ if (trx->undo_no_arr == NULL) {
+ trx->undo_no_arr = trx_undo_arr_create();
+ }
+
+ /* Build a 'query' graph which will perform the undo operations */
+
+ roll_graph = trx_roll_graph_build(trx);
+
+ trx->graph = roll_graph;
+ trx->que_state = TRX_QUE_ROLLING_BACK;
+
+ thr = que_fork_start_command(roll_graph);
+
+ ut_ad(thr);
+
+/* thr2 = que_fork_start_command(roll_graph);
+
+ ut_ad(thr2); */
+
+ if (next_thr && (*next_thr == NULL)) {
+ *next_thr = thr;
+/* srv_que_task_enqueue_low(thr2); */
+ } else {
+ srv_que_task_enqueue_low(thr);
+/* srv_que_task_enqueue_low(thr2); */
+ }
+}
+
+/********************************************************************
+Builds an undo 'query' graph for a transaction. The actual rollback is
+performed by executing this query graph like a query subprocedure call.
+The reply about the completion of the rollback will be sent by this
+graph. */
+
+que_t*
+trx_roll_graph_build(
+/*=================*/
+ /* out, own: the query graph */
+ trx_t* trx) /* in: trx handle */
+{
+ mem_heap_t* heap;
+ que_fork_t* fork;
+ que_thr_t* thr;
+/* que_thr_t* thr2; */
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ heap = mem_heap_create(512);
+ fork = que_fork_create(NULL, NULL, QUE_FORK_ROLLBACK, heap);
+ fork->trx = trx;
+
+ thr = que_thr_create(fork, heap);
+/* thr2 = que_thr_create(fork, heap); */
+
+ thr->child = row_undo_node_create(trx, thr, heap);
+/* thr2->child = row_undo_node_create(trx, thr2, heap); */
+
+ return(fork);
+}
+
+/*************************************************************************
+Finishes error processing after the necessary partial rollback has been
+done. */
+static
+void
+trx_finish_error_processing(
+/*========================*/
+ trx_t* trx) /* in: transaction */
+{
+ trx_sig_t* sig;
+ trx_sig_t* next_sig;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ sig = UT_LIST_GET_FIRST(trx->signals);
+
+ while (sig != NULL) {
+ next_sig = UT_LIST_GET_NEXT(signals, sig);
+
+ if (sig->type == TRX_SIG_ERROR_OCCURRED) {
+
+ trx_sig_remove(trx, sig);
+ }
+
+ sig = next_sig;
+ }
+
+ trx->que_state = TRX_QUE_RUNNING;
+}
+
+/*************************************************************************
+Finishes a partial rollback operation. */
+static
+void
+trx_finish_partial_rollback_off_kernel(
+/*===================================*/
+ trx_t* trx, /* in: transaction */
+ que_thr_t** next_thr)/* in/out: next query thread to run;
+ if the value which is passed in is a pointer
+ to a NULL pointer, then the calling function
+ can start running a new query thread; if this
+ parameter is NULL, it is ignored */
+{
+ trx_sig_t* sig;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ sig = UT_LIST_GET_FIRST(trx->signals);
+
+ /* Remove the signal from the signal queue and send reply message
+ to it */
+
+ trx_sig_reply(sig, next_thr);
+ trx_sig_remove(trx, sig);
+
+ trx->que_state = TRX_QUE_RUNNING;
+}
+
+/********************************************************************
+Finishes a transaction rollback. */
+
+void
+trx_finish_rollback_off_kernel(
+/*===========================*/
+ que_t* graph, /* in: undo graph which can now be freed */
+ trx_t* trx, /* in: transaction */
+ que_thr_t** next_thr)/* in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if this parameter is
+ NULL, it is ignored */
+{
+ trx_sig_t* sig;
+ trx_sig_t* next_sig;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ ut_a(trx->undo_no_arr == NULL || trx->undo_no_arr->n_used == 0);
+
+ /* Free the memory reserved by the undo graph */
+ que_graph_free(graph);
+
+ sig = UT_LIST_GET_FIRST(trx->signals);
+
+ if (sig->type == TRX_SIG_ROLLBACK_TO_SAVEPT) {
+
+ trx_finish_partial_rollback_off_kernel(trx, next_thr);
+
+ return;
+
+ } else if (sig->type == TRX_SIG_ERROR_OCCURRED) {
+
+ trx_finish_error_processing(trx);
+
+ return;
+ }
+
+ if (lock_print_waits) {
+ fprintf(stderr, "Trx %lu rollback finished\n",
+ (ulong) ut_dulint_get_low(trx->id));
+ }
+
+ trx_commit_off_kernel(trx);
+
+ /* Remove all TRX_SIG_TOTAL_ROLLBACK signals from the signal queue and
+ send reply messages to them */
+
+ trx->que_state = TRX_QUE_RUNNING;
+
+ while (sig != NULL) {
+ next_sig = UT_LIST_GET_NEXT(signals, sig);
+
+ if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
+
+ trx_sig_reply(sig, next_thr);
+
+ trx_sig_remove(trx, sig);
+ }
+
+ sig = next_sig;
+ }
+}
+
+/*************************************************************************
+Creates a rollback command node struct. */
+
+roll_node_t*
+roll_node_create(
+/*=============*/
+ /* out, own: rollback node struct */
+ mem_heap_t* heap) /* in: mem heap where created */
+{
+ roll_node_t* node;
+
+ node = mem_heap_alloc(heap, sizeof(roll_node_t));
+ node->common.type = QUE_NODE_ROLLBACK;
+ node->state = ROLL_NODE_SEND;
+
+ node->partial = FALSE;
+
+ return(node);
+}
+
+/***************************************************************
+Performs an execution step for a rollback command node in a query graph. */
+
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+ /* out: query thread to run next, or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ roll_node_t* node;
+ ibool success;
+ ulint sig_no;
+ trx_savept_t* savept;
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ node->state = ROLL_NODE_SEND;
+ }
+
+ if (node->state == ROLL_NODE_SEND) {
+ mutex_enter(&kernel_mutex);
+
+ node->state = ROLL_NODE_WAIT;
+
+ if (node->partial) {
+ sig_no = TRX_SIG_ROLLBACK_TO_SAVEPT;
+ savept = &(node->savept);
+ } else {
+ sig_no = TRX_SIG_TOTAL_ROLLBACK;
+ savept = NULL;
+ }
+
+ /* Send a rollback signal to the transaction */
+
+ success = trx_sig_send(thr_get_trx(thr),
+ sig_no, TRX_SIG_SELF,
+ thr, savept, NULL);
+
+ thr->state = QUE_THR_SIG_REPLY_WAIT;
+
+ mutex_exit(&kernel_mutex);
+
+ if (!success) {
+ /* Error in delivering the rollback signal */
+ que_thr_handle_error(thr, DB_ERROR, NULL, 0);
+ }
+
+ return(NULL);
+ }
+
+ ut_ad(node->state == ROLL_NODE_WAIT);
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
diff --git a/storage/innobase/trx/trx0rseg.c b/storage/innobase/trx/trx0rseg.c
new file mode 100644
index 00000000000..a01d4bb835d
--- /dev/null
+++ b/storage/innobase/trx/trx0rseg.c
@@ -0,0 +1,261 @@
+/******************************************************
+Rollback segment
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rseg.h"
+
+#ifdef UNIV_NONINL
+#include "trx0rseg.ic"
+#endif
+
+#include "trx0undo.h"
+#include "fut0lst.h"
+#include "srv0srv.h"
+#include "trx0purge.h"
+
+/**********************************************************************
+Looks for a rollback segment, based on the rollback segment id. */
+
+trx_rseg_t*
+trx_rseg_get_on_id(
+/*===============*/
+ /* out: rollback segment */
+ ulint id) /* in: rollback segment id */
+{
+ trx_rseg_t* rseg;
+
+ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+ ut_ad(rseg);
+
+ while (rseg->id != id) {
+ rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+ ut_ad(rseg);
+ }
+
+ return(rseg);
+}
+
+/********************************************************************
+Creates a rollback segment header. This function is called only when
+a new rollback segment is created in the database. */
+
+ulint
+trx_rseg_header_create(
+/*===================*/
+ /* out: page number of the created segment,
+ FIL_NULL if fail */
+ ulint space, /* in: space id */
+ ulint max_size, /* in: max size in pages */
+ ulint* slot_no, /* out: rseg id == slot number in trx sys */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint page_no;
+ trx_rsegf_t* rsegf;
+ trx_sysf_t* sys_header;
+ ulint i;
+ page_t* page;
+
+ ut_ad(mtr);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space),
+ MTR_MEMO_X_LOCK));
+ sys_header = trx_sysf_get(mtr);
+
+ *slot_no = trx_sysf_rseg_find_free(mtr);
+
+ if (*slot_no == ULINT_UNDEFINED) {
+
+ return(FIL_NULL);
+ }
+
+ /* Allocate a new file segment for the rollback segment */
+ page = fseg_create(space, 0, TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr);
+
+ if (page == NULL) {
+ /* No space left */
+
+ return(FIL_NULL);
+ }
+
+#ifdef UNIV_SYNC_DEBUG
+ buf_page_dbg_add_level(page, SYNC_RSEG_HEADER_NEW);
+#endif /* UNIV_SYNC_DEBUG */
+
+ page_no = buf_frame_get_page_no(page);
+
+ /* Get the rollback segment file page */
+ rsegf = trx_rsegf_get_new(space, page_no, mtr);
+
+ /* Initialize max size field */
+ mlog_write_ulint(rsegf + TRX_RSEG_MAX_SIZE, max_size, MLOG_4BYTES, mtr);
+
+ /* Initialize the history list */
+
+ mlog_write_ulint(rsegf + TRX_RSEG_HISTORY_SIZE, 0, MLOG_4BYTES, mtr);
+ flst_init(rsegf + TRX_RSEG_HISTORY, mtr);
+
+ /* Reset the undo log slots */
+ for (i = 0; i < TRX_RSEG_N_SLOTS; i++) {
+
+ trx_rsegf_set_nth_undo(rsegf, i, FIL_NULL, mtr);
+ }
+
+ /* Add the rollback segment info to the free slot in the trx system
+ header */
+
+ trx_sysf_rseg_set_space(sys_header, *slot_no, space, mtr);
+ trx_sysf_rseg_set_page_no(sys_header, *slot_no, page_no, mtr);
+
+ return(page_no);
+}
+
+/***************************************************************************
+Creates and initializes a rollback segment object. The values for the
+fields are read from the header. The object is inserted to the rseg
+list of the trx system object and a pointer is inserted in the rseg
+array in the trx system object. */
+static
+trx_rseg_t*
+trx_rseg_mem_create(
+/*================*/
+ /* out, own: rollback segment object */
+ ulint id, /* in: rollback segment id */
+ ulint space, /* in: space where the segment placed */
+ ulint page_no, /* in: page number of the segment header */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_rsegf_t* rseg_header;
+ trx_rseg_t* rseg;
+ trx_ulogf_t* undo_log_hdr;
+ fil_addr_t node_addr;
+ ulint sum_of_undo_sizes;
+ ulint len;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ rseg = mem_alloc(sizeof(trx_rseg_t));
+
+ rseg->id = id;
+ rseg->space = space;
+ rseg->page_no = page_no;
+
+ mutex_create(&(rseg->mutex));
+ mutex_set_level(&(rseg->mutex), SYNC_RSEG);
+
+ UT_LIST_ADD_LAST(rseg_list, trx_sys->rseg_list, rseg);
+
+ trx_sys_set_nth_rseg(trx_sys, id, rseg);
+
+ rseg_header = trx_rsegf_get_new(space, page_no, mtr);
+
+ rseg->max_size = mtr_read_ulint(rseg_header + TRX_RSEG_MAX_SIZE,
+ MLOG_4BYTES, mtr);
+
+ /* Initialize the undo log lists according to the rseg header */
+
+ sum_of_undo_sizes = trx_undo_lists_init(rseg);
+
+ rseg->curr_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+ MLOG_4BYTES, mtr)
+ + 1 + sum_of_undo_sizes;
+
+ len = flst_get_len(rseg_header + TRX_RSEG_HISTORY, mtr);
+ if (len > 0) {
+ trx_sys->rseg_history_len += len;
+
+ node_addr = trx_purge_get_log_from_hist(
+ flst_get_last(rseg_header + TRX_RSEG_HISTORY,
+ mtr));
+ rseg->last_page_no = node_addr.page;
+ rseg->last_offset = node_addr.boffset;
+
+ undo_log_hdr = trx_undo_page_get(rseg->space, node_addr.page,
+ mtr)
+ + node_addr.boffset;
+
+ rseg->last_trx_no = mtr_read_dulint(
+ undo_log_hdr + TRX_UNDO_TRX_NO, mtr);
+ rseg->last_del_marks = mtr_read_ulint(
+ undo_log_hdr + TRX_UNDO_DEL_MARKS,
+ MLOG_2BYTES, mtr);
+ } else {
+ rseg->last_page_no = FIL_NULL;
+ }
+
+ return(rseg);
+}
+
+/*************************************************************************
+Creates the memory copies for rollback segments and initializes the
+rseg list and array in trx_sys at a database startup. */
+
+void
+trx_rseg_list_and_array_init(
+/*=========================*/
+ trx_sysf_t* sys_header, /* in: trx system header */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint i;
+ ulint page_no;
+ ulint space;
+
+ UT_LIST_INIT(trx_sys->rseg_list);
+
+ trx_sys->rseg_history_len = 0;
+
+ for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+
+ page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
+
+ if (page_no == FIL_NULL) {
+
+ trx_sys_set_nth_rseg(trx_sys, i, NULL);
+ } else {
+ space = trx_sysf_rseg_get_space(sys_header, i, mtr);
+
+ trx_rseg_mem_create(i, space, page_no, mtr);
+ }
+ }
+}
+
+/********************************************************************
+Creates a new rollback segment to the database. */
+
+trx_rseg_t*
+trx_rseg_create(
+/*============*/
+ /* out: the created segment object, NULL if
+ fail */
+ ulint space, /* in: space id */
+ ulint max_size, /* in: max size in pages */
+ ulint* id, /* out: rseg id */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint page_no;
+ trx_rseg_t* rseg;
+
+ mtr_x_lock(fil_space_get_latch(space), mtr);
+ mutex_enter(&kernel_mutex);
+
+ page_no = trx_rseg_header_create(space, max_size, id, mtr);
+
+ if (page_no == FIL_NULL) {
+
+ mutex_exit(&kernel_mutex);
+ return(NULL);
+ }
+
+ rseg = trx_rseg_mem_create(*id, space, page_no, mtr);
+
+ mutex_exit(&kernel_mutex);
+
+ return(rseg);
+}
diff --git a/storage/innobase/trx/trx0sys.c b/storage/innobase/trx/trx0sys.c
new file mode 100644
index 00000000000..68fe6d5079a
--- /dev/null
+++ b/storage/innobase/trx/trx0sys.c
@@ -0,0 +1,964 @@
+/******************************************************
+Transaction system
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0sys.h"
+
+#ifdef UNIV_NONINL
+#include "trx0sys.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mtr0mtr.h"
+#include "trx0trx.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "srv0srv.h"
+#include "trx0purge.h"
+#include "log0log.h"
+#include "os0file.h"
+
+/* The transaction system */
+trx_sys_t* trx_sys = NULL;
+trx_doublewrite_t* trx_doublewrite = NULL;
+
+/* The following is set to TRUE when we are upgrading from the old format data
+files to the new >= 4.1.x format multiple tablespaces format data files */
+
+ibool trx_doublewrite_must_reset_space_ids = FALSE;
+
+/* The following is TRUE when we are using the database in the new format,
+i.e., we have successfully upgraded, or have created a new database
+installation */
+
+ibool trx_sys_multiple_tablespace_format = FALSE;
+
+/* In a MySQL replication slave, in crash recovery we store the master log
+file name and position here. We have successfully got the updates to InnoDB
+up to this position. If .._pos is -1, it means no crash recovery was needed,
+or there was no master log position info inside InnoDB. */
+
+char trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
+ib_longlong trx_sys_mysql_master_log_pos = -1;
+
+/* If this MySQL server uses binary logging, after InnoDB has been inited
+and if it has done a crash recovery, we store the binlog file name and position
+here. If .._pos is -1, it means there was no binlog position info inside
+InnoDB. */
+
+char trx_sys_mysql_bin_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
+ib_longlong trx_sys_mysql_bin_log_pos = -1;
+
+
+/********************************************************************
+Determines if a page number is located inside the doublewrite buffer. */
+
+ibool
+trx_doublewrite_page_inside(
+/*========================*/
+ /* out: TRUE if the location is inside
+ the two blocks of the doublewrite buffer */
+ ulint page_no) /* in: page number */
+{
+ if (trx_doublewrite == NULL) {
+
+ return(FALSE);
+ }
+
+ if (page_no >= trx_doublewrite->block1
+ && page_no < trx_doublewrite->block1
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ return(TRUE);
+ }
+
+ if (page_no >= trx_doublewrite->block2
+ && page_no < trx_doublewrite->block2
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/********************************************************************
+Creates or initialializes the doublewrite buffer at a database start. */
+static
+void
+trx_doublewrite_init(
+/*=================*/
+ byte* doublewrite) /* in: pointer to the doublewrite buf
+ header on trx sys page */
+{
+ trx_doublewrite = mem_alloc(sizeof(trx_doublewrite_t));
+
+ /* Since we now start to use the doublewrite buffer, no need to call
+ fsync() after every write to a data file */
+
+ os_do_not_call_flush_at_each_write = TRUE;
+
+ mutex_create(&(trx_doublewrite->mutex));
+ mutex_set_level(&(trx_doublewrite->mutex), SYNC_DOUBLEWRITE);
+
+ trx_doublewrite->first_free = 0;
+
+ trx_doublewrite->block1 = mach_read_from_4(
+ doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK1);
+ trx_doublewrite->block2 = mach_read_from_4(
+ doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK2);
+ trx_doublewrite->write_buf_unaligned =
+ ut_malloc(
+ (1 + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+ * UNIV_PAGE_SIZE);
+
+ trx_doublewrite->write_buf = ut_align(
+ trx_doublewrite->write_buf_unaligned,
+ UNIV_PAGE_SIZE);
+ trx_doublewrite->buf_block_arr = mem_alloc(
+ 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ * sizeof(void*));
+}
+
+/********************************************************************
+Frees the doublewrite buffer. */
+static
+void
+trx_doublewrite_free(void)
+/*======================*/
+{
+ mutex_free(&(trx_doublewrite->mutex));
+
+ mem_free(trx_doublewrite->buf_block_arr);
+ ut_free(trx_doublewrite->write_buf_unaligned);
+
+ mem_free(trx_doublewrite);
+ trx_doublewrite = NULL;
+}
+
+/********************************************************************
+Marks the trx sys header when we have successfully upgraded to the >= 4.1.x
+multiple tablespace format. */
+
+void
+trx_sys_mark_upgraded_to_multiple_tablespaces(void)
+/*===============================================*/
+{
+ page_t* page;
+ byte* doublewrite;
+ mtr_t mtr;
+
+ /* We upgraded to 4.1.x and reset the space id fields in the
+ doublewrite buffer. Let us mark to the trx_sys header that the upgrade
+ has been done. */
+
+ mtr_start(&mtr);
+
+ page = buf_page_get(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
+#ifdef UNIV_SYNC_DEBUG
+ buf_page_dbg_add_level(page, SYNC_NO_ORDER_CHECK);
+#endif /* UNIV_SYNC_DEBUG */
+
+ doublewrite = page + TRX_SYS_DOUBLEWRITE;
+
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+ MLOG_4BYTES, &mtr);
+ mtr_commit(&mtr);
+
+ /* Flush the modified pages to disk and make a checkpoint */
+ log_make_checkpoint_at(ut_dulint_max, TRUE);
+
+ trx_sys_multiple_tablespace_format = TRUE;
+}
+
+/********************************************************************
+Creates the doublewrite buffer to a new InnoDB installation. The header of the
+doublewrite buffer is placed on the trx system header page. */
+
+void
+trx_sys_create_doublewrite_buf(void)
+/*================================*/
+{
+ page_t* page;
+ page_t* page2;
+ page_t* new_page;
+ byte* doublewrite;
+ byte* fseg_header;
+ ulint page_no;
+ ulint prev_page_no;
+ ulint i;
+ mtr_t mtr;
+
+ if (trx_doublewrite) {
+ /* Already inited */
+
+ return;
+ }
+
+start_again:
+ mtr_start(&mtr);
+
+ page = buf_page_get(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
+#ifdef UNIV_SYNC_DEBUG
+ buf_page_dbg_add_level(page, SYNC_NO_ORDER_CHECK);
+#endif /* UNIV_SYNC_DEBUG */
+
+ doublewrite = page + TRX_SYS_DOUBLEWRITE;
+
+ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+ == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+ /* The doublewrite buffer has already been created:
+ just read in some numbers */
+
+ trx_doublewrite_init(doublewrite);
+
+ mtr_commit(&mtr);
+ } else {
+ fprintf(stderr,
+ "InnoDB: Doublewrite buffer not found: creating new\n");
+
+ if (buf_pool_get_curr_size() <
+ (2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ + FSP_EXTENT_SIZE / 2 + 100)
+ * UNIV_PAGE_SIZE) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite buffer: you must\n"
+ "InnoDB: increase your buffer pool size.\n"
+ "InnoDB: Cannot continue operation.\n");
+
+ exit(1);
+ }
+
+ page2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
+
+ /* fseg_create acquires a second latch on the page,
+ therefore we must declare it: */
+
+#ifdef UNIV_SYNC_DEBUG
+ buf_page_dbg_add_level(page2, SYNC_NO_ORDER_CHECK);
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (page2 == NULL) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite buffer: you must\n"
+ "InnoDB: increase your tablespace size.\n"
+ "InnoDB: Cannot continue operation.\n");
+
+ /* We exit without committing the mtr to prevent
+ its modifications to the database getting to disk */
+
+ exit(1);
+ }
+
+ fseg_header = page + TRX_SYS_DOUBLEWRITE
+ + TRX_SYS_DOUBLEWRITE_FSEG;
+ prev_page_no = 0;
+
+ for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ + FSP_EXTENT_SIZE / 2; i++) {
+ page_no = fseg_alloc_free_page(fseg_header,
+ prev_page_no + 1,
+ FSP_UP, &mtr);
+ if (page_no == FIL_NULL) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite buffer: you must\n"
+ "InnoDB: increase your tablespace size.\n"
+ "InnoDB: Cannot continue operation.\n");
+
+ exit(1);
+ }
+
+ /* We read the allocated pages to the buffer pool;
+ when they are written to disk in a flush, the space
+ id and page number fields are also written to the
+ pages. When we at database startup read pages
+ from the doublewrite buffer, we know that if the
+ space id and page number in them are the same as
+ the page position in the tablespace, then the page
+ has not been written to in doublewrite. */
+
+ new_page = buf_page_get(TRX_SYS_SPACE, page_no,
+ RW_X_LATCH, &mtr);
+#ifdef UNIV_SYNC_DEBUG
+ buf_page_dbg_add_level(new_page, SYNC_NO_ORDER_CHECK);
+#endif /* UNIV_SYNC_DEBUG */
+
+ /* Make a dummy change to the page to ensure it will
+ be written to disk in a flush */
+
+ mlog_write_ulint(new_page + FIL_PAGE_DATA,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N,
+ MLOG_4BYTES, &mtr);
+
+ if (i == FSP_EXTENT_SIZE / 2) {
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK1,
+ page_no, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_REPEAT
+ + TRX_SYS_DOUBLEWRITE_BLOCK1,
+ page_no, MLOG_4BYTES, &mtr);
+ } else if (i == FSP_EXTENT_SIZE / 2
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK2,
+ page_no, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_REPEAT
+ + TRX_SYS_DOUBLEWRITE_BLOCK2,
+ page_no, MLOG_4BYTES, &mtr);
+ } else if (i > FSP_EXTENT_SIZE / 2) {
+ ut_a(page_no == prev_page_no + 1);
+ }
+
+ prev_page_no = page_no;
+ }
+
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
+ + TRX_SYS_DOUBLEWRITE_REPEAT,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr);
+
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+ MLOG_4BYTES, &mtr);
+ mtr_commit(&mtr);
+
+ /* Flush the modified pages to disk and make a checkpoint */
+ log_make_checkpoint_at(ut_dulint_max, TRUE);
+
+ fprintf(stderr, "InnoDB: Doublewrite buffer created\n");
+
+ trx_sys_multiple_tablespace_format = TRUE;
+
+ goto start_again;
+ }
+}
+
+/********************************************************************
+At a database startup initializes the doublewrite buffer memory structure if
+we already have a doublewrite buffer created in the data files. If we are
+upgrading to an InnoDB version which supports multiple tablespaces, then this
+function performs the necessary update operations. If we are in a crash
+recovery, this function uses a possible doublewrite buffer to restore
+half-written pages in the data files. */
+
+void
+trx_sys_doublewrite_init_or_restore_pages(
+/*======================================*/
+ ibool restore_corrupt_pages)
+{
+ byte* buf;
+ byte* read_buf;
+ byte* unaligned_read_buf;
+ ulint block1;
+ ulint block2;
+ ulint source_page_no;
+ byte* page;
+ byte* doublewrite;
+ ulint space_id;
+ ulint page_no;
+ ulint i;
+
+ /* We do the file i/o past the buffer pool */
+
+ unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE);
+ read_buf = ut_align(unaligned_read_buf, UNIV_PAGE_SIZE);
+
+ /* Read the trx sys header to check if we are using the doublewrite
+ buffer */
+
+ fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, TRX_SYS_PAGE_NO, 0,
+ UNIV_PAGE_SIZE, read_buf, NULL);
+ doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
+
+ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+ == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+ /* The doublewrite buffer has been created */
+
+ trx_doublewrite_init(doublewrite);
+
+ block1 = trx_doublewrite->block1;
+ block2 = trx_doublewrite->block2;
+
+ buf = trx_doublewrite->write_buf;
+ } else {
+ goto leave_func;
+ }
+
+ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
+ != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
+
+ /* We are upgrading from a version < 4.1.x to a version where
+ multiple tablespaces are supported. We must reset the space id
+ field in the pages in the doublewrite buffer because starting
+ from this version the space id is stored to
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
+
+ trx_doublewrite_must_reset_space_ids = TRUE;
+
+ fprintf(stderr,
+"InnoDB: Resetting space id's in the doublewrite buffer\n");
+ } else {
+ trx_sys_multiple_tablespace_format = TRUE;
+ }
+
+ /* Read the pages from the doublewrite buffer to memory */
+
+ fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, block1, 0,
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+ buf, NULL);
+ fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, block2, 0,
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+ buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+ NULL);
+ /* Check if any of these pages is half-written in data files, in the
+ intended position */
+
+ page = buf;
+
+ for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
+
+ page_no = mach_read_from_4(page + FIL_PAGE_OFFSET);
+
+ if (trx_doublewrite_must_reset_space_ids) {
+
+ space_id = 0;
+ mach_write_to_4(page
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0);
+ /* We do not need to calculate new checksums for the
+ pages because the field .._SPACE_ID does not affect
+ them. Write the page back to where we read it from. */
+
+ if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ source_page_no = block1 + i;
+ } else {
+ source_page_no = block2
+ + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+ }
+
+ fil_io(OS_FILE_WRITE, TRUE, 0, source_page_no, 0,
+ UNIV_PAGE_SIZE, page, NULL);
+ /* printf("Resetting space id in page %lu\n",
+ source_page_no); */
+ } else {
+ space_id = mach_read_from_4(
+ page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ }
+
+ if (!restore_corrupt_pages) {
+ /* The database was shut down gracefully: no need to
+ restore pages */
+
+ } else if (!fil_tablespace_exists_in_mem(space_id)) {
+ /* Maybe we have dropped the single-table tablespace
+ and this page once belonged to it: do nothing */
+
+ } else if (!fil_check_adress_in_tablespace(space_id,
+ page_no)) {
+ fprintf(stderr,
+"InnoDB: Warning: a page in the doublewrite buffer is not within space\n"
+"InnoDB: bounds; space id %lu page number %lu, page %lu in doublewrite buf.\n",
+ (ulong) space_id, (ulong) page_no, (ulong) i);
+
+ } else if (space_id == TRX_SYS_SPACE
+ && ( (page_no >= block1
+ && page_no
+ < block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+ || (page_no >= block2
+ && page_no
+ < block2 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
+
+ /* It is an unwritten doublewrite buffer page:
+ do nothing */
+ } else {
+ /* Read in the actual page from the data files */
+
+ fil_io(OS_FILE_READ, TRUE, space_id, page_no, 0,
+ UNIV_PAGE_SIZE, read_buf, NULL);
+ /* Check if the page is corrupt */
+
+ if (buf_page_is_corrupted(read_buf)) {
+
+ fprintf(stderr,
+ "InnoDB: Warning: database page corruption or a failed\n"
+ "InnoDB: file read of page %lu.\n", (ulong) page_no);
+ fprintf(stderr,
+ "InnoDB: Trying to recover it from the doublewrite buffer.\n");
+
+ if (buf_page_is_corrupted(page)) {
+ fprintf(stderr,
+ "InnoDB: Dump of the page:\n");
+ buf_page_print(read_buf);
+ fprintf(stderr,
+ "InnoDB: Dump of corresponding page in doublewrite buffer:\n");
+ buf_page_print(page);
+
+ fprintf(stderr,
+ "InnoDB: Also the page in the doublewrite buffer is corrupt.\n"
+ "InnoDB: Cannot continue operation.\n"
+ "InnoDB: You can try to recover the database with the my.cnf\n"
+ "InnoDB: option:\n"
+ "InnoDB: set-variable=innodb_force_recovery=6\n");
+ exit(1);
+ }
+
+ /* Write the good page from the
+ doublewrite buffer to the intended
+ position */
+
+ fil_io(OS_FILE_WRITE, TRUE, space_id,
+ page_no, 0,
+ UNIV_PAGE_SIZE, page, NULL);
+ fprintf(stderr,
+ "InnoDB: Recovered the page from the doublewrite buffer.\n");
+ }
+ }
+
+ page += UNIV_PAGE_SIZE;
+ }
+
+ fil_flush_file_spaces(FIL_TABLESPACE);
+
+ if (!srv_use_doublewrite_buf)
+ trx_doublewrite_free();
+
+leave_func:
+ ut_free(unaligned_read_buf);
+}
+
+/********************************************************************
+Checks that trx is in the trx list. */
+
+ibool
+trx_in_trx_list(
+/*============*/
+ /* out: TRUE if is in */
+ trx_t* in_trx) /* in: trx */
+{
+ trx_t* trx;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(kernel_mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ while (trx != NULL) {
+
+ if (trx == in_trx) {
+
+ return(TRUE);
+ }
+
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************
+Writes the value of max_trx_id to the file based trx system header. */
+
+void
+trx_sys_flush_max_trx_id(void)
+/*==========================*/
+{
+ trx_sysf_t* sys_header;
+ mtr_t mtr;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ mtr_start(&mtr);
+
+ sys_header = trx_sysf_get(&mtr);
+
+ mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE,
+ trx_sys->max_trx_id, &mtr);
+ mtr_commit(&mtr);
+}
+
+/*********************************************************************
+Updates the offset information about the end of the MySQL binlog entry
+which corresponds to the transaction just being committed. In a MySQL
+replication slave updates the latest master binlog position up to which
+replication has proceeded. */
+
+void
+trx_sys_update_mysql_binlog_offset(
+/*===============================*/
+ const char* file_name,/* in: MySQL log file name */
+ ib_longlong offset, /* in: position in that log file */
+ ulint field, /* in: offset of the MySQL log info field in
+ the trx sys header */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_sysf_t* sys_header;
+
+ if (ut_strlen(file_name) >= TRX_SYS_MYSQL_LOG_NAME_LEN) {
+
+ /* We cannot fit the name to the 512 bytes we have reserved */
+
+ return;
+ }
+
+ sys_header = trx_sysf_get(mtr);
+
+ if (mach_read_from_4(sys_header + field
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+ != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+ mlog_write_ulint(sys_header + field
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD,
+ TRX_SYS_MYSQL_LOG_MAGIC_N,
+ MLOG_4BYTES, mtr);
+ }
+
+ if (0 != strcmp((char*) (sys_header + field + TRX_SYS_MYSQL_LOG_NAME), file_name)) {
+
+ mlog_write_string(sys_header + field
+ + TRX_SYS_MYSQL_LOG_NAME,
+ (byte*) file_name, 1 + ut_strlen(file_name), mtr);
+ }
+
+ if (mach_read_from_4(sys_header + field
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH) > 0
+ || (offset >> 32) > 0) {
+
+ mlog_write_ulint(sys_header + field
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH,
+ (ulint)(offset >> 32),
+ MLOG_4BYTES, mtr);
+ }
+
+ mlog_write_ulint(sys_header + field
+ + TRX_SYS_MYSQL_LOG_OFFSET_LOW,
+ (ulint)(offset & 0xFFFFFFFFUL),
+ MLOG_4BYTES, mtr);
+}
+
+/*********************************************************************
+Prints to stderr the MySQL binlog info in the system header if the
+magic number shows it valid. */
+
+void
+trx_sys_print_mysql_binlog_offset_from_page(
+/*========================================*/
+ byte* page) /* in: buffer containing the trx system header page,
+ i.e., page number TRX_SYS_PAGE_NO in the tablespace */
+{
+ trx_sysf_t* sys_header;
+
+ sys_header = page + TRX_SYS;
+
+ if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+ == TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+ fprintf(stderr,
+ "ibbackup: Last MySQL binlog file position %lu %lu, file name %s\n",
+ (ulong) mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
+ (ulong) mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_LOW),
+ sys_header + TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_NAME);
+ }
+}
+
+/*********************************************************************
+Stores the MySQL binlog offset info in the trx system header if
+the magic number shows it valid, and print the info to stderr */
+
+void
+trx_sys_print_mysql_binlog_offset(void)
+/*===================================*/
+{
+ trx_sysf_t* sys_header;
+ mtr_t mtr;
+ ulint trx_sys_mysql_bin_log_pos_high;
+ ulint trx_sys_mysql_bin_log_pos_low;
+
+ mtr_start(&mtr);
+
+ sys_header = trx_sysf_get(&mtr);
+
+ if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+ != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ trx_sys_mysql_bin_log_pos_high = mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH);
+ trx_sys_mysql_bin_log_pos_low = mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_LOW);
+
+ trx_sys_mysql_bin_log_pos = (((ib_longlong)trx_sys_mysql_bin_log_pos_high) << 32) +
+ (ib_longlong)trx_sys_mysql_bin_log_pos_low;
+
+ ut_memcpy(trx_sys_mysql_bin_log_name, sys_header + TRX_SYS_MYSQL_LOG_INFO +
+ TRX_SYS_MYSQL_LOG_NAME, TRX_SYS_MYSQL_LOG_NAME_LEN);
+
+ fprintf(stderr,
+ "InnoDB: Last MySQL binlog file position %lu %lu, file name %s\n",
+ trx_sys_mysql_bin_log_pos_high, trx_sys_mysql_bin_log_pos_low,
+ trx_sys_mysql_bin_log_name);
+
+ mtr_commit(&mtr);
+}
+
+/*********************************************************************
+Prints to stderr the MySQL master log offset info in the trx system header if
+the magic number shows it valid. */
+
+void
+trx_sys_print_mysql_master_log_pos(void)
+/*====================================*/
+{
+ trx_sysf_t* sys_header;
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ sys_header = trx_sysf_get(&mtr);
+
+ if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+ != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ fprintf(stderr,
+"InnoDB: In a MySQL replication slave the last master binlog file\n"
+"InnoDB: position %lu %lu, file name %s\n",
+ (ulong) mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
+ (ulong) mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_LOW),
+ sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_NAME);
+ /* Copy the master log position info to global variables we can
+ use in ha_innobase.cc to initialize glob_mi to right values */
+
+ ut_memcpy(trx_sys_mysql_master_log_name,
+ sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_NAME,
+ TRX_SYS_MYSQL_LOG_NAME_LEN);
+
+ trx_sys_mysql_master_log_pos =
+ (((ib_longlong)mach_read_from_4(
+ sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH))
+ << 32)
+ + (ib_longlong)
+ mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_LOW);
+ mtr_commit(&mtr);
+}
+
+/********************************************************************
+Looks for a free slot for a rollback segment in the trx system file copy. */
+
+ulint
+trx_sysf_rseg_find_free(
+/*====================*/
+ /* out: slot index or ULINT_UNDEFINED if not found */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_sysf_t* sys_header;
+ ulint page_no;
+ ulint i;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(kernel_mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ sys_header = trx_sysf_get(mtr);
+
+ for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+
+ page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
+
+ if (page_no == FIL_NULL) {
+
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/*********************************************************************
+Creates the file page for the transaction system. This function is called only
+at the database creation, before trx_sys_init. */
+static
+void
+trx_sysf_create(
+/*============*/
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_sysf_t* sys_header;
+ ulint slot_no;
+ page_t* page;
+ ulint page_no;
+ ulint i;
+
+ ut_ad(mtr);
+
+ /* Note that below we first reserve the file space x-latch, and
+ then enter the kernel: we must do it in this order to conform
+ to the latching order rules. */
+
+ mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE), mtr);
+ mutex_enter(&kernel_mutex);
+
+ /* Create the trx sys file block in a new allocated file segment */
+ page = fseg_create(TRX_SYS_SPACE, 0, TRX_SYS + TRX_SYS_FSEG_HEADER,
+ mtr);
+ ut_a(buf_frame_get_page_no(page) == TRX_SYS_PAGE_NO);
+
+#ifdef UNIV_SYNC_DEBUG
+ buf_page_dbg_add_level(page, SYNC_TRX_SYS_HEADER);
+#endif /* UNIV_SYNC_DEBUG */
+
+ sys_header = trx_sysf_get(mtr);
+
+ /* Start counting transaction ids from number 1 up */
+ mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE,
+ ut_dulint_create(0, 1), mtr);
+
+ /* Reset the rollback segment slots */
+ for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+
+ trx_sysf_rseg_set_page_no(sys_header, i, FIL_NULL, mtr);
+ }
+
+ /* Create the first rollback segment in the SYSTEM tablespace */
+ page_no = trx_rseg_header_create(TRX_SYS_SPACE, ULINT_MAX, &slot_no,
+ mtr);
+ ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
+ ut_a(page_no != FIL_NULL);
+
+ mutex_exit(&kernel_mutex);
+}
+
+/*********************************************************************
+Creates and initializes the central memory structures for the transaction
+system. This is called when the database is started. */
+
+void
+trx_sys_init_at_db_start(void)
+/*==========================*/
+{
+ trx_sysf_t* sys_header;
+ ib_longlong rows_to_undo = 0;
+ const char* unit = "";
+ trx_t* trx;
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ ut_ad(trx_sys == NULL);
+
+ mutex_enter(&kernel_mutex);
+
+ trx_sys = mem_alloc(sizeof(trx_sys_t));
+
+ sys_header = trx_sysf_get(&mtr);
+
+ trx_rseg_list_and_array_init(sys_header, &mtr);
+
+ trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+ /* VERY important: after the database is started, max_trx_id value is
+ divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in
+ trx_sys_get_new_trx_id will evaluate to TRUE when the function
+ is first time called, and the value for trx id will be written
+ to the disk-based header! Thus trx id values will not overlap when
+ the database is repeatedly started! */
+
+ trx_sys->max_trx_id = ut_dulint_add(
+ ut_dulint_align_up(
+ mtr_read_dulint(sys_header
+ + TRX_SYS_TRX_ID_STORE, &mtr),
+ TRX_SYS_TRX_ID_WRITE_MARGIN),
+ 2 * TRX_SYS_TRX_ID_WRITE_MARGIN);
+
+ UT_LIST_INIT(trx_sys->mysql_trx_list);
+ trx_lists_init_at_db_start();
+
+ if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ for (;;) {
+
+ if ( trx->conc_state != TRX_PREPARED) {
+ rows_to_undo +=
+ ut_conv_dulint_to_longlong(trx->undo_no);
+ }
+
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+
+ if (!trx) {
+ break;
+ }
+ }
+
+ if (rows_to_undo > 1000000000) {
+ unit = "M";
+ rows_to_undo = rows_to_undo / 1000000;
+ }
+
+ fprintf(stderr,
+"InnoDB: %lu transaction(s) which must be rolled back or cleaned up\n"
+"InnoDB: in total %lu%s row operations to undo\n",
+ (ulong) UT_LIST_GET_LEN(trx_sys->trx_list),
+ (ulong) rows_to_undo, unit);
+
+ fprintf(stderr, "InnoDB: Trx id counter is %lu %lu\n",
+ (ulong) ut_dulint_get_high(trx_sys->max_trx_id),
+ (ulong) ut_dulint_get_low(trx_sys->max_trx_id));
+ }
+
+ UT_LIST_INIT(trx_sys->view_list);
+
+ trx_purge_sys_create();
+
+ mutex_exit(&kernel_mutex);
+
+ mtr_commit(&mtr);
+}
+
+/*********************************************************************
+Creates and initializes the transaction system at the database creation. */
+
+void
+trx_sys_create(void)
+/*================*/
+{
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ trx_sysf_create(&mtr);
+
+ mtr_commit(&mtr);
+
+ trx_sys_init_at_db_start();
+}
diff --git a/storage/innobase/trx/trx0trx.c b/storage/innobase/trx/trx0trx.c
new file mode 100644
index 00000000000..cdda1dd4dee
--- /dev/null
+++ b/storage/innobase/trx/trx0trx.c
@@ -0,0 +1,2025 @@
+/******************************************************
+The transaction
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0trx.h"
+
+#ifdef UNIV_NONINL
+#include "trx0trx.ic"
+#endif
+
+#include "trx0undo.h"
+#include "trx0rseg.h"
+#include "log0log.h"
+#include "que0que.h"
+#include "lock0lock.h"
+#include "trx0roll.h"
+#include "usr0sess.h"
+#include "read0read.h"
+#include "srv0srv.h"
+#include "thr0loc.h"
+#include "btr0sea.h"
+#include "os0proc.h"
+#include "trx0xa.h"
+
+/* Copy of the prototype for innobase_mysql_print_thd: this
+copy MUST be equal to the one in mysql/sql/ha_innodb.cc ! */
+
+void innobase_mysql_print_thd(
+ FILE* f,
+ void* thd);
+
+/* Dummy session used currently in MySQL interface */
+sess_t* trx_dummy_sess = NULL;
+
+/* Number of transactions currently allocated for MySQL: protected by
+the kernel mutex */
+ulint trx_n_mysql_transactions = 0;
+
+/*****************************************************************
+Starts the transaction if it is not yet started. */
+
+void
+trx_start_if_not_started_noninline(
+/*===============================*/
+ trx_t* trx) /* in: transaction */
+{
+ trx_start_if_not_started(trx);
+}
+
+/********************************************************************
+Retrieves the error_info field from a trx. */
+
+void*
+trx_get_error_info(
+/*===============*/
+ /* out: the error info */
+ trx_t* trx) /* in: trx object */
+{
+ return(trx->error_info);
+}
+
+/********************************************************************
+Creates and initializes a transaction object. */
+
+trx_t*
+trx_create(
+/*=======*/
+ /* out, own: the transaction */
+ sess_t* sess) /* in: session or NULL */
+{
+ trx_t* trx;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ trx = mem_alloc(sizeof(trx_t));
+
+ trx->magic_n = TRX_MAGIC_N;
+
+ trx->op_info = "";
+
+ trx->type = TRX_USER;
+ trx->conc_state = TRX_NOT_STARTED;
+ trx->start_time = time(NULL);
+
+ trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+ trx->id = ut_dulint_zero;
+ trx->no = ut_dulint_max;
+
+ trx->support_xa = TRUE;
+
+ trx->check_foreigns = TRUE;
+ trx->check_unique_secondary = TRUE;
+
+ trx->flush_log_later = FALSE;
+ trx->must_flush_log_later = FALSE;
+
+ trx->dict_operation = FALSE;
+
+ trx->mysql_thd = NULL;
+ trx->mysql_query_str = NULL;
+
+ trx->n_mysql_tables_in_use = 0;
+ trx->mysql_n_tables_locked = 0;
+
+ trx->mysql_log_file_name = NULL;
+ trx->mysql_log_offset = 0;
+ trx->mysql_master_log_file_name = "";
+ trx->mysql_master_log_pos = 0;
+
+ trx->repl_wait_binlog_name = NULL;
+ trx->repl_wait_binlog_pos = 0;
+
+ mutex_create(&(trx->undo_mutex));
+ mutex_set_level(&(trx->undo_mutex), SYNC_TRX_UNDO);
+
+ trx->rseg = NULL;
+
+ trx->undo_no = ut_dulint_zero;
+ trx->last_sql_stat_start.least_undo_no = ut_dulint_zero;
+ trx->insert_undo = NULL;
+ trx->update_undo = NULL;
+ trx->undo_no_arr = NULL;
+
+ trx->error_state = DB_SUCCESS;
+
+ trx->sess = sess;
+ trx->que_state = TRX_QUE_RUNNING;
+ trx->n_active_thrs = 0;
+
+ trx->handling_signals = FALSE;
+
+ UT_LIST_INIT(trx->signals);
+ UT_LIST_INIT(trx->reply_signals);
+
+ trx->graph = NULL;
+
+ trx->wait_lock = NULL;
+ trx->was_chosen_as_deadlock_victim = FALSE;
+ UT_LIST_INIT(trx->wait_thrs);
+
+ trx->lock_heap = mem_heap_create_in_buffer(256);
+ UT_LIST_INIT(trx->trx_locks);
+
+ UT_LIST_INIT(trx->trx_savepoints);
+
+ trx->dict_operation_lock_mode = 0;
+ trx->has_search_latch = FALSE;
+ trx->search_latch_timeout = BTR_SEA_TIMEOUT;
+
+ trx->declared_to_be_inside_innodb = FALSE;
+ trx->n_tickets_to_enter_innodb = 0;
+
+ trx->auto_inc_lock = NULL;
+ trx->n_lock_table_exp = 0;
+ trx->n_lock_table_transactional = 0;
+
+ trx->read_view_heap = mem_heap_create(256);
+ trx->read_view = NULL;
+
+ /* Set X/Open XA transaction identification to NULL */
+ memset(&trx->xid, 0, sizeof(trx->xid));
+ trx->xid.formatID = -1;
+
+ return(trx);
+}
+
+/************************************************************************
+Creates a transaction object for MySQL. */
+
+trx_t*
+trx_allocate_for_mysql(void)
+/*========================*/
+ /* out, own: transaction object */
+{
+ trx_t* trx;
+
+ mutex_enter(&kernel_mutex);
+
+ /* Open a dummy session */
+
+ if (!trx_dummy_sess) {
+ trx_dummy_sess = sess_open();
+ }
+
+ trx = trx_create(trx_dummy_sess);
+
+ trx_n_mysql_transactions++;
+
+ UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx);
+
+ mutex_exit(&kernel_mutex);
+
+ trx->mysql_thread_id = os_thread_get_curr_id();
+
+ trx->mysql_process_no = os_proc_get_number();
+
+ return(trx);
+}
+
+/************************************************************************
+Creates a transaction object for background operations by the master thread. */
+
+trx_t*
+trx_allocate_for_background(void)
+/*=============================*/
+ /* out, own: transaction object */
+{
+ trx_t* trx;
+
+ mutex_enter(&kernel_mutex);
+
+ /* Open a dummy session */
+
+ if (!trx_dummy_sess) {
+ trx_dummy_sess = sess_open();
+ }
+
+ trx = trx_create(trx_dummy_sess);
+
+ mutex_exit(&kernel_mutex);
+
+ return(trx);
+}
+
+/************************************************************************
+Releases the search latch if trx has reserved it. */
+
+void
+trx_search_latch_release_if_reserved(
+/*=================================*/
+ trx_t* trx) /* in: transaction */
+{
+ if (trx->has_search_latch) {
+ rw_lock_s_unlock(&btr_search_latch);
+
+ trx->has_search_latch = FALSE;
+ }
+}
+
+/************************************************************************
+Frees a transaction object. */
+
+void
+trx_free(
+/*=====*/
+ trx_t* trx) /* in, own: trx object */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (trx->declared_to_be_inside_innodb) {
+ ut_print_timestamp(stderr);
+ fputs(
+" InnoDB: Error: Freeing a trx which is declared to be processing\n"
+"InnoDB: inside InnoDB.\n", stderr);
+ trx_print(stderr, trx);
+ putc('\n', stderr);
+ }
+
+ if (trx->n_mysql_tables_in_use != 0
+ || trx->mysql_n_tables_locked != 0) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Error: MySQL is freeing a thd\n"
+"InnoDB: though trx->n_mysql_tables_in_use is %lu\n"
+"InnoDB: and trx->mysql_n_tables_locked is %lu.\n",
+ (ulong)trx->n_mysql_tables_in_use,
+ (ulong)trx->mysql_n_tables_locked);
+
+ trx_print(stderr, trx);
+
+ ut_print_buf(stderr, (byte*)trx, sizeof(trx_t));
+ }
+
+ ut_a(trx->magic_n == TRX_MAGIC_N);
+
+ trx->magic_n = 11112222;
+
+ ut_a(trx->conc_state == TRX_NOT_STARTED);
+
+ mutex_free(&(trx->undo_mutex));
+
+ ut_a(trx->insert_undo == NULL);
+ ut_a(trx->update_undo == NULL);
+
+ if (trx->undo_no_arr) {
+ trx_undo_arr_free(trx->undo_no_arr);
+ }
+
+ if (trx->repl_wait_binlog_name != NULL) {
+
+ mem_free(trx->repl_wait_binlog_name);
+ }
+
+ ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
+ ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0);
+
+ ut_a(trx->wait_lock == NULL);
+ ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
+
+ ut_a(!trx->has_search_latch);
+ ut_a(!trx->auto_inc_lock);
+ ut_a(!trx->n_lock_table_exp);
+ ut_a(!trx->n_lock_table_transactional);
+
+ ut_a(trx->dict_operation_lock_mode == 0);
+
+ if (trx->lock_heap) {
+ mem_heap_free(trx->lock_heap);
+ }
+
+ ut_a(UT_LIST_GET_LEN(trx->trx_locks) == 0);
+
+ if (trx->read_view_heap) {
+ mem_heap_free(trx->read_view_heap);
+ }
+
+ ut_a(trx->read_view == NULL);
+
+ mem_free(trx);
+}
+
+/************************************************************************
+Frees a transaction object for MySQL. */
+
+void
+trx_free_for_mysql(
+/*===============*/
+ trx_t* trx) /* in, own: trx object */
+{
+ thr_local_free(trx->mysql_thread_id);
+
+ mutex_enter(&kernel_mutex);
+
+ UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx);
+
+ trx_free(trx);
+
+ ut_a(trx_n_mysql_transactions > 0);
+
+ trx_n_mysql_transactions--;
+
+ mutex_exit(&kernel_mutex);
+}
+
+/************************************************************************
+Frees a transaction object of a background operation of the master thread. */
+
+void
+trx_free_for_background(
+/*====================*/
+ trx_t* trx) /* in, own: trx object */
+{
+ mutex_enter(&kernel_mutex);
+
+ trx_free(trx);
+
+ mutex_exit(&kernel_mutex);
+}
+
+/********************************************************************
+Inserts the trx handle in the trx system trx list in the right position.
+The list is sorted on the trx id so that the biggest id is at the list
+start. This function is used at the database startup to insert incomplete
+transactions to the list. */
+static
+void
+trx_list_insert_ordered(
+/*====================*/
+ trx_t* trx) /* in: trx handle */
+{
+ trx_t* trx2;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ trx2 = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ while (trx2 != NULL) {
+ if (ut_dulint_cmp(trx->id, trx2->id) >= 0) {
+
+ ut_ad(ut_dulint_cmp(trx->id, trx2->id) == 1);
+ break;
+ }
+ trx2 = UT_LIST_GET_NEXT(trx_list, trx2);
+ }
+
+ if (trx2 != NULL) {
+ trx2 = UT_LIST_GET_PREV(trx_list, trx2);
+
+ if (trx2 == NULL) {
+ UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
+ } else {
+ UT_LIST_INSERT_AFTER(trx_list, trx_sys->trx_list,
+ trx2, trx);
+ }
+ } else {
+ UT_LIST_ADD_LAST(trx_list, trx_sys->trx_list, trx);
+ }
+}
+
+/********************************************************************
+Creates trx objects for transactions and initializes the trx list of
+trx_sys at database start. Rollback segment and undo log lists must
+already exist when this function is called, because the lists of
+transactions to be rolled back or cleaned up are built based on the
+undo log lists. */
+
+void
+trx_lists_init_at_db_start(void)
+/*============================*/
+{
+ trx_rseg_t* rseg;
+ trx_undo_t* undo;
+ trx_t* trx;
+
+ UT_LIST_INIT(trx_sys->trx_list);
+
+ /* Look from the rollback segments if there exist undo logs for
+ transactions */
+
+ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+ while (rseg != NULL) {
+ undo = UT_LIST_GET_FIRST(rseg->insert_undo_list);
+
+ while (undo != NULL) {
+
+ trx = trx_create(NULL);
+
+ trx->id = undo->trx_id;
+ trx->xid = undo->xid;
+ trx->insert_undo = undo;
+ trx->rseg = rseg;
+
+ if (undo->state != TRX_UNDO_ACTIVE) {
+
+ /* Prepared transactions are left in
+ the prepared state waiting for a
+ commit or abort decision from MySQL */
+
+ if (undo->state == TRX_UNDO_PREPARED) {
+
+ fprintf(stderr,
+"InnoDB: Transaction %lu %lu was in the XA prepared state.\n",
+ ut_dulint_get_high(trx->id),
+ ut_dulint_get_low(trx->id));
+
+ if (srv_force_recovery == 0) {
+
+ trx->conc_state = TRX_PREPARED;
+ } else {
+ fprintf(stderr,
+"InnoDB: Since innodb_force_recovery > 0, we will rollback it anyway.\n");
+
+ trx->conc_state = TRX_ACTIVE;
+ }
+ } else {
+ trx->conc_state =
+ TRX_COMMITTED_IN_MEMORY;
+ }
+
+ /* We give a dummy value for the trx no;
+ this should have no relevance since purge
+ is not interested in committed transaction
+ numbers, unless they are in the history
+ list, in which case it looks the number
+ from the disk based undo log structure */
+
+ trx->no = trx->id;
+ } else {
+ trx->conc_state = TRX_ACTIVE;
+
+ /* A running transaction always has the number
+ field inited to ut_dulint_max */
+
+ trx->no = ut_dulint_max;
+ }
+
+ if (undo->dict_operation) {
+ trx->dict_operation = undo->dict_operation;
+ trx->table_id = undo->table_id;
+ }
+
+ if (!undo->empty) {
+ trx->undo_no = ut_dulint_add(undo->top_undo_no,
+ 1);
+ }
+
+ trx_list_insert_ordered(trx);
+
+ undo = UT_LIST_GET_NEXT(undo_list, undo);
+ }
+
+ undo = UT_LIST_GET_FIRST(rseg->update_undo_list);
+
+ while (undo != NULL) {
+ trx = trx_get_on_id(undo->trx_id);
+
+ if (NULL == trx) {
+ trx = trx_create(NULL);
+
+ trx->id = undo->trx_id;
+ trx->xid = undo->xid;
+
+ if (undo->state != TRX_UNDO_ACTIVE) {
+
+ /* Prepared transactions are left in
+ the prepared state waiting for a
+ commit or abort decision from MySQL */
+
+ if (undo->state == TRX_UNDO_PREPARED) {
+ fprintf(stderr,
+"InnoDB: Transaction %lu %lu was in the XA prepared state.\n",
+ ut_dulint_get_high(trx->id),
+ ut_dulint_get_low(trx->id));
+
+ if (srv_force_recovery == 0) {
+
+ trx->conc_state = TRX_PREPARED;
+ } else {
+ fprintf(stderr,
+"InnoDB: Since innodb_force_recovery > 0, we will rollback it anyway.\n");
+
+ trx->conc_state = TRX_ACTIVE;
+ }
+ } else {
+ trx->conc_state =
+ TRX_COMMITTED_IN_MEMORY;
+ }
+
+ /* We give a dummy value for the trx
+ number */
+
+ trx->no = trx->id;
+ } else {
+ trx->conc_state = TRX_ACTIVE;
+
+ /* A running transaction always has
+ the number field inited to
+ ut_dulint_max */
+
+ trx->no = ut_dulint_max;
+ }
+
+ trx->rseg = rseg;
+ trx_list_insert_ordered(trx);
+
+ if (undo->dict_operation) {
+ trx->dict_operation =
+ undo->dict_operation;
+ trx->table_id = undo->table_id;
+ }
+ }
+
+ trx->update_undo = undo;
+
+ if ((!undo->empty)
+ && (ut_dulint_cmp(undo->top_undo_no, trx->undo_no)
+ >= 0)) {
+
+ trx->undo_no = ut_dulint_add(undo->top_undo_no,
+ 1);
+ }
+
+ undo = UT_LIST_GET_NEXT(undo_list, undo);
+ }
+
+ rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+ }
+}
+
+/**********************************************************************
+Assigns a rollback segment to a transaction in a round-robin fashion.
+Skips the SYSTEM rollback segment if another is available. */
+UNIV_INLINE
+ulint
+trx_assign_rseg(void)
+/*=================*/
+ /* out: assigned rollback segment id */
+{
+ trx_rseg_t* rseg = trx_sys->latest_rseg;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+loop:
+ /* Get next rseg in a round-robin fashion */
+
+ rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+
+ if (rseg == NULL) {
+ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+ }
+
+ /* If it is the SYSTEM rollback segment, and there exist others, skip
+ it */
+
+ if ((rseg->id == TRX_SYS_SYSTEM_RSEG_ID)
+ && (UT_LIST_GET_LEN(trx_sys->rseg_list) > 1)) {
+ goto loop;
+ }
+
+ trx_sys->latest_rseg = rseg;
+
+ return(rseg->id);
+}
+
+/********************************************************************
+Starts a new transaction. */
+
+ibool
+trx_start_low(
+/*==========*/
+ /* out: TRUE */
+ trx_t* trx, /* in: transaction */
+ ulint rseg_id)/* in: rollback segment id; if ULINT_UNDEFINED
+ is passed, the system chooses the rollback segment
+ automatically in a round-robin fashion */
+{
+ trx_rseg_t* rseg;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(trx->rseg == NULL);
+
+ if (trx->type == TRX_PURGE) {
+ trx->id = ut_dulint_zero;
+ trx->conc_state = TRX_ACTIVE;
+ trx->start_time = time(NULL);
+
+ return(TRUE);
+ }
+
+ ut_ad(trx->conc_state != TRX_ACTIVE);
+
+ if (rseg_id == ULINT_UNDEFINED) {
+
+ rseg_id = trx_assign_rseg();
+ }
+
+ rseg = trx_sys_get_nth_rseg(trx_sys, rseg_id);
+
+ trx->id = trx_sys_get_new_trx_id();
+
+ /* The initial value for trx->no: ut_dulint_max is used in
+ read_view_open_now: */
+
+ trx->no = ut_dulint_max;
+
+ trx->rseg = rseg;
+
+ trx->conc_state = TRX_ACTIVE;
+ trx->start_time = time(NULL);
+
+ UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
+
+ return(TRUE);
+}
+
+/********************************************************************
+Starts a new transaction. */
+
+ibool
+trx_start(
+/*======*/
+ /* out: TRUE */
+ trx_t* trx, /* in: transaction */
+ ulint rseg_id)/* in: rollback segment id; if ULINT_UNDEFINED
+ is passed, the system chooses the rollback segment
+ automatically in a round-robin fashion */
+{
+ ibool ret;
+
+ mutex_enter(&kernel_mutex);
+
+ ret = trx_start_low(trx, rseg_id);
+
+ mutex_exit(&kernel_mutex);
+
+ return(ret);
+}
+
+/********************************************************************
+Commits a transaction. */
+
+void
+trx_commit_off_kernel(
+/*==================*/
+ trx_t* trx) /* in: transaction */
+{
+ page_t* update_hdr_page;
+ dulint lsn;
+ trx_rseg_t* rseg;
+ trx_undo_t* undo;
+ ibool must_flush_log = FALSE;
+ mtr_t mtr;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ trx->must_flush_log_later = FALSE;
+
+ rseg = trx->rseg;
+
+ if (trx->insert_undo != NULL || trx->update_undo != NULL) {
+
+ mutex_exit(&kernel_mutex);
+
+ mtr_start(&mtr);
+
+ must_flush_log = TRUE;
+
+ /* Change the undo log segment states from TRX_UNDO_ACTIVE
+ to some other state: these modifications to the file data
+ structure define the transaction as committed in the file
+ based world, at the serialization point of the log sequence
+ number lsn obtained below. */
+
+ mutex_enter(&(rseg->mutex));
+
+ if (trx->insert_undo != NULL) {
+ trx_undo_set_state_at_finish(trx, trx->insert_undo,
+ &mtr);
+ }
+
+ undo = trx->update_undo;
+
+ if (undo) {
+ mutex_enter(&kernel_mutex);
+ trx->no = trx_sys_get_new_trx_no();
+
+ mutex_exit(&kernel_mutex);
+
+ /* It is not necessary to obtain trx->undo_mutex here
+ because only a single OS thread is allowed to do the
+ transaction commit for this transaction. */
+
+ update_hdr_page = trx_undo_set_state_at_finish(trx,
+ undo, &mtr);
+
+ /* We have to do the cleanup for the update log while
+ holding the rseg mutex because update log headers
+ have to be put to the history list in the order of
+ the trx number. */
+
+ trx_undo_update_cleanup(trx, update_hdr_page, &mtr);
+ }
+
+ mutex_exit(&(rseg->mutex));
+
+ /* Update the latest MySQL binlog name and offset info
+ in trx sys header if MySQL binlogging is on or the database
+ server is a MySQL replication slave */
+
+ if (trx->mysql_log_file_name) {
+ trx_sys_update_mysql_binlog_offset(
+ trx->mysql_log_file_name,
+ trx->mysql_log_offset,
+ TRX_SYS_MYSQL_LOG_INFO, &mtr);
+ trx->mysql_log_file_name = NULL;
+ }
+
+ if (trx->mysql_master_log_file_name[0] != '\0') {
+ /* This database server is a MySQL replication slave */
+ trx_sys_update_mysql_binlog_offset(
+ trx->mysql_master_log_file_name,
+ trx->mysql_master_log_pos,
+ TRX_SYS_MYSQL_MASTER_LOG_INFO, &mtr);
+ }
+
+ /* The following call commits the mini-transaction, making the
+ whole transaction committed in the file-based world, at this
+ log sequence number. The transaction becomes 'durable' when
+ we write the log to disk, but in the logical sense the commit
+ in the file-based data structures (undo logs etc.) happens
+ here.
+
+ NOTE that transaction numbers, which are assigned only to
+ transactions with an update undo log, do not necessarily come
+ in exactly the same order as commit lsn's, if the transactions
+ have different rollback segments. To get exactly the same
+ order we should hold the kernel mutex up to this point,
+ adding to to the contention of the kernel mutex. However, if
+ a transaction T2 is able to see modifications made by
+ a transaction T1, T2 will always get a bigger transaction
+ number and a bigger commit lsn than T1. */
+
+ /*--------------*/
+ mtr_commit(&mtr);
+ /*--------------*/
+ lsn = mtr.end_lsn;
+
+ mutex_enter(&kernel_mutex);
+ }
+
+ ut_ad(trx->conc_state == TRX_ACTIVE
+ || trx->conc_state == TRX_PREPARED);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ /* The following assignment makes the transaction committed in memory
+ and makes its changes to data visible to other transactions.
+ NOTE that there is a small discrepancy from the strict formal
+ visibility rules here: a human user of the database can see
+ modifications made by another transaction T even before the necessary
+ log segment has been flushed to the disk. If the database happens to
+ crash before the flush, the user has seen modifications from T which
+ will never be a committed transaction. However, any transaction T2
+ which sees the modifications of the committing transaction T, and
+ which also itself makes modifications to the database, will get an lsn
+ larger than the committing transaction T. In the case where the log
+ flush fails, and T never gets committed, also T2 will never get
+ committed. */
+
+ /*--------------------------------------*/
+ trx->conc_state = TRX_COMMITTED_IN_MEMORY;
+ /*--------------------------------------*/
+
+ lock_release_off_kernel(trx);
+
+ if (trx->read_view) {
+ read_view_close(trx->read_view);
+
+ mem_heap_empty(trx->read_view_heap);
+ trx->read_view = NULL;
+ }
+
+ if (must_flush_log) {
+
+ mutex_exit(&kernel_mutex);
+
+ if (trx->insert_undo != NULL) {
+
+ trx_undo_insert_cleanup(trx);
+ }
+
+ /* NOTE that we could possibly make a group commit more
+ efficient here: call os_thread_yield here to allow also other
+ trxs to come to commit! */
+
+ /*-------------------------------------*/
+
+ /* Depending on the my.cnf options, we may now write the log
+ buffer to the log files, making the transaction durable if
+ the OS does not crash. We may also flush the log files to
+ disk, making the transaction durable also at an OS crash or a
+ power outage.
+
+ The idea in InnoDB's group commit is that a group of
+ transactions gather behind a trx doing a physical disk write
+ to log files, and when that physical write has been completed,
+ one of those transactions does a write which commits the whole
+ group. Note that this group commit will only bring benefit if
+ there are > 2 users in the database. Then at least 2 users can
+ gather behind one doing the physical log write to disk.
+
+ If we are calling trx_commit() under MySQL's binlog mutex, we
+ will delay possible log write and flush to a separate function
+ trx_commit_complete_for_mysql(), which is only called when the
+ thread has released the binlog mutex. This is to make the
+ group commit algorithm to work. Otherwise, the MySQL binlog
+ mutex would serialize all commits and prevent a group of
+ transactions from gathering. */
+
+ if (trx->flush_log_later) {
+ /* Do nothing yet */
+ trx->must_flush_log_later = TRUE;
+ } else if (srv_flush_log_at_trx_commit == 0) {
+ /* Do nothing */
+ } else if (srv_flush_log_at_trx_commit == 1) {
+ if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
+ FALSE);
+ } else {
+ /* Write the log to the log files AND flush
+ them to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
+ }
+ } else if (srv_flush_log_at_trx_commit == 2) {
+
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+ } else {
+ ut_error;
+ }
+
+ trx->commit_lsn = lsn;
+
+ /*-------------------------------------*/
+
+ mutex_enter(&kernel_mutex);
+ }
+
+ /* Free savepoints */
+ trx_roll_savepoints_free(trx, NULL);
+
+ trx->conc_state = TRX_NOT_STARTED;
+ trx->rseg = NULL;
+ trx->undo_no = ut_dulint_zero;
+ trx->last_sql_stat_start.least_undo_no = ut_dulint_zero;
+
+ ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
+ ut_ad(UT_LIST_GET_LEN(trx->trx_locks) == 0);
+
+ UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
+}
+
+/********************************************************************
+Cleans up a transaction at database startup. The cleanup is needed if
+the transaction already got to the middle of a commit when the database
+crashed, andf we cannot roll it back. */
+
+void
+trx_cleanup_at_db_startup(
+/*======================*/
+ trx_t* trx) /* in: transaction */
+{
+ if (trx->insert_undo != NULL) {
+
+ trx_undo_insert_cleanup(trx);
+ }
+
+ trx->conc_state = TRX_NOT_STARTED;
+ trx->rseg = NULL;
+ trx->undo_no = ut_dulint_zero;
+ trx->last_sql_stat_start.least_undo_no = ut_dulint_zero;
+
+ UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
+}
+
+/************************************************************************
+Assigns a read view for a consistent read query. All the consistent reads
+within the same transaction will get the same read view, which is created
+when this function is first called for a new started transaction. */
+
+read_view_t*
+trx_assign_read_view(
+/*=================*/
+ /* out: consistent read view */
+ trx_t* trx) /* in: active transaction */
+{
+ ut_ad(trx->conc_state == TRX_ACTIVE);
+
+ if (trx->read_view) {
+ return(trx->read_view);
+ }
+
+ mutex_enter(&kernel_mutex);
+
+ if (!trx->read_view) {
+ trx->read_view = read_view_open_now(trx, trx->read_view_heap);
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ return(trx->read_view);
+}
+
+/********************************************************************
+Commits a transaction. NOTE that the kernel mutex is temporarily released. */
+static
+void
+trx_handle_commit_sig_off_kernel(
+/*=============================*/
+ trx_t* trx, /* in: transaction */
+ que_thr_t** next_thr) /* in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+{
+ trx_sig_t* sig;
+ trx_sig_t* next_sig;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ trx->que_state = TRX_QUE_COMMITTING;
+
+ trx_commit_off_kernel(trx);
+
+ ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
+
+ /* Remove all TRX_SIG_COMMIT signals from the signal queue and send
+ reply messages to them */
+
+ sig = UT_LIST_GET_FIRST(trx->signals);
+
+ while (sig != NULL) {
+ next_sig = UT_LIST_GET_NEXT(signals, sig);
+
+ if (sig->type == TRX_SIG_COMMIT) {
+
+ trx_sig_reply(sig, next_thr);
+ trx_sig_remove(trx, sig);
+ }
+
+ sig = next_sig;
+ }
+
+ trx->que_state = TRX_QUE_RUNNING;
+}
+
+/***************************************************************
+The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to
+the TRX_QUE_RUNNING state and releases query threads which were
+waiting for a lock in the wait_thrs list. */
+
+void
+trx_end_lock_wait(
+/*==============*/
+ trx_t* trx) /* in: transaction */
+{
+ que_thr_t* thr;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
+
+ thr = UT_LIST_GET_FIRST(trx->wait_thrs);
+
+ while (thr != NULL) {
+ que_thr_end_wait_no_next_thr(thr);
+
+ UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
+
+ thr = UT_LIST_GET_FIRST(trx->wait_thrs);
+ }
+
+ trx->que_state = TRX_QUE_RUNNING;
+}
+
+/***************************************************************
+Moves the query threads in the lock wait list to the SUSPENDED state and puts
+the transaction to the TRX_QUE_RUNNING state. */
+static
+void
+trx_lock_wait_to_suspended(
+/*=======================*/
+ trx_t* trx) /* in: transaction in the TRX_QUE_LOCK_WAIT state */
+{
+ que_thr_t* thr;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
+
+ thr = UT_LIST_GET_FIRST(trx->wait_thrs);
+
+ while (thr != NULL) {
+ thr->state = QUE_THR_SUSPENDED;
+
+ UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
+
+ thr = UT_LIST_GET_FIRST(trx->wait_thrs);
+ }
+
+ trx->que_state = TRX_QUE_RUNNING;
+}
+
+/***************************************************************
+Moves the query threads in the sig reply wait list of trx to the SUSPENDED
+state. */
+static
+void
+trx_sig_reply_wait_to_suspended(
+/*============================*/
+ trx_t* trx) /* in: transaction */
+{
+ trx_sig_t* sig;
+ que_thr_t* thr;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ sig = UT_LIST_GET_FIRST(trx->reply_signals);
+
+ while (sig != NULL) {
+ thr = sig->receiver;
+
+ ut_ad(thr->state == QUE_THR_SIG_REPLY_WAIT);
+
+ thr->state = QUE_THR_SUSPENDED;
+
+ sig->receiver = NULL;
+
+ UT_LIST_REMOVE(reply_signals, trx->reply_signals, sig);
+
+ sig = UT_LIST_GET_FIRST(trx->reply_signals);
+ }
+}
+
+/*********************************************************************
+Checks the compatibility of a new signal with the other signals in the
+queue. */
+static
+ibool
+trx_sig_is_compatible(
+/*==================*/
+ /* out: TRUE if the signal can be queued */
+ trx_t* trx, /* in: trx handle */
+ ulint type, /* in: signal type */
+ ulint sender) /* in: TRX_SIG_SELF or TRX_SIG_OTHER_SESS */
+{
+ trx_sig_t* sig;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (UT_LIST_GET_LEN(trx->signals) == 0) {
+
+ return(TRUE);
+ }
+
+ if (sender == TRX_SIG_SELF) {
+ if (type == TRX_SIG_ERROR_OCCURRED) {
+
+ return(TRUE);
+
+ } else if (type == TRX_SIG_BREAK_EXECUTION) {
+
+ return(TRUE);
+ } else {
+ return(FALSE);
+ }
+ }
+
+ ut_ad(sender == TRX_SIG_OTHER_SESS);
+
+ sig = UT_LIST_GET_FIRST(trx->signals);
+
+ if (type == TRX_SIG_COMMIT) {
+ while (sig != NULL) {
+
+ if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
+
+ return(FALSE);
+ }
+
+ sig = UT_LIST_GET_NEXT(signals, sig);
+ }
+
+ return(TRUE);
+
+ } else if (type == TRX_SIG_TOTAL_ROLLBACK) {
+ while (sig != NULL) {
+
+ if (sig->type == TRX_SIG_COMMIT) {
+
+ return(FALSE);
+ }
+
+ sig = UT_LIST_GET_NEXT(signals, sig);
+ }
+
+ return(TRUE);
+
+ } else if (type == TRX_SIG_BREAK_EXECUTION) {
+
+ return(TRUE);
+ } else {
+ ut_error;
+
+ return(FALSE);
+ }
+}
+
+/********************************************************************
+Sends a signal to a trx object. */
+
+ibool
+trx_sig_send(
+/*=========*/
+ /* out: TRUE if the signal was
+ successfully delivered */
+ trx_t* trx, /* in: trx handle */
+ ulint type, /* in: signal type */
+ ulint sender, /* in: TRX_SIG_SELF or
+ TRX_SIG_OTHER_SESS */
+ que_thr_t* receiver_thr, /* in: query thread which wants the
+ reply, or NULL; if type is
+ TRX_SIG_END_WAIT, this must be NULL */
+ trx_savept_t* savept, /* in: possible rollback savepoint, or
+ NULL */
+ que_thr_t** next_thr) /* in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if the parameter
+ is NULL, it is ignored */
+{
+ trx_sig_t* sig;
+ trx_t* receiver_trx;
+
+ ut_ad(trx);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (!trx_sig_is_compatible(trx, type, sender)) {
+ /* The signal is not compatible with the other signals in
+ the queue: do nothing */
+
+ ut_error;
+
+ return(FALSE);
+ }
+
+ /* Queue the signal object */
+
+ if (UT_LIST_GET_LEN(trx->signals) == 0) {
+
+ /* The signal list is empty: the 'sig' slot must be unused
+ (we improve performance a bit by avoiding mem_alloc) */
+ sig = &(trx->sig);
+ } else {
+ /* It might be that the 'sig' slot is unused also in this
+ case, but we choose the easy way of using mem_alloc */
+
+ sig = mem_alloc(sizeof(trx_sig_t));
+ }
+
+ UT_LIST_ADD_LAST(signals, trx->signals, sig);
+
+ sig->type = type;
+ sig->state = TRX_SIG_WAITING;
+ sig->sender = sender;
+ sig->receiver = receiver_thr;
+
+ if (savept) {
+ sig->savept = *savept;
+ }
+
+ if (receiver_thr) {
+ receiver_trx = thr_get_trx(receiver_thr);
+
+ UT_LIST_ADD_LAST(reply_signals, receiver_trx->reply_signals,
+ sig);
+ }
+
+ if (trx->sess->state == SESS_ERROR) {
+
+ trx_sig_reply_wait_to_suspended(trx);
+ }
+
+ if ((sender != TRX_SIG_SELF) || (type == TRX_SIG_BREAK_EXECUTION)) {
+
+ /* The following call will add a TRX_SIG_ERROR_OCCURRED
+ signal to the end of the queue, if the session is not yet
+ in the error state: */
+
+ ut_error;
+ }
+
+ /* If there were no other signals ahead in the queue, try to start
+ handling of the signal */
+
+ if (UT_LIST_GET_FIRST(trx->signals) == sig) {
+
+ trx_sig_start_handle(trx, next_thr);
+ }
+
+ return(TRUE);
+}
+
+/********************************************************************
+Ends signal handling. If the session is in the error state, and
+trx->graph_before_signal_handling != NULL, then returns control to the error
+handling routine of the graph (currently just returns the control to the
+graph root which then will send an error message to the client). */
+
+void
+trx_end_signal_handling(
+/*====================*/
+ trx_t* trx) /* in: trx */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(trx->handling_signals == TRUE);
+
+ trx->handling_signals = FALSE;
+
+ trx->graph = trx->graph_before_signal_handling;
+
+ if (trx->graph && (trx->sess->state == SESS_ERROR)) {
+
+ que_fork_error_handle(trx, trx->graph);
+ }
+}
+
+/********************************************************************
+Starts handling of a trx signal. */
+
+void
+trx_sig_start_handle(
+/*=================*/
+ trx_t* trx, /* in: trx handle */
+ que_thr_t** next_thr) /* in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if the parameter
+ is NULL, it is ignored */
+{
+ trx_sig_t* sig;
+ ulint type;
+loop:
+ /* We loop in this function body as long as there are queued signals
+ we can process immediately */
+
+ ut_ad(trx);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (trx->handling_signals && (UT_LIST_GET_LEN(trx->signals) == 0)) {
+
+ trx_end_signal_handling(trx);
+
+ return;
+ }
+
+ if (trx->conc_state == TRX_NOT_STARTED) {
+
+ trx_start_low(trx, ULINT_UNDEFINED);
+ }
+
+ /* If the trx is in a lock wait state, moves the waiting query threads
+ to the suspended state */
+
+ if (trx->que_state == TRX_QUE_LOCK_WAIT) {
+
+ trx_lock_wait_to_suspended(trx);
+ }
+
+ /* If the session is in the error state and this trx has threads
+ waiting for reply from signals, moves these threads to the suspended
+ state, canceling wait reservations; note that if the transaction has
+ sent a commit or rollback signal to itself, and its session is not in
+ the error state, then nothing is done here. */
+
+ if (trx->sess->state == SESS_ERROR) {
+ trx_sig_reply_wait_to_suspended(trx);
+ }
+
+ /* If there are no running query threads, we can start processing of a
+ signal, otherwise we have to wait until all query threads of this
+ transaction are aware of the arrival of the signal. */
+
+ if (trx->n_active_thrs > 0) {
+
+ return;
+ }
+
+ if (trx->handling_signals == FALSE) {
+ trx->graph_before_signal_handling = trx->graph;
+
+ trx->handling_signals = TRUE;
+ }
+
+ sig = UT_LIST_GET_FIRST(trx->signals);
+ type = sig->type;
+
+ if (type == TRX_SIG_COMMIT) {
+
+ trx_handle_commit_sig_off_kernel(trx, next_thr);
+
+ } else if ((type == TRX_SIG_TOTAL_ROLLBACK)
+ || (type == TRX_SIG_ROLLBACK_TO_SAVEPT)) {
+
+ trx_rollback(trx, sig, next_thr);
+
+ /* No further signals can be handled until the rollback
+ completes, therefore we return */
+
+ return;
+
+ } else if (type == TRX_SIG_ERROR_OCCURRED) {
+
+ trx_rollback(trx, sig, next_thr);
+
+ /* No further signals can be handled until the rollback
+ completes, therefore we return */
+
+ return;
+
+ } else if (type == TRX_SIG_BREAK_EXECUTION) {
+
+ trx_sig_reply(sig, next_thr);
+ trx_sig_remove(trx, sig);
+ } else {
+ ut_error;
+ }
+
+ goto loop;
+}
+
+/********************************************************************
+Send the reply message when a signal in the queue of the trx has been
+handled. */
+
+void
+trx_sig_reply(
+/*==========*/
+ trx_sig_t* sig, /* in: signal */
+ que_thr_t** next_thr) /* in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+{
+ trx_t* receiver_trx;
+
+ ut_ad(sig);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (sig->receiver != NULL) {
+ ut_ad((sig->receiver)->state == QUE_THR_SIG_REPLY_WAIT);
+
+ receiver_trx = thr_get_trx(sig->receiver);
+
+ UT_LIST_REMOVE(reply_signals, receiver_trx->reply_signals,
+ sig);
+ ut_ad(receiver_trx->sess->state != SESS_ERROR);
+
+ que_thr_end_wait(sig->receiver, next_thr);
+
+ sig->receiver = NULL;
+
+ }
+}
+
+/********************************************************************
+Removes a signal object from the trx signal queue. */
+
+void
+trx_sig_remove(
+/*===========*/
+ trx_t* trx, /* in: trx handle */
+ trx_sig_t* sig) /* in, own: signal */
+{
+ ut_ad(trx && sig);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ ut_ad(sig->receiver == NULL);
+
+ UT_LIST_REMOVE(signals, trx->signals, sig);
+ sig->type = 0; /* reset the field to catch possible bugs */
+
+ if (sig != &(trx->sig)) {
+ mem_free(sig);
+ }
+}
+
+/*************************************************************************
+Creates a commit command node struct. */
+
+commit_node_t*
+commit_node_create(
+/*===============*/
+ /* out, own: commit node struct */
+ mem_heap_t* heap) /* in: mem heap where created */
+{
+ commit_node_t* node;
+
+ node = mem_heap_alloc(heap, sizeof(commit_node_t));
+ node->common.type = QUE_NODE_COMMIT;
+ node->state = COMMIT_NODE_SEND;
+
+ return(node);
+}
+
+/***************************************************************
+Performs an execution step for a commit type node in a query graph. */
+
+que_thr_t*
+trx_commit_step(
+/*============*/
+ /* out: query thread to run next, or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ commit_node_t* node;
+ que_thr_t* next_thr;
+ ibool success;
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ node->state = COMMIT_NODE_SEND;
+ }
+
+ if (node->state == COMMIT_NODE_SEND) {
+ mutex_enter(&kernel_mutex);
+
+ node->state = COMMIT_NODE_WAIT;
+
+ next_thr = NULL;
+
+ thr->state = QUE_THR_SIG_REPLY_WAIT;
+
+ /* Send the commit signal to the transaction */
+
+ success = trx_sig_send(thr_get_trx(thr), TRX_SIG_COMMIT,
+ TRX_SIG_SELF, thr, NULL, &next_thr);
+
+ mutex_exit(&kernel_mutex);
+
+ if (!success) {
+ /* Error in delivering the commit signal */
+ que_thr_handle_error(thr, DB_ERROR, NULL, 0);
+ }
+
+ return(next_thr);
+ }
+
+ ut_ad(node->state == COMMIT_NODE_WAIT);
+
+ node->state = COMMIT_NODE_SEND;
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
+
+/**************************************************************************
+Does the transaction commit for MySQL. */
+
+ulint
+trx_commit_for_mysql(
+/*=================*/
+ /* out: 0 or error number */
+ trx_t* trx) /* in: trx handle */
+{
+ /* Because we do not do the commit by sending an Innobase
+ sig to the transaction, we must here make sure that trx has been
+ started. */
+
+ ut_a(trx);
+
+ trx->op_info = "committing";
+
+ trx_start_if_not_started(trx);
+
+ mutex_enter(&kernel_mutex);
+
+ trx_commit_off_kernel(trx);
+
+ mutex_exit(&kernel_mutex);
+
+ trx->op_info = "";
+
+ return(0);
+}
+
+/**************************************************************************
+If required, flushes the log to disk if we called trx_commit_for_mysql()
+with trx->flush_log_later == TRUE. */
+
+ulint
+trx_commit_complete_for_mysql(
+/*==========================*/
+ /* out: 0 or error number */
+ trx_t* trx) /* in: trx handle */
+{
+ dulint lsn = trx->commit_lsn;
+
+ ut_a(trx);
+
+ trx->op_info = "flushing log";
+
+ if (!trx->must_flush_log_later) {
+ /* Do nothing */
+ } else if (srv_flush_log_at_trx_commit == 0) {
+ /* Do nothing */
+ } else if (srv_flush_log_at_trx_commit == 1) {
+ if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+ } else {
+ /* Write the log to the log files AND flush them to
+ disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
+ }
+ } else if (srv_flush_log_at_trx_commit == 2) {
+
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+ } else {
+ ut_error;
+ }
+
+ trx->must_flush_log_later = FALSE;
+
+ trx->op_info = "";
+
+ return(0);
+}
+
+/**************************************************************************
+Marks the latest SQL statement ended. */
+
+void
+trx_mark_sql_stat_end(
+/*==================*/
+ trx_t* trx) /* in: trx handle */
+{
+ ut_a(trx);
+
+ if (trx->conc_state == TRX_NOT_STARTED) {
+ trx->undo_no = ut_dulint_zero;
+ }
+
+ trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+}
+
+/**************************************************************************
+Prints info about a transaction to the standard output. The caller must
+own the kernel mutex and must have called
+innobase_mysql_prepare_print_arbitrary_thd(), unless he knows that MySQL or
+InnoDB cannot meanwhile change the info printed here. */
+
+void
+trx_print(
+/*======*/
+ FILE* f, /* in: output stream */
+ trx_t* trx) /* in: transaction */
+{
+ ibool newline;
+
+ fprintf(f, "TRANSACTION %lu %lu",
+ (ulong) ut_dulint_get_high(trx->id),
+ (ulong) ut_dulint_get_low(trx->id));
+
+ switch (trx->conc_state) {
+ case TRX_NOT_STARTED:
+ fputs(", not started", f);
+ break;
+ case TRX_ACTIVE:
+ fprintf(f, ", ACTIVE %lu sec",
+ (ulong)difftime(time(NULL), trx->start_time));
+ break;
+ case TRX_PREPARED:
+ fprintf(f, ", ACTIVE (PREPARED) %lu sec",
+ (ulong)difftime(time(NULL), trx->start_time));
+ break;
+ case TRX_COMMITTED_IN_MEMORY:
+ fputs(", COMMITTED IN MEMORY", f);
+ break;
+ default:
+ fprintf(f, " state %lu", (ulong) trx->conc_state);
+ }
+
+#ifdef UNIV_LINUX
+ fprintf(f, ", process no %lu", trx->mysql_process_no);
+#endif
+ fprintf(f, ", OS thread id %lu",
+ (ulong) os_thread_pf(trx->mysql_thread_id));
+
+ if (*trx->op_info) {
+ putc(' ', f);
+ fputs(trx->op_info, f);
+ }
+
+ if (trx->type != TRX_USER) {
+ fputs(" purge trx", f);
+ }
+
+ if (trx->declared_to_be_inside_innodb) {
+ fprintf(f, ", thread declared inside InnoDB %lu",
+ (ulong) trx->n_tickets_to_enter_innodb);
+ }
+
+ putc('\n', f);
+
+ if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
+ fprintf(f, "mysql tables in use %lu, locked %lu\n",
+ (ulong) trx->n_mysql_tables_in_use,
+ (ulong) trx->mysql_n_tables_locked);
+ }
+
+ if (trx->n_lock_table_transactional > 0 || trx->n_lock_table_exp > 0) {
+fprintf(f, "mysql explicit table locks %lu, transactional table locks %lu\n",
+ (ulong) trx->n_lock_table_exp,
+ (ulong) trx->n_lock_table_transactional);
+ }
+
+ newline = TRUE;
+
+ switch (trx->que_state) {
+ case TRX_QUE_RUNNING:
+ newline = FALSE; break;
+ case TRX_QUE_LOCK_WAIT:
+ fputs("LOCK WAIT ", f); break;
+ case TRX_QUE_ROLLING_BACK:
+ fputs("ROLLING BACK ", f); break;
+ case TRX_QUE_COMMITTING:
+ fputs("COMMITTING ", f); break;
+ default:
+ fprintf(f, "que state %lu ", (ulong) trx->que_state);
+ }
+
+ if (0 < UT_LIST_GET_LEN(trx->trx_locks) ||
+ mem_heap_get_size(trx->lock_heap) > 400) {
+ newline = TRUE;
+
+ fprintf(f, "%lu lock struct(s), heap size %lu",
+ (ulong) UT_LIST_GET_LEN(trx->trx_locks),
+ (ulong) mem_heap_get_size(trx->lock_heap));
+ }
+
+ if (trx->has_search_latch) {
+ newline = TRUE;
+ fputs(", holds adaptive hash latch", f);
+ }
+
+ if (ut_dulint_cmp(trx->undo_no, ut_dulint_zero) != 0) {
+ newline = TRUE;
+ fprintf(f, ", undo log entries %lu",
+ (ulong) ut_dulint_get_low(trx->undo_no));
+ }
+
+ if (newline) {
+ putc('\n', f);
+ }
+
+ if (trx->mysql_thd != NULL) {
+ innobase_mysql_print_thd(f, trx->mysql_thd);
+ }
+}
+
+/********************************************************************
+Prepares a transaction. */
+
+void
+trx_prepare_off_kernel(
+/*===================*/
+ trx_t* trx) /* in: transaction */
+{
+ page_t* update_hdr_page;
+ trx_rseg_t* rseg;
+ ibool must_flush_log = FALSE;
+ dulint lsn;
+ mtr_t mtr;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ rseg = trx->rseg;
+
+ if (trx->insert_undo != NULL || trx->update_undo != NULL) {
+
+ mutex_exit(&kernel_mutex);
+
+ mtr_start(&mtr);
+
+ must_flush_log = TRUE;
+
+ /* Change the undo log segment states from TRX_UNDO_ACTIVE
+ to TRX_UNDO_PREPARED: these modifications to the file data
+ structure define the transaction as prepared in the
+ file-based world, at the serialization point of lsn. */
+
+ mutex_enter(&(rseg->mutex));
+
+ if (trx->insert_undo != NULL) {
+
+ /* It is not necessary to obtain trx->undo_mutex here
+ because only a single OS thread is allowed to do the
+ transaction prepare for this transaction. */
+
+ trx_undo_set_state_at_prepare(trx, trx->insert_undo,
+ &mtr);
+ }
+
+ if (trx->update_undo) {
+ update_hdr_page = trx_undo_set_state_at_prepare(trx,
+ trx->update_undo, &mtr);
+ }
+
+ mutex_exit(&(rseg->mutex));
+
+ /*--------------*/
+ mtr_commit(&mtr); /* This mtr commit makes the
+ transaction prepared in the file-based
+ world */
+ /*--------------*/
+ lsn = mtr.end_lsn;
+
+ mutex_enter(&kernel_mutex);
+ }
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ /*--------------------------------------*/
+ trx->conc_state = TRX_PREPARED;
+ /*--------------------------------------*/
+
+ if (must_flush_log) {
+ /* Depending on the my.cnf options, we may now write the log
+ buffer to the log files, making the prepared state of the
+ transaction durable if the OS does not crash. We may also
+ flush the log files to disk, making the prepared state of the
+ transaction durable also at an OS crash or a power outage.
+
+ The idea in InnoDB's group prepare is that a group of
+ transactions gather behind a trx doing a physical disk write
+ to log files, and when that physical write has been completed,
+ one of those transactions does a write which prepares the whole
+ group. Note that this group prepare will only bring benefit if
+ there are > 2 users in the database. Then at least 2 users can
+ gather behind one doing the physical log write to disk.
+
+ TODO: find out if MySQL holds some mutex when calling this.
+ That would spoil our group prepare algorithm. */
+
+ mutex_exit(&kernel_mutex);
+
+ if (srv_flush_log_at_trx_commit == 0) {
+ /* Do nothing */
+ } else if (srv_flush_log_at_trx_commit == 1) {
+ if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
+ FALSE);
+ } else {
+ /* Write the log to the log files AND flush
+ them to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
+ }
+ } else if (srv_flush_log_at_trx_commit == 2) {
+
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+ } else {
+ ut_error;
+ }
+
+ mutex_enter(&kernel_mutex);
+ }
+}
+
+/**************************************************************************
+Does the transaction prepare for MySQL. */
+
+ulint
+trx_prepare_for_mysql(
+/*====-=============*/
+ /* out: 0 or error number */
+ trx_t* trx) /* in: trx handle */
+{
+ /* Because we do not do the prepare by sending an Innobase
+ sig to the transaction, we must here make sure that trx has been
+ started. */
+
+ ut_a(trx);
+
+ trx->op_info = "preparing";
+
+ trx_start_if_not_started(trx);
+
+ mutex_enter(&kernel_mutex);
+
+ trx_prepare_off_kernel(trx);
+
+ mutex_exit(&kernel_mutex);
+
+ trx->op_info = "";
+
+ return(0);
+}
+
+/**************************************************************************
+This function is used to find number of prepared transactions and
+their transaction objects for a recovery. */
+
+int
+trx_recover_for_mysql(
+/*==================*/
+ /* out: number of prepared transactions
+ stored in xid_list */
+ XID* xid_list, /* in/out: prepared transactions */
+ ulint len) /* in: number of slots in xid_list */
+{
+ trx_t* trx;
+ int count = 0;
+
+ ut_ad(xid_list);
+ ut_ad(len);
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Starting recovery for XA transactions...\n");
+
+ /* We should set those transactions which are in the prepared state
+ to the xid_list */
+
+ mutex_enter(&kernel_mutex);
+
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ while (trx) {
+ if (trx->conc_state == TRX_PREPARED) {
+ xid_list[count] = trx->xid;
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Transaction %lu %lu in prepared state after recovery\n",
+ (ulong) ut_dulint_get_high(trx->id),
+ (ulong) ut_dulint_get_low(trx->id));
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Transaction contains changes to %lu rows\n",
+ (ulong)ut_conv_dulint_to_longlong(trx->undo_no));
+
+ count++;
+
+ if ((uint)count == len ) {
+ break;
+ }
+ }
+
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: %d transactions in prepared state after recovery\n",
+ count);
+
+ return (count);
+}
+
+/***********************************************************************
+This function is used to find one X/Open XA distributed transaction
+which is in the prepared state */
+
+trx_t*
+trx_get_trx_by_xid(
+/*===============*/
+ /* out: trx or NULL */
+ XID* xid) /* in: X/Open XA transaction identification */
+{
+ trx_t* trx;
+
+ if (xid == NULL) {
+
+ return (NULL);
+ }
+
+ mutex_enter(&kernel_mutex);
+
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ while (trx) {
+ /* Compare two X/Open XA transaction id's: their
+ length should be the same and binary comparison
+ of gtrid_lenght+bqual_length bytes should be
+ the same */
+
+ if (xid->gtrid_length == trx->xid.gtrid_length &&
+ xid->bqual_length == trx->xid.bqual_length &&
+ memcmp(xid->data, trx->xid.data,
+ xid->gtrid_length +
+ xid->bqual_length) == 0) {
+ break;
+ }
+
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ if (trx) {
+ if (trx->conc_state != TRX_PREPARED) {
+
+ return(NULL);
+ }
+
+ return(trx);
+ } else {
+ return(NULL);
+ }
+}
diff --git a/storage/innobase/trx/trx0undo.c b/storage/innobase/trx/trx0undo.c
new file mode 100644
index 00000000000..bb314dd35e9
--- /dev/null
+++ b/storage/innobase/trx/trx0undo.c
@@ -0,0 +1,1906 @@
+/******************************************************
+Transaction undo log
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0undo.h"
+
+#ifdef UNIV_NONINL
+#include "trx0undo.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "srv0srv.h"
+#include "trx0rec.h"
+#include "trx0purge.h"
+#include "trx0xa.h"
+
+/* How should the old versions in the history list be managed?
+ ----------------------------------------------------------
+If each transaction is given a whole page for its update undo log, file
+space consumption can be 10 times higher than necessary. Therefore,
+partly filled update undo log pages should be reusable. But then there
+is no way individual pages can be ordered so that the ordering agrees
+with the serialization numbers of the transactions on the pages. Thus,
+the history list must be formed of undo logs, not their header pages as
+it was in the old implementation.
+ However, on a single header page the transactions are placed in
+the order of their serialization numbers. As old versions are purged, we
+may free the page when the last transaction on the page has been purged.
+ A problem is that the purge has to go through the transactions
+in the serialization order. This means that we have to look through all
+rollback segments for the one that has the smallest transaction number
+in its history list.
+ When should we do a purge? A purge is necessary when space is
+running out in any of the rollback segments. Then we may have to purge
+also old version which might be needed by some consistent read. How do
+we trigger the start of a purge? When a transaction writes to an undo log,
+it may notice that the space is running out. When a read view is closed,
+it may make some history superfluous. The server can have an utility which
+periodically checks if it can purge some history.
+ In a parallellized purge we have the problem that a query thread
+can remove a delete marked clustered index record before another query
+thread has processed an earlier version of the record, which cannot then
+be done because the row cannot be constructed from the clustered index
+record. To avoid this problem, we will store in the update and delete mark
+undo record also the columns necessary to construct the secondary index
+entries which are modified.
+ We can latch the stack of versions of a single clustered index record
+by taking a latch on the clustered index page. As long as the latch is held,
+no new versions can be added and no versions removed by undo. But, a purge
+can still remove old versions from the bottom of the stack. */
+
+/* How to protect rollback segments, undo logs, and history lists with
+ -------------------------------------------------------------------
+latches?
+-------
+The contention of the kernel mutex should be minimized. When a transaction
+does its first insert or modify in an index, an undo log is assigned for it.
+Then we must have an x-latch to the rollback segment header.
+ When the transaction does more modifys or rolls back, the undo log is
+protected with undo_mutex in the transaction.
+ When the transaction commits, its insert undo log is either reset and
+cached for a fast reuse, or freed. In these cases we must have an x-latch on
+the rollback segment page. The update undo log is put to the history list. If
+it is not suitable for reuse, its slot in the rollback segment is reset. In
+both cases, an x-latch must be acquired on the rollback segment.
+ The purge operation steps through the history list without modifying
+it until a truncate operation occurs, which can remove undo logs from the end
+of the list and release undo log segments. In stepping through the list,
+s-latches on the undo log pages are enough, but in a truncate, x-latches must
+be obtained on the rollback segment and individual pages. */
+
+/************************************************************************
+Initializes the fields in an undo log segment page. */
+static
+void
+trx_undo_page_init(
+/*================*/
+ page_t* undo_page, /* in: undo log segment page */
+ ulint type, /* in: undo log segment type */
+ mtr_t* mtr); /* in: mtr */
+/************************************************************************
+Creates and initializes an undo log memory object. */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+ /* out, own: the undo log memory object */
+ trx_rseg_t* rseg, /* in: rollback segment memory object */
+ ulint id, /* in: slot index within rseg */
+ ulint type, /* in: type of the log: TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+ dulint trx_id, /* in: id of the trx for which the undo log
+ is created */
+ XID* xid, /* in: X/Open XA transaction identification*/
+ ulint page_no,/* in: undo log header page number */
+ ulint offset);/* in: undo log header byte offset on page */
+/*******************************************************************
+Initializes a cached insert undo log header page for new use. NOTE that this
+function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change
+the operation of this function! */
+static
+ulint
+trx_undo_insert_header_reuse(
+/*=========================*/
+ /* out: undo log header byte offset on page */
+ page_t* undo_page, /* in: insert undo log segment header page,
+ x-latched */
+ dulint trx_id, /* in: transaction id */
+ mtr_t* mtr); /* in: mtr */
+/**************************************************************************
+If an update undo log can be discarded immediately, this function frees the
+space, resetting the page to the proper state for caching. */
+static
+void
+trx_undo_discard_latest_update_undo(
+/*================================*/
+ page_t* undo_page, /* in: header page of an undo log of size 1 */
+ mtr_t* mtr); /* in: mtr */
+
+
+/***************************************************************************
+Gets the previous record in an undo log from the previous page. */
+static
+trx_undo_rec_t*
+trx_undo_get_prev_rec_from_prev_page(
+/*=================================*/
+ /* out: undo log record, the page s-latched,
+ NULL if none */
+ trx_undo_rec_t* rec, /* in: undo record */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset, /* in: undo log header offset on page */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint prev_page_no;
+ page_t* prev_page;
+ page_t* undo_page;
+
+ undo_page = buf_frame_align(rec);
+
+ prev_page_no = flst_get_prev_addr(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_NODE, mtr)
+ .page;
+
+ if (prev_page_no == FIL_NULL) {
+
+ return(NULL);
+ }
+
+ prev_page = trx_undo_page_get_s_latched(
+ buf_frame_get_space_id(undo_page),
+ prev_page_no, mtr);
+
+ return(trx_undo_page_get_last_rec(prev_page, page_no, offset));
+}
+
+/***************************************************************************
+Gets the previous record in an undo log. */
+
+trx_undo_rec_t*
+trx_undo_get_prev_rec(
+/*==================*/
+ /* out: undo log record, the page s-latched,
+ NULL if none */
+ trx_undo_rec_t* rec, /* in: undo record */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset, /* in: undo log header offset on page */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_undo_rec_t* prev_rec;
+
+ prev_rec = trx_undo_page_get_prev_rec(rec, page_no, offset);
+
+ if (prev_rec) {
+
+ return(prev_rec);
+ }
+
+ /* We have to go to the previous undo log page to look for the
+ previous record */
+
+ return(trx_undo_get_prev_rec_from_prev_page(rec, page_no, offset,
+ mtr));
+}
+
+/***************************************************************************
+Gets the next record in an undo log from the next page. */
+static
+trx_undo_rec_t*
+trx_undo_get_next_rec_from_next_page(
+/*=================================*/
+ /* out: undo log record, the page latched, NULL if
+ none */
+ page_t* undo_page, /* in: undo log page */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset, /* in: undo log header offset on page */
+ ulint mode, /* in: latch mode: RW_S_LATCH or RW_X_LATCH */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_ulogf_t* log_hdr;
+ ulint next_page_no;
+ page_t* next_page;
+ ulint space;
+ ulint next;
+
+ if (page_no == buf_frame_get_page_no(undo_page)) {
+
+ log_hdr = undo_page + offset;
+ next = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG);
+
+ if (next != 0) {
+
+ return(NULL);
+ }
+ }
+
+ space = buf_frame_get_space_id(undo_page);
+
+ next_page_no = flst_get_next_addr(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_NODE, mtr)
+ .page;
+ if (next_page_no == FIL_NULL) {
+
+ return(NULL);
+ }
+
+ if (mode == RW_S_LATCH) {
+ next_page = trx_undo_page_get_s_latched(space, next_page_no,
+ mtr);
+ } else {
+ ut_ad(mode == RW_X_LATCH);
+ next_page = trx_undo_page_get(space, next_page_no, mtr);
+ }
+
+ return(trx_undo_page_get_first_rec(next_page, page_no, offset));
+}
+
+/***************************************************************************
+Gets the next record in an undo log. */
+
+trx_undo_rec_t*
+trx_undo_get_next_rec(
+/*==================*/
+ /* out: undo log record, the page s-latched,
+ NULL if none */
+ trx_undo_rec_t* rec, /* in: undo record */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset, /* in: undo log header offset on page */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_undo_rec_t* next_rec;
+
+ next_rec = trx_undo_page_get_next_rec(rec, page_no, offset);
+
+ if (next_rec) {
+ return(next_rec);
+ }
+
+ return(trx_undo_get_next_rec_from_next_page(buf_frame_align(rec),
+ page_no, offset,
+ RW_S_LATCH, mtr));
+}
+
+/***************************************************************************
+Gets the first record in an undo log. */
+
+trx_undo_rec_t*
+trx_undo_get_first_rec(
+/*===================*/
+ /* out: undo log record, the page latched, NULL if
+ none */
+ ulint space, /* in: undo log header space */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset, /* in: undo log header offset on page */
+ ulint mode, /* in: latching mode: RW_S_LATCH or RW_X_LATCH */
+ mtr_t* mtr) /* in: mtr */
+{
+ page_t* undo_page;
+ trx_undo_rec_t* rec;
+
+ if (mode == RW_S_LATCH) {
+ undo_page = trx_undo_page_get_s_latched(space, page_no, mtr);
+ } else {
+ undo_page = trx_undo_page_get(space, page_no, mtr);
+ }
+
+ rec = trx_undo_page_get_first_rec(undo_page, page_no, offset);
+
+ if (rec) {
+ return(rec);
+ }
+
+ return(trx_undo_get_next_rec_from_next_page(undo_page, page_no, offset,
+ mode, mtr));
+}
+
+/*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/
+
+/**************************************************************************
+Writes the mtr log entry of an undo log page initialization. */
+UNIV_INLINE
+void
+trx_undo_page_init_log(
+/*====================*/
+ page_t* undo_page, /* in: undo log page */
+ ulint type, /* in: undo log type */
+ mtr_t* mtr) /* in: mtr */
+{
+ mlog_write_initial_log_record(undo_page, MLOG_UNDO_INIT, mtr);
+
+ mlog_catenate_ulint_compressed(mtr, type);
+}
+
+/***************************************************************
+Parses the redo log entry of an undo log page initialization. */
+
+byte*
+trx_undo_parse_page_init(
+/*======================*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page, /* in: page or NULL */
+ mtr_t* mtr) /* in: mtr or NULL */
+{
+ ulint type;
+
+ ptr = mach_parse_compressed(ptr, end_ptr, &type);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ if (page) {
+ trx_undo_page_init(page, type, mtr);
+ }
+
+ return(ptr);
+}
+
+/************************************************************************
+Initializes the fields in an undo log segment page. */
+static
+void
+trx_undo_page_init(
+/*================*/
+ page_t* undo_page, /* in: undo log segment page */
+ ulint type, /* in: undo log segment type */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_upagef_t* page_hdr;
+
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_TYPE, type);
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START,
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE,
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+
+ fil_page_set_type(undo_page, FIL_PAGE_UNDO_LOG);
+
+ trx_undo_page_init_log(undo_page, type, mtr);
+}
+
+/*******************************************************************
+Creates a new undo log segment in file. */
+static
+page_t*
+trx_undo_seg_create(
+/*================*/
+ /* out: segment header page x-latched, NULL
+ if no space left */
+ trx_rseg_t* rseg __attribute__((unused)),/* in: rollback segment */
+ trx_rsegf_t* rseg_hdr,/* in: rollback segment header, page
+ x-latched */
+ ulint type, /* in: type of the segment: TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+ ulint* id, /* out: slot index within rseg header */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint slot_no;
+ ulint space;
+ page_t* undo_page;
+ trx_upagef_t* page_hdr;
+ trx_usegf_t* seg_hdr;
+ ulint n_reserved;
+ ibool success;
+
+ ut_ad(mtr && id && rseg_hdr);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(rseg->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+/* fputs(type == TRX_UNDO_INSERT
+ ? "Creating insert undo log segment\n"
+ : "Creating update undo log segment\n", stderr); */
+ slot_no = trx_rsegf_undo_find_free(rseg_hdr, mtr);
+
+ if (slot_no == ULINT_UNDEFINED) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+"InnoDB: Warning: cannot find a free slot for an undo log. Do you have too\n"
+"InnoDB: many active transactions running concurrently?\n");
+
+ return(NULL);
+ }
+
+ space = buf_frame_get_space_id(rseg_hdr);
+
+ success = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO,
+ mtr);
+ if (!success) {
+
+ return(NULL);
+ }
+
+ /* Allocate a new file segment for the undo log */
+ undo_page = fseg_create_general(space, 0,
+ TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER, TRUE, mtr);
+
+ fil_space_release_free_extents(space, n_reserved);
+
+ if (undo_page == NULL) {
+ /* No space left */
+
+ return(NULL);
+ }
+
+#ifdef UNIV_SYNC_DEBUG
+ buf_page_dbg_add_level(undo_page, SYNC_TRX_UNDO_PAGE);
+#endif /* UNIV_SYNC_DEBUG */
+
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+ trx_undo_page_init(undo_page, type, mtr);
+
+ mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE,
+ TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE,
+ MLOG_2BYTES, mtr);
+
+ mlog_write_ulint(seg_hdr + TRX_UNDO_LAST_LOG, 0, MLOG_2BYTES, mtr);
+
+ flst_init(seg_hdr + TRX_UNDO_PAGE_LIST, mtr);
+
+ flst_add_last(seg_hdr + TRX_UNDO_PAGE_LIST,
+ page_hdr + TRX_UNDO_PAGE_NODE, mtr);
+
+ trx_rsegf_set_nth_undo(rseg_hdr, slot_no,
+ buf_frame_get_page_no(undo_page), mtr);
+ *id = slot_no;
+
+ return(undo_page);
+}
+
+/**************************************************************************
+Writes the mtr log entry of an undo log header initialization. */
+UNIV_INLINE
+void
+trx_undo_header_create_log(
+/*=======================*/
+ page_t* undo_page, /* in: undo log header page */
+ dulint trx_id, /* in: transaction id */
+ mtr_t* mtr) /* in: mtr */
+{
+ mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_CREATE, mtr);
+
+ mlog_catenate_dulint_compressed(mtr, trx_id);
+}
+
+/*******************************************************************
+Creates a new undo log header in file. NOTE that this function has its own
+log record type MLOG_UNDO_HDR_CREATE. You must NOT change the operation of
+this function! */
+static
+ulint
+trx_undo_header_create(
+/*===================*/
+ /* out: header byte offset on page */
+ page_t* undo_page, /* in: undo log segment header page,
+ x-latched; it is assumed that there are
+ TRX_UNDO_LOG_XA_HDR_SIZE bytes free space
+ on it */
+ dulint trx_id, /* in: transaction id */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_upagef_t* page_hdr;
+ trx_usegf_t* seg_hdr;
+ trx_ulogf_t* log_hdr;
+ trx_ulogf_t* prev_log_hdr;
+ ulint prev_log;
+ ulint free;
+ ulint new_free;
+
+ ut_ad(mtr && undo_page);
+
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+ free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE);
+
+ log_hdr = undo_page + free;
+
+ new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE;
+
+ ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100);
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free);
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free);
+
+ mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE);
+
+ prev_log = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG);
+
+ if (prev_log != 0) {
+ prev_log_hdr = undo_page + prev_log;
+
+ mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, free);
+ }
+
+ mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, free);
+
+ log_hdr = undo_page + free;
+
+ mach_write_to_2(log_hdr + TRX_UNDO_DEL_MARKS, TRUE);
+
+ mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id);
+ mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free);
+
+ mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE);
+ mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE);
+
+ mach_write_to_2(log_hdr + TRX_UNDO_NEXT_LOG, 0);
+ mach_write_to_2(log_hdr + TRX_UNDO_PREV_LOG, prev_log);
+
+ /* Write the log record about the header creation */
+ trx_undo_header_create_log(undo_page, trx_id, mtr);
+
+ return(free);
+}
+
+/************************************************************************
+Write X/Open XA Transaction Identification (XID) to undo log header */
+static
+void
+trx_undo_write_xid(
+/*===============*/
+ trx_ulogf_t* log_hdr,/* in: undo log header */
+ XID* xid, /* in: X/Open XA Transaction Identification */
+ mtr_t* mtr) /* in: mtr */
+{
+ mlog_write_ulint(log_hdr + TRX_UNDO_XA_FORMAT, xid->formatID,
+ MLOG_4BYTES, mtr);
+
+ mlog_write_ulint(log_hdr + TRX_UNDO_XA_TRID_LEN, xid->gtrid_length,
+ MLOG_4BYTES, mtr);
+
+ mlog_write_ulint(log_hdr + TRX_UNDO_XA_BQUAL_LEN, xid->bqual_length,
+ MLOG_4BYTES, mtr);
+
+ mlog_write_string(log_hdr + TRX_UNDO_XA_XID, xid->data,
+ XIDDATASIZE, mtr);
+}
+
+/************************************************************************
+Read X/Open XA Transaction Identification (XID) from undo log header */
+static
+void
+trx_undo_read_xid(
+/*==============*/
+ trx_ulogf_t* log_hdr,/* in: undo log header */
+ XID* xid) /* out: X/Open XA Transaction Identification */
+{
+ ulint i;
+
+ xid->formatID = mach_read_from_4(log_hdr + TRX_UNDO_XA_FORMAT);
+
+ xid->gtrid_length = mach_read_from_4(log_hdr + TRX_UNDO_XA_TRID_LEN);
+
+ xid->bqual_length = mach_read_from_4(log_hdr + TRX_UNDO_XA_BQUAL_LEN);
+
+ for (i = 0; i < XIDDATASIZE; i++) {
+ xid->data[i] = (char)mach_read_from_1(log_hdr +
+ TRX_UNDO_XA_XID + i);
+ }
+}
+
+/*******************************************************************
+Adds space for the XA XID after an undo log old-style header. */
+static
+void
+trx_undo_header_add_space_for_xid(
+/*==============================*/
+ page_t* undo_page,/* in: undo log segment header page */
+ trx_ulogf_t* log_hdr,/* in: undo log header */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_upagef_t* page_hdr;
+ ulint free;
+ ulint new_free;
+
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+ free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE);
+
+ /* free is now the end offset of the old style undo log header */
+
+ ut_a(free == (ulint)(log_hdr - undo_page) + TRX_UNDO_LOG_OLD_HDR_SIZE);
+
+ new_free = free + (TRX_UNDO_LOG_XA_HDR_SIZE
+ - TRX_UNDO_LOG_OLD_HDR_SIZE);
+
+ /* Add space for a XID after the header, update the free offset
+ fields on the undo log page and in the undo log header */
+
+ mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_START, new_free,
+ MLOG_2BYTES, mtr);
+
+ mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE, new_free,
+ MLOG_2BYTES, mtr);
+
+ mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, new_free,
+ MLOG_2BYTES, mtr);
+}
+
+/**************************************************************************
+Writes the mtr log entry of an undo log header reuse. */
+UNIV_INLINE
+void
+trx_undo_insert_header_reuse_log(
+/*=============================*/
+ page_t* undo_page, /* in: undo log header page */
+ dulint trx_id, /* in: transaction id */
+ mtr_t* mtr) /* in: mtr */
+{
+ mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_REUSE, mtr);
+
+ mlog_catenate_dulint_compressed(mtr, trx_id);
+}
+
+/***************************************************************
+Parses the redo log entry of an undo log page header create or reuse. */
+
+byte*
+trx_undo_parse_page_header(
+/*=======================*/
+ /* out: end of log record or NULL */
+ ulint type, /* in: MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page, /* in: page or NULL */
+ mtr_t* mtr) /* in: mtr or NULL */
+{
+ dulint trx_id;
+
+ ptr = mach_dulint_parse_compressed(ptr, end_ptr, &trx_id);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ if (page) {
+ if (type == MLOG_UNDO_HDR_CREATE) {
+ trx_undo_header_create(page, trx_id, mtr);
+ } else {
+ ut_ad(type == MLOG_UNDO_HDR_REUSE);
+ trx_undo_insert_header_reuse(page, trx_id, mtr);
+ }
+ }
+
+ return(ptr);
+}
+
+/*******************************************************************
+Initializes a cached insert undo log header page for new use. NOTE that this
+function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change
+the operation of this function! */
+static
+ulint
+trx_undo_insert_header_reuse(
+/*=========================*/
+ /* out: undo log header byte offset on page */
+ page_t* undo_page, /* in: insert undo log segment header page,
+ x-latched */
+ dulint trx_id, /* in: transaction id */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_upagef_t* page_hdr;
+ trx_usegf_t* seg_hdr;
+ trx_ulogf_t* log_hdr;
+ ulint free;
+ ulint new_free;
+
+ ut_ad(mtr && undo_page);
+
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+ free = TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE;
+
+ ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100);
+
+ log_hdr = undo_page + free;
+
+ new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE;
+
+ /* Insert undo data is not needed after commit: we may free all
+ the space on the page */
+
+ ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_TYPE)
+ == TRX_UNDO_INSERT);
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free);
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free);
+
+ mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE);
+
+ log_hdr = undo_page + free;
+
+ mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id);
+ mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free);
+
+ mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE);
+ mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE);
+
+ /* Write the log record MLOG_UNDO_HDR_REUSE */
+ trx_undo_insert_header_reuse_log(undo_page, trx_id, mtr);
+
+ return(free);
+}
+
+/**************************************************************************
+Writes the redo log entry of an update undo log header discard. */
+UNIV_INLINE
+void
+trx_undo_discard_latest_log(
+/*========================*/
+ page_t* undo_page, /* in: undo log header page */
+ mtr_t* mtr) /* in: mtr */
+{
+ mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_DISCARD, mtr);
+}
+
+/***************************************************************
+Parses the redo log entry of an undo log page header discard. */
+
+byte*
+trx_undo_parse_discard_latest(
+/*==========================*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr __attribute__((unused)), /* in: buffer end */
+ page_t* page, /* in: page or NULL */
+ mtr_t* mtr) /* in: mtr or NULL */
+{
+ ut_ad(end_ptr);
+
+ if (page) {
+ trx_undo_discard_latest_update_undo(page, mtr);
+ }
+
+ return(ptr);
+}
+
+/**************************************************************************
+If an update undo log can be discarded immediately, this function frees the
+space, resetting the page to the proper state for caching. */
+static
+void
+trx_undo_discard_latest_update_undo(
+/*================================*/
+ page_t* undo_page, /* in: header page of an undo log of size 1 */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_usegf_t* seg_hdr;
+ trx_upagef_t* page_hdr;
+ trx_ulogf_t* log_hdr;
+ trx_ulogf_t* prev_log_hdr;
+ ulint free;
+ ulint prev_hdr_offset;
+
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+ free = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG);
+ log_hdr = undo_page + free;
+
+ prev_hdr_offset = mach_read_from_2(log_hdr + TRX_UNDO_PREV_LOG);
+
+ if (prev_hdr_offset != 0) {
+ prev_log_hdr = undo_page + prev_hdr_offset;
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START,
+ mach_read_from_2(prev_log_hdr + TRX_UNDO_LOG_START));
+ mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, 0);
+ }
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, free);
+
+ mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_CACHED);
+ mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, prev_hdr_offset);
+
+ trx_undo_discard_latest_log(undo_page, mtr);
+}
+
+/************************************************************************
+Tries to add a page to the undo log segment where the undo log is placed. */
+
+ulint
+trx_undo_add_page(
+/*==============*/
+ /* out: page number if success, else
+ FIL_NULL */
+ trx_t* trx, /* in: transaction */
+ trx_undo_t* undo, /* in: undo log memory object */
+ mtr_t* mtr) /* in: mtr which does not have a latch to any
+ undo log page; the caller must have reserved
+ the rollback segment mutex */
+{
+ page_t* header_page;
+ page_t* new_page;
+ trx_rseg_t* rseg;
+ ulint page_no;
+ ulint n_reserved;
+ ibool success;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(trx->undo_mutex)));
+ ut_ad(!mutex_own(&kernel_mutex));
+ ut_ad(mutex_own(&(trx->rseg->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ rseg = trx->rseg;
+
+ if (rseg->curr_size == rseg->max_size) {
+
+ return(FIL_NULL);
+ }
+
+ header_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr);
+
+ success = fsp_reserve_free_extents(&n_reserved, undo->space, 1,
+ FSP_UNDO, mtr);
+ if (!success) {
+
+ return(FIL_NULL);
+ }
+
+ page_no = fseg_alloc_free_page_general(header_page + TRX_UNDO_SEG_HDR
+ + TRX_UNDO_FSEG_HEADER,
+ undo->top_page_no + 1, FSP_UP,
+ TRUE, mtr);
+
+ fil_space_release_free_extents(undo->space, n_reserved);
+
+ if (page_no == FIL_NULL) {
+
+ /* No space left */
+
+ return(FIL_NULL);
+ }
+
+ undo->last_page_no = page_no;
+
+ new_page = trx_undo_page_get(undo->space, page_no, mtr);
+
+ trx_undo_page_init(new_page, undo->type, mtr);
+
+ flst_add_last(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+ new_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+ undo->size++;
+ rseg->curr_size++;
+
+ return(page_no);
+}
+
+/************************************************************************
+Frees an undo log page that is not the header page. */
+static
+ulint
+trx_undo_free_page(
+/*===============*/
+ /* out: last page number in remaining log */
+ trx_rseg_t* rseg, /* in: rollback segment */
+ ibool in_history, /* in: TRUE if the undo log is in the history
+ list */
+ ulint space, /* in: space */
+ ulint hdr_page_no, /* in: header page number */
+ ulint page_no, /* in: page number to free: must not be the
+ header page */
+ mtr_t* mtr) /* in: mtr which does not have a latch to any
+ undo log page; the caller must have reserved
+ the rollback segment mutex */
+{
+ page_t* header_page;
+ page_t* undo_page;
+ fil_addr_t last_addr;
+ trx_rsegf_t* rseg_header;
+ ulint hist_size;
+
+ ut_a(hdr_page_no != page_no);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!mutex_own(&kernel_mutex));
+ ut_ad(mutex_own(&(rseg->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ undo_page = trx_undo_page_get(space, page_no, mtr);
+
+ header_page = trx_undo_page_get(space, hdr_page_no, mtr);
+
+ flst_remove(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+ undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+
+ fseg_free_page(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER,
+ space, page_no, mtr);
+
+ last_addr = flst_get_last(header_page + TRX_UNDO_SEG_HDR
+ + TRX_UNDO_PAGE_LIST, mtr);
+ rseg->curr_size--;
+
+ if (in_history) {
+ rseg_header = trx_rsegf_get(space, rseg->page_no, mtr);
+
+ hist_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+ MLOG_4BYTES, mtr);
+ ut_ad(hist_size > 0);
+ mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+ hist_size - 1, MLOG_4BYTES, mtr);
+ }
+
+ return(last_addr.page);
+}
+
+/************************************************************************
+Frees an undo log page when there is also the memory object for the undo
+log. */
+static
+void
+trx_undo_free_page_in_rollback(
+/*===========================*/
+ trx_t* trx __attribute__((unused)), /* in: transaction */
+ trx_undo_t* undo, /* in: undo log memory copy */
+ ulint page_no,/* in: page number to free: must not be the
+ header page */
+ mtr_t* mtr) /* in: mtr which does not have a latch to any
+ undo log page; the caller must have reserved
+ the rollback segment mutex */
+{
+ ulint last_page_no;
+
+ ut_ad(undo->hdr_page_no != page_no);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(trx->undo_mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ last_page_no = trx_undo_free_page(undo->rseg, FALSE, undo->space,
+ undo->hdr_page_no, page_no, mtr);
+
+ undo->last_page_no = last_page_no;
+ undo->size--;
+}
+
+/************************************************************************
+Empties an undo log header page of undo records for that undo log. Other
+undo logs may still have records on that page, if it is an update undo log. */
+static
+void
+trx_undo_empty_header_page(
+/*=======================*/
+ ulint space, /* in: space */
+ ulint hdr_page_no, /* in: header page number */
+ ulint hdr_offset, /* in: header offset */
+ mtr_t* mtr) /* in: mtr */
+{
+ page_t* header_page;
+ trx_ulogf_t* log_hdr;
+ ulint end;
+
+ header_page = trx_undo_page_get(space, hdr_page_no, mtr);
+
+ log_hdr = header_page + hdr_offset;
+
+ end = trx_undo_page_get_end(header_page, hdr_page_no, hdr_offset);
+
+ mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, end, MLOG_2BYTES, mtr);
+}
+
+/***************************************************************************
+Truncates an undo log from the end. This function is used during a rollback
+to free space from an undo log. */
+
+void
+trx_undo_truncate_end(
+/*==================*/
+ trx_t* trx, /* in: transaction whose undo log it is */
+ trx_undo_t* undo, /* in: undo log */
+ dulint limit) /* in: all undo records with undo number
+ >= this value should be truncated */
+{
+ page_t* undo_page;
+ ulint last_page_no;
+ trx_undo_rec_t* rec;
+ trx_undo_rec_t* trunc_here;
+ trx_rseg_t* rseg;
+ mtr_t mtr;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(trx->undo_mutex)));
+ ut_ad(mutex_own(&(trx->rseg->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ rseg = trx->rseg;
+
+ for (;;) {
+ mtr_start(&mtr);
+
+ trunc_here = NULL;
+
+ last_page_no = undo->last_page_no;
+
+ undo_page = trx_undo_page_get(undo->space, last_page_no, &mtr);
+
+ rec = trx_undo_page_get_last_rec(undo_page, undo->hdr_page_no,
+ undo->hdr_offset);
+ for (;;) {
+ if (rec == NULL) {
+ if (last_page_no == undo->hdr_page_no) {
+
+ goto function_exit;
+ }
+
+ trx_undo_free_page_in_rollback(trx, undo,
+ last_page_no, &mtr);
+ break;
+ }
+
+ if (ut_dulint_cmp(trx_undo_rec_get_undo_no(rec), limit)
+ >= 0) {
+ /* Truncate at least this record off, maybe
+ more */
+ trunc_here = rec;
+ } else {
+ goto function_exit;
+ }
+
+ rec = trx_undo_page_get_prev_rec(rec,
+ undo->hdr_page_no,
+ undo->hdr_offset);
+ }
+
+ mtr_commit(&mtr);
+ }
+
+function_exit:
+ if (trunc_here) {
+ mlog_write_ulint(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE,
+ trunc_here - undo_page, MLOG_2BYTES, &mtr);
+ }
+
+ mtr_commit(&mtr);
+}
+
+/***************************************************************************
+Truncates an undo log from the start. This function is used during a purge
+operation. */
+
+void
+trx_undo_truncate_start(
+/*====================*/
+ trx_rseg_t* rseg, /* in: rollback segment */
+ ulint space, /* in: space id of the log */
+ ulint hdr_page_no, /* in: header page number */
+ ulint hdr_offset, /* in: header offset on the page */
+ dulint limit) /* in: all undo pages with undo numbers <
+ this value should be truncated; NOTE that
+ the function only frees whole pages; the
+ header page is not freed, but emptied, if
+ all the records there are < limit */
+{
+ page_t* undo_page;
+ trx_undo_rec_t* rec;
+ trx_undo_rec_t* last_rec;
+ ulint page_no;
+ mtr_t mtr;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(rseg->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (0 == ut_dulint_cmp(limit, ut_dulint_zero)) {
+
+ return;
+ }
+loop:
+ mtr_start(&mtr);
+
+ rec = trx_undo_get_first_rec(space, hdr_page_no, hdr_offset,
+ RW_X_LATCH, &mtr);
+ if (rec == NULL) {
+ /* Already empty */
+
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ undo_page = buf_frame_align(rec);
+
+ last_rec = trx_undo_page_get_last_rec(undo_page, hdr_page_no,
+ hdr_offset);
+ if (ut_dulint_cmp(trx_undo_rec_get_undo_no(last_rec), limit) >= 0) {
+
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ page_no = buf_frame_get_page_no(undo_page);
+
+ if (page_no == hdr_page_no) {
+ trx_undo_empty_header_page(space, hdr_page_no, hdr_offset,
+ &mtr);
+ } else {
+ trx_undo_free_page(rseg, TRUE, space, hdr_page_no,
+ page_no, &mtr);
+ }
+
+ mtr_commit(&mtr);
+
+ goto loop;
+}
+
+/**************************************************************************
+Frees an undo log segment which is not in the history list. */
+static
+void
+trx_undo_seg_free(
+/*==============*/
+ trx_undo_t* undo) /* in: undo log */
+{
+ trx_rseg_t* rseg;
+ fseg_header_t* file_seg;
+ trx_rsegf_t* rseg_header;
+ trx_usegf_t* seg_header;
+ ibool finished;
+ mtr_t mtr;
+
+ finished = FALSE;
+ rseg = undo->rseg;
+
+ while (!finished) {
+
+ mtr_start(&mtr);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+ mutex_enter(&(rseg->mutex));
+
+ seg_header = trx_undo_page_get(undo->space, undo->hdr_page_no,
+ &mtr)
+ + TRX_UNDO_SEG_HDR;
+
+ file_seg = seg_header + TRX_UNDO_FSEG_HEADER;
+
+ finished = fseg_free_step(file_seg, &mtr);
+
+ if (finished) {
+ /* Update the rseg header */
+ rseg_header = trx_rsegf_get(rseg->space, rseg->page_no,
+ &mtr);
+ trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL,
+ &mtr);
+ }
+
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+ }
+}
+
+/*========== UNDO LOG MEMORY COPY INITIALIZATION =====================*/
+
+/************************************************************************
+Creates and initializes an undo log memory object according to the values
+in the header in file, when the database is started. The memory object is
+inserted in the appropriate list of rseg. */
+static
+trx_undo_t*
+trx_undo_mem_create_at_db_start(
+/*============================*/
+ /* out, own: the undo log memory object */
+ trx_rseg_t* rseg, /* in: rollback segment memory object */
+ ulint id, /* in: slot index within rseg */
+ ulint page_no,/* in: undo log segment page number */
+ mtr_t* mtr) /* in: mtr */
+{
+ page_t* undo_page;
+ trx_upagef_t* page_header;
+ trx_usegf_t* seg_header;
+ trx_ulogf_t* undo_header;
+ trx_undo_t* undo;
+ ulint type;
+ ulint state;
+ dulint trx_id;
+ ulint offset;
+ fil_addr_t last_addr;
+ page_t* last_page;
+ trx_undo_rec_t* rec;
+ XID xid;
+ ibool xid_exists = FALSE;
+
+ if (id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr,
+ "InnoDB: Error: undo->id is %lu\n", (ulong) id);
+ ut_error;
+ }
+
+ undo_page = trx_undo_page_get(rseg->space, page_no, mtr);
+
+ page_header = undo_page + TRX_UNDO_PAGE_HDR;
+
+ type = mtr_read_ulint(page_header + TRX_UNDO_PAGE_TYPE, MLOG_2BYTES,
+ mtr);
+ seg_header = undo_page + TRX_UNDO_SEG_HDR;
+
+ state = mach_read_from_2(seg_header + TRX_UNDO_STATE);
+
+ offset = mach_read_from_2(seg_header + TRX_UNDO_LAST_LOG);
+
+ undo_header = undo_page + offset;
+
+ trx_id = mtr_read_dulint(undo_header + TRX_UNDO_TRX_ID, mtr);
+
+ xid_exists = mtr_read_ulint(undo_header + TRX_UNDO_XID_EXISTS,
+ MLOG_1BYTE, mtr);
+
+ /* Read X/Open XA transaction identification if it exists, or
+ set it to NULL. */
+
+ memset(&xid, 0, sizeof(xid));
+ xid.formatID = -1;
+
+ if (xid_exists == TRUE) {
+ trx_undo_read_xid(undo_header, &xid);
+ }
+
+ mutex_enter(&(rseg->mutex));
+
+ undo = trx_undo_mem_create(rseg, id, type, trx_id, &xid,
+ page_no, offset);
+ mutex_exit(&(rseg->mutex));
+
+ undo->dict_operation = mtr_read_ulint(
+ undo_header + TRX_UNDO_DICT_TRANS,
+ MLOG_1BYTE, mtr);
+
+ undo->table_id = mtr_read_dulint(undo_header + TRX_UNDO_TABLE_ID, mtr);
+ undo->state = state;
+ undo->size = flst_get_len(seg_header + TRX_UNDO_PAGE_LIST, mtr);
+
+ /* If the log segment is being freed, the page list is inconsistent! */
+ if (state == TRX_UNDO_TO_FREE) {
+
+ goto add_to_list;
+ }
+
+ last_addr = flst_get_last(seg_header + TRX_UNDO_PAGE_LIST, mtr);
+
+ undo->last_page_no = last_addr.page;
+ undo->top_page_no = last_addr.page;
+
+ last_page = trx_undo_page_get(rseg->space, undo->last_page_no, mtr);
+
+ rec = trx_undo_page_get_last_rec(last_page, page_no, offset);
+
+ if (rec == NULL) {
+ undo->empty = TRUE;
+ } else {
+ undo->empty = FALSE;
+ undo->top_offset = rec - last_page;
+ undo->top_undo_no = trx_undo_rec_get_undo_no(rec);
+ }
+add_to_list:
+ if (type == TRX_UNDO_INSERT) {
+ if (state != TRX_UNDO_CACHED) {
+ UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_list,
+ undo);
+ } else {
+ UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_cached,
+ undo);
+ }
+ } else {
+ ut_ad(type == TRX_UNDO_UPDATE);
+ if (state != TRX_UNDO_CACHED) {
+ UT_LIST_ADD_LAST(undo_list, rseg->update_undo_list,
+ undo);
+ } else {
+ UT_LIST_ADD_LAST(undo_list, rseg->update_undo_cached,
+ undo);
+ }
+ }
+
+ return(undo);
+}
+
+/************************************************************************
+Initializes the undo log lists for a rollback segment memory copy. This
+function is only called when the database is started or a new rollback
+segment is created. */
+
+ulint
+trx_undo_lists_init(
+/*================*/
+ /* out: the combined size of undo log segments
+ in pages */
+ trx_rseg_t* rseg) /* in: rollback segment memory object */
+{
+ ulint page_no;
+ trx_undo_t* undo;
+ ulint size = 0;
+ trx_rsegf_t* rseg_header;
+ ulint i;
+ mtr_t mtr;
+
+ UT_LIST_INIT(rseg->update_undo_list);
+ UT_LIST_INIT(rseg->update_undo_cached);
+ UT_LIST_INIT(rseg->insert_undo_list);
+ UT_LIST_INIT(rseg->insert_undo_cached);
+
+ mtr_start(&mtr);
+
+ rseg_header = trx_rsegf_get_new(rseg->space, rseg->page_no, &mtr);
+
+ for (i = 0; i < TRX_RSEG_N_SLOTS; i++) {
+ page_no = trx_rsegf_get_nth_undo(rseg_header, i, &mtr);
+
+ /* In forced recovery: try to avoid operations which look
+ at database pages; undo logs are rapidly changing data, and
+ the probability that they are in an inconsistent state is
+ high */
+
+ if (page_no != FIL_NULL
+ && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
+
+ undo = trx_undo_mem_create_at_db_start(rseg, i,
+ page_no, &mtr);
+ size += undo->size;
+
+ mtr_commit(&mtr);
+
+ mtr_start(&mtr);
+
+ rseg_header = trx_rsegf_get(rseg->space,
+ rseg->page_no, &mtr);
+ }
+ }
+
+ mtr_commit(&mtr);
+
+ return(size);
+}
+
+/************************************************************************
+Creates and initializes an undo log memory object. */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+ /* out, own: the undo log memory object */
+ trx_rseg_t* rseg, /* in: rollback segment memory object */
+ ulint id, /* in: slot index within rseg */
+ ulint type, /* in: type of the log: TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+ dulint trx_id, /* in: id of the trx for which the undo log
+ is created */
+ XID* xid, /* in: X/Open transaction identification */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset) /* in: undo log header byte offset on page */
+{
+ trx_undo_t* undo;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(rseg->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr,
+ "InnoDB: Error: undo->id is %lu\n", (ulong) id);
+ ut_error;
+ }
+
+ undo = mem_alloc(sizeof(trx_undo_t));
+
+ undo->id = id;
+ undo->type = type;
+ undo->state = TRX_UNDO_ACTIVE;
+ undo->del_marks = FALSE;
+ undo->trx_id = trx_id;
+ undo->xid = *xid;
+
+ undo->dict_operation = FALSE;
+
+ undo->rseg = rseg;
+
+ undo->space = rseg->space;
+ undo->hdr_page_no = page_no;
+ undo->hdr_offset = offset;
+ undo->last_page_no = page_no;
+ undo->size = 1;
+
+ undo->empty = TRUE;
+ undo->top_page_no = page_no;
+ undo->guess_page = NULL;
+
+ return(undo);
+}
+
+/************************************************************************
+Initializes a cached undo log object for new use. */
+static
+void
+trx_undo_mem_init_for_reuse(
+/*========================*/
+ trx_undo_t* undo, /* in: undo log to init */
+ dulint trx_id, /* in: id of the trx for which the undo log
+ is created */
+ XID* xid, /* in: X/Open XA transaction identification*/
+ ulint offset) /* in: undo log header byte offset on page */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&((undo->rseg)->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (undo->id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+ (ulong) undo->id);
+
+ mem_analyze_corruption((byte*)undo);
+ ut_error;
+ }
+
+ undo->state = TRX_UNDO_ACTIVE;
+ undo->del_marks = FALSE;
+ undo->trx_id = trx_id;
+ undo->xid = *xid;
+
+ undo->dict_operation = FALSE;
+
+ undo->hdr_offset = offset;
+ undo->empty = TRUE;
+}
+
+/************************************************************************
+Frees an undo log memory copy. */
+static
+void
+trx_undo_mem_free(
+/*==============*/
+ trx_undo_t* undo) /* in: the undo object to be freed */
+{
+ if (undo->id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr,
+ "InnoDB: Error: undo->id is %lu\n", (ulong) undo->id);
+ ut_error;
+ }
+
+ mem_free(undo);
+}
+
+/**************************************************************************
+Creates a new undo log. */
+static
+trx_undo_t*
+trx_undo_create(
+/*============*/
+ /* out: undo log object, NULL if did not
+ succeed: out of space */
+ trx_t* trx, /* in: transaction */
+ trx_rseg_t* rseg, /* in: rollback segment memory copy */
+ ulint type, /* in: type of the log: TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+ dulint trx_id, /* in: id of the trx for which the undo log
+ is created */
+ XID* xid, /* in: X/Open transaction identification*/
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_rsegf_t* rseg_header;
+ ulint page_no;
+ ulint offset;
+ ulint id;
+ trx_undo_t* undo;
+ page_t* undo_page;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(rseg->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (rseg->curr_size == rseg->max_size) {
+
+ return(NULL);
+ }
+
+ rseg->curr_size++;
+
+ rseg_header = trx_rsegf_get(rseg->space, rseg->page_no, mtr);
+
+ undo_page = trx_undo_seg_create(rseg, rseg_header, type, &id, mtr);
+
+ if (undo_page == NULL) {
+ /* Did not succeed */
+
+ rseg->curr_size--;
+
+ return(NULL);
+ }
+
+ page_no = buf_frame_get_page_no(undo_page);
+
+ offset = trx_undo_header_create(undo_page, trx_id, mtr);
+
+ if (trx->support_xa) {
+ trx_undo_header_add_space_for_xid(undo_page,
+ undo_page + offset, mtr);
+ }
+
+ undo = trx_undo_mem_create(rseg, id, type, trx_id, xid,
+ page_no, offset);
+ return(undo);
+}
+
+/*================ UNDO LOG ASSIGNMENT AND CLEANUP =====================*/
+
+/************************************************************************
+Reuses a cached undo log. */
+static
+trx_undo_t*
+trx_undo_reuse_cached(
+/*==================*/
+ /* out: the undo log memory object, NULL if
+ none cached */
+ trx_t* trx, /* in: transaction */
+ trx_rseg_t* rseg, /* in: rollback segment memory object */
+ ulint type, /* in: type of the log: TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+ dulint trx_id, /* in: id of the trx for which the undo log
+ is used */
+ XID* xid, /* in: X/Open XA transaction identification */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_undo_t* undo;
+ page_t* undo_page;
+ ulint offset;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(rseg->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (type == TRX_UNDO_INSERT) {
+
+ undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached);
+ if (undo == NULL) {
+
+ return(NULL);
+ }
+
+ UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, undo);
+ } else {
+ ut_ad(type == TRX_UNDO_UPDATE);
+
+ undo = UT_LIST_GET_FIRST(rseg->update_undo_cached);
+ if (undo == NULL) {
+
+ return(NULL);
+ }
+
+ UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, undo);
+ }
+
+ ut_ad(undo->size == 1);
+
+ if (undo->id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+ (ulong) undo->id);
+ mem_analyze_corruption((byte*)undo);
+ ut_error;
+ }
+
+ undo_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr);
+
+ if (type == TRX_UNDO_INSERT) {
+ offset = trx_undo_insert_header_reuse(undo_page, trx_id, mtr);
+
+ if (trx->support_xa) {
+ trx_undo_header_add_space_for_xid(undo_page,
+ undo_page + offset, mtr);
+ }
+ } else {
+ ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_TYPE)
+ == TRX_UNDO_UPDATE);
+
+ offset = trx_undo_header_create(undo_page, trx_id, mtr);
+
+ if (trx->support_xa) {
+ trx_undo_header_add_space_for_xid(undo_page,
+ undo_page + offset, mtr);
+ }
+ }
+
+ trx_undo_mem_init_for_reuse(undo, trx_id, xid, offset);
+
+ return(undo);
+}
+
+/**************************************************************************
+Marks an undo log header as a header of a data dictionary operation
+transaction. */
+static
+void
+trx_undo_mark_as_dict_operation(
+/*============================*/
+ trx_t* trx, /* in: dict op transaction */
+ trx_undo_t* undo, /* in: assigned undo log */
+ mtr_t* mtr) /* in: mtr */
+{
+ page_t* hdr_page;
+
+ ut_a(trx->dict_operation);
+
+ hdr_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr);
+
+ mlog_write_ulint(hdr_page + undo->hdr_offset +
+ TRX_UNDO_DICT_TRANS,
+ trx->dict_operation, MLOG_1BYTE, mtr);
+
+ mlog_write_dulint(hdr_page + undo->hdr_offset + TRX_UNDO_TABLE_ID,
+ trx->table_id, mtr);
+
+ undo->dict_operation = trx->dict_operation;
+ undo->table_id = trx->table_id;
+}
+
+/**************************************************************************
+Assigns an undo log for a transaction. A new undo log is created or a cached
+undo log reused. */
+
+trx_undo_t*
+trx_undo_assign_undo(
+/*=================*/
+ /* out: the undo log, NULL if did not succeed: out of
+ space */
+ trx_t* trx, /* in: transaction */
+ ulint type) /* in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */
+{
+ trx_rseg_t* rseg;
+ trx_undo_t* undo;
+ mtr_t mtr;
+
+ ut_ad(trx);
+ ut_ad(trx->rseg);
+
+ rseg = trx->rseg;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(trx->undo_mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ mtr_start(&mtr);
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+ mutex_enter(&(rseg->mutex));
+
+ undo = trx_undo_reuse_cached(trx, rseg, type, trx->id, &trx->xid,
+ &mtr);
+ if (undo == NULL) {
+ undo = trx_undo_create(trx, rseg, type, trx->id, &trx->xid,
+ &mtr);
+ if (undo == NULL) {
+ /* Did not succeed */
+
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ return(NULL);
+ }
+ }
+
+ if (type == TRX_UNDO_INSERT) {
+ UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_list, undo);
+ ut_ad(trx->insert_undo == NULL);
+ trx->insert_undo = undo;
+ } else {
+ UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_list, undo);
+ ut_ad(trx->update_undo == NULL);
+ trx->update_undo = undo;
+ }
+
+ if (trx->dict_operation) {
+ trx_undo_mark_as_dict_operation(trx, undo, &mtr);
+ }
+
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ return(undo);
+}
+
+/**********************************************************************
+Sets the state of the undo log segment at a transaction finish. */
+
+page_t*
+trx_undo_set_state_at_finish(
+/*=========================*/
+ /* out: undo log segment header page,
+ x-latched */
+ trx_t* trx __attribute__((unused)), /* in: transaction */
+ trx_undo_t* undo, /* in: undo log memory copy */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_usegf_t* seg_hdr;
+ trx_upagef_t* page_hdr;
+ page_t* undo_page;
+ ulint state;
+
+ ut_ad(trx && undo && mtr);
+
+ if (undo->id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+ (ulong) undo->id);
+ mem_analyze_corruption((byte*)undo);
+ ut_error;
+ }
+
+ undo_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr);
+
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+ if (undo->size == 1 && mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE)
+ < TRX_UNDO_PAGE_REUSE_LIMIT) {
+ state = TRX_UNDO_CACHED;
+
+ } else if (undo->type == TRX_UNDO_INSERT) {
+
+ state = TRX_UNDO_TO_FREE;
+ } else {
+ state = TRX_UNDO_TO_PURGE;
+ }
+
+ undo->state = state;
+
+ mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, state, MLOG_2BYTES, mtr);
+
+ return(undo_page);
+}
+
+/**********************************************************************
+Sets the state of the undo log segment at a transaction prepare. */
+
+page_t*
+trx_undo_set_state_at_prepare(
+/*==========================*/
+ /* out: undo log segment header page,
+ x-latched */
+ trx_t* trx, /* in: transaction */
+ trx_undo_t* undo, /* in: undo log memory copy */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_usegf_t* seg_hdr;
+ trx_upagef_t* page_hdr;
+ trx_ulogf_t* undo_header;
+ page_t* undo_page;
+ ulint offset;
+
+ ut_ad(trx && undo && mtr);
+
+ if (undo->id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+ (ulong) undo->id);
+ mem_analyze_corruption((byte*)undo);
+ ut_error;
+ }
+
+ undo_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr);
+
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+ /*------------------------------*/
+ undo->state = TRX_UNDO_PREPARED;
+ undo->xid = trx->xid;
+ /*------------------------------*/
+
+ mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, undo->state,
+ MLOG_2BYTES, mtr);
+
+ offset = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG);
+ undo_header = undo_page + offset;
+
+ mlog_write_ulint(undo_header + TRX_UNDO_XID_EXISTS,
+ TRUE, MLOG_1BYTE, mtr);
+
+ trx_undo_write_xid(undo_header, &undo->xid, mtr);
+
+ return(undo_page);
+}
+
+/**************************************************************************
+Adds the update undo log header as the first in the history list, and
+frees the memory object, or puts it to the list of cached update undo log
+segments. */
+
+void
+trx_undo_update_cleanup(
+/*====================*/
+ trx_t* trx, /* in: trx owning the update undo log */
+ page_t* undo_page, /* in: update undo log header page,
+ x-latched */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_rseg_t* rseg;
+ trx_undo_t* undo;
+
+ undo = trx->update_undo;
+ rseg = trx->rseg;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(rseg->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+ trx_purge_add_update_undo_to_history(trx, undo_page, mtr);
+
+ UT_LIST_REMOVE(undo_list, rseg->update_undo_list, undo);
+
+ trx->update_undo = NULL;
+
+ if (undo->state == TRX_UNDO_CACHED) {
+
+ UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_cached, undo);
+ } else {
+ ut_ad(undo->state == TRX_UNDO_TO_PURGE);
+
+ trx_undo_mem_free(undo);
+ }
+}
+
+/**********************************************************************
+Frees or caches an insert undo log after a transaction commit or rollback.
+Knowledge of inserts is not needed after a commit or rollback, therefore
+the data can be discarded. */
+
+void
+trx_undo_insert_cleanup(
+/*====================*/
+ trx_t* trx) /* in: transaction handle */
+{
+ trx_undo_t* undo;
+ trx_rseg_t* rseg;
+
+ undo = trx->insert_undo;
+ ut_ad(undo);
+
+ rseg = trx->rseg;
+
+ mutex_enter(&(rseg->mutex));
+
+ UT_LIST_REMOVE(undo_list, rseg->insert_undo_list, undo);
+ trx->insert_undo = NULL;
+
+ if (undo->state == TRX_UNDO_CACHED) {
+
+ UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_cached, undo);
+ } else {
+ ut_ad(undo->state == TRX_UNDO_TO_FREE);
+
+ /* Delete first the undo log segment in the file */
+
+ mutex_exit(&(rseg->mutex));
+
+ trx_undo_seg_free(undo);
+
+ mutex_enter(&(rseg->mutex));
+
+ ut_ad(rseg->curr_size > undo->size);
+
+ rseg->curr_size -= undo->size;
+
+ trx_undo_mem_free(undo);
+ }
+
+ mutex_exit(&(rseg->mutex));
+}