From 4da7f485b75867dbc2ed02054459d1e8050144f3 Mon Sep 17 00:00:00 2001 From: "heikki@hundin.mysql.fi" <> Date: Sun, 15 Jun 2003 01:04:28 +0300 Subject: Many files: Merge InnoDB-4.0.14: SAVEPOINT now implemented; InnoDB now accepts also column prefix keys; crashing bug in ON UPDATE CASCADE fixed; page checksum formula fixed --- innobase/trx/trx0rec.c | 24 +++--- innobase/trx/trx0roll.c | 222 ++++++++++++++++++++++++++++++++++++++++-------- innobase/trx/trx0sys.c | 2 +- innobase/trx/trx0trx.c | 5 ++ 4 files changed, 205 insertions(+), 48 deletions(-) (limited to 'innobase/trx') diff --git a/innobase/trx/trx0rec.c b/innobase/trx/trx0rec.c index 05e179e06a5..9453189d598 100644 --- a/innobase/trx/trx0rec.c +++ b/innobase/trx/trx0rec.c @@ -272,8 +272,8 @@ trx_undo_page_report_insert( mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, ptr - undo_page); - /* Write the log entry to the REDO log of this change in the UNDO log */ - + /* Write the log entry to the REDO log of this change in the UNDO + log */ trx_undof_page_add_undo_rec_log(undo_page, first_free, ptr - undo_page, mtr); return(first_free); @@ -492,7 +492,8 @@ trx_undo_page_report_modify( /* Reserve 2 bytes for the pointer to the next undo log record */ ptr += 2; - /* Store first some general parameters to the undo log */ + /* Store first some general parameters to the undo log */ + if (update) { if (rec_get_deleted_flag(rec)) { type_cmpl = TRX_UNDO_UPD_DEL_REC; @@ -526,8 +527,7 @@ trx_undo_page_report_modify( /* Store the values of the system columns */ trx_id = dict_index_rec_get_sys_col(index, DATA_TRX_ID, rec); - roll_ptr = dict_index_rec_get_sys_col(index, DATA_ROLL_PTR, rec); - + roll_ptr = dict_index_rec_get_sys_col(index, DATA_ROLL_PTR, rec); len = mach_dulint_write_compressed(ptr, trx_id); ptr += len; @@ -632,7 +632,11 @@ trx_undo_page_report_modify( columns which occur as ordering fields in any index. This info is used in the purge of old versions where we use it to build and search the delete marked index records, to look if we can remove them from the - index tree. */ + index tree. Note that starting from 4.0.14 also externally stored + fields can be ordering in some index. But we always store at least + 384 first bytes locally to the clustered index record, which means + we can construct the column prefix fields in the index from the + stored data. */ if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { @@ -1408,11 +1412,11 @@ trx_undo_prev_version_build( return(DB_ERROR); } - if (row_upd_changes_field_size(rec, index, update)) { - - entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap); + if (row_upd_changes_field_size_or_external(rec, index, update)) { - row_upd_clust_index_replace_new_col_vals(entry, update); + entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, + heap); + row_upd_index_replace_new_col_vals(entry, index, update, heap); buf = mem_heap_alloc(heap, rec_get_converted_size(entry)); diff --git a/innobase/trx/trx0roll.c b/innobase/trx/trx0roll.c index a9f8c5ad22c..7d1b341221c 100644 --- a/innobase/trx/trx0roll.c +++ b/innobase/trx/trx0roll.c @@ -52,6 +52,11 @@ trx_general_rollback_for_mysql( que_thr_t* thr; roll_node_t* roll_node; + /* Tell Innobase server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + trx_start_if_not_started(trx); heap = mem_heap_create(512); @@ -89,6 +94,11 @@ trx_general_rollback_for_mysql( ut_a(trx->error_state == DB_SUCCESS); + /* Tell Innobase server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + return((int) trx->error_state); } @@ -110,20 +120,8 @@ trx_rollback_for_mysql( trx->op_info = (char *) "rollback"; - /* Tell Innobase server that there might be work for - utility threads: */ - - srv_active_wake_master_thread(); - err = trx_general_rollback_for_mysql(trx, FALSE, NULL); - trx_mark_sql_stat_end(trx); - - /* Tell Innobase server that there might be work for - utility threads: */ - - srv_active_wake_master_thread(); - trx->op_info = (char *) ""; return(err); @@ -147,25 +145,191 @@ trx_rollback_last_sql_stat_for_mysql( trx->op_info = (char *) "rollback of SQL statement"; - /* Tell Innobase server that there might be work for - utility threads: */ - - srv_active_wake_master_thread(); - err = trx_general_rollback_for_mysql(trx, TRUE, &(trx->last_sql_stat_start)); + /* The following call should not be needed, but we play safe: */ trx_mark_sql_stat_end(trx); - /* Tell Innobase server that there might be work for - utility threads: */ + trx->op_info = (char *) ""; + + return(err); +} - srv_active_wake_master_thread(); +/*********************************************************************** +Frees savepoint structs. */ - trx->op_info = (char *) ""; +void +trx_roll_savepoints_free( +/*=====================*/ + trx_t* trx, /* in: transaction handle */ + trx_named_savept_t* savep) /* in: free all savepoints > this one; + if this is NULL, free all savepoints + of trx */ +{ + trx_named_savept_t* next_savep; + + if (savep == NULL) { + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + } else { + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + while (savep != NULL) { + next_savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + + UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep); + mem_free(savep->name); + mem_free(savep); + + savep = next_savep; + } +} + +/*********************************************************************** +Rolls back a transaction back to a named savepoint. Modifications after the +savepoint are undone but InnoDB does NOT release the corresponding locks +which are stored in memory. If a lock is 'implicit', that is, a new inserted +row holds a lock where the lock information is carried by the trx id stored in +the row, these locks are naturally released in the rollback. Savepoints which +were set after this savepoint are deleted. */ + +ulint +trx_rollback_to_savepoint_for_mysql( +/*================================*/ + /* out: if no savepoint + of the name found then + DB_NO_SAVEPOINT, + otherwise DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + char* savepoint_name, /* in: savepoint name */ + ib_longlong* mysql_binlog_cache_pos) /* out: the MySQL binlog cache + position corresponding to this + savepoint; MySQL needs this + information to remove the + binlog entries of the queries + executed after the savepoint */ +{ + trx_named_savept_t* savep; + ulint err; + + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + + while (savep != NULL) { + if (0 == ut_strcmp(savep->name, savepoint_name)) { + /* Found */ + break; + } + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + + if (savep == NULL) { + + return(DB_NO_SAVEPOINT); + } + + if (trx->conc_state == TRX_NOT_STARTED) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: transaction has a savepoint %s though it is not started\n", + savep->name); + return(DB_ERROR); + } + + /* We can now free all savepoints strictly later than this one */ + + trx_roll_savepoints_free(trx, savep); + + *mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos; + + trx->op_info = (char *) "rollback to a savepoint"; + + err = trx_general_rollback_for_mysql(trx, TRUE, &(savep->savept)); + + /* Store the current undo_no of the transaction so that we know where + to roll back if we have to roll back the next SQL statement: */ + + trx_mark_sql_stat_end(trx); + + trx->op_info = (char *) ""; + return(err); } +/*********************************************************************** +Creates a named savepoint. If the transaction is not yet started, starts it. +If there is already a savepoint of the same name, this call erases that old +savepoint and replaces it with a new. Savepoints are deleted in a transaction +commit or rollback. */ + +ulint +trx_savepoint_for_mysql( +/*====================*/ + /* out: always DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + char* savepoint_name, /* in: savepoint name */ + ib_longlong binlog_cache_pos) /* in: MySQL binlog cache + position corresponding to this + connection at the time of the + savepoint */ +{ + trx_named_savept_t* savep; + + ut_a(trx); + ut_a(savepoint_name); + + trx_start_if_not_started(trx); + + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + + while (savep != NULL) { + if (0 == ut_strcmp(savep->name, savepoint_name)) { + /* Found */ + break; + } + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + + if (savep) { + /* There is a savepoint with the same name: free that */ + + UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep); + + mem_free(savep->name); + mem_free(savep); + } + + /* Create a new savepoint and add it as the last in the list */ + + savep = mem_alloc(sizeof(trx_named_savept_t)); + + savep->name = mem_alloc(1 + ut_strlen(savepoint_name)); + ut_memcpy(savep->name, savepoint_name, 1 + ut_strlen(savepoint_name)); + + savep->savept = trx_savept_take(trx); + + savep->mysql_binlog_cache_pos = binlog_cache_pos; + + UT_LIST_ADD_LAST(trx_savepoints, trx->trx_savepoints, savep); + + return(DB_SUCCESS); +} + +/*********************************************************************** +Returns a transaction savepoint taken at this point in time. */ + +trx_savept_t +trx_savept_take( +/*============*/ + /* out: savepoint */ + trx_t* trx) /* in: transaction */ +{ + trx_savept_t savept; + + savept.least_undo_no = trx->undo_no; + + return(savept); +} + /*********************************************************************** Rollback or clean up transactions which have no user session. If the transaction already was committed, then we clean up a possible insert @@ -325,22 +489,6 @@ loop: goto loop; } - -/*********************************************************************** -Returns a transaction savepoint taken at this point in time. */ - -trx_savept_t -trx_savept_take( -/*============*/ - /* out: savepoint */ - trx_t* trx) /* in: transaction */ -{ - trx_savept_t savept; - - savept.least_undo_no = trx->undo_no; - - return(savept); -} /*********************************************************************** Creates an undo number array. */ diff --git a/innobase/trx/trx0sys.c b/innobase/trx/trx0sys.c index 51aad60d3e2..f1b03fff3bd 100644 --- a/innobase/trx/trx0sys.c +++ b/innobase/trx/trx0sys.c @@ -321,8 +321,8 @@ trx_sys_doublewrite_restore_corrupt_pages(void) for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) { - space_id = mach_read_from_4(page + FIL_PAGE_SPACE); page_no = mach_read_from_4(page + FIL_PAGE_OFFSET); + space_id = 0; if (!fil_check_adress_in_tablespace(space_id, page_no)) { fprintf(stderr, diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c index d73d6327d76..9233e861784 100644 --- a/innobase/trx/trx0trx.c +++ b/innobase/trx/trx0trx.c @@ -135,6 +135,8 @@ trx_create( trx->lock_heap = mem_heap_create_in_buffer(256); UT_LIST_INIT(trx->trx_locks); + UT_LIST_INIT(trx->trx_savepoints); + trx->dict_operation_lock_mode = 0; trx->has_search_latch = FALSE; trx->search_latch_timeout = BTR_SEA_TIMEOUT; @@ -807,6 +809,9 @@ trx_commit_off_kernel( mutex_enter(&kernel_mutex); } + /* Free savepoints */ + trx_roll_savepoints_free(trx, NULL); + trx->conc_state = TRX_NOT_STARTED; trx->rseg = NULL; trx->undo_no = ut_dulint_zero; -- cgit v1.2.1 From b2769e279239cca52c3f264724286b052357ed14 Mon Sep 17 00:00:00 2001 From: "heikki@hundin.mysql.fi" <> Date: Thu, 3 Jul 2003 17:36:33 +0300 Subject: trx0trx.c: Fix bug: group commit still did not work when we had MySQL binlogging on --- innobase/trx/trx0trx.c | 98 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 67 insertions(+), 31 deletions(-) (limited to 'innobase/trx') diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c index 9233e861784..095a9df9082 100644 --- a/innobase/trx/trx0trx.c +++ b/innobase/trx/trx0trx.c @@ -778,29 +778,53 @@ trx_commit_off_kernel( efficient here: call os_thread_yield here to allow also other trxs to come to commit! */ - /* We now flush the log, as the transaction made changes to - the database, making the transaction committed on disk. It is - enough that any one of the log groups gets written to disk. */ - /*-------------------------------------*/ - /* Most MySQL users run with srv_flush_.. set to 0: */ - - if (srv_flush_log_at_trx_commit != 0) { - if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC - && srv_flush_log_at_trx_commit != 2 - && !trx->flush_log_later) { - - /* Write the log to the log files AND flush - them to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); - } else { - /* Write the log but do not flush it to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); - } - } + /* Depending on the my.cnf options, we may now write the log + buffer to the log files, making the transaction durable if + the OS does not crash. We may also flush the log files to + disk, making the transaction durable also at an OS crash or a + power outage. + + The idea in InnoDB's group commit is that a group of + transactions gather behind a trx doing a physical disk write + to log files, and when that physical write has been completed, + one of those transactions does a write which commits the whole + group. Note that this group commit will only bring benefit if + there are > 2 users in the database. Then at least 2 users can + gather behind one doing the physical log write to disk. + + If we are calling trx_commit() under MySQL's binlog mutex, we + will delay possible log write and flush to a separate function + trx_commit_complete_for_mysql(), which is only called when the + thread has released the binlog mutex. This is to make the + group commit algorithm to work. Otherwise, the MySQL binlog + mutex would serialize all commits and prevent a group of + transactions from gathering. */ + + if (trx->flush_log_later) { + /* Do nothing yet */ + } else if (srv_flush_log_at_trx_commit == 0) { + /* Do nothing */ + } else if (srv_flush_log_at_trx_commit == 1) { + if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + /* Write the log to the log files AND flush + them to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); + } + } else if (srv_flush_log_at_trx_commit == 2) { + + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + ut_a(0); + } trx->commit_lsn = lsn; @@ -1497,21 +1521,33 @@ trx_commit_complete_for_mysql( /* out: 0 or error number */ trx_t* trx) /* in: trx handle */ { - ut_a(trx); + dulint lsn = trx->commit_lsn; - if (srv_flush_log_at_trx_commit == 1 - && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) { - - trx->op_info = (char *) "flushing log"; + ut_a(trx); - /* Flush the log files to disk */ + if (srv_flush_log_at_trx_commit == 0) { + /* Do nothing */ + } else if (srv_flush_log_at_trx_commit == 1) { + if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { + /* Write the log but do not flush it to disk */ - log_write_up_to(trx->commit_lsn, LOG_WAIT_ONE_GROUP, TRUE); + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + /* Write the log to the log files AND flush them to + disk */ - trx->op_info = (char *) ""; - } + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); + } + } else if (srv_flush_log_at_trx_commit == 2) { - return(0); + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + ut_a(0); + } + + return(0); } /************************************************************************** -- cgit v1.2.1 From 2a1f87ff726219123d54dfb856956bff6ea2cbf7 Mon Sep 17 00:00:00 2001 From: "heikki@hundin.mysql.fi" <> Date: Thu, 3 Jul 2003 21:25:55 +0300 Subject: Many files: Remove potential starvation of a full log buffer flush: only flush up to the lsn which was the largest at the time when we requested the full log buffer flush os0sync.h, os0sync.c: Fix a bug in os_event on Unix: even though we signaled the event, some threads could continue waiting if the event became nonsignaled quickly again; this made group commit less efficient than it should be --- innobase/trx/trx0trx.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'innobase/trx') diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c index 095a9df9082..0dead50264b 100644 --- a/innobase/trx/trx0trx.c +++ b/innobase/trx/trx0trx.c @@ -1524,6 +1524,8 @@ trx_commit_complete_for_mysql( dulint lsn = trx->commit_lsn; ut_a(trx); + + trx->op_info = (char*)"flushing log"; if (srv_flush_log_at_trx_commit == 0) { /* Do nothing */ @@ -1547,6 +1549,8 @@ trx_commit_complete_for_mysql( ut_a(0); } + trx->op_info = (char*)""; + return(0); } -- cgit v1.2.1 From e387dac04206abfd4164be6c5569d152d2da1a9f Mon Sep 17 00:00:00 2001 From: "heikki@hundin.mysql.fi" <> Date: Sun, 6 Jul 2003 16:17:02 +0300 Subject: srv0srv.c: In background loop run purge to completion before doing other background operations: it does not make sense to flush buffer pool pages if they are soon modified again by purge trx0purge.c: Increase purge_sys->n_pages_handled for every undo log we purge, even if that log would be only a hundred bytes: that way we get the purge batches of 20 pages to set a fresh purge view (limit) more often, and we can reduce the number of old row versions purge has to look at when it decides if it can remove some delete-marked index record --- innobase/trx/trx0purge.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'innobase/trx') diff --git a/innobase/trx/trx0purge.c b/innobase/trx/trx0purge.c index d58240d3c11..fa9c287b0ad 100644 --- a/innobase/trx/trx0purge.c +++ b/innobase/trx/trx0purge.c @@ -593,7 +593,7 @@ trx_purge_rseg_get_next_history_log( mutex_enter(&(rseg->mutex)); - ut_ad(rseg->last_page_no != FIL_NULL); + ut_a(rseg->last_page_no != FIL_NULL); purge_sys->purge_trx_no = ut_dulint_add(rseg->last_trx_no, 1); purge_sys->purge_undo_no = ut_dulint_zero; @@ -606,16 +606,9 @@ trx_purge_rseg_get_next_history_log( log_hdr = undo_page + rseg->last_offset; seg_hdr = undo_page + TRX_UNDO_SEG_HDR; - if ((mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG) == 0) - && (mach_read_from_2(seg_hdr + TRX_UNDO_STATE) - == TRX_UNDO_TO_PURGE)) { - - /* This is the last log header on this page and the log - segment cannot be reused: we may increment the number of - pages handled */ + /* Increase the purge page count by one for every handled log */ - purge_sys->n_pages_handled++; - } + purge_sys->n_pages_handled++; prev_log_addr = trx_purge_get_log_from_hist( flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, -- cgit v1.2.1 From 99d224420d5d63045c577279c18b031d18666ace Mon Sep 17 00:00:00 2001 From: "heikki@hundin.mysql.fi" <> Date: Tue, 8 Jul 2003 14:16:27 +0300 Subject: trx0trx.c: A cosmetic change: set trx id to zero at creation so that SHOW INNODB STATUS does not print a random value for the id of a transaction object for which the transaction has never been started yet (for example, running SHOW INNODB STATUS as the first command from a mysql prompt printed a random id for the trx object associated with the session itself running the SHOW INNODB STATUS command) --- innobase/trx/trx0trx.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'innobase/trx') diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c index 0dead50264b..e8d6c093680 100644 --- a/innobase/trx/trx0trx.c +++ b/innobase/trx/trx0trx.c @@ -86,6 +86,10 @@ trx_create( trx->start_time = time(NULL); trx->isolation_level = TRX_ISO_REPEATABLE_READ; + + trx->id = ut_dulint_zero; + trx->no = ut_dulint_max; + trx->check_foreigns = TRUE; trx->check_unique_secondary = TRUE; -- cgit v1.2.1 From c1b62170a7afd62fc20b9d22da5592868abf255f Mon Sep 17 00:00:00 2001 From: "heikki@hundin.mysql.fi" <> Date: Fri, 25 Jul 2003 22:26:39 +0300 Subject: srv0srv.c, os0file.c, log0recv.h, log0log.h, fil0fil.h, fsp0fsp.c, fil0fil.c: Merge trx0trx.c: Print more info about a trx in SHOW INNODB status; try to find the bug reported by Plaxo buf0buf.c: Check that page log sequence numbers are not in the future log0recv.c, log0log.c: Fixed a bug: if you used big BLOBs, and your log files were relatively small, InnoDB could in a big BLOB operation temporarily write over the log produced AFTER the latest checkpoint. If InnoDB would crash at that moment, then the crash recovery would fail, because InnoDB would not be able to scan the log even up to the latest checkpoint. Starting from this version, InnoDB tries to ensure the latest checkpoint is young enough. If that is not possible, InnoDB prints a warning to the .err log --- innobase/trx/trx0trx.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'innobase/trx') diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c index e8d6c093680..47c39290b15 100644 --- a/innobase/trx/trx0trx.c +++ b/innobase/trx/trx0trx.c @@ -1624,6 +1624,13 @@ trx_print( } buf += sprintf(buf, "\n"); + + if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { + + buf += sprintf(buf, "mysql tables in use %lu, locked %lu\n", + trx->n_mysql_tables_in_use, + trx->mysql_n_tables_locked); + } start_of_line = buf; -- cgit v1.2.1