diff options
author | Bjorn Munch <bjorn.munch@oracle.com> | 2011-08-29 22:47:08 +0200 |
---|---|---|
committer | Bjorn Munch <bjorn.munch@oracle.com> | 2011-08-29 22:47:08 +0200 |
commit | 98136ecf1802258dbe3ce2959ebd3a50615e59fd (patch) | |
tree | 1a6068bf140dadd80392e7f44ea1729856ae53e3 | |
parent | a6212ddac6522a1cfcf16fb1e3af69939db5709c (diff) | |
parent | f610c5658748ae97a5e2c1e1afbd229f2121a082 (diff) | |
download | mariadb-git-98136ecf1802258dbe3ce2959ebd3a50615e59fd.tar.gz |
merge from 5.1 main
50 files changed, 1484 insertions, 632 deletions
diff --git a/extra/innochecksum.c b/extra/innochecksum.c index 7ad900d16d3..b55b510b888 100644 --- a/extra/innochecksum.c +++ b/extra/innochecksum.c @@ -25,12 +25,7 @@ Published with a permission. */ -/* needed to have access to 64 bit file functions */ -#define _LARGEFILE_SOURCE -#define _LARGEFILE64_SOURCE - -#define _XOPEN_SOURCE 500 /* needed to include getopt.h on some platforms. */ - +#include <my_global.h> #include <stdio.h> #include <stdlib.h> #include <time.h> @@ -53,7 +48,6 @@ /* another argument to specify page ranges... seek to right spot and go from there */ typedef unsigned long int ulint; -typedef unsigned char uchar; /* innodb function in name; modified slightly to not have the ASM version (lots of #ifs that didn't apply) */ ulint mach_read_from_4(uchar *b) diff --git a/include/decimal.h b/include/decimal.h index 530ed9e1757..c377bd4a400 100644 --- a/include/decimal.h +++ b/include/decimal.h @@ -21,6 +21,15 @@ typedef enum decimal_round_mode; typedef int32 decimal_digit_t; +/** + intg is the number of *decimal* digits (NOT number of decimal_digit_t's !) + before the point + frac is the number of decimal digits after the point + len is the length of buf (length of allocated space) in decimal_digit_t's, + not in bytes + sign false means positive, true means negative + buf is an array of decimal_digit_t's + */ typedef struct st_decimal_t { int intg, frac, len; my_bool sign; diff --git a/mysql-test/r/merge.result b/mysql-test/r/merge.result index 3af152672ab..a4f1c79dff4 100644 --- a/mysql-test/r/merge.result +++ b/mysql-test/r/merge.result @@ -2341,4 +2341,33 @@ REPAIR TABLE m1; Table Op Msg_type Msg_text test.m1 repair note The storage engine for the table doesn't support repair DROP TABLE m1, t1; +# +# BUG#11763712 - 56458: KILLING A FLUSH TABLE FOR A MERGE/CHILD +# CRASHES SERVER +# +CREATE TABLE t1(a INT); +CREATE TABLE t2(a INT); +CREATE TABLE t3(a INT, b INT); +CREATE TABLE m1(a INT) ENGINE=MERGE UNION=(t1, t2); +# Test reopen merge parent failure +LOCK TABLES m1 READ; +# Remove 'm1' table using file operations. +FLUSH TABLES; +ERROR 42S02: Table 'test.m1' doesn't exist +UNLOCK TABLES; +CREATE TABLE m1(a INT) ENGINE=MERGE UNION=(t1, t2); +# Test reopen merge child failure +LOCK TABLES m1 READ; +# Remove 't1' table using file operations. +FLUSH TABLES; +ERROR 42S02: Table 'test.t1' doesn't exist +UNLOCK TABLES; +CREATE TABLE t1(a INT); +# Test reattach merge failure +LOCK TABLES m1 READ; +# Replace 't1' with 't3' table using file operations. +FLUSH TABLES; +ERROR HY000: Can't reopen table: 'm1' +UNLOCK TABLES; +DROP TABLE t1, t2, t3, m1; End of 5.1 tests diff --git a/mysql-test/r/type_newdecimal.result b/mysql-test/r/type_newdecimal.result index c301a7dd629..0c6c1333e9b 100644 --- a/mysql-test/r/type_newdecimal.result +++ b/mysql-test/r/type_newdecimal.result @@ -1927,3 +1927,14 @@ f1 0.000000000000000000000000 DROP TABLE IF EXISTS t1; End of 5.1 tests +# +# BUG#12911710 - VALGRIND FAILURE IN +# ROW-DEBUG:PERFSCHEMA.SOCKET_SUMMARY_BY_INSTANCE_FUNC +# +CREATE TABLE t1(d1 DECIMAL(60,0) NOT NULL, +d2 DECIMAL(60,0) NOT NULL); +INSERT INTO t1 (d1, d2) VALUES(0.0, 0.0); +SELECT d1 * d2 FROM t1; +d1 * d2 +0 +DROP TABLE t1; diff --git a/mysql-test/suite/innodb_plugin/r/innodb-index.result b/mysql-test/suite/innodb_plugin/r/innodb-index.result index b24f282dfc4..5be1460d2b7 100644 --- a/mysql-test/suite/innodb_plugin/r/innodb-index.result +++ b/mysql-test/suite/innodb_plugin/r/innodb-index.result @@ -1024,6 +1024,15 @@ INSERT INTO t1 VALUES(9,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r); UPDATE t1 SET a=1000; DELETE FROM t1; DROP TABLE t1; +CREATE TABLE bug12547647( +a INT NOT NULL, b BLOB NOT NULL, c TEXT, +PRIMARY KEY (b(10), a), INDEX (c(10)) +) ENGINE=InnoDB ROW_FORMAT=DYNAMIC; +INSERT INTO bug12547647 VALUES (5,repeat('khdfo5AlOq',1900),repeat('g',7731)); +COMMIT; +UPDATE bug12547647 SET c = REPEAT('b',16928); +ERROR 42000: Row size too large. The maximum row size for the used table type, not counting BLOBs, is 8126. You have to change some columns to TEXT or BLOBs +DROP TABLE bug12547647; set global innodb_file_per_table=0; set global innodb_file_format=Antelope; set global innodb_file_format_check=Antelope; diff --git a/mysql-test/suite/innodb_plugin/t/innodb-index.test b/mysql-test/suite/innodb_plugin/t/innodb-index.test index 52f94990b15..b4e2aae09e9 100644 --- a/mysql-test/suite/innodb_plugin/t/innodb-index.test +++ b/mysql-test/suite/innodb_plugin/t/innodb-index.test @@ -480,6 +480,19 @@ DELETE FROM t1; -- sleep 10 DROP TABLE t1; +# Bug#12547647 UPDATE LOGGING COULD EXCEED LOG PAGE SIZE +CREATE TABLE bug12547647( +a INT NOT NULL, b BLOB NOT NULL, c TEXT, +PRIMARY KEY (b(10), a), INDEX (c(10)) +) ENGINE=InnoDB ROW_FORMAT=DYNAMIC; + +INSERT INTO bug12547647 VALUES (5,repeat('khdfo5AlOq',1900),repeat('g',7731)); +COMMIT; +# The following used to cause infinite undo log allocation. +--error ER_TOO_BIG_ROWSIZE +UPDATE bug12547647 SET c = REPEAT('b',16928); +DROP TABLE bug12547647; + eval set global innodb_file_per_table=$per_table; eval set global innodb_file_format=$format; eval set global innodb_file_format_check=$format; diff --git a/mysql-test/t/merge.test b/mysql-test/t/merge.test index f290803bbd2..a6affbb0540 100644 --- a/mysql-test/t/merge.test +++ b/mysql-test/t/merge.test @@ -1783,4 +1783,49 @@ REPAIR TABLE m1; # DROP TABLE m1, t1; + +--echo # +--echo # BUG#11763712 - 56458: KILLING A FLUSH TABLE FOR A MERGE/CHILD +--echo # CRASHES SERVER +--echo # +CREATE TABLE t1(a INT); +CREATE TABLE t2(a INT); +CREATE TABLE t3(a INT, b INT); +CREATE TABLE m1(a INT) ENGINE=MERGE UNION=(t1, t2); + +--echo # Test reopen merge parent failure +LOCK TABLES m1 READ; +--echo # Remove 'm1' table using file operations. +remove_file $MYSQLD_DATADIR/test/m1.MRG; +remove_file $MYSQLD_DATADIR/test/m1.frm; +--error ER_NO_SUCH_TABLE +FLUSH TABLES; +UNLOCK TABLES; +CREATE TABLE m1(a INT) ENGINE=MERGE UNION=(t1, t2); + +--echo # Test reopen merge child failure +LOCK TABLES m1 READ; +--echo # Remove 't1' table using file operations. +remove_file $MYSQLD_DATADIR/test/t1.frm; +remove_file $MYSQLD_DATADIR/test/t1.MYI; +remove_file $MYSQLD_DATADIR/test/t1.MYD; +--error ER_NO_SUCH_TABLE +FLUSH TABLES; +UNLOCK TABLES; +CREATE TABLE t1(a INT); + +--echo # Test reattach merge failure +LOCK TABLES m1 READ; +--echo # Replace 't1' with 't3' table using file operations. +remove_file $MYSQLD_DATADIR/test/t1.frm; +remove_file $MYSQLD_DATADIR/test/t1.MYI; +remove_file $MYSQLD_DATADIR/test/t1.MYD; +copy_file $MYSQLD_DATADIR/test/t3.frm $MYSQLD_DATADIR/test/t1.frm; +copy_file $MYSQLD_DATADIR/test/t3.MYI $MYSQLD_DATADIR/test/t1.MYI; +copy_file $MYSQLD_DATADIR/test/t3.MYD $MYSQLD_DATADIR/test/t1.MYD; +--error ER_CANT_REOPEN_TABLE +FLUSH TABLES; +UNLOCK TABLES; +DROP TABLE t1, t2, t3, m1; + --echo End of 5.1 tests diff --git a/mysql-test/t/type_newdecimal.test b/mysql-test/t/type_newdecimal.test index 31a8808da55..567d6c0b6a1 100644 --- a/mysql-test/t/type_newdecimal.test +++ b/mysql-test/t/type_newdecimal.test @@ -1526,3 +1526,17 @@ DROP TABLE IF EXISTS t1; --echo End of 5.1 tests + +--echo # +--echo # BUG#12911710 - VALGRIND FAILURE IN +--echo # ROW-DEBUG:PERFSCHEMA.SOCKET_SUMMARY_BY_INSTANCE_FUNC +--echo # + +CREATE TABLE t1(d1 DECIMAL(60,0) NOT NULL, + d2 DECIMAL(60,0) NOT NULL); + +INSERT INTO t1 (d1, d2) VALUES(0.0, 0.0); +SELECT d1 * d2 FROM t1; + +DROP TABLE t1; + diff --git a/mysql-test/valgrind.supp b/mysql-test/valgrind.supp index 8720cd511b9..3751a339a1a 100644 --- a/mysql-test/valgrind.supp +++ b/mysql-test/valgrind.supp @@ -791,3 +791,37 @@ fun:fil_delete_tablespace fun:row_drop_table_for_mysql } + +{ + Bug#12856915 VALGRIND FAILURE IN FILESORT/CREATE_SORT_INDEX / one + Memcheck:Param + write(buf) + obj:*/libpthread*.so + fun:my_write + fun:my_b_flush_io_cache + fun:_my_b_write + fun:_ZL10write_keysP13st_sort_paramPPhjP11st_io_cacheS4_ + fun:_ZL13find_all_keysP13st_sort_paramP10SQL_SELECTPPhP11st_io_cacheS6_S6_ + fun:_Z8filesortP3THDP8st_tableP13st_sort_fieldjP10SQL_SELECTybPy +} + +{ + Bug#12856915 VALGRIND FAILURE IN FILESORT/CREATE_SORT_INDEX / two + Memcheck:Param + write(buf) + obj:*/libpthread*.so + fun:my_write + fun:my_b_flush_io_cache + fun:_Z15merge_many_buffP13st_sort_paramPhP10st_buffpekPjP11st_io_cache + fun:_Z8filesortP3THDP8st_tableP13st_sort_fieldjP10SQL_SELECTybPy +} + +{ + Bug#12856915 VALGRIND FAILURE IN FILESORT/CREATE_SORT_INDEX / three + Memcheck:Param + write(buf) + obj:*/libpthread*.so + fun:my_write + fun:my_b_flush_io_cache + fun:_Z8filesortP3THDP8st_tableP13st_sort_fieldjP10SQL_SELECTybPy +} diff --git a/sql/filesort.cc b/sql/filesort.cc index 99e5156427a..0ff354b334c 100644 --- a/sql/filesort.cc +++ b/sql/filesort.cc @@ -959,21 +959,10 @@ static void make_sortkey(register SORTPARAM *param, if (addonf->null_bit && field->is_null()) { nulls[addonf->null_offset]|= addonf->null_bit; -#ifdef HAVE_purify - bzero(to, addonf->length); -#endif } else { -#ifdef HAVE_purify - uchar *end= field->pack(to, field->ptr); - uint length= (uint) ((to + addonf->length) - end); - DBUG_ASSERT((int) length >= 0); - if (length) - bzero(end, length); -#else (void) field->pack(to, field->ptr); -#endif } to+= addonf->length; } diff --git a/sql/my_decimal.h b/sql/my_decimal.h index c7a99e10233..21f485560da 100644 --- a/sql/my_decimal.h +++ b/sql/my_decimal.h @@ -101,12 +101,8 @@ public: { len= DECIMAL_BUFF_LENGTH; buf= buffer; -#if !defined (HAVE_purify) && !defined(DBUG_OFF) - /* Set buffer to 'random' value to find wrong buffer usage */ - for (uint i= 0; i < DECIMAL_BUFF_LENGTH; i++) - buffer[i]= i; -#endif } + my_decimal() { init(); diff --git a/sql/rpl_rli.h b/sql/rpl_rli.h index 811ad8eb864..ae8639c69ac 100644 --- a/sql/rpl_rli.h +++ b/sql/rpl_rli.h @@ -221,7 +221,13 @@ public: #endif /* if not set, the value of other members of the structure are undefined */ - bool inited; + /* + inited changes its value within LOCK_active_mi-guarded critical + sections at times of start_slave_threads() (0->1) and end_slave() (1->0). + Readers may not acquire the mutex while they realize potential concurrency + issue. + */ + volatile bool inited; volatile bool abort_slave; volatile uint slave_running; diff --git a/sql/slave.cc b/sql/slave.cc index 6c375238ce4..02d8cc2c199 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -598,11 +598,15 @@ int start_slave_thread(pthread_handler h_func, pthread_mutex_t *start_lock, DBUG_PRINT("sleep",("Waiting for slave thread to start")); const char* old_msg = thd->enter_cond(start_cond,cond_lock, "Waiting for slave thread to start"); - pthread_cond_wait(start_cond,cond_lock); + pthread_cond_wait(start_cond, cond_lock); thd->exit_cond(old_msg); pthread_mutex_lock(cond_lock); // re-acquire it as exit_cond() released if (thd->killed) + { + if (start_lock) + pthread_mutex_unlock(start_lock); DBUG_RETURN(thd->killed_errno()); + } } } if (start_lock) @@ -2531,6 +2535,7 @@ pthread_handler_t handle_slave_io(void *arg) thd= new THD; // note that contructor of THD uses DBUG_ ! THD_CHECK_SENTRY(thd); + DBUG_ASSERT(mi->io_thd == 0); mi->io_thd = thd; pthread_detach_this_thread(); @@ -4489,9 +4494,6 @@ int rotate_relay_log(Master_info* mi) Relay_log_info* rli= &mi->rli; int error= 0; - /* We don't lock rli->run_lock. This would lead to deadlocks. */ - pthread_mutex_lock(&mi->run_lock); - /* We need to test inited because otherwise, new_file() will attempt to lock LOCK_log, which may not be inited (if we're not a slave). @@ -4521,7 +4523,6 @@ int rotate_relay_log(Master_info* mi) */ rli->relay_log.harvest_bytes_written(&rli->log_space_total); end: - pthread_mutex_unlock(&mi->run_lock); DBUG_RETURN(error); } diff --git a/sql/sql_base.cc b/sql/sql_base.cc index ab1ba156905..ace78947054 100644 --- a/sql/sql_base.cc +++ b/sql/sql_base.cc @@ -96,6 +96,13 @@ static TABLE_SHARE *oldest_unused_share, end_of_unused_share; static pthread_mutex_t LOCK_table_share; static bool table_def_inited= 0; +/** + Dummy TABLE instance which is used in reopen_tables() and reattach_merge() + functions to mark MERGE tables and their children with which there is some + kind of problem and which therefore we need to close. +*/ +static TABLE bad_merge_marker; + static int open_unireg_entry(THD *thd, TABLE *entry, TABLE_LIST *table_list, const char *alias, char *cache_key, uint cache_key_length, @@ -3215,46 +3222,65 @@ void close_data_files_and_morph_locks(THD *thd, const char *db, /** + @brief Mark merge parent and children with bad_merge_marker + + @param[in,out] parent the TABLE object of the parent +*/ + +static void mark_merge_parent_and_children_as_bad(TABLE *parent) +{ + TABLE_LIST *child_l; + DBUG_ENTER("mark_merge_parent_and_children_as_bad"); + parent->parent= &bad_merge_marker; + for (child_l= parent->child_l; ; child_l= child_l->next_global) + { + child_l->table->parent= &bad_merge_marker; + child_l->table= NULL; + if (&child_l->next_global == parent->child_last_l) + break; + } + DBUG_VOID_RETURN; +} + + +/** Reattach MERGE children after reopen. @param[in] thd thread context - @param[in,out] err_tables_p pointer to pointer of tables in error + + @note If reattach failed for certain MERGE table, the table (and all + it's children) are marked with bad_merge_marker. @return status - @retval FALSE OK, err_tables_p unchanged - @retval TRUE Error, err_tables_p contains table(s) + @retval FALSE OK + @retval TRUE Error */ -static bool reattach_merge(THD *thd, TABLE **err_tables_p) +static bool reattach_merge(THD *thd) { TABLE *table; - TABLE *next; - TABLE **prv_p= &thd->open_tables; bool error= FALSE; DBUG_ENTER("reattach_merge"); - for (table= thd->open_tables; table; table= next) + for (table= thd->open_tables; table; table= table->next) { - next= table->next; - DBUG_PRINT("tcache", ("check table: '%s'.'%s' 0x%lx next: 0x%lx", + DBUG_PRINT("tcache", ("check table: '%s'.'%s' 0x%lx", table->s->db.str, table->s->table_name.str, - (long) table, (long) next)); - /* Reattach children for MERGE tables with "closed data files" only. */ - if (table->child_l && !table->children_attached) + (long) table)); + /* + Reattach children only for MERGE tables that had children or parent + with "closed data files" and were reopen. For extra safety skip MERGE + tables which we failed to reopen (should not happen with current code). + */ + if (table->child_l && table->parent != &bad_merge_marker && + !table->children_attached) { DBUG_PRINT("tcache", ("MERGE parent, attach children")); - if(table->file->extra(HA_EXTRA_ATTACH_CHILDREN)) + if (table->file->extra(HA_EXTRA_ATTACH_CHILDREN)) { my_error(ER_CANT_REOPEN_TABLE, MYF(0), table->alias); error= TRUE; - /* Remove table from open_tables. */ - *prv_p= next; - if (next) - prv_p= &next->next; - /* Stack table on error list. */ - table->next= *err_tables_p; - *err_tables_p= table; - continue; + mark_merge_parent_and_children_as_bad(table); } else { @@ -3264,7 +3290,6 @@ static bool reattach_merge(THD *thd, TABLE **err_tables_p) table->s->table_name.str, (long) table)); } } - prv_p= &table->next; } DBUG_RETURN(error); } @@ -3294,7 +3319,6 @@ bool reopen_tables(THD *thd, bool get_locks, bool mark_share_as_old) { TABLE *table,*next,**prev; TABLE **tables,**tables_ptr; // For locks - TABLE *err_tables= NULL; bool error=0, not_used; bool merge_table_found= FALSE; const uint flags= MYSQL_LOCK_NOTIFY_IF_NEED_REOPEN | @@ -3328,29 +3352,69 @@ bool reopen_tables(THD *thd, bool get_locks, bool mark_share_as_old) for (table=thd->open_tables; table ; table=next) { uint db_stat=table->db_stat; + TABLE *parent= table->child_l ? table : table->parent; next=table->next; DBUG_PRINT("tcache", ("open table: '%s'.'%s' 0x%lx " "parent: 0x%lx db_stat: %u", table->s->db.str, table->s->table_name.str, (long) table, (long) table->parent, db_stat)); - if (table->child_l && !db_stat) + /* + If we need to reopen child or parent table in a MERGE table, then + children in this MERGE table has to be already detached at this + point. + */ + DBUG_ASSERT(db_stat || !parent || !parent->children_attached); + /* + Thanks to the above assumption the below condition will guarantee that + merge_table_found is TRUE when we need to reopen child or parent table. + Note that it works even in situation when it is only a child and not a + parent that needs reopen (this can happen when get_locks == FALSE). + */ + if (table->child_l && !table->children_attached) merge_table_found= TRUE; - if (!tables || (!db_stat && reopen_table(table))) + + if (!tables) { - my_error(ER_CANT_REOPEN_TABLE, MYF(0), table->alias); /* - If we could not allocate 'tables', we may close open tables - here. If a MERGE table is affected, detach the children first. - It is not necessary to clear the child or parent table reference - of this table because the TABLE is freed. But we need to clear - the child or parent references of the other belonging tables so - that they cannot be moved into the unused_tables chain with - these pointers set. + If we could not allocate 'tables' we close ALL open tables here. + Before closing MERGE child or parent we need to detach children + and/or clear references in/to them. */ - if (table->child_l || table->parent) + if (parent) detach_merge_children(table, TRUE); - VOID(hash_delete(&open_cache,(uchar*) table)); - error=1; + } + else if (table->parent == &bad_merge_marker) + { + /* + This is either a child or a parent of a MERGE table for which + we already decided that we are unable to reopen it. Close it. + + Reset parent reference, it may be used while freeing the table. + */ + table->parent= NULL; + } + else if (!db_stat && reopen_table(table)) + { + /* + If we fail to reopen a child or a parent in a MERGE table and the + MERGE table is affected for the first time, mark all relevant tables + invalid. Otherwise handle it as usual. + + All in all we must end up with: + - child tables are detached from parent. This was done earlier, + but child<->parent references were kept valid for reopen. + - parent is not in the to-be-locked tables + - all child tables and parent are not in the THD::open_tables. + - all child tables and parent are not in the open_cache. + + Please note that below we do additional pass through THD::open_tables + list to achieve the last three points. + */ + if (parent) + { + mark_merge_parent_and_children_as_bad(parent); + table->parent= NULL; + } } else { @@ -3366,21 +3430,56 @@ bool reopen_tables(THD *thd, bool get_locks, bool mark_share_as_old) table->s->version=0; table->open_placeholder= 0; } + continue; } + my_error(ER_CANT_REOPEN_TABLE, MYF(0), table->alias); + VOID(hash_delete(&open_cache, (uchar *) table)); + error= 1; } *prev=0; /* When all tables are open again, we can re-attach MERGE children to - their parents. All TABLE objects are still present. + their parents. + + If there was an error while reopening a child or a parent of a MERGE + table, or while reattaching child tables to their parents, some tables + may have been kept open but marked for close with bad_merge_marker. + Close these tables now. */ - DBUG_PRINT("tcache", ("re-attaching MERGE tables: %d", merge_table_found)); - if (!error && merge_table_found && reattach_merge(thd, &err_tables)) + if (tables && merge_table_found && (error|= reattach_merge(thd))) { - while (err_tables) + prev= &thd->open_tables; + for (table= thd->open_tables; table; table= next) { - VOID(hash_delete(&open_cache, (uchar*) err_tables)); - err_tables= err_tables->next; + next= table->next; + if (table->parent == &bad_merge_marker) + { + /* Remove merge parent from to-be-locked tables array. */ + if (get_locks && table->child_l) + { + TABLE **t; + for (t= tables; t < tables_ptr; t++) + { + if (*t == table) + { + tables_ptr--; + memmove(t, t + 1, (tables_ptr - t) * sizeof(TABLE *)); + break; + } + } + } + /* Reset parent reference, it may be used while freeing the table. */ + table->parent= NULL; + /* Free table. */ + VOID(hash_delete(&open_cache, (uchar *) table)); + } + else + { + *prev= table; + prev= &table->next; + } } + *prev= 0; } DBUG_PRINT("tcache", ("open tables to lock: %u", (uint) (tables_ptr - tables))); diff --git a/storage/innobase/btr/btr0btr.c b/storage/innobase/btr/btr0btr.c index 790582815a3..ad99913cf3b 100644 --- a/storage/innobase/btr/btr0btr.c +++ b/storage/innobase/btr/btr0btr.c @@ -300,29 +300,30 @@ btr_page_alloc_for_ibuf( /****************************************************************** Allocates a new file page to be used in an index tree. NOTE: we assume that the caller has made the reservation for free extents! */ - -page_t* -btr_page_alloc( -/*===========*/ - /* out: new allocated page, x-latched; - NULL if out of space */ +static +ulint +btr_page_alloc_low( +/*===============*/ + /* out: allocated page number, + FIL_NULL if out of space */ dict_index_t* index, /* in: index */ ulint hint_page_no, /* in: hint of a good page */ byte file_direction, /* in: direction where a possible page split is made */ ulint level, /* in: level where the page is placed in the tree */ - mtr_t* mtr) /* in: mtr */ + mtr_t* mtr, /* in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr) /* in/out: mini-transaction + in which the page should be + initialized (may be the same + as mtr), or NULL if it should + not be initialized (the page + at hint was previously freed + in mtr) */ { fseg_header_t* seg_header; page_t* root; - page_t* new_page; - ulint new_page_no; - - if (index->type & DICT_IBUF) { - - return(btr_page_alloc_for_ibuf(index, mtr)); - } root = btr_root_get(index, mtr); @@ -336,19 +337,61 @@ btr_page_alloc( reservation for free extents, and thus we know that a page can be allocated: */ - new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no, - file_direction, TRUE, mtr); + return(fseg_alloc_free_page_general(seg_header, hint_page_no, + file_direction, TRUE, + mtr, init_mtr)); +} + +/**************************************************************//** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! */ + +page_t* +btr_page_alloc( +/*===========*/ + /* out: new allocated block, x-latched; + NULL if out of space */ + dict_index_t* index, /* in: index */ + ulint hint_page_no, /* in: hint of a good page */ + byte file_direction, /* in: direction where a possible + page split is made */ + ulint level, /* in: level where the page is placed + in the tree */ + mtr_t* mtr, /* in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr) /* in/out: mini-transaction + for x-latching and initializing + the page */ +{ + page_t* new_page; + ulint new_page_no; + + if (index->type & DICT_IBUF) { + + return(btr_page_alloc_for_ibuf(index, mtr)); + } + + new_page_no = btr_page_alloc_low( + index, hint_page_no, file_direction, level, mtr, init_mtr); + if (new_page_no == FIL_NULL) { return(NULL); } new_page = buf_page_get(dict_index_get_space(index), new_page_no, - RW_X_LATCH, mtr); + RW_X_LATCH, init_mtr); #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(new_page, SYNC_TREE_NODE_NEW); #endif /* UNIV_SYNC_DEBUG */ + if (mtr->freed_clust_leaf) { + mtr_memo_release(mtr, new_page, MTR_MEMO_FREE_CLUST_LEAF); + ut_ad(!mtr_memo_contains(mtr, buf_block_align(new_page), + MTR_MEMO_FREE_CLUST_LEAF)); + } + + ut_ad(btr_freed_leaves_validate(mtr)); return(new_page); } @@ -464,6 +507,16 @@ btr_page_free_low( page_no = buf_frame_get_page_no(page); fseg_free_page(seg_header, space, page_no, mtr); + + /* The page was marked free in the allocation bitmap, but it + should remain buffer-fixed until mtr_commit(mtr) or until it + is explicitly freed from the mini-transaction. */ + ut_ad(mtr_memo_contains(mtr, buf_block_align(page), + MTR_MEMO_PAGE_X_FIX)); + /* TODO: Discard any operations on the page from the redo log + and remove the block from the flush list and the buffer pool. + This would free up buffer pool earlier and reduce writes to + both the tablespace and the redo log. */ } /****************************************************************** @@ -479,13 +532,144 @@ btr_page_free( { ulint level; + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); level = btr_page_get_level(page, mtr); btr_page_free_low(index, page, level, mtr); + + /* The handling of MTR_MEMO_FREE_CLUST_LEAF assumes this. */ + ut_ad(mtr_memo_contains(mtr, buf_block_align(page), + MTR_MEMO_PAGE_X_FIX)); + + if (level == 0 && (index->type & DICT_CLUSTERED)) { + /* We may have to call btr_mark_freed_leaves() to + temporarily mark the block nonfree for invoking + btr_store_big_rec_extern_fields() after an + update. Remember that the block was freed. */ + mtr->freed_clust_leaf = TRUE; + mtr_memo_push(mtr, buf_block_align(page), + MTR_MEMO_FREE_CLUST_LEAF); + } + + ut_ad(btr_freed_leaves_validate(mtr)); } +/**************************************************************//** +Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free. +For invoking btr_store_big_rec_extern_fields() after an update, +we must temporarily mark freed clustered index pages allocated, so +that off-page columns will not be allocated from them. Between the +btr_store_big_rec_extern_fields() and mtr_commit() we have to +mark the pages free again, so that no pages will be leaked. */ + +void +btr_mark_freed_leaves( +/*==================*/ + dict_index_t* index, /* in/out: clustered index */ + mtr_t* mtr, /* in/out: mini-transaction */ + ibool nonfree)/* in: TRUE=mark nonfree, FALSE=mark freed */ +{ + /* This is loosely based on mtr_memo_release(). */ + + ulint offset; + + ut_ad(index->type & DICT_CLUSTERED); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + + if (!mtr->freed_clust_leaf) { + return; + } + + offset = dyn_array_get_data_size(&mtr->memo); + + while (offset > 0) { + mtr_memo_slot_t* slot; + buf_block_t* block; + + offset -= sizeof *slot; + + slot = dyn_array_get_element(&mtr->memo, offset); + + if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) { + continue; + } + + /* Because btr_page_alloc() does invoke + mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all + blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the + memo must still be clustered index leaf tree pages. */ + block = slot->object; + ut_a(buf_block_get_space(block) + == dict_index_get_space(index)); + ut_a(fil_page_get_type(buf_block_get_frame(block)) + == FIL_PAGE_INDEX); + ut_a(btr_page_get_level(buf_block_get_frame(block), mtr) == 0); + + if (nonfree) { + /* Allocate the same page again. */ + ulint page_no; + page_no = btr_page_alloc_low( + index, buf_block_get_page_no(block), + FSP_NO_DIR, 0, mtr, NULL); + ut_a(page_no == buf_block_get_page_no(block)); + } else { + /* Assert that the page is allocated and free it. */ + btr_page_free_low(index, buf_block_get_frame(block), + 0, mtr); + } + } + + ut_ad(btr_freed_leaves_validate(mtr)); +} + +#ifdef UNIV_DEBUG +/**************************************************************//** +Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF. +See btr_mark_freed_leaves(). */ + +ibool +btr_freed_leaves_validate( +/*======================*/ + /* out: TRUE if valid */ + mtr_t* mtr) /* in: mini-transaction */ +{ + ulint offset; + + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + + offset = dyn_array_get_data_size(&mtr->memo); + + while (offset > 0) { + mtr_memo_slot_t* slot; + buf_block_t* block; + + offset -= sizeof *slot; + + slot = dyn_array_get_element(&mtr->memo, offset); + + if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) { + continue; + } + + ut_a(mtr->freed_clust_leaf); + /* Because btr_page_alloc() does invoke + mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all + blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the + memo must still be clustered index leaf tree pages. */ + block = slot->object; + ut_a(fil_page_get_type(buf_block_get_frame(block)) + == FIL_PAGE_INDEX); + ut_a(btr_page_get_level(buf_block_get_frame(block), mtr) == 0); + } + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + /****************************************************************** Sets the child node file address in a node pointer. */ UNIV_INLINE @@ -1015,7 +1199,7 @@ btr_root_raise_and_insert( a node pointer to the new page, and then splitting the new page. */ new_page = btr_page_alloc(index, 0, FSP_NO_DIR, - btr_page_get_level(root, mtr), mtr); + btr_page_get_level(root, mtr), mtr, mtr); btr_page_create(new_page, index, mtr); @@ -1636,7 +1820,7 @@ func_start: /* 2. Allocate a new page to the index */ new_page = btr_page_alloc(cursor->index, hint_page_no, direction, - btr_page_get_level(page, mtr), mtr); + btr_page_get_level(page, mtr), mtr, mtr); btr_page_create(new_page, cursor->index, mtr); /* 3. Calculate the first record on the upper half-page, and the diff --git a/storage/innobase/btr/btr0cur.c b/storage/innobase/btr/btr0cur.c index 9ce09929f9a..a1dda8edf69 100644 --- a/storage/innobase/btr/btr0cur.c +++ b/storage/innobase/btr/btr0cur.c @@ -2051,43 +2051,6 @@ return_after_reservations: return(err); } -/***************************************************************** -Commits and restarts a mini-transaction so that it will retain an -x-lock on index->lock and the cursor page. */ - -void -btr_cur_mtr_commit_and_start( -/*=========================*/ - btr_cur_t* cursor, /* in: cursor */ - mtr_t* mtr) /* in/out: mini-transaction */ -{ - buf_block_t* block; - - block = buf_block_align(btr_cur_get_rec(cursor)); - - ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index), - MTR_MEMO_X_LOCK)); - ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); - /* Keep the locks across the mtr_commit(mtr). */ - rw_lock_x_lock(dict_index_get_lock(cursor->index)); - rw_lock_x_lock(&block->lock); - mutex_enter(&block->mutex); -#ifdef UNIV_SYNC_DEBUG - buf_block_buf_fix_inc_debug(block, __FILE__, __LINE__); -#else - buf_block_buf_fix_inc(block); -#endif - mutex_exit(&block->mutex); - /* Write out the redo log. */ - mtr_commit(mtr); - mtr_start(mtr); - /* Reassociate the locks with the mini-transaction. - They will be released on mtr_commit(mtr). */ - mtr_memo_push(mtr, dict_index_get_lock(cursor->index), - MTR_MEMO_X_LOCK); - mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX); -} - /*==================== B-TREE DELETE MARK AND UNMARK ===============*/ /******************************************************************** @@ -3494,6 +3457,11 @@ btr_store_big_rec_extern_fields( this function returns */ big_rec_t* big_rec_vec, /* in: vector containing fields to be stored externally */ + mtr_t* alloc_mtr, /* in/out: in an insert, NULL; + in an update, local_mtr for + allocating BLOB pages and + updating BLOB pointers; alloc_mtr + must not have freed any leaf pages */ mtr_t* local_mtr __attribute__((unused))) /* in: mtr containing the latch to rec and to the tree */ @@ -3514,6 +3482,8 @@ btr_store_big_rec_extern_fields( ulint i; mtr_t mtr; + ut_ad(local_mtr); + ut_ad(!alloc_mtr || alloc_mtr == local_mtr); ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index), MTR_MEMO_X_LOCK)); @@ -3523,6 +3493,25 @@ btr_store_big_rec_extern_fields( space_id = buf_frame_get_space_id(rec); + if (alloc_mtr) { + /* Because alloc_mtr will be committed after + mtr, it is possible that the tablespace has been + extended when the B-tree record was updated or + inserted, or it will be extended while allocating + pages for big_rec. + + TODO: In mtr (not alloc_mtr), write a redo log record + about extending the tablespace to its current size, + and remember the current size. Whenever the tablespace + grows as pages are allocated, write further redo log + records to mtr. (Currently tablespace extension is not + covered by the redo log. If it were, the record would + only be written to alloc_mtr, which is committed after + mtr.) */ + } else { + alloc_mtr = &mtr; + } + /* We have to create a file segment to the tablespace for each field and put the pointer to the field in rec */ @@ -3549,7 +3538,7 @@ btr_store_big_rec_extern_fields( } page = btr_page_alloc(index, hint_page_no, - FSP_NO_DIR, 0, &mtr); + FSP_NO_DIR, 0, alloc_mtr, &mtr); if (page == NULL) { mtr_commit(&mtr); @@ -3603,37 +3592,42 @@ btr_store_big_rec_extern_fields( extern_len -= store_len; + if (alloc_mtr == &mtr) { #ifdef UNIV_SYNC_DEBUG - rec_page = + rec_page = #endif /* UNIV_SYNC_DEBUG */ - buf_page_get(space_id, - buf_frame_get_page_no(data), - RW_X_LATCH, &mtr); + buf_page_get( + space_id, + buf_frame_get_page_no(data), + RW_X_LATCH, &mtr); #ifdef UNIV_SYNC_DEBUG - buf_page_dbg_add_level(rec_page, SYNC_NO_ORDER_CHECK); + buf_page_dbg_add_level( + rec_page, SYNC_NO_ORDER_CHECK); #endif /* UNIV_SYNC_DEBUG */ + } + mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, 0, - MLOG_4BYTES, &mtr); + MLOG_4BYTES, alloc_mtr); mlog_write_ulint(data + local_len + BTR_EXTERN_LEN + 4, big_rec_vec->fields[i].len - extern_len, - MLOG_4BYTES, &mtr); + MLOG_4BYTES, alloc_mtr); if (prev_page_no == FIL_NULL) { mlog_write_ulint(data + local_len + BTR_EXTERN_SPACE_ID, space_id, - MLOG_4BYTES, &mtr); + MLOG_4BYTES, alloc_mtr); mlog_write_ulint(data + local_len + BTR_EXTERN_PAGE_NO, page_no, - MLOG_4BYTES, &mtr); + MLOG_4BYTES, alloc_mtr); mlog_write_ulint(data + local_len + BTR_EXTERN_OFFSET, FIL_PAGE_DATA, - MLOG_4BYTES, &mtr); + MLOG_4BYTES, alloc_mtr); /* Set the bit denoting that this field in rec is stored externally */ @@ -3641,7 +3635,7 @@ btr_store_big_rec_extern_fields( rec_set_nth_field_extern_bit( rec, index, big_rec_vec->fields[i].field_no, - TRUE, &mtr); + TRUE, alloc_mtr); } prev_page_no = page_no; diff --git a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c index 08e033e7a63..78b39812cff 100644 --- a/storage/innobase/buf/buf0buf.c +++ b/storage/innobase/buf/buf0buf.c @@ -1009,29 +1009,6 @@ buf_page_peek_block( } /************************************************************************ -Resets the check_index_page_at_flush field of a page if found in the buffer -pool. */ - -void -buf_reset_check_index_page_at_flush( -/*================================*/ - ulint space, /* in: space id */ - ulint offset) /* in: page number */ -{ - buf_block_t* block; - - mutex_enter_fast(&(buf_pool->mutex)); - - block = buf_page_hash_get(space, offset); - - if (block) { - block->check_index_page_at_flush = FALSE; - } - - mutex_exit(&(buf_pool->mutex)); -} - -/************************************************************************ Returns the current state of is_hashed of a page. FALSE if the page is not in the pool. NOTE that this operation does not fix the page in the pool if it is found there. */ diff --git a/storage/innobase/fsp/fsp0fsp.c b/storage/innobase/fsp/fsp0fsp.c index d228e683957..d5be8fca38f 100644 --- a/storage/innobase/fsp/fsp0fsp.c +++ b/storage/innobase/fsp/fsp0fsp.c @@ -293,15 +293,19 @@ fseg_alloc_free_page_low( /* out: the allocated page number, FIL_NULL if no page could be allocated */ ulint space, /* in: space */ - fseg_inode_t* seg_inode, /* in: segment inode */ + fseg_inode_t* seg_inode, /* in/out: segment inode */ ulint hint, /* in: hint of which page would be desirable */ byte direction, /* in: if the new page is needed because of an index page split, and records are inserted there in order, into which direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR */ - mtr_t* mtr); /* in: mtr handle */ - + mtr_t* mtr, /* in/out: mini-transaction */ + mtr_t* init_mtr);/* in/out: mini-transaction in which the + page should be initialized + (may be the same as mtr), or NULL if it + should not be initialized (the page at hint + was previously freed in mtr) */ /************************************************************************** Reads the file space size stored in the header page. */ @@ -1371,6 +1375,43 @@ fsp_alloc_free_extent( return(descr); } +/**********************************************************************//** +Allocates a single free page from a space. */ +static __attribute__((nonnull)) +void +fsp_alloc_from_free_frag( +/*=====================*/ + fsp_header_t* header, /* in/out: tablespace header */ + xdes_t* descr, /* in/out: extent descriptor */ + ulint bit, /* in: slot to allocate in the extent */ + mtr_t* mtr) /* in/out: mini-transaction */ +{ + ulint frag_n_used; + + ut_ad(xdes_get_state(descr, mtr) == XDES_FREE_FRAG); + ut_a(xdes_get_bit(descr, XDES_FREE_BIT, bit, mtr)); + xdes_set_bit(descr, XDES_FREE_BIT, bit, FALSE, mtr); + + /* Update the FRAG_N_USED field */ + frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, + mtr); + frag_n_used++; + mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES, + mtr); + if (xdes_is_full(descr, mtr)) { + /* The fragment is full: move it to another list */ + flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, + mtr); + xdes_set_state(descr, XDES_FULL_FRAG, mtr); + + flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE, + mtr); + mlog_write_ulint(header + FSP_FRAG_N_USED, + frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES, + mtr); + } +} + /************************************************************************** Allocates a single free page from a space. The page is marked as used. */ static @@ -1381,19 +1422,22 @@ fsp_alloc_free_page( be allocated */ ulint space, /* in: space id */ ulint hint, /* in: hint of which page would be desirable */ - mtr_t* mtr) /* in: mtr handle */ + mtr_t* mtr, /* in/out: mini-transaction */ + mtr_t* init_mtr)/* in/out: mini-transaction in which the + page should be initialized + (may be the same as mtr) */ { fsp_header_t* header; fil_addr_t first; xdes_t* descr; page_t* page; ulint free; - ulint frag_n_used; ulint page_no; ulint space_size; ibool success; ut_ad(mtr); + ut_ad(init_mtr); header = fsp_get_space_header(space, mtr); @@ -1441,6 +1485,7 @@ fsp_alloc_free_page( if (free == ULINT_UNDEFINED) { ut_print_buf(stderr, ((byte*)descr) - 500, 1000); + putc('\n', stderr); ut_error; } @@ -1472,40 +1517,21 @@ fsp_alloc_free_page( } } - xdes_set_bit(descr, XDES_FREE_BIT, free, FALSE, mtr); - - /* Update the FRAG_N_USED field */ - frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, - mtr); - frag_n_used++; - mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES, - mtr); - if (xdes_is_full(descr, mtr)) { - /* The fragment is full: move it to another list */ - flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, - mtr); - xdes_set_state(descr, XDES_FULL_FRAG, mtr); - - flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE, - mtr); - mlog_write_ulint(header + FSP_FRAG_N_USED, - frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES, - mtr); - } + fsp_alloc_from_free_frag(header, descr, free, mtr); /* Initialize the allocated page to the buffer pool, so that it can be obtained immediately with buf_page_get without need for a disk read. */ - buf_page_create(space, page_no, mtr); + buf_page_create(space, page_no, init_mtr); - page = buf_page_get(space, page_no, RW_X_LATCH, mtr); + page = buf_page_get(space, page_no, RW_X_LATCH, init_mtr); #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(page, SYNC_FSP_PAGE); #endif /* UNIV_SYNC_DEBUG */ /* Prior contents of the page should be ignored */ - fsp_init_file_page(page, mtr); + fsp_init_file_page(page, init_mtr); return(page_no); } @@ -1724,7 +1750,7 @@ fsp_alloc_seg_inode_page( space = buf_frame_get_space_id(space_header); - page_no = fsp_alloc_free_page(space, 0, mtr); + page_no = fsp_alloc_free_page(space, 0, mtr, mtr); if (page_no == FIL_NULL) { @@ -2094,7 +2120,8 @@ fseg_create_general( } if (page == 0) { - page = fseg_alloc_free_page_low(space, inode, 0, FSP_UP, mtr); + page = fseg_alloc_free_page_low(space, + inode, 0, FSP_UP, mtr, mtr); if (page == FIL_NULL) { @@ -2331,14 +2358,19 @@ fseg_alloc_free_page_low( /* out: the allocated page number, FIL_NULL if no page could be allocated */ ulint space, /* in: space */ - fseg_inode_t* seg_inode, /* in: segment inode */ + fseg_inode_t* seg_inode, /* in/out: segment inode */ ulint hint, /* in: hint of which page would be desirable */ byte direction, /* in: if the new page is needed because of an index page split, and records are inserted there in order, into which direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR */ - mtr_t* mtr) /* in: mtr handle */ + mtr_t* mtr, /* in/out: mini-transaction */ + mtr_t* init_mtr)/* in/out: mini-transaction in which the + page should be initialized + (may be the same as mtr), or NULL if it + should not be initialized (the page at hint + was previously freed in mtr) */ { fsp_header_t* space_header; ulint space_size; @@ -2350,7 +2382,6 @@ fseg_alloc_free_page_low( if could not be allocated */ xdes_t* ret_descr; /* the extent of the allocated page */ page_t* page; - ibool frag_page_allocated = FALSE; ibool success; ulint n; @@ -2371,6 +2402,8 @@ fseg_alloc_free_page_low( if (descr == NULL) { /* Hint outside space or too high above free limit: reset hint */ + ut_a(init_mtr); + /* The file space header page is always allocated. */ hint = 0; descr = xdes_get_descriptor(space, hint, mtr); } @@ -2382,15 +2415,20 @@ fseg_alloc_free_page_low( mtr), seg_id)) && (xdes_get_bit(descr, XDES_FREE_BIT, hint % FSP_EXTENT_SIZE, mtr) == TRUE)) { - +take_hinted_page: /* 1. We can take the hinted page =================================*/ ret_descr = descr; ret_page = hint; + /* Skip the check for extending the tablespace. If the + page hint were not within the size of the tablespace, + we would have got (descr == NULL) above and reset the hint. */ + goto got_hinted_page; /*-----------------------------------------------------------*/ - } else if ((xdes_get_state(descr, mtr) == XDES_FREE) - && ((reserved - used) < reserved / FSEG_FILLFACTOR) - && (used >= FSEG_FRAG_LIMIT)) { + } else if (xdes_get_state(descr, mtr) == XDES_FREE + && (!init_mtr + || ((reserved - used < reserved / FSEG_FILLFACTOR) + && used >= FSEG_FRAG_LIMIT))) { /* 2. We allocate the free extent from space and can take ========================================================= @@ -2408,8 +2446,20 @@ fseg_alloc_free_page_low( /* Try to fill the segment free list */ fseg_fill_free_list(seg_inode, space, hint + FSP_EXTENT_SIZE, mtr); - ret_page = hint; + goto take_hinted_page; /*-----------------------------------------------------------*/ + } else if (!init_mtr) { + ut_a(xdes_get_state(descr, mtr) == XDES_FREE_FRAG); + fsp_alloc_from_free_frag(space_header, descr, + hint % FSP_EXTENT_SIZE, mtr); + ret_page = hint; + ret_descr = NULL; + + /* Put the page in the fragment page array of the segment */ + n = fseg_find_free_frag_page_slot(seg_inode, mtr); + ut_a(n != FIL_NULL); + fseg_set_nth_frag_page_no(seg_inode, n, ret_page, mtr); + goto got_hinted_page; } else if ((direction != FSP_NO_DIR) && ((reserved - used) < reserved / FSEG_FILLFACTOR) && (used >= FSEG_FRAG_LIMIT) @@ -2467,11 +2517,9 @@ fseg_alloc_free_page_low( } else if (used < FSEG_FRAG_LIMIT) { /* 6. We allocate an individual page from the space ===================================================*/ - ret_page = fsp_alloc_free_page(space, hint, mtr); + ret_page = fsp_alloc_free_page(space, hint, mtr, init_mtr); ret_descr = NULL; - frag_page_allocated = TRUE; - if (ret_page != FIL_NULL) { /* Put the page in the fragment page array of the segment */ @@ -2481,6 +2529,10 @@ fseg_alloc_free_page_low( fseg_set_nth_frag_page_no(seg_inode, n, ret_page, mtr); } + + /* fsp_alloc_free_page() invoked fsp_init_file_page() + already. */ + return(ret_page); /*-----------------------------------------------------------*/ } else { /* 7. We allocate a new extent and take its first page @@ -2527,22 +2579,31 @@ fseg_alloc_free_page_low( } } - if (!frag_page_allocated) { +got_hinted_page: + { /* Initialize the allocated page to buffer pool, so that it can be obtained immediately with buf_page_get without need for a disk read */ + mtr_t* block_mtr = init_mtr ? init_mtr : mtr; - page = buf_page_create(space, ret_page, mtr); + page = buf_page_create(space, ret_page, block_mtr); - ut_a(page == buf_page_get(space, ret_page, RW_X_LATCH, mtr)); + ut_a(page == buf_page_get(space, ret_page, RW_X_LATCH, + block_mtr)); #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(page, SYNC_FSP_PAGE); #endif /* UNIV_SYNC_DEBUG */ - /* The prior contents of the page should be ignored */ - fsp_init_file_page(page, mtr); + if (init_mtr) { + /* The prior contents of the page should be ignored */ + fsp_init_file_page(page, init_mtr); + } + } + /* ret_descr == NULL if the block was allocated from free_frag + (XDES_FREE_FRAG) */ + if (ret_descr != NULL) { /* At this point we know the extent and the page offset. The extent is still in the appropriate list (FSEG_NOT_FULL or FSEG_FREE), and the page is not yet marked as used. */ @@ -2554,8 +2615,6 @@ fseg_alloc_free_page_low( fseg_mark_page_used(seg_inode, space, ret_page, mtr); } - buf_reset_check_index_page_at_flush(space, ret_page); - return(ret_page); } @@ -2569,7 +2628,7 @@ fseg_alloc_free_page_general( /*=========================*/ /* out: allocated page offset, FIL_NULL if no page could be allocated */ - fseg_header_t* seg_header,/* in: segment header */ + fseg_header_t* seg_header,/* in/out: segment header */ ulint hint, /* in: hint of which page would be desirable */ byte direction,/* in: if the new page is needed because of an index page split, and records are @@ -2581,7 +2640,11 @@ fseg_alloc_free_page_general( with fsp_reserve_free_extents, then there is no need to do the check for this individual page */ - mtr_t* mtr) /* in: mtr handle */ + mtr_t* mtr, /* in/out: mini-transaction handle */ + mtr_t* init_mtr)/* in/out: mtr or another mini-transaction + in which the page should be initialized, + or NULL if this is a "fake allocation" of + a page that was previously freed in mtr */ { fseg_inode_t* inode; ulint space; @@ -2619,7 +2682,8 @@ fseg_alloc_free_page_general( } page_no = fseg_alloc_free_page_low(buf_frame_get_space_id(inode), - inode, hint, direction, mtr); + inode, hint, direction, + mtr, init_mtr); if (!has_done_reservation) { fil_space_release_free_extents(space, n_reserved); } @@ -2647,7 +2711,7 @@ fseg_alloc_free_page( mtr_t* mtr) /* in: mtr handle */ { return(fseg_alloc_free_page_general(seg_header, hint, direction, - FALSE, mtr)); + FALSE, mtr, mtr)); } /************************************************************************** diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index 269fa355558..3988019589d 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -379,7 +379,11 @@ btr_page_alloc( page split is made */ ulint level, /* in: level where the page is placed in the tree */ - mtr_t* mtr); /* in: mtr */ + mtr_t* mtr, /* in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr); /* in/out: mini-transaction + for x-latching and initializing + the page */ /****************************************************************** Frees a file page used in an index tree. NOTE: cannot free field external storage pages because the page must contain info on its level. */ @@ -402,6 +406,31 @@ btr_page_free_low( page_t* page, /* in: page to be freed, x-latched */ ulint level, /* in: page level */ mtr_t* mtr); /* in: mtr */ +/**************************************************************//** +Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free. +For invoking btr_store_big_rec_extern_fields() after an update, +we must temporarily mark freed clustered index pages allocated, so +that off-page columns will not be allocated from them. Between the +btr_store_big_rec_extern_fields() and mtr_commit() we have to +mark the pages free again, so that no pages will be leaked. */ + +void +btr_mark_freed_leaves( +/*==================*/ + dict_index_t* index, /* in/out: clustered index */ + mtr_t* mtr, /* in/out: mini-transaction */ + ibool nonfree);/* in: TRUE=mark nonfree, FALSE=mark freed */ +#ifdef UNIV_DEBUG +/**************************************************************//** +Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF. +See btr_mark_freed_leaves(). */ + +ibool +btr_freed_leaves_validate( +/*======================*/ + /* out: TRUE if valid */ + mtr_t* mtr); /* in: mini-transaction */ +#endif /* UNIV_DEBUG */ #ifdef UNIV_BTR_PRINT /***************************************************************** Prints size info of a B-tree. */ diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h index c068d8d3318..c2bf84ef9cb 100644 --- a/storage/innobase/include/btr0cur.h +++ b/storage/innobase/include/btr0cur.h @@ -252,15 +252,6 @@ btr_cur_pessimistic_update( updates */ que_thr_t* thr, /* in: query thread */ mtr_t* mtr); /* in: mtr */ -/***************************************************************** -Commits and restarts a mini-transaction so that it will retain an -x-lock on index->lock and the cursor page. */ - -void -btr_cur_mtr_commit_and_start( -/*=========================*/ - btr_cur_t* cursor, /* in: cursor */ - mtr_t* mtr); /* in/out: mini-transaction */ /*************************************************************** Marks a clustered index record deleted. Writes an undo log record to undo log on this delete marking. Writes in the trx id field the id @@ -471,6 +462,11 @@ btr_store_big_rec_extern_fields( this function returns */ big_rec_t* big_rec_vec, /* in: vector containing fields to be stored externally */ + mtr_t* alloc_mtr, /* in/out: in an insert, NULL; + in an update, local_mtr for + allocating BLOB pages and + updating BLOB pointers; alloc_mtr + must not have freed any leaf pages */ mtr_t* local_mtr); /* in: mtr containing the latch to rec and to the tree */ /*********************************************************************** diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 7479ce9cbf0..87b2f6172de 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -294,15 +294,6 @@ buf_page_peek_block( ulint space, /* in: space id */ ulint offset);/* in: page number */ /************************************************************************ -Resets the check_index_page_at_flush field of a page if found in the buffer -pool. */ - -void -buf_reset_check_index_page_at_flush( -/*================================*/ - ulint space, /* in: space id */ - ulint offset);/* in: page number */ -/************************************************************************ Sets file_page_was_freed TRUE if the page is found in the buffer pool. This function should be called when we free a file page and want the debug version to check that it is not accessed any more unless diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h index 17bfbeec2c1..4c58d6075e6 100644 --- a/storage/innobase/include/fsp0fsp.h +++ b/storage/innobase/include/fsp0fsp.h @@ -167,7 +167,7 @@ fseg_alloc_free_page_general( /*=========================*/ /* out: allocated page offset, FIL_NULL if no page could be allocated */ - fseg_header_t* seg_header,/* in: segment header */ + fseg_header_t* seg_header,/* in/out: segment header */ ulint hint, /* in: hint of which page would be desirable */ byte direction,/* in: if the new page is needed because of an index page split, and records are @@ -179,7 +179,11 @@ fseg_alloc_free_page_general( with fsp_reserve_free_extents, then there is no need to do the check for this individual page */ - mtr_t* mtr); /* in: mtr handle */ + mtr_t* mtr, /* in/out: mini-transaction */ + mtr_t* init_mtr);/* in/out: mtr or another mini-transaction + in which the page should be initialized, + or NULL if this is a "fake allocation" of + a page that was previously freed in mtr */ /************************************************************************** Reserves free pages from a tablespace. All mini-transactions which may use several pages from the tablespace should call this function beforehand diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index a6e2976830b..a0a51dbbd17 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -36,6 +36,8 @@ first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ #define MTR_MEMO_MODIFY 54 #define MTR_MEMO_S_LOCK 55 #define MTR_MEMO_X_LOCK 56 +/* The mini-transaction freed a clustered index leaf page. */ +#define MTR_MEMO_FREE_CLUST_LEAF 57 /* Log item types: we have made them to be of the type 'byte' for the compiler to warn if val and type parameters are switched @@ -325,9 +327,12 @@ struct mtr_struct{ ulint state; /* MTR_ACTIVE, MTR_COMMITTING, MTR_COMMITTED */ dyn_array_t memo; /* memo stack for locks etc. */ dyn_array_t log; /* mini-transaction log */ - ibool modifications; + unsigned modifications:1; /* TRUE if the mtr made modifications to buffer pool pages */ + unsigned freed_clust_leaf:1; + /* TRUE if MTR_MEMO_FREE_CLUST_LEAF + was logged in the mini-transaction */ ulint n_log_recs; /* count of how many page initial log records have been written to the mtr log */ diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic index 81eec3bfc92..6b4cacf0766 100644 --- a/storage/innobase/include/mtr0mtr.ic +++ b/storage/innobase/include/mtr0mtr.ic @@ -26,6 +26,7 @@ mtr_start( mtr->log_mode = MTR_LOG_ALL; mtr->modifications = FALSE; + mtr->freed_clust_leaf = FALSE; mtr->n_log_recs = 0; #ifdef UNIV_DEBUG @@ -50,7 +51,8 @@ mtr_memo_push( ut_ad(object); ut_ad(type >= MTR_MEMO_PAGE_S_FIX); - ut_ad(type <= MTR_MEMO_X_LOCK); + ut_ad(type <= MTR_MEMO_FREE_CLUST_LEAF); + ut_ad(type != MTR_MEMO_FREE_CLUST_LEAF || mtr->freed_clust_leaf); ut_ad(mtr); ut_ad(mtr->magic_n == MTR_MAGIC_N); diff --git a/storage/innobase/mtr/mtr0mtr.c b/storage/innobase/mtr/mtr0mtr.c index 365fa15878a..a11e20ca661 100644 --- a/storage/innobase/mtr/mtr0mtr.c +++ b/storage/innobase/mtr/mtr0mtr.c @@ -53,17 +53,13 @@ mtr_memo_slot_release( buf_page_release((buf_block_t*)object, type, mtr); } else if (type == MTR_MEMO_S_LOCK) { rw_lock_s_unlock((rw_lock_t*)object); -#ifdef UNIV_DEBUG - } else if (type == MTR_MEMO_X_LOCK) { - rw_lock_x_unlock((rw_lock_t*)object); - } else { - ut_ad(type == MTR_MEMO_MODIFY); + } else if (type != MTR_MEMO_X_LOCK) { + ut_ad(type == MTR_MEMO_MODIFY + || type == MTR_MEMO_FREE_CLUST_LEAF); ut_ad(mtr_memo_contains(mtr, object, MTR_MEMO_PAGE_X_FIX)); -#else } else { rw_lock_x_unlock((rw_lock_t*)object); -#endif } } diff --git a/storage/innobase/row/row0ins.c b/storage/innobase/row/row0ins.c index 7ff443a11ad..6366beb6b47 100644 --- a/storage/innobase/row/row0ins.c +++ b/storage/innobase/row/row0ins.c @@ -2089,15 +2089,20 @@ row_ins_index_entry_low( if (big_rec) { ut_a(err == DB_SUCCESS); /* Write out the externally stored - columns while still x-latching - index->lock and block->lock. We have - to mtr_commit(mtr) first, so that the - redo log will be written in the - correct order. Otherwise, we would run - into trouble on crash recovery if mtr - freed B-tree pages on which some of - the big_rec fields will be written. */ - btr_cur_mtr_commit_and_start(&cursor, &mtr); + columns, but allocate the pages and + write the pointers using the + mini-transaction of the record update. + If any pages were freed in the update, + temporarily mark them allocated so + that off-page columns will not + overwrite them. We must do this, + because we will write the redo log for + the BLOB writes before writing the + redo log for the record update. Thus, + redo log application at crash recovery + will see BLOBs being written to free pages. */ + + btr_mark_freed_leaves(index, &mtr, TRUE); rec = btr_cur_get_rec(&cursor); offsets = rec_get_offsets(rec, index, offsets, @@ -2105,7 +2110,8 @@ row_ins_index_entry_low( &heap); err = btr_store_big_rec_extern_fields( - index, rec, offsets, big_rec, &mtr); + index, rec, offsets, big_rec, + &mtr, &mtr); /* If writing big_rec fails (for example, because of DB_OUT_OF_FILE_SPACE), the record will be corrupted. Even if @@ -2118,6 +2124,9 @@ row_ins_index_entry_low( undo log, and thus the record cannot be rolled back. */ ut_a(err == DB_SUCCESS); + /* Free the pages again + in order to avoid a leak. */ + btr_mark_freed_leaves(index, &mtr, FALSE); goto stored_big_rec; } } else { @@ -2165,7 +2174,8 @@ function_exit: ULINT_UNDEFINED, &heap); err = btr_store_big_rec_extern_fields(index, rec, - offsets, big_rec, &mtr); + offsets, big_rec, + NULL, &mtr); stored_big_rec: if (modify) { dtuple_big_rec_free(big_rec); diff --git a/storage/innobase/row/row0row.c b/storage/innobase/row/row0row.c index 171039e34ac..ccb3c1f7781 100644 --- a/storage/innobase/row/row0row.c +++ b/storage/innobase/row/row0row.c @@ -212,23 +212,27 @@ row_build( } #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG - /* This condition can occur during crash recovery before - trx_rollback_or_clean_all_without_sess() has completed - execution. - - This condition is possible if the server crashed - during an insert or update before - btr_store_big_rec_extern_fields() did mtr_commit() all - BLOB pointers to the clustered index record. - - If the record contains a null BLOB pointer, look up the - transaction that holds the implicit lock on this record, and - assert that it is active. (In this version of InnoDB, we - cannot assert that it was recovered, because there is no - trx->is_recovered field.) */ - - ut_a(!rec_offs_any_null_extern(rec, offsets) - || trx_assert_active(row_get_rec_trx_id(rec, index, offsets))); + if (rec_offs_any_null_extern(rec, offsets)) { + /* This condition can occur during crash recovery + before trx_rollback_or_clean_all_without_sess() has + completed execution. + + This condition is possible if the server crashed + during an insert or update before + btr_store_big_rec_extern_fields() did mtr_commit() all + BLOB pointers to the clustered index record. + + If the record contains a null BLOB pointer, look up the + transaction that holds the implicit lock on this record, and + assert that it is active. (In this version of InnoDB, we + cannot assert that it was recovered, because there is no + trx->is_recovered field.) */ + + ut_a(trx_assert_active( + row_get_rec_trx_id(rec, index, offsets))); + ut_a(trx_undo_roll_ptr_is_insert( + row_get_rec_roll_ptr(rec, index, offsets))); + } #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ if (type != ROW_COPY_POINTERS) { diff --git a/storage/innobase/row/row0upd.c b/storage/innobase/row/row0upd.c index 694b00ea265..58739edfd98 100644 --- a/storage/innobase/row/row0upd.c +++ b/storage/innobase/row/row0upd.c @@ -1591,21 +1591,22 @@ row_upd_clust_rec( *offsets_ = (sizeof offsets_) / sizeof *offsets_; ut_a(err == DB_SUCCESS); - /* Write out the externally stored columns while still - x-latching index->lock and block->lock. We have to - mtr_commit(mtr) first, so that the redo log will be - written in the correct order. Otherwise, we would run - into trouble on crash recovery if mtr freed B-tree - pages on which some of the big_rec fields will be - written. */ - btr_cur_mtr_commit_and_start(btr_cur, mtr); - + /* Write out the externally stored columns, but + allocate the pages and write the pointers using the + mini-transaction of the record update. If any pages + were freed in the update, temporarily mark them + allocated so that off-page columns will not overwrite + them. We must do this, because we write the redo log + for the BLOB writes before writing the redo log for + the record update. */ + + btr_mark_freed_leaves(index, mtr, TRUE); rec = btr_cur_get_rec(btr_cur); err = btr_store_big_rec_extern_fields( index, rec, rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap), - big_rec, mtr); + big_rec, mtr, mtr); if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } @@ -1618,6 +1619,8 @@ row_upd_clust_rec( to the undo log, and thus the record cannot be rolled back. */ ut_a(err == DB_SUCCESS); + /* Free the pages again in order to avoid a leak. */ + btr_mark_freed_leaves(index, mtr, FALSE); } mtr_commit(mtr); diff --git a/storage/innobase/trx/trx0undo.c b/storage/innobase/trx/trx0undo.c index 329565943c8..ce09862f317 100644 --- a/storage/innobase/trx/trx0undo.c +++ b/storage/innobase/trx/trx0undo.c @@ -864,7 +864,7 @@ trx_undo_add_page( page_no = fseg_alloc_free_page_general(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER, undo->top_page_no + 1, FSP_UP, - TRUE, mtr); + TRUE, mtr, mtr); fil_space_release_free_extents(undo->space, n_reserved); diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index 0b90b5729d5..96b6a47085a 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,17 @@ +2011-08-29 The InnoDB Team + + * btr/btr0btr.c, btr/btr0cur.c, fsp/fsp0fsp.c, + include/btr0btr.h, include/btr0cur.h, include/fsp0fsp.h, + include/mtr0mtr.h, include/mtr0mtr.ic, mtr/mtr0mtr.c, + row/row0ins.c, row/row0row.c, row/row0upd.c, trx/trx0undo.c: + Fix Bug#12704861 Corruption after a crash during BLOB update + and other regressions from the fix of Bug#12612184 + +2011-08-23 The InnoDB Team + + * include/trx0undo.h, trx/trx0rec.c, trx/trx0undo.c: + Fix Bug#12547647 UPDATE LOGGING COULD EXCEED LOG PAGE SIZE + 2011-08-15 The InnoDB Team * btr/btr0btr.c, btr/btr0cur.c, btr/btr0pcur.c, btr/btr0sea.c, diff --git a/storage/innodb_plugin/btr/btr0btr.c b/storage/innodb_plugin/btr/btr0btr.c index 5e6724bbd54..71e1599d19e 100644 --- a/storage/innodb_plugin/btr/btr0btr.c +++ b/storage/innodb_plugin/btr/btr0btr.c @@ -906,28 +906,29 @@ btr_page_alloc_for_ibuf( /**************************************************************//** Allocates a new file page to be used in an index tree. NOTE: we assume that the caller has made the reservation for free extents! -@return new allocated block, x-latched; NULL if out of space */ -UNIV_INTERN -buf_block_t* -btr_page_alloc( -/*===========*/ +@return allocated page number, FIL_NULL if out of space */ +static __attribute__((nonnull(1,5), warn_unused_result)) +ulint +btr_page_alloc_low( +/*===============*/ dict_index_t* index, /*!< in: index */ ulint hint_page_no, /*!< in: hint of a good page */ byte file_direction, /*!< in: direction where a possible page split is made */ ulint level, /*!< in: level where the page is placed in the tree */ - mtr_t* mtr) /*!< in: mtr */ + mtr_t* mtr, /*!< in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr) /*!< in/out: mini-transaction + in which the page should be + initialized (may be the same + as mtr), or NULL if it should + not be initialized (the page + at hint was previously freed + in mtr) */ { fseg_header_t* seg_header; page_t* root; - buf_block_t* new_block; - ulint new_page_no; - - if (dict_index_is_ibuf(index)) { - - return(btr_page_alloc_for_ibuf(index, mtr)); - } root = btr_root_get(index, mtr); @@ -941,8 +942,42 @@ btr_page_alloc( reservation for free extents, and thus we know that a page can be allocated: */ - new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no, - file_direction, TRUE, mtr); + return(fseg_alloc_free_page_general( + seg_header, hint_page_no, file_direction, + TRUE, mtr, init_mtr)); +} + +/**************************************************************//** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! +@return new allocated block, x-latched; NULL if out of space */ +UNIV_INTERN +buf_block_t* +btr_page_alloc( +/*===========*/ + dict_index_t* index, /*!< in: index */ + ulint hint_page_no, /*!< in: hint of a good page */ + byte file_direction, /*!< in: direction where a possible + page split is made */ + ulint level, /*!< in: level where the page is placed + in the tree */ + mtr_t* mtr, /*!< in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr) /*!< in/out: mini-transaction + for x-latching and initializing + the page */ +{ + buf_block_t* new_block; + ulint new_page_no; + + if (dict_index_is_ibuf(index)) { + + return(btr_page_alloc_for_ibuf(index, mtr)); + } + + new_page_no = btr_page_alloc_low( + index, hint_page_no, file_direction, level, mtr, init_mtr); + if (new_page_no == FIL_NULL) { return(NULL); @@ -950,9 +985,16 @@ btr_page_alloc( new_block = buf_page_get(dict_index_get_space(index), dict_table_zip_size(index->table), - new_page_no, RW_X_LATCH, mtr); + new_page_no, RW_X_LATCH, init_mtr); buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW); + if (mtr->freed_clust_leaf) { + mtr_memo_release(mtr, new_block, MTR_MEMO_FREE_CLUST_LEAF); + ut_ad(!mtr_memo_contains(mtr, new_block, + MTR_MEMO_FREE_CLUST_LEAF)); + } + + ut_ad(btr_freed_leaves_validate(mtr)); return(new_block); } @@ -1065,6 +1107,15 @@ btr_page_free_low( fseg_free_page(seg_header, buf_block_get_space(block), buf_block_get_page_no(block), mtr); + + /* The page was marked free in the allocation bitmap, but it + should remain buffer-fixed until mtr_commit(mtr) or until it + is explicitly freed from the mini-transaction. */ + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + /* TODO: Discard any operations on the page from the redo log + and remove the block from the flush list and the buffer pool. + This would free up buffer pool earlier and reduce writes to + both the tablespace and the redo log. */ } /**************************************************************//** @@ -1078,14 +1129,141 @@ btr_page_free( buf_block_t* block, /*!< in: block to be freed, x-latched */ mtr_t* mtr) /*!< in: mtr */ { - ulint level; - - level = btr_page_get_level(buf_block_get_frame(block), mtr); + const page_t* page = buf_block_get_frame(block); + ulint level = btr_page_get_level(page, mtr); + ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_INDEX); btr_page_free_low(index, block, level, mtr); + + /* The handling of MTR_MEMO_FREE_CLUST_LEAF assumes this. */ + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + + if (level == 0 && dict_index_is_clust(index)) { + /* We may have to call btr_mark_freed_leaves() to + temporarily mark the block nonfree for invoking + btr_store_big_rec_extern_fields_func() after an + update. Remember that the block was freed. */ + mtr->freed_clust_leaf = TRUE; + mtr_memo_push(mtr, block, MTR_MEMO_FREE_CLUST_LEAF); + } + + ut_ad(btr_freed_leaves_validate(mtr)); } /**************************************************************//** +Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free. +For invoking btr_store_big_rec_extern_fields() after an update, +we must temporarily mark freed clustered index pages allocated, so +that off-page columns will not be allocated from them. Between the +btr_store_big_rec_extern_fields() and mtr_commit() we have to +mark the pages free again, so that no pages will be leaked. */ +UNIV_INTERN +void +btr_mark_freed_leaves( +/*==================*/ + dict_index_t* index, /*!< in/out: clustered index */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + ibool nonfree)/*!< in: TRUE=mark nonfree, FALSE=mark freed */ +{ + /* This is loosely based on mtr_memo_release(). */ + + ulint offset; + + ut_ad(dict_index_is_clust(index)); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + + if (!mtr->freed_clust_leaf) { + return; + } + + offset = dyn_array_get_data_size(&mtr->memo); + + while (offset > 0) { + mtr_memo_slot_t* slot; + buf_block_t* block; + + offset -= sizeof *slot; + + slot = dyn_array_get_element(&mtr->memo, offset); + + if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) { + continue; + } + + /* Because btr_page_alloc() does invoke + mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all + blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the + memo must still be clustered index leaf tree pages. */ + block = slot->object; + ut_a(buf_block_get_space(block) + == dict_index_get_space(index)); + ut_a(fil_page_get_type(buf_block_get_frame(block)) + == FIL_PAGE_INDEX); + ut_a(page_is_leaf(buf_block_get_frame(block))); + + if (nonfree) { + /* Allocate the same page again. */ + ulint page_no; + page_no = btr_page_alloc_low( + index, buf_block_get_page_no(block), + FSP_NO_DIR, 0, mtr, NULL); + ut_a(page_no == buf_block_get_page_no(block)); + } else { + /* Assert that the page is allocated and free it. */ + btr_page_free_low(index, block, 0, mtr); + } + } + + ut_ad(btr_freed_leaves_validate(mtr)); +} + +#ifdef UNIV_DEBUG +/**************************************************************//** +Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF. +@see btr_mark_freed_leaves() +@return TRUE */ +UNIV_INTERN +ibool +btr_freed_leaves_validate( +/*======================*/ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + ulint offset; + + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + + offset = dyn_array_get_data_size(&mtr->memo); + + while (offset > 0) { + const mtr_memo_slot_t* slot; + const buf_block_t* block; + + offset -= sizeof *slot; + + slot = dyn_array_get_element(&mtr->memo, offset); + + if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) { + continue; + } + + ut_a(mtr->freed_clust_leaf); + /* Because btr_page_alloc() does invoke + mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all + blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the + memo must still be clustered index leaf tree pages. */ + block = slot->object; + ut_a(fil_page_get_type(buf_block_get_frame(block)) + == FIL_PAGE_INDEX); + ut_a(page_is_leaf(buf_block_get_frame(block))); + } + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/**************************************************************//** Sets the child node file address in a node pointer. */ UNIV_INLINE void @@ -1806,7 +1984,7 @@ btr_root_raise_and_insert( level = btr_page_get_level(root, mtr); - new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr); + new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr); new_page = buf_block_get_frame(new_block); new_page_zip = buf_block_get_page_zip(new_block); ut_a(!new_page_zip == !root_page_zip); @@ -2542,7 +2720,7 @@ func_start: /* 2. Allocate a new page to the index */ new_block = btr_page_alloc(cursor->index, hint_page_no, direction, - btr_page_get_level(page, mtr), mtr); + btr_page_get_level(page, mtr), mtr, mtr); new_page = buf_block_get_frame(new_block); new_page_zip = buf_block_get_page_zip(new_block); btr_page_create(new_block, new_page_zip, cursor->index, diff --git a/storage/innodb_plugin/btr/btr0cur.c b/storage/innodb_plugin/btr/btr0cur.c index 5cefa51bcd5..f1c2c2ddd5e 100644 --- a/storage/innodb_plugin/btr/btr0cur.c +++ b/storage/innodb_plugin/btr/btr0cur.c @@ -2414,39 +2414,6 @@ return_after_reservations: return(err); } -/**************************************************************//** -Commits and restarts a mini-transaction so that it will retain an -x-lock on index->lock and the cursor page. */ -UNIV_INTERN -void -btr_cur_mtr_commit_and_start( -/*=========================*/ - btr_cur_t* cursor, /*!< in: cursor */ - mtr_t* mtr) /*!< in/out: mini-transaction */ -{ - buf_block_t* block; - - block = btr_cur_get_block(cursor); - - ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index), - MTR_MEMO_X_LOCK)); - ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); - /* Keep the locks across the mtr_commit(mtr). */ - rw_lock_x_lock(dict_index_get_lock(cursor->index)); - rw_lock_x_lock(&block->lock); - mutex_enter(&block->mutex); - buf_block_buf_fix_inc(block, __FILE__, __LINE__); - mutex_exit(&block->mutex); - /* Write out the redo log. */ - mtr_commit(mtr); - mtr_start(mtr); - /* Reassociate the locks with the mini-transaction. - They will be released on mtr_commit(mtr). */ - mtr_memo_push(mtr, dict_index_get_lock(cursor->index), - MTR_MEMO_X_LOCK); - mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX); -} - /*==================== B-TREE DELETE MARK AND UNMARK ===============*/ /****************************************************************//** @@ -3901,6 +3868,9 @@ btr_store_big_rec_extern_fields_func( the "external storage" flags in offsets will not correspond to rec when this function returns */ + const big_rec_t*big_rec_vec, /*!< in: vector containing fields + to be stored externally */ + #ifdef UNIV_DEBUG mtr_t* local_mtr, /*!< in: mtr containing the latch to rec and to the tree */ @@ -3909,9 +3879,11 @@ btr_store_big_rec_extern_fields_func( ibool update_in_place,/*! in: TRUE if the record is updated in place (not delete+insert) */ #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ - const big_rec_t*big_rec_vec) /*!< in: vector containing fields - to be stored externally */ - + mtr_t* alloc_mtr) /*!< in/out: in an insert, NULL; + in an update, local_mtr for + allocating BLOB pages and + updating BLOB pointers; alloc_mtr + must not have freed any leaf pages */ { ulint rec_page_no; byte* field_ref; @@ -3930,6 +3902,9 @@ btr_store_big_rec_extern_fields_func( ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(rec_offs_any_extern(offsets)); + ut_ad(local_mtr); + ut_ad(!alloc_mtr || alloc_mtr == local_mtr); + ut_ad(!update_in_place || alloc_mtr); ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index), MTR_MEMO_X_LOCK)); ut_ad(mtr_memo_contains(local_mtr, rec_block, MTR_MEMO_PAGE_X_FIX)); @@ -3945,6 +3920,25 @@ btr_store_big_rec_extern_fields_func( rec_page_no = buf_block_get_page_no(rec_block); ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX); + if (alloc_mtr) { + /* Because alloc_mtr will be committed after + mtr, it is possible that the tablespace has been + extended when the B-tree record was updated or + inserted, or it will be extended while allocating + pages for big_rec. + + TODO: In mtr (not alloc_mtr), write a redo log record + about extending the tablespace to its current size, + and remember the current size. Whenever the tablespace + grows as pages are allocated, write further redo log + records to mtr. (Currently tablespace extension is not + covered by the redo log. If it were, the record would + only be written to alloc_mtr, which is committed after + mtr.) */ + } else { + alloc_mtr = &mtr; + } + if (UNIV_LIKELY_NULL(page_zip)) { int err; @@ -4021,7 +4015,7 @@ btr_store_big_rec_extern_fields_func( } block = btr_page_alloc(index, hint_page_no, - FSP_NO_DIR, 0, &mtr); + FSP_NO_DIR, 0, alloc_mtr, &mtr); if (UNIV_UNLIKELY(block == NULL)) { mtr_commit(&mtr); @@ -4148,11 +4142,15 @@ btr_store_big_rec_extern_fields_func( goto next_zip_page; } - rec_block = buf_page_get(space_id, zip_size, - rec_page_no, - RW_X_LATCH, &mtr); - buf_block_dbg_add_level(rec_block, - SYNC_NO_ORDER_CHECK); + if (alloc_mtr == &mtr) { + rec_block = buf_page_get( + space_id, zip_size, + rec_page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level( + rec_block, + SYNC_NO_ORDER_CHECK); + } if (err == Z_STREAM_END) { mach_write_to_4(field_ref @@ -4186,7 +4184,8 @@ btr_store_big_rec_extern_fields_func( page_zip_write_blob_ptr( page_zip, rec, index, offsets, - big_rec_vec->fields[i].field_no, &mtr); + big_rec_vec->fields[i].field_no, + alloc_mtr); next_zip_page: prev_page_no = page_no; @@ -4231,19 +4230,23 @@ next_zip_page: extern_len -= store_len; - rec_block = buf_page_get(space_id, zip_size, - rec_page_no, - RW_X_LATCH, &mtr); - buf_block_dbg_add_level(rec_block, - SYNC_NO_ORDER_CHECK); + if (alloc_mtr == &mtr) { + rec_block = buf_page_get( + space_id, zip_size, + rec_page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level( + rec_block, + SYNC_NO_ORDER_CHECK); + } mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0, - MLOG_4BYTES, &mtr); + MLOG_4BYTES, alloc_mtr); mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4, big_rec_vec->fields[i].len - extern_len, - MLOG_4BYTES, &mtr); + MLOG_4BYTES, alloc_mtr); if (prev_page_no == FIL_NULL) { btr_blob_dbg_add_blob( @@ -4253,18 +4256,19 @@ next_zip_page: mlog_write_ulint(field_ref + BTR_EXTERN_SPACE_ID, - space_id, - MLOG_4BYTES, &mtr); + space_id, MLOG_4BYTES, + alloc_mtr); mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO, - page_no, - MLOG_4BYTES, &mtr); + page_no, MLOG_4BYTES, + alloc_mtr); mlog_write_ulint(field_ref + BTR_EXTERN_OFFSET, FIL_PAGE_DATA, - MLOG_4BYTES, &mtr); + MLOG_4BYTES, + alloc_mtr); } prev_page_no = page_no; diff --git a/storage/innodb_plugin/buf/buf0buf.c b/storage/innodb_plugin/buf/buf0buf.c index cd1461d22b7..47300627acc 100644 --- a/storage/innodb_plugin/buf/buf0buf.c +++ b/storage/innodb_plugin/buf/buf0buf.c @@ -1175,29 +1175,6 @@ buf_page_set_accessed_make_young( } /********************************************************************//** -Resets the check_index_page_at_flush field of a page if found in the buffer -pool. */ -UNIV_INTERN -void -buf_reset_check_index_page_at_flush( -/*================================*/ - ulint space, /*!< in: space id */ - ulint offset) /*!< in: page number */ -{ - buf_block_t* block; - - buf_pool_mutex_enter(); - - block = (buf_block_t*) buf_page_hash_get(space, offset); - - if (block && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE) { - block->check_index_page_at_flush = FALSE; - } - - buf_pool_mutex_exit(); -} - -/********************************************************************//** Returns the current state of is_hashed of a page. FALSE if the page is not in the pool. NOTE that this operation does not fix the page in the pool if it is found there. diff --git a/storage/innodb_plugin/fsp/fsp0fsp.c b/storage/innodb_plugin/fsp/fsp0fsp.c index d091a14c474..19846b63d5b 100644 --- a/storage/innodb_plugin/fsp/fsp0fsp.c +++ b/storage/innodb_plugin/fsp/fsp0fsp.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -312,8 +312,9 @@ fsp_fill_free_list( descriptor page and ibuf bitmap page; then we do not allocate more extents */ ulint space, /*!< in: space */ - fsp_header_t* header, /*!< in: space header */ - mtr_t* mtr); /*!< in: mtr */ + fsp_header_t* header, /*!< in/out: space header */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); /**********************************************************************//** Allocates a single free page from a segment. This function implements the intelligent allocation strategy which tries to minimize file space @@ -326,14 +327,20 @@ fseg_alloc_free_page_low( ulint space, /*!< in: space */ ulint zip_size,/*!< in: compressed page size in bytes or 0 for uncompressed pages */ - fseg_inode_t* seg_inode, /*!< in: segment inode */ + fseg_inode_t* seg_inode, /*!< in/out: segment inode */ ulint hint, /*!< in: hint of which page would be desirable */ byte direction, /*!< in: if the new page is needed because of an index page split, and records are inserted there in order, into which direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR */ - mtr_t* mtr); /*!< in: mtr handle */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr)/*!< in/out: mini-transaction in which the + page should be initialized + (may be the same as mtr), or NULL if it + should not be initialized (the page at hint + was previously freed in mtr) */ + __attribute__((warn_unused_result, nonnull(3,6))); #endif /* !UNIV_HOTBACKUP */ /**********************************************************************//** @@ -701,17 +708,18 @@ list, if not free limit == space size. This adding is necessary to make the descriptor defined, as they are uninitialized above the free limit. @return pointer to the extent descriptor, NULL if the page does not exist in the space or if the offset exceeds the free limit */ -UNIV_INLINE +UNIV_INLINE __attribute__((nonnull, warn_unused_result)) xdes_t* xdes_get_descriptor_with_space_hdr( /*===============================*/ - fsp_header_t* sp_header,/*!< in/out: space header, x-latched */ - ulint space, /*!< in: space id */ - ulint offset, /*!< in: page offset; - if equal to the free limit, - we try to add new extents to - the space free list */ - mtr_t* mtr) /*!< in: mtr handle */ + fsp_header_t* sp_header, /*!< in/out: space header, x-latched + in mtr */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: page offset; if equal + to the free limit, we try to + add new extents to the space + free list */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { ulint limit; ulint size; @@ -719,11 +727,9 @@ xdes_get_descriptor_with_space_hdr( ulint descr_page_no; page_t* descr_page; - ut_ad(mtr); ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL), MTR_MEMO_X_LOCK)); - ut_ad(mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_S_FIX) - || mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_X_FIX)); ut_ad(page_offset(sp_header) == FSP_HEADER_OFFSET); /* Read free limit and space size */ limit = mach_read_from_4(sp_header + FSP_FREE_LIMIT); @@ -773,7 +779,7 @@ is necessary to make the descriptor defined, as they are uninitialized above the free limit. @return pointer to the extent descriptor, NULL if the page does not exist in the space or if the offset exceeds the free limit */ -static +static __attribute__((nonnull, warn_unused_result)) xdes_t* xdes_get_descriptor( /*================*/ @@ -782,7 +788,7 @@ xdes_get_descriptor( or 0 for uncompressed pages */ ulint offset, /*!< in: page offset; if equal to the free limit, we try to add new extents to the space free list */ - mtr_t* mtr) /*!< in: mtr handle */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { buf_block_t* block; fsp_header_t* sp_header; @@ -1160,14 +1166,14 @@ fsp_header_get_tablespace_size(void) Tries to extend a single-table tablespace so that a page would fit in the data file. @return TRUE if success */ -static +static __attribute__((nonnull, warn_unused_result)) ibool fsp_try_extend_data_file_with_pages( /*================================*/ ulint space, /*!< in: space */ ulint page_no, /*!< in: page number */ - fsp_header_t* header, /*!< in: space header */ - mtr_t* mtr) /*!< in: mtr */ + fsp_header_t* header, /*!< in/out: space header */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { ibool success; ulint actual_size; @@ -1192,7 +1198,7 @@ fsp_try_extend_data_file_with_pages( /***********************************************************************//** Tries to extend the last data file of a tablespace if it is auto-extending. @return FALSE if not auto-extending */ -static +static __attribute__((nonnull)) ibool fsp_try_extend_data_file( /*=====================*/ @@ -1202,8 +1208,8 @@ fsp_try_extend_data_file( the actual file size rounded down to megabyte */ ulint space, /*!< in: space */ - fsp_header_t* header, /*!< in: space header */ - mtr_t* mtr) /*!< in: mtr */ + fsp_header_t* header, /*!< in/out: space header */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { ulint size; ulint zip_size; @@ -1339,7 +1345,7 @@ fsp_fill_free_list( then we do not allocate more extents */ ulint space, /*!< in: space */ fsp_header_t* header, /*!< in/out: space header */ - mtr_t* mtr) /*!< in: mtr */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { ulint limit; ulint size; @@ -1538,9 +1544,46 @@ fsp_alloc_free_extent( } /**********************************************************************//** +Allocates a single free page from a space. */ +static __attribute__((nonnull)) +void +fsp_alloc_from_free_frag( +/*=====================*/ + fsp_header_t* header, /*!< in/out: tablespace header */ + xdes_t* descr, /*!< in/out: extent descriptor */ + ulint bit, /*!< in: slot to allocate in the extent */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint frag_n_used; + + ut_ad(xdes_get_state(descr, mtr) == XDES_FREE_FRAG); + ut_a(xdes_get_bit(descr, XDES_FREE_BIT, bit, mtr)); + xdes_set_bit(descr, XDES_FREE_BIT, bit, FALSE, mtr); + + /* Update the FRAG_N_USED field */ + frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, + mtr); + frag_n_used++; + mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES, + mtr); + if (xdes_is_full(descr, mtr)) { + /* The fragment is full: move it to another list */ + flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, + mtr); + xdes_set_state(descr, XDES_FULL_FRAG, mtr); + + flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE, + mtr); + mlog_write_ulint(header + FSP_FRAG_N_USED, + frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES, + mtr); + } +} + +/**********************************************************************//** Allocates a single free page from a space. The page is marked as used. @return the page offset, FIL_NULL if no page could be allocated */ -static +static __attribute__((nonnull, warn_unused_result)) ulint fsp_alloc_free_page( /*================*/ @@ -1548,19 +1591,22 @@ fsp_alloc_free_page( ulint zip_size,/*!< in: compressed page size in bytes or 0 for uncompressed pages */ ulint hint, /*!< in: hint of which page would be desirable */ - mtr_t* mtr) /*!< in: mtr handle */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr)/*!< in/out: mini-transaction in which the + page should be initialized + (may be the same as mtr) */ { fsp_header_t* header; fil_addr_t first; xdes_t* descr; buf_block_t* block; ulint free; - ulint frag_n_used; ulint page_no; ulint space_size; ibool success; ut_ad(mtr); + ut_ad(init_mtr); header = fsp_get_space_header(space, zip_size, mtr); @@ -1642,38 +1688,19 @@ fsp_alloc_free_page( } } - xdes_set_bit(descr, XDES_FREE_BIT, free, FALSE, mtr); - - /* Update the FRAG_N_USED field */ - frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, - mtr); - frag_n_used++; - mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES, - mtr); - if (xdes_is_full(descr, mtr)) { - /* The fragment is full: move it to another list */ - flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, - mtr); - xdes_set_state(descr, XDES_FULL_FRAG, mtr); - - flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE, - mtr); - mlog_write_ulint(header + FSP_FRAG_N_USED, - frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES, - mtr); - } + fsp_alloc_from_free_frag(header, descr, free, mtr); /* Initialize the allocated page to the buffer pool, so that it can be obtained immediately with buf_page_get without need for a disk read. */ - buf_page_create(space, page_no, zip_size, mtr); + buf_page_create(space, page_no, zip_size, init_mtr); - block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr); + block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, init_mtr); buf_block_dbg_add_level(block, SYNC_FSP_PAGE); /* Prior contents of the page should be ignored */ - fsp_init_file_page(block, mtr); + fsp_init_file_page(block, init_mtr); return(page_no); } @@ -1909,7 +1936,7 @@ fsp_alloc_seg_inode_page( zip_size = dict_table_flags_to_zip_size( mach_read_from_4(FSP_SPACE_FLAGS + space_header)); - page_no = fsp_alloc_free_page(space, zip_size, 0, mtr); + page_no = fsp_alloc_free_page(space, zip_size, 0, mtr, mtr); if (page_no == FIL_NULL) { @@ -2323,7 +2350,7 @@ fseg_create_general( if (page == 0) { page = fseg_alloc_free_page_low(space, zip_size, - inode, 0, FSP_UP, mtr); + inode, 0, FSP_UP, mtr, mtr); if (page == FIL_NULL) { @@ -2572,14 +2599,19 @@ fseg_alloc_free_page_low( ulint space, /*!< in: space */ ulint zip_size,/*!< in: compressed page size in bytes or 0 for uncompressed pages */ - fseg_inode_t* seg_inode, /*!< in: segment inode */ + fseg_inode_t* seg_inode, /*!< in/out: segment inode */ ulint hint, /*!< in: hint of which page would be desirable */ byte direction, /*!< in: if the new page is needed because of an index page split, and records are inserted there in order, into which direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR */ - mtr_t* mtr) /*!< in: mtr handle */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr)/*!< in/out: mini-transaction in which the + page should be initialized + (may be the same as mtr), or NULL if it + should not be initialized (the page at hint + was previously freed in mtr) */ { fsp_header_t* space_header; ulint space_size; @@ -2590,7 +2622,6 @@ fseg_alloc_free_page_low( ulint ret_page; /*!< the allocated page offset, FIL_NULL if could not be allocated */ xdes_t* ret_descr; /*!< the extent of the allocated page */ - ibool frag_page_allocated = FALSE; ibool success; ulint n; @@ -2612,6 +2643,8 @@ fseg_alloc_free_page_low( if (descr == NULL) { /* Hint outside space or too high above free limit: reset hint */ + ut_a(init_mtr); + /* The file space header page is always allocated. */ hint = 0; descr = xdes_get_descriptor(space, zip_size, hint, mtr); } @@ -2623,15 +2656,20 @@ fseg_alloc_free_page_low( mtr), seg_id)) && (xdes_get_bit(descr, XDES_FREE_BIT, hint % FSP_EXTENT_SIZE, mtr) == TRUE)) { - +take_hinted_page: /* 1. We can take the hinted page =================================*/ ret_descr = descr; ret_page = hint; + /* Skip the check for extending the tablespace. If the + page hint were not within the size of the tablespace, + we would have got (descr == NULL) above and reset the hint. */ + goto got_hinted_page; /*-----------------------------------------------------------*/ - } else if ((xdes_get_state(descr, mtr) == XDES_FREE) - && ((reserved - used) < reserved / FSEG_FILLFACTOR) - && (used >= FSEG_FRAG_LIMIT)) { + } else if (xdes_get_state(descr, mtr) == XDES_FREE + && (!init_mtr + || ((reserved - used < reserved / FSEG_FILLFACTOR) + && used >= FSEG_FRAG_LIMIT))) { /* 2. We allocate the free extent from space and can take ========================================================= @@ -2649,8 +2687,20 @@ fseg_alloc_free_page_low( /* Try to fill the segment free list */ fseg_fill_free_list(seg_inode, space, zip_size, hint + FSP_EXTENT_SIZE, mtr); - ret_page = hint; + goto take_hinted_page; /*-----------------------------------------------------------*/ + } else if (!init_mtr) { + ut_a(xdes_get_state(descr, mtr) == XDES_FREE_FRAG); + fsp_alloc_from_free_frag(space_header, descr, + hint % FSP_EXTENT_SIZE, mtr); + ret_page = hint; + ret_descr = NULL; + + /* Put the page in the fragment page array of the segment */ + n = fseg_find_free_frag_page_slot(seg_inode, mtr); + ut_a(n != FIL_NULL); + fseg_set_nth_frag_page_no(seg_inode, n, ret_page, mtr); + goto got_hinted_page; } else if ((direction != FSP_NO_DIR) && ((reserved - used) < reserved / FSEG_FILLFACTOR) && (used >= FSEG_FRAG_LIMIT) @@ -2710,11 +2760,10 @@ fseg_alloc_free_page_low( } else if (used < FSEG_FRAG_LIMIT) { /* 6. We allocate an individual page from the space ===================================================*/ - ret_page = fsp_alloc_free_page(space, zip_size, hint, mtr); + ret_page = fsp_alloc_free_page(space, zip_size, hint, + mtr, init_mtr); ret_descr = NULL; - frag_page_allocated = TRUE; - if (ret_page != FIL_NULL) { /* Put the page in the fragment page array of the segment */ @@ -2724,6 +2773,10 @@ fseg_alloc_free_page_low( fseg_set_nth_frag_page_no(seg_inode, n, ret_page, mtr); } + + /* fsp_alloc_free_page() invoked fsp_init_file_page() + already. */ + return(ret_page); /*-----------------------------------------------------------*/ } else { /* 7. We allocate a new extent and take its first page @@ -2771,26 +2824,34 @@ fseg_alloc_free_page_low( } } - if (!frag_page_allocated) { +got_hinted_page: + { /* Initialize the allocated page to buffer pool, so that it can be obtained immediately with buf_page_get without need for a disk read */ buf_block_t* block; ulint zip_size = dict_table_flags_to_zip_size( mach_read_from_4(FSP_SPACE_FLAGS + space_header)); + mtr_t* block_mtr = init_mtr ? init_mtr : mtr; - block = buf_page_create(space, ret_page, zip_size, mtr); + block = buf_page_create(space, ret_page, zip_size, block_mtr); buf_block_dbg_add_level(block, SYNC_FSP_PAGE); if (UNIV_UNLIKELY(block != buf_page_get(space, zip_size, ret_page, RW_X_LATCH, - mtr))) { + block_mtr))) { ut_error; } - /* The prior contents of the page should be ignored */ - fsp_init_file_page(block, mtr); + if (init_mtr) { + /* The prior contents of the page should be ignored */ + fsp_init_file_page(block, init_mtr); + } + } + /* ret_descr == NULL if the block was allocated from free_frag + (XDES_FREE_FRAG) */ + if (ret_descr != NULL) { /* At this point we know the extent and the page offset. The extent is still in the appropriate list (FSEG_NOT_FULL or FSEG_FREE), and the page is not yet marked as used. */ @@ -2803,8 +2864,6 @@ fseg_alloc_free_page_low( fseg_mark_page_used(seg_inode, space, zip_size, ret_page, mtr); } - buf_reset_check_index_page_at_flush(space, ret_page); - return(ret_page); } @@ -2817,7 +2876,7 @@ UNIV_INTERN ulint fseg_alloc_free_page_general( /*=========================*/ - fseg_header_t* seg_header,/*!< in: segment header */ + fseg_header_t* seg_header,/*!< in/out: segment header */ ulint hint, /*!< in: hint of which page would be desirable */ byte direction,/*!< in: if the new page is needed because of an index page split, and records are @@ -2829,7 +2888,11 @@ fseg_alloc_free_page_general( with fsp_reserve_free_extents, then there is no need to do the check for this individual page */ - mtr_t* mtr) /*!< in: mtr handle */ + mtr_t* mtr, /*!< in/out: mini-transaction handle */ + mtr_t* init_mtr)/*!< in/out: mtr or another mini-transaction + in which the page should be initialized, + or NULL if this is a "fake allocation" of + a page that was previously freed in mtr */ { fseg_inode_t* inode; ulint space; @@ -2871,7 +2934,8 @@ fseg_alloc_free_page_general( } page_no = fseg_alloc_free_page_low(space, zip_size, - inode, hint, direction, mtr); + inode, hint, direction, + mtr, init_mtr); if (!has_done_reservation) { fil_space_release_free_extents(space, n_reserved); } @@ -2880,28 +2944,6 @@ fseg_alloc_free_page_general( } /**********************************************************************//** -Allocates a single free page from a segment. This function implements -the intelligent allocation strategy which tries to minimize file space -fragmentation. -@return allocated page offset, FIL_NULL if no page could be allocated */ -UNIV_INTERN -ulint -fseg_alloc_free_page( -/*=================*/ - fseg_header_t* seg_header,/*!< in: segment header */ - ulint hint, /*!< in: hint of which page would be desirable */ - byte direction,/*!< in: if the new page is needed because - of an index page split, and records are - inserted there in order, into which - direction they go alphabetically: FSP_DOWN, - FSP_UP, FSP_NO_DIR */ - mtr_t* mtr) /*!< in: mtr handle */ -{ - return(fseg_alloc_free_page_general(seg_header, hint, direction, - FALSE, mtr)); -} - -/**********************************************************************//** Checks that we have at least 2 frag pages free in the first extent of a single-table tablespace, and they are also physically initialized to the data file. That is we have already extended the data file so that those pages are diff --git a/storage/innodb_plugin/include/btr0btr.h b/storage/innodb_plugin/include/btr0btr.h index c0a038dd21d..476ad29adac 100644 --- a/storage/innodb_plugin/include/btr0btr.h +++ b/storage/innodb_plugin/include/btr0btr.h @@ -557,7 +557,12 @@ btr_page_alloc( page split is made */ ulint level, /*!< in: level where the page is placed in the tree */ - mtr_t* mtr); /*!< in: mtr */ + mtr_t* mtr, /*!< in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr) /*!< in/out: mini-transaction + for x-latching and initializing + the page */ + __attribute__((nonnull, warn_unused_result)); /**************************************************************//** Frees a file page used in an index tree. NOTE: cannot free field external storage pages because the page must contain info on its level. */ @@ -580,6 +585,33 @@ btr_page_free_low( buf_block_t* block, /*!< in: block to be freed, x-latched */ ulint level, /*!< in: page level */ mtr_t* mtr); /*!< in: mtr */ +/**************************************************************//** +Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free. +For invoking btr_store_big_rec_extern_fields() after an update, +we must temporarily mark freed clustered index pages allocated, so +that off-page columns will not be allocated from them. Between the +btr_store_big_rec_extern_fields() and mtr_commit() we have to +mark the pages free again, so that no pages will be leaked. */ +UNIV_INTERN +void +btr_mark_freed_leaves( +/*==================*/ + dict_index_t* index, /*!< in/out: clustered index */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + ibool nonfree)/*!< in: TRUE=mark nonfree, FALSE=mark freed */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +/**************************************************************//** +Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF. +@see btr_mark_freed_leaves() +@return TRUE */ +UNIV_INTERN +ibool +btr_freed_leaves_validate( +/*======================*/ + mtr_t* mtr) /*!< in: mini-transaction */ + __attribute__((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ #ifdef UNIV_BTR_PRINT /*************************************************************//** Prints size info of a B-tree. */ diff --git a/storage/innodb_plugin/include/btr0cur.h b/storage/innodb_plugin/include/btr0cur.h index 6094a2a6c7a..1d97c5b9452 100644 --- a/storage/innodb_plugin/include/btr0cur.h +++ b/storage/innodb_plugin/include/btr0cur.h @@ -326,16 +326,6 @@ btr_cur_pessimistic_update( que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr); /*!< in: mtr; must be committed before latching any further pages */ -/***************************************************************** -Commits and restarts a mini-transaction so that it will retain an -x-lock on index->lock and the cursor page. */ -UNIV_INTERN -void -btr_cur_mtr_commit_and_start( -/*=========================*/ - btr_cur_t* cursor, /*!< in: cursor */ - mtr_t* mtr) /*!< in/out: mini-transaction */ - __attribute__((nonnull)); /***********************************************************//** Marks a clustered index record deleted. Writes an undo log record to undo log on this delete marking. Writes in the trx id field the id @@ -540,6 +530,8 @@ btr_store_big_rec_extern_fields_func( the "external storage" flags in offsets will not correspond to rec when this function returns */ + const big_rec_t*big_rec_vec, /*!< in: vector containing fields + to be stored externally */ #ifdef UNIV_DEBUG mtr_t* local_mtr, /*!< in: mtr containing the latch to rec and to the tree */ @@ -548,9 +540,12 @@ btr_store_big_rec_extern_fields_func( ibool update_in_place,/*! in: TRUE if the record is updated in place (not delete+insert) */ #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ - const big_rec_t*big_rec_vec) /*!< in: vector containing fields - to be stored externally */ - __attribute__((nonnull)); + mtr_t* alloc_mtr) /*!< in/out: in an insert, NULL; + in an update, local_mtr for + allocating BLOB pages and + updating BLOB pointers; alloc_mtr + must not have freed any leaf pages */ + __attribute__((nonnull(1,2,3,4,5), warn_unused_result)); /** Stores the fields in big_rec_vec to the tablespace and puts pointers to them in rec. The extern flags in rec will have to be set beforehand. @@ -559,21 +554,22 @@ file segment of the index tree. @param index in: clustered index; MUST be X-latched by mtr @param b in/out: block containing rec; MUST be X-latched by mtr @param rec in/out: clustered index record -@param offsets in: rec_get_offsets(rec, index); +@param offs in: rec_get_offsets(rec, index); the "external storage" flags in offsets will not be adjusted +@param big in: vector containing fields to be stored externally @param mtr in: mini-transaction that holds x-latch on index and b @param upd in: TRUE if the record is updated in place (not delete+insert) -@param big in: vector containing fields to be stored externally +@param rmtr in/out: in updates, the mini-transaction that holds rec @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ #ifdef UNIV_DEBUG -# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \ - btr_store_big_rec_extern_fields_func(index,b,rec,offsets,mtr,upd,big) +# define btr_store_big_rec_extern_fields(index,b,rec,offs,big,mtr,upd,rmtr) \ + btr_store_big_rec_extern_fields_func(index,b,rec,offs,big,mtr,upd,rmtr) #elif defined UNIV_BLOB_LIGHT_DEBUG -# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \ - btr_store_big_rec_extern_fields_func(index,b,rec,offsets,upd,big) +# define btr_store_big_rec_extern_fields(index,b,rec,offs,big,mtr,upd,rmtr) \ + btr_store_big_rec_extern_fields_func(index,b,rec,offs,big,upd,rmtr) #else -# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \ - btr_store_big_rec_extern_fields_func(index,b,rec,offsets,big) +# define btr_store_big_rec_extern_fields(index,b,rec,offs,big,mtr,upd,rmtr) \ + btr_store_big_rec_extern_fields_func(index,b,rec,offs,big,rmtr) #endif /*******************************************************************//** diff --git a/storage/innodb_plugin/include/buf0buf.h b/storage/innodb_plugin/include/buf0buf.h index 9856bfce409..557bc17d311 100644 --- a/storage/innodb_plugin/include/buf0buf.h +++ b/storage/innodb_plugin/include/buf0buf.h @@ -372,15 +372,6 @@ buf_page_peek( /*==========*/ ulint space, /*!< in: space id */ ulint offset);/*!< in: page number */ -/********************************************************************//** -Resets the check_index_page_at_flush field of a page if found in the buffer -pool. */ -UNIV_INTERN -void -buf_reset_check_index_page_at_flush( -/*================================*/ - ulint space, /*!< in: space id */ - ulint offset);/*!< in: page number */ #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG /********************************************************************//** Sets file_page_was_freed TRUE if the page is found in the buffer pool. diff --git a/storage/innodb_plugin/include/fsp0fsp.h b/storage/innodb_plugin/include/fsp0fsp.h index 7abd3914eda..2221380c9a2 100644 --- a/storage/innodb_plugin/include/fsp0fsp.h +++ b/storage/innodb_plugin/include/fsp0fsp.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -176,19 +176,18 @@ fseg_n_reserved_pages( Allocates a single free page from a segment. This function implements the intelligent allocation strategy which tries to minimize file space fragmentation. -@return the allocated page offset FIL_NULL if no page could be allocated */ -UNIV_INTERN -ulint -fseg_alloc_free_page( -/*=================*/ - fseg_header_t* seg_header, /*!< in: segment header */ - ulint hint, /*!< in: hint of which page would be desirable */ - byte direction, /*!< in: if the new page is needed because +@param[in/out] seg_header segment header +@param[in] hint hint of which page would be desirable +@param[in] direction if the new page is needed because of an index page split, and records are inserted there in order, into which direction they go alphabetically: FSP_DOWN, - FSP_UP, FSP_NO_DIR */ - mtr_t* mtr); /*!< in: mtr handle */ + FSP_UP, FSP_NO_DIR +@param[in/out] mtr mini-transaction +@return the allocated page offset FIL_NULL if no page could be allocated */ +#define fseg_alloc_free_page(seg_header, hint, direction, mtr) \ + fseg_alloc_free_page_general(seg_header, hint, direction, \ + FALSE, mtr, mtr) /**********************************************************************//** Allocates a single free page from a segment. This function implements the intelligent allocation strategy which tries to minimize file space @@ -198,7 +197,7 @@ UNIV_INTERN ulint fseg_alloc_free_page_general( /*=========================*/ - fseg_header_t* seg_header,/*!< in: segment header */ + fseg_header_t* seg_header,/*!< in/out: segment header */ ulint hint, /*!< in: hint of which page would be desirable */ byte direction,/*!< in: if the new page is needed because of an index page split, and records are @@ -210,7 +209,12 @@ fseg_alloc_free_page_general( with fsp_reserve_free_extents, then there is no need to do the check for this individual page */ - mtr_t* mtr); /*!< in: mtr handle */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr)/*!< in/out: mtr or another mini-transaction + in which the page should be initialized, + or NULL if this is a "fake allocation" of + a page that was previously freed in mtr */ + __attribute__((warn_unused_result, nonnull(1,5))); /**********************************************************************//** Reserves free pages from a tablespace. All mini-transactions which may use several pages from the tablespace should call this function beforehand diff --git a/storage/innodb_plugin/include/mtr0mtr.h b/storage/innodb_plugin/include/mtr0mtr.h index bc3f1951be9..2a561131c09 100644 --- a/storage/innodb_plugin/include/mtr0mtr.h +++ b/storage/innodb_plugin/include/mtr0mtr.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -53,6 +53,8 @@ first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ #define MTR_MEMO_MODIFY 54 #define MTR_MEMO_S_LOCK 55 #define MTR_MEMO_X_LOCK 56 +/** The mini-transaction freed a clustered index leaf page. */ +#define MTR_MEMO_FREE_CLUST_LEAF 57 /** @name Log item types The log items are declared 'byte' so that the compiler can warn if val @@ -387,9 +389,12 @@ struct mtr_struct{ #endif dyn_array_t memo; /*!< memo stack for locks etc. */ dyn_array_t log; /*!< mini-transaction log */ - ibool modifications; - /* TRUE if the mtr made modifications to - buffer pool pages */ + unsigned modifications:1; + /*!< TRUE if the mini-transaction + modified buffer pool pages */ + unsigned freed_clust_leaf:1; + /*!< TRUE if MTR_MEMO_FREE_CLUST_LEAF + was logged in the mini-transaction */ ulint n_log_recs; /* count of how many page initial log records have been written to the mtr log */ diff --git a/storage/innodb_plugin/include/mtr0mtr.ic b/storage/innodb_plugin/include/mtr0mtr.ic index 18f8e87b3cf..9c0ddff9132 100644 --- a/storage/innodb_plugin/include/mtr0mtr.ic +++ b/storage/innodb_plugin/include/mtr0mtr.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -44,6 +44,7 @@ mtr_start( mtr->log_mode = MTR_LOG_ALL; mtr->modifications = FALSE; + mtr->freed_clust_leaf = FALSE; mtr->n_log_recs = 0; ut_d(mtr->state = MTR_ACTIVE); @@ -67,7 +68,8 @@ mtr_memo_push( ut_ad(object); ut_ad(type >= MTR_MEMO_PAGE_S_FIX); - ut_ad(type <= MTR_MEMO_X_LOCK); + ut_ad(type <= MTR_MEMO_FREE_CLUST_LEAF); + ut_ad(type != MTR_MEMO_FREE_CLUST_LEAF || mtr->freed_clust_leaf); ut_ad(mtr); ut_ad(mtr->magic_n == MTR_MAGIC_N); ut_ad(mtr->state == MTR_ACTIVE); diff --git a/storage/innodb_plugin/include/trx0undo.h b/storage/innodb_plugin/include/trx0undo.h index 4f15cd85833..c95f99d6417 100644 --- a/storage/innodb_plugin/include/trx0undo.h +++ b/storage/innodb_plugin/include/trx0undo.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -204,17 +204,51 @@ trx_undo_add_page( mtr_t* mtr); /*!< in: mtr which does not have a latch to any undo log page; the caller must have reserved the rollback segment mutex */ +/********************************************************************//** +Frees the last undo log page. +The caller must hold the rollback segment mutex. */ +UNIV_INTERN +void +trx_undo_free_last_page_func( +/*==========================*/ +#ifdef UNIV_DEBUG + const trx_t* trx, /*!< in: transaction */ +#endif /* UNIV_DEBUG */ + trx_undo_t* undo, /*!< in/out: undo log memory copy */ + mtr_t* mtr) /*!< in/out: mini-transaction which does not + have a latch to any undo log page or which + has allocated the undo log page */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +# define trx_undo_free_last_page(trx,undo,mtr) \ + trx_undo_free_last_page_func(trx,undo,mtr) +#else /* UNIV_DEBUG */ +# define trx_undo_free_last_page(trx,undo,mtr) \ + trx_undo_free_last_page_func(undo,mtr) +#endif /* UNIV_DEBUG */ + /***********************************************************************//** Truncates an undo log from the end. This function is used during a rollback to free space from an undo log. */ UNIV_INTERN void -trx_undo_truncate_end( -/*==================*/ - trx_t* trx, /*!< in: transaction whose undo log it is */ - trx_undo_t* undo, /*!< in: undo log */ - undo_no_t limit); /*!< in: all undo records with undo number +trx_undo_truncate_end_func( +/*=======================*/ +#ifdef UNIV_DEBUG + const trx_t* trx, /*!< in: transaction whose undo log it is */ +#endif /* UNIV_DEBUG */ + trx_undo_t* undo, /*!< in/out: undo log */ + undo_no_t limit) /*!< in: all undo records with undo number >= this value should be truncated */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +# define trx_undo_truncate_end(trx,undo,limit) \ + trx_undo_truncate_end_func(trx,undo,limit) +#else /* UNIV_DEBUG */ +# define trx_undo_truncate_end(trx,undo,limit) \ + trx_undo_truncate_end_func(undo,limit) +#endif /* UNIV_DEBUG */ + /***********************************************************************//** Truncates an undo log from the start. This function is used during a purge operation. */ diff --git a/storage/innodb_plugin/mtr/mtr0mtr.c b/storage/innodb_plugin/mtr/mtr0mtr.c index 417e97732bb..6dd5b6eb8c3 100644 --- a/storage/innodb_plugin/mtr/mtr0mtr.c +++ b/storage/innodb_plugin/mtr/mtr0mtr.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -58,12 +58,11 @@ mtr_memo_slot_release( buf_page_release((buf_block_t*)object, type, mtr); } else if (type == MTR_MEMO_S_LOCK) { rw_lock_s_unlock((rw_lock_t*)object); -#ifdef UNIV_DEBUG } else if (type != MTR_MEMO_X_LOCK) { - ut_ad(type == MTR_MEMO_MODIFY); + ut_ad(type == MTR_MEMO_MODIFY + || type == MTR_MEMO_FREE_CLUST_LEAF); ut_ad(mtr_memo_contains(mtr, object, MTR_MEMO_PAGE_X_FIX)); -#endif /* UNIV_DEBUG */ } else { rw_lock_x_unlock((rw_lock_t*)object); } diff --git a/storage/innodb_plugin/row/row0ins.c b/storage/innodb_plugin/row/row0ins.c index ea43cbfb5f1..0f158cdc706 100644 --- a/storage/innodb_plugin/row/row0ins.c +++ b/storage/innodb_plugin/row/row0ins.c @@ -2094,15 +2094,20 @@ row_ins_index_entry_low( if (big_rec) { ut_a(err == DB_SUCCESS); /* Write out the externally stored - columns while still x-latching - index->lock and block->lock. We have - to mtr_commit(mtr) first, so that the - redo log will be written in the - correct order. Otherwise, we would run - into trouble on crash recovery if mtr - freed B-tree pages on which some of - the big_rec fields will be written. */ - btr_cur_mtr_commit_and_start(&cursor, &mtr); + columns, but allocate the pages and + write the pointers using the + mini-transaction of the record update. + If any pages were freed in the update, + temporarily mark them allocated so + that off-page columns will not + overwrite them. We must do this, + because we will write the redo log for + the BLOB writes before writing the + redo log for the record update. Thus, + redo log application at crash recovery + will see BLOBs being written to free pages. */ + + btr_mark_freed_leaves(index, &mtr, TRUE); rec = btr_cur_get_rec(&cursor); offsets = rec_get_offsets( @@ -2111,7 +2116,8 @@ row_ins_index_entry_low( err = btr_store_big_rec_extern_fields( index, btr_cur_get_block(&cursor), - rec, offsets, &mtr, FALSE, big_rec); + rec, offsets, big_rec, &mtr, + FALSE, &mtr); /* If writing big_rec fails (for example, because of DB_OUT_OF_FILE_SPACE), the record will be corrupted. Even if @@ -2124,6 +2130,9 @@ row_ins_index_entry_low( undo log, and thus the record cannot be rolled back. */ ut_a(err == DB_SUCCESS); + /* Free the pages again + in order to avoid a leak. */ + btr_mark_freed_leaves(index, &mtr, FALSE); goto stored_big_rec; } } else { @@ -2165,7 +2174,7 @@ function_exit: err = btr_store_big_rec_extern_fields( index, btr_cur_get_block(&cursor), - rec, offsets, &mtr, FALSE, big_rec); + rec, offsets, big_rec, &mtr, FALSE, NULL); stored_big_rec: if (modify) { diff --git a/storage/innodb_plugin/row/row0row.c b/storage/innodb_plugin/row/row0row.c index 9cdbbe76e04..e476ffae84e 100644 --- a/storage/innodb_plugin/row/row0row.c +++ b/storage/innodb_plugin/row/row0row.c @@ -243,19 +243,20 @@ row_build( } #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG - /* This condition can occur during crash recovery before - trx_rollback_active() has completed execution. - - This condition is possible if the server crashed - during an insert or update before - btr_store_big_rec_extern_fields() did mtr_commit() all - BLOB pointers to the clustered index record. - - If the record contains a null BLOB pointer, look up the - transaction that holds the implicit lock on this record, and - assert that it was recovered (and will soon be rolled back). */ - ut_a(!rec_offs_any_null_extern(rec, offsets) - || trx_assert_recovered(row_get_rec_trx_id(rec, index, offsets))); + if (rec_offs_any_null_extern(rec, offsets)) { + /* This condition can occur during crash recovery + before trx_rollback_active() has completed execution. + + This condition is possible if the server crashed + during an insert or update-by-delete-and-insert before + btr_store_big_rec_extern_fields() did mtr_commit() all + BLOB pointers to the freshly inserted clustered index + record. */ + ut_a(trx_assert_recovered( + row_get_rec_trx_id(rec, index, offsets))); + ut_a(trx_undo_roll_ptr_is_insert( + row_get_rec_roll_ptr(rec, index, offsets))); + } #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ if (type != ROW_COPY_POINTERS) { diff --git a/storage/innodb_plugin/row/row0upd.c b/storage/innodb_plugin/row/row0upd.c index b5952ff0a78..05856687015 100644 --- a/storage/innodb_plugin/row/row0upd.c +++ b/storage/innodb_plugin/row/row0upd.c @@ -1978,21 +1978,22 @@ row_upd_clust_rec( rec_offs_init(offsets_); ut_a(err == DB_SUCCESS); - /* Write out the externally stored columns while still - x-latching index->lock and block->lock. We have to - mtr_commit(mtr) first, so that the redo log will be - written in the correct order. Otherwise, we would run - into trouble on crash recovery if mtr freed B-tree - pages on which some of the big_rec fields will be - written. */ - btr_cur_mtr_commit_and_start(btr_cur, mtr); - + /* Write out the externally stored columns, but + allocate the pages and write the pointers using the + mini-transaction of the record update. If any pages + were freed in the update, temporarily mark them + allocated so that off-page columns will not overwrite + them. We must do this, because we write the redo log + for the BLOB writes before writing the redo log for + the record update. */ + + btr_mark_freed_leaves(index, mtr, TRUE); rec = btr_cur_get_rec(btr_cur); err = btr_store_big_rec_extern_fields( index, btr_cur_get_block(btr_cur), rec, rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap), - mtr, TRUE, big_rec); + big_rec, mtr, TRUE, mtr); /* If writing big_rec fails (for example, because of DB_OUT_OF_FILE_SPACE), the record will be corrupted. Even if we did not update any externally stored @@ -2002,6 +2003,8 @@ row_upd_clust_rec( to the undo log, and thus the record cannot be rolled back. */ ut_a(err == DB_SUCCESS); + /* Free the pages again in order to avoid a leak. */ + btr_mark_freed_leaves(index, mtr, FALSE); } mtr_commit(mtr); diff --git a/storage/innodb_plugin/sync/sync0sync.c b/storage/innodb_plugin/sync/sync0sync.c index 1b97e1f11f3..64aadffdfad 100644 --- a/storage/innodb_plugin/sync/sync0sync.c +++ b/storage/innodb_plugin/sync/sync0sync.c @@ -1248,7 +1248,13 @@ sync_thread_add_level( TRUE)); break; case SYNC_IBUF_TREE_NODE_NEW: - ut_a(sync_thread_levels_contain(array, SYNC_IBUF_MUTEX)); + /* ibuf_add_free_page() allocates new pages for the + change buffer while only holding the tablespace + x-latch. These pre-allocated new pages may only be + taken in use while holding ibuf_mutex, in + btr_page_alloc_for_ibuf(). */ + ut_a(sync_thread_levels_contain(array, SYNC_IBUF_MUTEX) + || sync_thread_levels_contain(array, SYNC_FSP)); break; case SYNC_IBUF_INDEX_TREE: if (sync_thread_levels_contain(array, SYNC_FSP)) { diff --git a/storage/innodb_plugin/trx/trx0rec.c b/storage/innodb_plugin/trx/trx0rec.c index 9f2fd59d82b..a729a39d0cc 100644 --- a/storage/innodb_plugin/trx/trx0rec.c +++ b/storage/innodb_plugin/trx/trx0rec.c @@ -1097,22 +1097,29 @@ trx_undo_rec_get_partial_row( #endif /* !UNIV_HOTBACKUP */ /***********************************************************************//** -Erases the unused undo log page end. */ -static -void +Erases the unused undo log page end. +@return TRUE if the page contained something, FALSE if it was empty */ +static __attribute__((nonnull, warn_unused_result)) +ibool trx_undo_erase_page_end( /*====================*/ - page_t* undo_page, /*!< in: undo page whose end to erase */ - mtr_t* mtr) /*!< in: mtr */ + page_t* undo_page, /*!< in/out: undo page whose end to erase */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { ulint first_free; first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE); + if (first_free == TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE) { + /* This was an empty page to begin with. + Do nothing here; the caller should free the page. */ + return(FALSE); + } memset(undo_page + first_free, 0xff, (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) - first_free); mlog_write_initial_log_record(undo_page, MLOG_UNDO_ERASE_END, mtr); + return(TRUE); } /***********************************************************//** @@ -1134,7 +1141,11 @@ trx_undo_parse_erase_page_end( return(ptr); } - trx_undo_erase_page_end(page, mtr); + if (!trx_undo_erase_page_end(page, mtr)) { + /* The function trx_undo_erase_page_end() should not + have done anything to an empty page. */ + ut_ad(0); + } return(ptr); } @@ -1180,6 +1191,9 @@ trx_undo_report_row_operation( mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets = offsets_; +#ifdef UNIV_DEBUG + int loop_count = 0; +#endif /* UNIV_DEBUG */ rec_offs_init(offsets_); ut_a(dict_index_is_clust(index)); @@ -1242,7 +1256,7 @@ trx_undo_report_row_operation( mtr_start(&mtr); - for (;;) { + do { buf_block_t* undo_block; page_t* undo_page; ulint offset; @@ -1271,7 +1285,19 @@ trx_undo_report_row_operation( version the replicate page constructed using the log records stays identical to the original page */ - trx_undo_erase_page_end(undo_page, &mtr); + if (!trx_undo_erase_page_end(undo_page, &mtr)) { + /* The record did not fit on an empty + undo page. Discard the freshly allocated + page and return an error. */ + + mutex_enter(&rseg->mutex); + trx_undo_free_last_page(trx, undo, &mtr); + mutex_exit(&rseg->mutex); + + err = DB_TOO_BIG_RECORD; + goto err_exit; + } + mtr_commit(&mtr); } else { /* Success */ @@ -1291,16 +1317,15 @@ trx_undo_report_row_operation( *roll_ptr = trx_undo_build_roll_ptr( op_type == TRX_UNDO_INSERT_OP, rseg->id, page_no, offset); - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } - return(DB_SUCCESS); + err = DB_SUCCESS; + goto func_exit; } ut_ad(page_no == undo->last_page_no); /* We have to extend the undo log by one page */ + ut_ad(++loop_count < 2); mtr_start(&mtr); /* When we add a page to an undo log, this is analogous to @@ -1312,18 +1337,19 @@ trx_undo_report_row_operation( page_no = trx_undo_add_page(trx, undo, &mtr); mutex_exit(&(rseg->mutex)); + } while (UNIV_LIKELY(page_no != FIL_NULL)); - if (UNIV_UNLIKELY(page_no == FIL_NULL)) { - /* Did not succeed: out of space */ + /* Did not succeed: out of space */ + err = DB_OUT_OF_FILE_SPACE; - mutex_exit(&(trx->undo_mutex)); - mtr_commit(&mtr); - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } - return(DB_OUT_OF_FILE_SPACE); - } +err_exit: + mutex_exit(&trx->undo_mutex); + mtr_commit(&mtr); +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); } + return(err); } /*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/ diff --git a/storage/innodb_plugin/trx/trx0undo.c b/storage/innodb_plugin/trx/trx0undo.c index 7f03b68fb55..c36f55fbd9c 100644 --- a/storage/innodb_plugin/trx/trx0undo.c +++ b/storage/innodb_plugin/trx/trx0undo.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -912,7 +912,7 @@ trx_undo_add_page( page_no = fseg_alloc_free_page_general(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER, undo->top_page_no + 1, FSP_UP, - TRUE, mtr); + TRUE, mtr, mtr); fil_space_release_free_extents(undo->space, n_reserved); @@ -998,29 +998,28 @@ trx_undo_free_page( } /********************************************************************//** -Frees an undo log page when there is also the memory object for the undo -log. */ -static +Frees the last undo log page. +The caller must hold the rollback segment mutex. */ +UNIV_INTERN void -trx_undo_free_page_in_rollback( -/*===========================*/ - trx_t* trx __attribute__((unused)), /*!< in: transaction */ - trx_undo_t* undo, /*!< in: undo log memory copy */ - ulint page_no,/*!< in: page number to free: must not be the - header page */ - mtr_t* mtr) /*!< in: mtr which does not have a latch to any - undo log page; the caller must have reserved - the rollback segment mutex */ +trx_undo_free_last_page_func( +/*==========================*/ +#ifdef UNIV_DEBUG + const trx_t* trx, /*!< in: transaction */ +#endif /* UNIV_DEBUG */ + trx_undo_t* undo, /*!< in/out: undo log memory copy */ + mtr_t* mtr) /*!< in/out: mini-transaction which does not + have a latch to any undo log page or which + has allocated the undo log page */ { - ulint last_page_no; - - ut_ad(undo->hdr_page_no != page_no); - ut_ad(mutex_own(&(trx->undo_mutex))); + ut_ad(mutex_own(&trx->undo_mutex)); + ut_ad(undo->hdr_page_no != undo->last_page_no); + ut_ad(undo->size > 0); - last_page_no = trx_undo_free_page(undo->rseg, FALSE, undo->space, - undo->hdr_page_no, page_no, mtr); + undo->last_page_no = trx_undo_free_page( + undo->rseg, FALSE, undo->space, + undo->hdr_page_no, undo->last_page_no, mtr); - undo->last_page_no = last_page_no; undo->size--; } @@ -1056,9 +1055,11 @@ Truncates an undo log from the end. This function is used during a rollback to free space from an undo log. */ UNIV_INTERN void -trx_undo_truncate_end( -/*==================*/ - trx_t* trx, /*!< in: transaction whose undo log it is */ +trx_undo_truncate_end_func( +/*=======================*/ +#ifdef UNIV_DEBUG + const trx_t* trx, /*!< in: transaction whose undo log it is */ +#endif /* UNIV_DEBUG */ trx_undo_t* undo, /*!< in: undo log */ undo_no_t limit) /*!< in: all undo records with undo number >= this value should be truncated */ @@ -1084,18 +1085,7 @@ trx_undo_truncate_end( rec = trx_undo_page_get_last_rec(undo_page, undo->hdr_page_no, undo->hdr_offset); - for (;;) { - if (rec == NULL) { - if (last_page_no == undo->hdr_page_no) { - - goto function_exit; - } - - trx_undo_free_page_in_rollback( - trx, undo, last_page_no, &mtr); - break; - } - + while (rec) { if (ut_dulint_cmp(trx_undo_rec_get_undo_no(rec), limit) >= 0) { /* Truncate at least this record off, maybe @@ -1110,6 +1100,14 @@ trx_undo_truncate_end( undo->hdr_offset); } + if (last_page_no == undo->hdr_page_no) { + + goto function_exit; + } + + ut_ad(last_page_no == undo->last_page_no); + trx_undo_free_last_page(trx, undo, &mtr); + mtr_commit(&mtr); } diff --git a/strings/decimal.c b/strings/decimal.c index 43957c7dc19..6c89657004c 100644 --- a/strings/decimal.c +++ b/strings/decimal.c @@ -1423,11 +1423,18 @@ int bin2decimal(const uchar *from, decimal_t *to, int precision, int scale) buf++; } my_afree(d_copy); + + /* + No digits? We have read the number zero, of unspecified precision. + Make it a proper zero, with non-zero precision. + */ + if (to->intg == 0 && to->frac == 0) + decimal_make_zero(to); return error; err: my_afree(d_copy); - decimal_make_zero(((decimal_t*) to)); + decimal_make_zero(to); return(E_DEC_BAD_NUM); } diff --git a/support-files/mysql.spec.sh b/support-files/mysql.spec.sh index 006dea45e64..28f2b0f773a 100644 --- a/support-files/mysql.spec.sh +++ b/support-files/mysql.spec.sh @@ -382,7 +382,7 @@ sh -c "PATH=\"${MYSQL_BUILD_PATH:-$PATH}\" \ --enable-local-infile \ --with-fast-mutexes \ --with-mysqld-user=%{mysqld_user} \ - --with-unix-socket-path=/var/lib/mysql/mysql.sock \ + --with-unix-socket-path=%{mysqldatadir}/mysql.sock \ --with-pic \ --prefix=/ \ %if %{CLUSTER_BUILD} @@ -858,6 +858,13 @@ chown -R %{mysqld_user}:%{mysqld_group} $mysql_datadir # ---------------------------------------------------------------------- chmod -R og-rw $mysql_datadir/mysql +# ---------------------------------------------------------------------- +# Deal with SELinux, if it is installed / used +# ---------------------------------------------------------------------- +if [ -x /sbin/restorecon ] ; then + /sbin/restorecon -R %{mysqldatadir} +fi + # Was the server running before the upgrade? If so, restart the new one. if [ "$SERVER_TO_START" = "true" ] ; then # Restart in the same way that mysqld will be started normally. @@ -1165,6 +1172,15 @@ fi # merging BK trees) ############################################################################## %changelog +* Fri Aug 19 2011 Joerg Bruehe <joerg.bruehe@oracle.com> + +- Fix bug#37165 "((Generic rpm)) fail to install on Fedora 9 x86_64" + On Fedora, certain accesses to "/var/lib/mysql/HOSTNAME.err" were blocked + by SELinux policy, this made the server start fail with the message + Manager of pid-file quit without updating file + Calling "/sbin/restorecon -R /var/lib/mysql" fixes this. +- Replace occurrences of that path name by the spec file variable %{mysqldatadir}. + * Thu Jul 07 2011 Joerg Bruehe <joerg.bruehe@oracle.com> - Fix bug#45415: "rpm upgrade recreates test database" |