From c79b036ed1804bb06073965badf2b2fa18b597c6 Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 16 Aug 2011 19:29:06 +0200 Subject: fix for bug 55713 innochecksum is NOT built with large file support enabled --- extra/innochecksum.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/extra/innochecksum.c b/extra/innochecksum.c index 7ad900d16d3..f066cab2b3a 100644 --- a/extra/innochecksum.c +++ b/extra/innochecksum.c @@ -25,12 +25,7 @@ Published with a permission. */ -/* needed to have access to 64 bit file functions */ -#define _LARGEFILE_SOURCE -#define _LARGEFILE64_SOURCE - -#define _XOPEN_SOURCE 500 /* needed to include getopt.h on some platforms. */ - +#include #include #include #include -- cgit v1.2.1 From b213e7f814c7145a98b689710f14a479cf700912 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 17 Aug 2011 00:34:11 +0200 Subject: 2nd fix for Bug #55713 innochecksum is NOT built with large file support enabled --- extra/innochecksum.c | 1 - 1 file changed, 1 deletion(-) diff --git a/extra/innochecksum.c b/extra/innochecksum.c index f066cab2b3a..b55b510b888 100644 --- a/extra/innochecksum.c +++ b/extra/innochecksum.c @@ -48,7 +48,6 @@ /* another argument to specify page ranges... seek to right spot and go from there */ typedef unsigned long int ulint; -typedef unsigned char uchar; /* innodb function in name; modified slightly to not have the ASM version (lots of #ifs that didn't apply) */ ulint mach_read_from_4(uchar *b) -- cgit v1.2.1 From a8ee6e48f7129d35a2d83beab26f920e01092032 Mon Sep 17 00:00:00 2001 From: Sergey Vojtovich Date: Thu, 18 Aug 2011 10:38:51 +0400 Subject: BUG#11763712 - 56458: KILLING A FLUSH TABLE FOR A MERGE/CHILD CRASHES SERVER Flushing of MERGE table or one of its child tables, which was locked by flushing thread using LOCK TABLES, might have caused crashes or assertion failures if the thread failed to reopen child or parent table. Particularly, this might have happened when another connection killed this FLUSH TABLE statement/connection. Also this problem might have occurred when we failed to reopen MERGE table or one of its children when executing DDL statement under LOCK TABLES. The problem was caused by the fact that reopen_tables() might have failed to reopen child table but still tried to reopen, reattach children for and re-lock its parent. Vice versa it might have failed to reopen parent but kept references from children to parent around. Since reopen_tables() closes table it has failed to reopen and therefore frees all associated memory such dangling references led to crashes when followed. This patch solves this problem by ensuring that we always close parent table and all its children if we fail to reopen this table or one of its children. Same happens if we fail to reattach children to parent. Affects 5.1 only. mysql-test/r/merge.result: A test case for BUG#11763712. mysql-test/t/merge.test: A test case for BUG#11763712. sql/sql_base.cc: When flushing tables under LOCK TABLES, all locked and flushed tables are released and then reopened. It may happen that we failed to reopen some tables, in this case we reopen as much tables as possible. If it was not possible to reopen MERGE child, MERGE parent is unusable and must be removed from thread open tables list. If it was not possible to reopen MERGE parent, all MERGE child table objects are unusable as well, at least because their locks are handled by MERGE parent. They must also be removed from thread open tables list. In other words if it was impossible to reopen any object of a MERGE table or reattach child tables, all objects of this MERGE table must be considered unusable and closed. --- mysql-test/r/merge.result | 29 ++++++++ mysql-test/t/merge.test | 45 ++++++++++++ sql/sql_base.cc | 183 +++++++++++++++++++++++++++++++++++----------- 3 files changed, 215 insertions(+), 42 deletions(-) diff --git a/mysql-test/r/merge.result b/mysql-test/r/merge.result index 3af152672ab..a4f1c79dff4 100644 --- a/mysql-test/r/merge.result +++ b/mysql-test/r/merge.result @@ -2341,4 +2341,33 @@ REPAIR TABLE m1; Table Op Msg_type Msg_text test.m1 repair note The storage engine for the table doesn't support repair DROP TABLE m1, t1; +# +# BUG#11763712 - 56458: KILLING A FLUSH TABLE FOR A MERGE/CHILD +# CRASHES SERVER +# +CREATE TABLE t1(a INT); +CREATE TABLE t2(a INT); +CREATE TABLE t3(a INT, b INT); +CREATE TABLE m1(a INT) ENGINE=MERGE UNION=(t1, t2); +# Test reopen merge parent failure +LOCK TABLES m1 READ; +# Remove 'm1' table using file operations. +FLUSH TABLES; +ERROR 42S02: Table 'test.m1' doesn't exist +UNLOCK TABLES; +CREATE TABLE m1(a INT) ENGINE=MERGE UNION=(t1, t2); +# Test reopen merge child failure +LOCK TABLES m1 READ; +# Remove 't1' table using file operations. +FLUSH TABLES; +ERROR 42S02: Table 'test.t1' doesn't exist +UNLOCK TABLES; +CREATE TABLE t1(a INT); +# Test reattach merge failure +LOCK TABLES m1 READ; +# Replace 't1' with 't3' table using file operations. +FLUSH TABLES; +ERROR HY000: Can't reopen table: 'm1' +UNLOCK TABLES; +DROP TABLE t1, t2, t3, m1; End of 5.1 tests diff --git a/mysql-test/t/merge.test b/mysql-test/t/merge.test index f290803bbd2..a6affbb0540 100644 --- a/mysql-test/t/merge.test +++ b/mysql-test/t/merge.test @@ -1783,4 +1783,49 @@ REPAIR TABLE m1; # DROP TABLE m1, t1; + +--echo # +--echo # BUG#11763712 - 56458: KILLING A FLUSH TABLE FOR A MERGE/CHILD +--echo # CRASHES SERVER +--echo # +CREATE TABLE t1(a INT); +CREATE TABLE t2(a INT); +CREATE TABLE t3(a INT, b INT); +CREATE TABLE m1(a INT) ENGINE=MERGE UNION=(t1, t2); + +--echo # Test reopen merge parent failure +LOCK TABLES m1 READ; +--echo # Remove 'm1' table using file operations. +remove_file $MYSQLD_DATADIR/test/m1.MRG; +remove_file $MYSQLD_DATADIR/test/m1.frm; +--error ER_NO_SUCH_TABLE +FLUSH TABLES; +UNLOCK TABLES; +CREATE TABLE m1(a INT) ENGINE=MERGE UNION=(t1, t2); + +--echo # Test reopen merge child failure +LOCK TABLES m1 READ; +--echo # Remove 't1' table using file operations. +remove_file $MYSQLD_DATADIR/test/t1.frm; +remove_file $MYSQLD_DATADIR/test/t1.MYI; +remove_file $MYSQLD_DATADIR/test/t1.MYD; +--error ER_NO_SUCH_TABLE +FLUSH TABLES; +UNLOCK TABLES; +CREATE TABLE t1(a INT); + +--echo # Test reattach merge failure +LOCK TABLES m1 READ; +--echo # Replace 't1' with 't3' table using file operations. +remove_file $MYSQLD_DATADIR/test/t1.frm; +remove_file $MYSQLD_DATADIR/test/t1.MYI; +remove_file $MYSQLD_DATADIR/test/t1.MYD; +copy_file $MYSQLD_DATADIR/test/t3.frm $MYSQLD_DATADIR/test/t1.frm; +copy_file $MYSQLD_DATADIR/test/t3.MYI $MYSQLD_DATADIR/test/t1.MYI; +copy_file $MYSQLD_DATADIR/test/t3.MYD $MYSQLD_DATADIR/test/t1.MYD; +--error ER_CANT_REOPEN_TABLE +FLUSH TABLES; +UNLOCK TABLES; +DROP TABLE t1, t2, t3, m1; + --echo End of 5.1 tests diff --git a/sql/sql_base.cc b/sql/sql_base.cc index dc78f3b84c6..971a54d88c2 100644 --- a/sql/sql_base.cc +++ b/sql/sql_base.cc @@ -96,6 +96,13 @@ static TABLE_SHARE *oldest_unused_share, end_of_unused_share; static pthread_mutex_t LOCK_table_share; static bool table_def_inited= 0; +/** + Dummy TABLE instance which is used in reopen_tables() and reattach_merge() + functions to mark MERGE tables and their children with which there is some + kind of problem and which therefore we need to close. +*/ +static TABLE bad_merge_marker; + static int open_unireg_entry(THD *thd, TABLE *entry, TABLE_LIST *table_list, const char *alias, char *cache_key, uint cache_key_length, @@ -3214,47 +3221,66 @@ void close_data_files_and_morph_locks(THD *thd, const char *db, } +/** + @brief Mark merge parent and children with bad_merge_marker + + @param[in,out] parent the TABLE object of the parent +*/ + +static void mark_merge_parent_and_children_as_bad(TABLE *parent) +{ + TABLE_LIST *child_l; + DBUG_ENTER("mark_merge_parent_and_children_as_bad"); + parent->parent= &bad_merge_marker; + for (child_l= parent->child_l; ; child_l= child_l->next_global) + { + child_l->table->parent= &bad_merge_marker; + child_l->table= NULL; + if (&child_l->next_global == parent->child_last_l) + break; + } + DBUG_VOID_RETURN; +} + + /** Reattach MERGE children after reopen. @param[in] thd thread context - @param[in,out] err_tables_p pointer to pointer of tables in error + + @note If reattach failed for certain MERGE table, the table (and all + it's children) are marked with bad_merge_marker. @return status - @retval FALSE OK, err_tables_p unchanged - @retval TRUE Error, err_tables_p contains table(s) + @retval FALSE OK + @retval TRUE Error */ -static bool reattach_merge(THD *thd, TABLE **err_tables_p) +static bool reattach_merge(THD *thd) { TABLE *table; - TABLE *next; - TABLE **prv_p= &thd->open_tables; bool error= FALSE; DBUG_ENTER("reattach_merge"); - for (table= thd->open_tables; table; table= next) + for (table= thd->open_tables; table; table= table->next) { - next= table->next; - DBUG_PRINT("tcache", ("check table: '%s'.'%s' 0x%lx next: 0x%lx", + DBUG_PRINT("tcache", ("check table: '%s'.'%s' 0x%lx", table->s->db.str, table->s->table_name.str, - (long) table, (long) next)); - /* Reattach children for MERGE tables with "closed data files" only. */ - if (table->child_l && !table->children_attached) + (long) table)); + /* + Reattach children only for MERGE tables that had children or parent + with "closed data files" and were reopen. For extra safety skip MERGE + tables which we failed to reopen (should not happen with current code). + */ + if (table->child_l && table->parent != &bad_merge_marker && + !table->children_attached) { DBUG_PRINT("tcache", ("MERGE parent, attach children")); - if(table->file->extra(HA_EXTRA_ATTACH_CHILDREN)) + if (table->file->extra(HA_EXTRA_ATTACH_CHILDREN)) { my_error(ER_CANT_REOPEN_TABLE, MYF(0), table->alias); error= TRUE; - /* Remove table from open_tables. */ - *prv_p= next; - if (next) - prv_p= &next->next; - /* Stack table on error list. */ - table->next= *err_tables_p; - *err_tables_p= table; - continue; + mark_merge_parent_and_children_as_bad(table); } else { @@ -3264,7 +3290,6 @@ static bool reattach_merge(THD *thd, TABLE **err_tables_p) table->s->table_name.str, (long) table)); } } - prv_p= &table->next; } DBUG_RETURN(error); } @@ -3294,7 +3319,6 @@ bool reopen_tables(THD *thd, bool get_locks, bool mark_share_as_old) { TABLE *table,*next,**prev; TABLE **tables,**tables_ptr; // For locks - TABLE *err_tables= NULL; bool error=0, not_used; bool merge_table_found= FALSE; const uint flags= MYSQL_LOCK_NOTIFY_IF_NEED_REOPEN | @@ -3328,29 +3352,69 @@ bool reopen_tables(THD *thd, bool get_locks, bool mark_share_as_old) for (table=thd->open_tables; table ; table=next) { uint db_stat=table->db_stat; + TABLE *parent= table->child_l ? table : table->parent; next=table->next; DBUG_PRINT("tcache", ("open table: '%s'.'%s' 0x%lx " "parent: 0x%lx db_stat: %u", table->s->db.str, table->s->table_name.str, (long) table, (long) table->parent, db_stat)); - if (table->child_l && !db_stat) + /* + If we need to reopen child or parent table in a MERGE table, then + children in this MERGE table has to be already detached at this + point. + */ + DBUG_ASSERT(db_stat || !parent || !parent->children_attached); + /* + Thanks to the above assumption the below condition will guarantee that + merge_table_found is TRUE when we need to reopen child or parent table. + Note that it works even in situation when it is only a child and not a + parent that needs reopen (this can happen when get_locks == FALSE). + */ + if (table->child_l && !table->children_attached) merge_table_found= TRUE; - if (!tables || (!db_stat && reopen_table(table))) + + if (!tables) { - my_error(ER_CANT_REOPEN_TABLE, MYF(0), table->alias); /* - If we could not allocate 'tables', we may close open tables - here. If a MERGE table is affected, detach the children first. - It is not necessary to clear the child or parent table reference - of this table because the TABLE is freed. But we need to clear - the child or parent references of the other belonging tables so - that they cannot be moved into the unused_tables chain with - these pointers set. + If we could not allocate 'tables' we close ALL open tables here. + Before closing MERGE child or parent we need to detach children + and/or clear references in/to them. */ - if (table->child_l || table->parent) + if (parent) detach_merge_children(table, TRUE); - VOID(hash_delete(&open_cache,(uchar*) table)); - error=1; + } + else if (table->parent == &bad_merge_marker) + { + /* + This is either a child or a parent of a MERGE table for which + we already decided that we are unable to reopen it. Close it. + + Reset parent reference, it may be used while freeing the table. + */ + table->parent= NULL; + } + else if (!db_stat && reopen_table(table)) + { + /* + If we fail to reopen a child or a parent in a MERGE table and the + MERGE table is affected for the first time, mark all relevant tables + invalid. Otherwise handle it as usual. + + All in all we must end up with: + - child tables are detached from parent. This was done earlier, + but child<->parent references were kept valid for reopen. + - parent is not in the to-be-locked tables + - all child tables and parent are not in the THD::open_tables. + - all child tables and parent are not in the open_cache. + + Please note that below we do additional pass through THD::open_tables + list to achieve the last three points. + */ + if (parent) + { + mark_merge_parent_and_children_as_bad(parent); + table->parent= NULL; + } } else { @@ -3366,21 +3430,56 @@ bool reopen_tables(THD *thd, bool get_locks, bool mark_share_as_old) table->s->version=0; table->open_placeholder= 0; } + continue; } + my_error(ER_CANT_REOPEN_TABLE, MYF(0), table->alias); + VOID(hash_delete(&open_cache, (uchar *) table)); + error= 1; } *prev=0; /* When all tables are open again, we can re-attach MERGE children to - their parents. All TABLE objects are still present. + their parents. + + If there was an error while reopening a child or a parent of a MERGE + table, or while reattaching child tables to their parents, some tables + may have been kept open but marked for close with bad_merge_marker. + Close these tables now. */ - DBUG_PRINT("tcache", ("re-attaching MERGE tables: %d", merge_table_found)); - if (!error && merge_table_found && reattach_merge(thd, &err_tables)) + if (tables && merge_table_found && (error|= reattach_merge(thd))) { - while (err_tables) + prev= &thd->open_tables; + for (table= thd->open_tables; table; table= next) { - VOID(hash_delete(&open_cache, (uchar*) err_tables)); - err_tables= err_tables->next; + next= table->next; + if (table->parent == &bad_merge_marker) + { + /* Remove merge parent from to-be-locked tables array. */ + if (get_locks && table->child_l) + { + TABLE **t; + for (t= tables; t < tables_ptr; t++) + { + if (*t == table) + { + tables_ptr--; + memmove(t, t + 1, (tables_ptr - t) * sizeof(TABLE *)); + break; + } + } + } + /* Reset parent reference, it may be used while freeing the table. */ + table->parent= NULL; + /* Free table. */ + VOID(hash_delete(&open_cache, (uchar *) table)); + } + else + { + *prev= table; + prev= &table->next; + } } + *prev= 0; } DBUG_PRINT("tcache", ("open tables to lock: %u", (uint) (tables_ptr - tables))); -- cgit v1.2.1 From e8f2e3c2176a1625220b3189e73bba91dae7684c Mon Sep 17 00:00:00 2001 From: Joerg Bruehe Date: Fri, 19 Aug 2011 18:48:14 +0200 Subject: Fix bug#37165 "((Generic rpm)) fail to install on Fedora 9 x86_64" On Fedora, certain accesses to "/var/lib/mysql/HOSTNAME.err" were blocked by SELinux policy, this made the server start fail with the message Manager of pid-file quit without updating file Calling "/sbin/restorecon -R /var/lib/mysql" fixes this. --- support-files/mysql.spec.sh | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/support-files/mysql.spec.sh b/support-files/mysql.spec.sh index 006dea45e64..28f2b0f773a 100644 --- a/support-files/mysql.spec.sh +++ b/support-files/mysql.spec.sh @@ -382,7 +382,7 @@ sh -c "PATH=\"${MYSQL_BUILD_PATH:-$PATH}\" \ --enable-local-infile \ --with-fast-mutexes \ --with-mysqld-user=%{mysqld_user} \ - --with-unix-socket-path=/var/lib/mysql/mysql.sock \ + --with-unix-socket-path=%{mysqldatadir}/mysql.sock \ --with-pic \ --prefix=/ \ %if %{CLUSTER_BUILD} @@ -858,6 +858,13 @@ chown -R %{mysqld_user}:%{mysqld_group} $mysql_datadir # ---------------------------------------------------------------------- chmod -R og-rw $mysql_datadir/mysql +# ---------------------------------------------------------------------- +# Deal with SELinux, if it is installed / used +# ---------------------------------------------------------------------- +if [ -x /sbin/restorecon ] ; then + /sbin/restorecon -R %{mysqldatadir} +fi + # Was the server running before the upgrade? If so, restart the new one. if [ "$SERVER_TO_START" = "true" ] ; then # Restart in the same way that mysqld will be started normally. @@ -1165,6 +1172,15 @@ fi # merging BK trees) ############################################################################## %changelog +* Fri Aug 19 2011 Joerg Bruehe + +- Fix bug#37165 "((Generic rpm)) fail to install on Fedora 9 x86_64" + On Fedora, certain accesses to "/var/lib/mysql/HOSTNAME.err" were blocked + by SELinux policy, this made the server start fail with the message + Manager of pid-file quit without updating file + Calling "/sbin/restorecon -R /var/lib/mysql" fixes this. +- Replace occurrences of that path name by the spec file variable %{mysqldatadir}. + * Thu Jul 07 2011 Joerg Bruehe - Fix bug#45415: "rpm upgrade recreates test database" -- cgit v1.2.1 From 49ee12d03b763a460cfcc371566465019a4dc067 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Mon, 22 Aug 2011 17:03:07 +0300 Subject: Bug #11766591 59733: POSSIBLE DEADLOCK WHEN BUFFERED CHANGES ARE TO BE DISCARDED The fix in revision id marko.makela@oracle.com-20110815091143-h3zbvm0pv8ni3qql introduced a false UNIV_SYNC_DEBUG alarm. Relax the assertion. --- storage/innodb_plugin/sync/sync0sync.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/storage/innodb_plugin/sync/sync0sync.c b/storage/innodb_plugin/sync/sync0sync.c index 1b97e1f11f3..64aadffdfad 100644 --- a/storage/innodb_plugin/sync/sync0sync.c +++ b/storage/innodb_plugin/sync/sync0sync.c @@ -1248,7 +1248,13 @@ sync_thread_add_level( TRUE)); break; case SYNC_IBUF_TREE_NODE_NEW: - ut_a(sync_thread_levels_contain(array, SYNC_IBUF_MUTEX)); + /* ibuf_add_free_page() allocates new pages for the + change buffer while only holding the tablespace + x-latch. These pre-allocated new pages may only be + taken in use while holding ibuf_mutex, in + btr_page_alloc_for_ibuf(). */ + ut_a(sync_thread_levels_contain(array, SYNC_IBUF_MUTEX) + || sync_thread_levels_contain(array, SYNC_FSP)); break; case SYNC_IBUF_INDEX_TREE: if (sync_thread_levels_contain(array, SYNC_FSP)) { -- cgit v1.2.1 From 0c7db7839b052269e034d91a7597a74b3d1f90ba Mon Sep 17 00:00:00 2001 From: Tor Didriksen Date: Thu, 25 Aug 2011 10:38:07 +0200 Subject: Bug#12856915 VALGRIND FAILURE IN FILESORT/CREATE_SORT_INDEX Suppress the known warnings generated by filesort(). The real fix belongs to worklog 1509: Pack values of non-sorted fields in the sort buffer (which is basically the same issue, but in an optimization context: We are writing the entire sort buffer to disk, including un-used space for varchar columns.) mysql-test/valgrind.supp: Add new Memcheck suppressions for filesort. sql/filesort.cc: Remove the ifdef HAVE_purify/bzero code, use valgrind suppressions instead. --- mysql-test/valgrind.supp | 34 ++++++++++++++++++++++++++++++++++ sql/filesort.cc | 11 ----------- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/mysql-test/valgrind.supp b/mysql-test/valgrind.supp index 8720cd511b9..3751a339a1a 100644 --- a/mysql-test/valgrind.supp +++ b/mysql-test/valgrind.supp @@ -791,3 +791,37 @@ fun:fil_delete_tablespace fun:row_drop_table_for_mysql } + +{ + Bug#12856915 VALGRIND FAILURE IN FILESORT/CREATE_SORT_INDEX / one + Memcheck:Param + write(buf) + obj:*/libpthread*.so + fun:my_write + fun:my_b_flush_io_cache + fun:_my_b_write + fun:_ZL10write_keysP13st_sort_paramPPhjP11st_io_cacheS4_ + fun:_ZL13find_all_keysP13st_sort_paramP10SQL_SELECTPPhP11st_io_cacheS6_S6_ + fun:_Z8filesortP3THDP8st_tableP13st_sort_fieldjP10SQL_SELECTybPy +} + +{ + Bug#12856915 VALGRIND FAILURE IN FILESORT/CREATE_SORT_INDEX / two + Memcheck:Param + write(buf) + obj:*/libpthread*.so + fun:my_write + fun:my_b_flush_io_cache + fun:_Z15merge_many_buffP13st_sort_paramPhP10st_buffpekPjP11st_io_cache + fun:_Z8filesortP3THDP8st_tableP13st_sort_fieldjP10SQL_SELECTybPy +} + +{ + Bug#12856915 VALGRIND FAILURE IN FILESORT/CREATE_SORT_INDEX / three + Memcheck:Param + write(buf) + obj:*/libpthread*.so + fun:my_write + fun:my_b_flush_io_cache + fun:_Z8filesortP3THDP8st_tableP13st_sort_fieldjP10SQL_SELECTybPy +} diff --git a/sql/filesort.cc b/sql/filesort.cc index 99e5156427a..0ff354b334c 100644 --- a/sql/filesort.cc +++ b/sql/filesort.cc @@ -959,21 +959,10 @@ static void make_sortkey(register SORTPARAM *param, if (addonf->null_bit && field->is_null()) { nulls[addonf->null_offset]|= addonf->null_bit; -#ifdef HAVE_purify - bzero(to, addonf->length); -#endif } else { -#ifdef HAVE_purify - uchar *end= field->pack(to, field->ptr); - uint length= (uint) ((to + addonf->length) - end); - DBUG_ASSERT((int) length >= 0); - if (length) - bzero(end, length); -#else (void) field->pack(to, field->ptr); -#endif } to+= addonf->length; } -- cgit v1.2.1 From e46b3453bfd379f7eef4fcc01853f085007c012b Mon Sep 17 00:00:00 2001 From: Rohit Kalhans Date: Fri, 26 Aug 2011 15:27:29 +0530 Subject: BUG#11878104: FIXES OF BUG 11752963 - 44312 TO BACKPORT TO MYSQL-5.1 Background: Backporting fix for BUG 11752963 to Mysql5.1 branch. Problem: Fix of bug 11752963 was only available for trunk and 5.5 branch. Partial fix has been pushed to 5.1 branch as well. Fix: backporting the fixes of bug 11752963 to 5.1 branch. 1. Made all major changes to make 5.1 branch in line with 5.5 and the trunk. 2. skipped the partial patch that was already applied to the 5.1 branch. sql/rpl_rli.h: Made inited Volatile (find inline comments) sql/slave.cc: backported all changes from the fix of BUG#11752963. --- sql/rpl_rli.h | 8 +++++++- sql/slave.cc | 11 ++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/sql/rpl_rli.h b/sql/rpl_rli.h index 811ad8eb864..ae8639c69ac 100644 --- a/sql/rpl_rli.h +++ b/sql/rpl_rli.h @@ -221,7 +221,13 @@ public: #endif /* if not set, the value of other members of the structure are undefined */ - bool inited; + /* + inited changes its value within LOCK_active_mi-guarded critical + sections at times of start_slave_threads() (0->1) and end_slave() (1->0). + Readers may not acquire the mutex while they realize potential concurrency + issue. + */ + volatile bool inited; volatile bool abort_slave; volatile uint slave_running; diff --git a/sql/slave.cc b/sql/slave.cc index 6c375238ce4..02d8cc2c199 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -598,11 +598,15 @@ int start_slave_thread(pthread_handler h_func, pthread_mutex_t *start_lock, DBUG_PRINT("sleep",("Waiting for slave thread to start")); const char* old_msg = thd->enter_cond(start_cond,cond_lock, "Waiting for slave thread to start"); - pthread_cond_wait(start_cond,cond_lock); + pthread_cond_wait(start_cond, cond_lock); thd->exit_cond(old_msg); pthread_mutex_lock(cond_lock); // re-acquire it as exit_cond() released if (thd->killed) + { + if (start_lock) + pthread_mutex_unlock(start_lock); DBUG_RETURN(thd->killed_errno()); + } } } if (start_lock) @@ -2531,6 +2535,7 @@ pthread_handler_t handle_slave_io(void *arg) thd= new THD; // note that contructor of THD uses DBUG_ ! THD_CHECK_SENTRY(thd); + DBUG_ASSERT(mi->io_thd == 0); mi->io_thd = thd; pthread_detach_this_thread(); @@ -4489,9 +4494,6 @@ int rotate_relay_log(Master_info* mi) Relay_log_info* rli= &mi->rli; int error= 0; - /* We don't lock rli->run_lock. This would lead to deadlocks. */ - pthread_mutex_lock(&mi->run_lock); - /* We need to test inited because otherwise, new_file() will attempt to lock LOCK_log, which may not be inited (if we're not a slave). @@ -4521,7 +4523,6 @@ int rotate_relay_log(Master_info* mi) */ rli->relay_log.harvest_bytes_written(&rli->log_space_total); end: - pthread_mutex_unlock(&mi->run_lock); DBUG_RETURN(error); } -- cgit v1.2.1 From 41f229cd9ed5a1549fb6438bdc1614e48d003625 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Mon, 29 Aug 2011 11:16:42 +0300 Subject: Bug#12704861 Corruption after a crash during BLOB update The fix of Bug#12612184 broke crash recovery. When a record that contains off-page columns (BLOBs) is updated, we must first write redo log about the BLOB page writes, and only after that write the redo log about the B-tree changes. The buggy fix would log the B-tree changes first, meaning that after recovery, we could end up having a record that contains a null BLOB pointer. Because we will be redo logging the writes off the off-page columns before the B-tree changes, we must make sure that the pages chosen for the off-page columns are free both before and after the B-tree changes. In this way, the worst thing that can happen in crash recovery is that the BLOBs are written to free pages, but the B-tree changes are not applied. The BLOB pages would correctly remain free in this case. To achieve this, we must allocate the BLOB pages in the mini-transaction of the B-tree operation. A further quirk is that BLOB pages are allocated from the same file segment as leaf pages. Because of this, we must temporarily "hide" any leaf pages that were freed during the B-tree operation by "fake allocating" them prior to writing the BLOBs, and freeing them again before the mtr_commit() of the B-tree operation, in btr_mark_freed_leaves(). btr_cur_mtr_commit_and_start(): Remove this faulty function that was introduced in the Bug#12612184 fix. The problem that this function was trying to address was that when we did mtr_commit() the BLOB writes before the mtr_commit() of the update, the new BLOB pages could have overwritten clustered index B-tree leaf pages that were freed during the update. If recovery applied the redo log of the BLOB writes but did not see the log of the record update, the index tree would be corrupted. The correct solution is to make the freed clustered index pages unavailable to the BLOB allocation. This function is also a likely culprit of InnoDB hangs that were observed when testing the Bug#12612184 fix. btr_mark_freed_leaves(): Mark all freed clustered index leaf pages of a mini-transaction allocated (nonfree=TRUE) before storing the BLOBs, or freed (nonfree=FALSE) before committing the mini-transaction. btr_freed_leaves_validate(): A debug function for checking that all clustered index leaf pages that have been marked free in the mini-transaction are consistent (have not been zeroed out). btr_page_alloc_low(): Refactored from btr_page_alloc(). Return the number of the allocated page, or FIL_NULL if out of space. Add the parameter "mtr_t* init_mtr" for specifying the mini-transaction where the page should be initialized, or if this is a "fake allocation" (init_mtr=NULL) by btr_mark_freed_leaves(nonfree=TRUE). btr_page_alloc(): Add the parameter init_mtr, allowing the page to be initialized and X-latched in a different mini-transaction than the one that is used for the allocation. Invoke btr_page_alloc_low(). If a clustered index leaf page was previously freed in mtr, remove it from the memo of previously freed pages. btr_page_free(): Assert that the page is a B-tree page and it has been X-latched by the mini-transaction. If the freed page was a leaf page of a clustered index, link it by a MTR_MEMO_FREE_CLUST_LEAF marker to the mini-transaction. btr_store_big_rec_extern_fields_func(): Add the parameter alloc_mtr, which is NULL (old behaviour in inserts) and the same as local_mtr in updates. If alloc_mtr!=NULL, the BLOB pages will be allocated from it instead of the mini-transaction that is used for writing the BLOBs. fsp_alloc_from_free_frag(): Refactored from fsp_alloc_free_page(). Allocate the specified page from a partially free extent. fseg_alloc_free_page_low(), fseg_alloc_free_page_general(): Add the parameter "mtr_t* init_mtr" for specifying the mini-transaction where the page should be initialized, or NULL if this is a "fake allocation" that prevents the reuse of a previously freed B-tree page for BLOB storage. If init_mtr==NULL, try harder to reallocate the specified page and assert that it succeeded. fsp_alloc_free_page(): Add the parameter "mtr_t* init_mtr" for specifying the mini-transaction where the page should be initialized. Do not allow init_mtr == NULL, because this function is never to be used for "fake allocations". mtr_t: Add the operation MTR_MEMO_FREE_CLUST_LEAF and the flag mtr->freed_clust_leaf for quickly determining if any MTR_MEMO_FREE_CLUST_LEAF operations have been posted. row_ins_index_entry_low(): When columns are being made off-page in insert-by-update, invoke btr_mark_freed_leaves(nonfree=TRUE) and pass the mini-transaction as the alloc_mtr to btr_store_big_rec_extern_fields(). Finally, invoke btr_mark_freed_leaves(nonfree=FALSE) to avoid leaking pages. row_build(): Correct a comment, and add a debug assertion that a record that contains NULL BLOB pointers must be a fresh insert. row_upd_clust_rec(): When columns are being moved off-page, invoke btr_mark_freed_leaves(nonfree=TRUE) and pass the mini-transaction as the alloc_mtr to btr_store_big_rec_extern_fields(). Finally, invoke btr_mark_freed_leaves(nonfree=FALSE) to avoid leaking pages. buf_reset_check_index_page_at_flush(): Remove. The function fsp_init_file_page_low() already sets bpage->check_index_page_at_flush=FALSE. There is a known issue in tablespace extension. If the request to allocate a BLOB page leads to the tablespace being extended, crash recovery could see BLOB writes to pages that are off the tablespace file bounds. This should trigger an assertion failure in fil_io() at crash recovery. The safe thing would be to write redo log about the tablespace extension to the mini-transaction of the BLOB write, not to the mini-transaction of the record update. However, there is no redo log record for file extension in the current redo log format. rb:693 approved by Sunny Bains --- .../suite/innodb_plugin/r/innodb-index.result | 9 + mysql-test/suite/innodb_plugin/t/innodb-index.test | 13 ++ storage/innobase/btr/btr0btr.c | 222 +++++++++++++++++-- storage/innobase/btr/btr0cur.c | 92 ++++---- storage/innobase/buf/buf0buf.c | 23 -- storage/innobase/fsp/fsp0fsp.c | 168 ++++++++++----- storage/innobase/include/btr0btr.h | 31 ++- storage/innobase/include/btr0cur.h | 14 +- storage/innobase/include/buf0buf.h | 9 - storage/innobase/include/fsp0fsp.h | 8 +- storage/innobase/include/mtr0mtr.h | 7 +- storage/innobase/include/mtr0mtr.ic | 4 +- storage/innobase/mtr/mtr0mtr.c | 10 +- storage/innobase/row/row0ins.c | 32 ++- storage/innobase/row/row0row.c | 38 ++-- storage/innobase/row/row0upd.c | 23 +- storage/innobase/trx/trx0undo.c | 2 +- storage/innodb_plugin/ChangeLog | 14 ++ storage/innodb_plugin/btr/btr0btr.c | 220 +++++++++++++++++-- storage/innodb_plugin/btr/btr0cur.c | 114 +++++----- storage/innodb_plugin/buf/buf0buf.c | 23 -- storage/innodb_plugin/fsp/fsp0fsp.c | 234 ++++++++++++--------- storage/innodb_plugin/include/btr0btr.h | 34 ++- storage/innodb_plugin/include/btr0cur.h | 38 ++-- storage/innodb_plugin/include/buf0buf.h | 9 - storage/innodb_plugin/include/fsp0fsp.h | 30 +-- storage/innodb_plugin/include/mtr0mtr.h | 13 +- storage/innodb_plugin/include/mtr0mtr.ic | 6 +- storage/innodb_plugin/include/trx0undo.h | 46 +++- storage/innodb_plugin/mtr/mtr0mtr.c | 7 +- storage/innodb_plugin/row/row0ins.c | 31 ++- storage/innodb_plugin/row/row0row.c | 27 +-- storage/innodb_plugin/row/row0upd.c | 23 +- storage/innodb_plugin/trx/trx0rec.c | 68 ++++-- storage/innodb_plugin/trx/trx0undo.c | 70 +++--- 35 files changed, 1154 insertions(+), 558 deletions(-) diff --git a/mysql-test/suite/innodb_plugin/r/innodb-index.result b/mysql-test/suite/innodb_plugin/r/innodb-index.result index b24f282dfc4..5be1460d2b7 100644 --- a/mysql-test/suite/innodb_plugin/r/innodb-index.result +++ b/mysql-test/suite/innodb_plugin/r/innodb-index.result @@ -1024,6 +1024,15 @@ INSERT INTO t1 VALUES(9,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r); UPDATE t1 SET a=1000; DELETE FROM t1; DROP TABLE t1; +CREATE TABLE bug12547647( +a INT NOT NULL, b BLOB NOT NULL, c TEXT, +PRIMARY KEY (b(10), a), INDEX (c(10)) +) ENGINE=InnoDB ROW_FORMAT=DYNAMIC; +INSERT INTO bug12547647 VALUES (5,repeat('khdfo5AlOq',1900),repeat('g',7731)); +COMMIT; +UPDATE bug12547647 SET c = REPEAT('b',16928); +ERROR 42000: Row size too large. The maximum row size for the used table type, not counting BLOBs, is 8126. You have to change some columns to TEXT or BLOBs +DROP TABLE bug12547647; set global innodb_file_per_table=0; set global innodb_file_format=Antelope; set global innodb_file_format_check=Antelope; diff --git a/mysql-test/suite/innodb_plugin/t/innodb-index.test b/mysql-test/suite/innodb_plugin/t/innodb-index.test index 52f94990b15..b4e2aae09e9 100644 --- a/mysql-test/suite/innodb_plugin/t/innodb-index.test +++ b/mysql-test/suite/innodb_plugin/t/innodb-index.test @@ -480,6 +480,19 @@ DELETE FROM t1; -- sleep 10 DROP TABLE t1; +# Bug#12547647 UPDATE LOGGING COULD EXCEED LOG PAGE SIZE +CREATE TABLE bug12547647( +a INT NOT NULL, b BLOB NOT NULL, c TEXT, +PRIMARY KEY (b(10), a), INDEX (c(10)) +) ENGINE=InnoDB ROW_FORMAT=DYNAMIC; + +INSERT INTO bug12547647 VALUES (5,repeat('khdfo5AlOq',1900),repeat('g',7731)); +COMMIT; +# The following used to cause infinite undo log allocation. +--error ER_TOO_BIG_ROWSIZE +UPDATE bug12547647 SET c = REPEAT('b',16928); +DROP TABLE bug12547647; + eval set global innodb_file_per_table=$per_table; eval set global innodb_file_format=$format; eval set global innodb_file_format_check=$format; diff --git a/storage/innobase/btr/btr0btr.c b/storage/innobase/btr/btr0btr.c index 790582815a3..ad99913cf3b 100644 --- a/storage/innobase/btr/btr0btr.c +++ b/storage/innobase/btr/btr0btr.c @@ -300,29 +300,30 @@ btr_page_alloc_for_ibuf( /****************************************************************** Allocates a new file page to be used in an index tree. NOTE: we assume that the caller has made the reservation for free extents! */ - -page_t* -btr_page_alloc( -/*===========*/ - /* out: new allocated page, x-latched; - NULL if out of space */ +static +ulint +btr_page_alloc_low( +/*===============*/ + /* out: allocated page number, + FIL_NULL if out of space */ dict_index_t* index, /* in: index */ ulint hint_page_no, /* in: hint of a good page */ byte file_direction, /* in: direction where a possible page split is made */ ulint level, /* in: level where the page is placed in the tree */ - mtr_t* mtr) /* in: mtr */ + mtr_t* mtr, /* in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr) /* in/out: mini-transaction + in which the page should be + initialized (may be the same + as mtr), or NULL if it should + not be initialized (the page + at hint was previously freed + in mtr) */ { fseg_header_t* seg_header; page_t* root; - page_t* new_page; - ulint new_page_no; - - if (index->type & DICT_IBUF) { - - return(btr_page_alloc_for_ibuf(index, mtr)); - } root = btr_root_get(index, mtr); @@ -336,19 +337,61 @@ btr_page_alloc( reservation for free extents, and thus we know that a page can be allocated: */ - new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no, - file_direction, TRUE, mtr); + return(fseg_alloc_free_page_general(seg_header, hint_page_no, + file_direction, TRUE, + mtr, init_mtr)); +} + +/**************************************************************//** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! */ + +page_t* +btr_page_alloc( +/*===========*/ + /* out: new allocated block, x-latched; + NULL if out of space */ + dict_index_t* index, /* in: index */ + ulint hint_page_no, /* in: hint of a good page */ + byte file_direction, /* in: direction where a possible + page split is made */ + ulint level, /* in: level where the page is placed + in the tree */ + mtr_t* mtr, /* in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr) /* in/out: mini-transaction + for x-latching and initializing + the page */ +{ + page_t* new_page; + ulint new_page_no; + + if (index->type & DICT_IBUF) { + + return(btr_page_alloc_for_ibuf(index, mtr)); + } + + new_page_no = btr_page_alloc_low( + index, hint_page_no, file_direction, level, mtr, init_mtr); + if (new_page_no == FIL_NULL) { return(NULL); } new_page = buf_page_get(dict_index_get_space(index), new_page_no, - RW_X_LATCH, mtr); + RW_X_LATCH, init_mtr); #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(new_page, SYNC_TREE_NODE_NEW); #endif /* UNIV_SYNC_DEBUG */ + if (mtr->freed_clust_leaf) { + mtr_memo_release(mtr, new_page, MTR_MEMO_FREE_CLUST_LEAF); + ut_ad(!mtr_memo_contains(mtr, buf_block_align(new_page), + MTR_MEMO_FREE_CLUST_LEAF)); + } + + ut_ad(btr_freed_leaves_validate(mtr)); return(new_page); } @@ -464,6 +507,16 @@ btr_page_free_low( page_no = buf_frame_get_page_no(page); fseg_free_page(seg_header, space, page_no, mtr); + + /* The page was marked free in the allocation bitmap, but it + should remain buffer-fixed until mtr_commit(mtr) or until it + is explicitly freed from the mini-transaction. */ + ut_ad(mtr_memo_contains(mtr, buf_block_align(page), + MTR_MEMO_PAGE_X_FIX)); + /* TODO: Discard any operations on the page from the redo log + and remove the block from the flush list and the buffer pool. + This would free up buffer pool earlier and reduce writes to + both the tablespace and the redo log. */ } /****************************************************************** @@ -479,13 +532,144 @@ btr_page_free( { ulint level; + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); level = btr_page_get_level(page, mtr); btr_page_free_low(index, page, level, mtr); + + /* The handling of MTR_MEMO_FREE_CLUST_LEAF assumes this. */ + ut_ad(mtr_memo_contains(mtr, buf_block_align(page), + MTR_MEMO_PAGE_X_FIX)); + + if (level == 0 && (index->type & DICT_CLUSTERED)) { + /* We may have to call btr_mark_freed_leaves() to + temporarily mark the block nonfree for invoking + btr_store_big_rec_extern_fields() after an + update. Remember that the block was freed. */ + mtr->freed_clust_leaf = TRUE; + mtr_memo_push(mtr, buf_block_align(page), + MTR_MEMO_FREE_CLUST_LEAF); + } + + ut_ad(btr_freed_leaves_validate(mtr)); } +/**************************************************************//** +Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free. +For invoking btr_store_big_rec_extern_fields() after an update, +we must temporarily mark freed clustered index pages allocated, so +that off-page columns will not be allocated from them. Between the +btr_store_big_rec_extern_fields() and mtr_commit() we have to +mark the pages free again, so that no pages will be leaked. */ + +void +btr_mark_freed_leaves( +/*==================*/ + dict_index_t* index, /* in/out: clustered index */ + mtr_t* mtr, /* in/out: mini-transaction */ + ibool nonfree)/* in: TRUE=mark nonfree, FALSE=mark freed */ +{ + /* This is loosely based on mtr_memo_release(). */ + + ulint offset; + + ut_ad(index->type & DICT_CLUSTERED); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + + if (!mtr->freed_clust_leaf) { + return; + } + + offset = dyn_array_get_data_size(&mtr->memo); + + while (offset > 0) { + mtr_memo_slot_t* slot; + buf_block_t* block; + + offset -= sizeof *slot; + + slot = dyn_array_get_element(&mtr->memo, offset); + + if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) { + continue; + } + + /* Because btr_page_alloc() does invoke + mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all + blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the + memo must still be clustered index leaf tree pages. */ + block = slot->object; + ut_a(buf_block_get_space(block) + == dict_index_get_space(index)); + ut_a(fil_page_get_type(buf_block_get_frame(block)) + == FIL_PAGE_INDEX); + ut_a(btr_page_get_level(buf_block_get_frame(block), mtr) == 0); + + if (nonfree) { + /* Allocate the same page again. */ + ulint page_no; + page_no = btr_page_alloc_low( + index, buf_block_get_page_no(block), + FSP_NO_DIR, 0, mtr, NULL); + ut_a(page_no == buf_block_get_page_no(block)); + } else { + /* Assert that the page is allocated and free it. */ + btr_page_free_low(index, buf_block_get_frame(block), + 0, mtr); + } + } + + ut_ad(btr_freed_leaves_validate(mtr)); +} + +#ifdef UNIV_DEBUG +/**************************************************************//** +Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF. +See btr_mark_freed_leaves(). */ + +ibool +btr_freed_leaves_validate( +/*======================*/ + /* out: TRUE if valid */ + mtr_t* mtr) /* in: mini-transaction */ +{ + ulint offset; + + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + + offset = dyn_array_get_data_size(&mtr->memo); + + while (offset > 0) { + mtr_memo_slot_t* slot; + buf_block_t* block; + + offset -= sizeof *slot; + + slot = dyn_array_get_element(&mtr->memo, offset); + + if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) { + continue; + } + + ut_a(mtr->freed_clust_leaf); + /* Because btr_page_alloc() does invoke + mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all + blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the + memo must still be clustered index leaf tree pages. */ + block = slot->object; + ut_a(fil_page_get_type(buf_block_get_frame(block)) + == FIL_PAGE_INDEX); + ut_a(btr_page_get_level(buf_block_get_frame(block), mtr) == 0); + } + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + /****************************************************************** Sets the child node file address in a node pointer. */ UNIV_INLINE @@ -1015,7 +1199,7 @@ btr_root_raise_and_insert( a node pointer to the new page, and then splitting the new page. */ new_page = btr_page_alloc(index, 0, FSP_NO_DIR, - btr_page_get_level(root, mtr), mtr); + btr_page_get_level(root, mtr), mtr, mtr); btr_page_create(new_page, index, mtr); @@ -1636,7 +1820,7 @@ func_start: /* 2. Allocate a new page to the index */ new_page = btr_page_alloc(cursor->index, hint_page_no, direction, - btr_page_get_level(page, mtr), mtr); + btr_page_get_level(page, mtr), mtr, mtr); btr_page_create(new_page, cursor->index, mtr); /* 3. Calculate the first record on the upper half-page, and the diff --git a/storage/innobase/btr/btr0cur.c b/storage/innobase/btr/btr0cur.c index 9ce09929f9a..a1dda8edf69 100644 --- a/storage/innobase/btr/btr0cur.c +++ b/storage/innobase/btr/btr0cur.c @@ -2051,43 +2051,6 @@ return_after_reservations: return(err); } -/***************************************************************** -Commits and restarts a mini-transaction so that it will retain an -x-lock on index->lock and the cursor page. */ - -void -btr_cur_mtr_commit_and_start( -/*=========================*/ - btr_cur_t* cursor, /* in: cursor */ - mtr_t* mtr) /* in/out: mini-transaction */ -{ - buf_block_t* block; - - block = buf_block_align(btr_cur_get_rec(cursor)); - - ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index), - MTR_MEMO_X_LOCK)); - ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); - /* Keep the locks across the mtr_commit(mtr). */ - rw_lock_x_lock(dict_index_get_lock(cursor->index)); - rw_lock_x_lock(&block->lock); - mutex_enter(&block->mutex); -#ifdef UNIV_SYNC_DEBUG - buf_block_buf_fix_inc_debug(block, __FILE__, __LINE__); -#else - buf_block_buf_fix_inc(block); -#endif - mutex_exit(&block->mutex); - /* Write out the redo log. */ - mtr_commit(mtr); - mtr_start(mtr); - /* Reassociate the locks with the mini-transaction. - They will be released on mtr_commit(mtr). */ - mtr_memo_push(mtr, dict_index_get_lock(cursor->index), - MTR_MEMO_X_LOCK); - mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX); -} - /*==================== B-TREE DELETE MARK AND UNMARK ===============*/ /******************************************************************** @@ -3494,6 +3457,11 @@ btr_store_big_rec_extern_fields( this function returns */ big_rec_t* big_rec_vec, /* in: vector containing fields to be stored externally */ + mtr_t* alloc_mtr, /* in/out: in an insert, NULL; + in an update, local_mtr for + allocating BLOB pages and + updating BLOB pointers; alloc_mtr + must not have freed any leaf pages */ mtr_t* local_mtr __attribute__((unused))) /* in: mtr containing the latch to rec and to the tree */ @@ -3514,6 +3482,8 @@ btr_store_big_rec_extern_fields( ulint i; mtr_t mtr; + ut_ad(local_mtr); + ut_ad(!alloc_mtr || alloc_mtr == local_mtr); ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index), MTR_MEMO_X_LOCK)); @@ -3523,6 +3493,25 @@ btr_store_big_rec_extern_fields( space_id = buf_frame_get_space_id(rec); + if (alloc_mtr) { + /* Because alloc_mtr will be committed after + mtr, it is possible that the tablespace has been + extended when the B-tree record was updated or + inserted, or it will be extended while allocating + pages for big_rec. + + TODO: In mtr (not alloc_mtr), write a redo log record + about extending the tablespace to its current size, + and remember the current size. Whenever the tablespace + grows as pages are allocated, write further redo log + records to mtr. (Currently tablespace extension is not + covered by the redo log. If it were, the record would + only be written to alloc_mtr, which is committed after + mtr.) */ + } else { + alloc_mtr = &mtr; + } + /* We have to create a file segment to the tablespace for each field and put the pointer to the field in rec */ @@ -3549,7 +3538,7 @@ btr_store_big_rec_extern_fields( } page = btr_page_alloc(index, hint_page_no, - FSP_NO_DIR, 0, &mtr); + FSP_NO_DIR, 0, alloc_mtr, &mtr); if (page == NULL) { mtr_commit(&mtr); @@ -3603,37 +3592,42 @@ btr_store_big_rec_extern_fields( extern_len -= store_len; + if (alloc_mtr == &mtr) { #ifdef UNIV_SYNC_DEBUG - rec_page = + rec_page = #endif /* UNIV_SYNC_DEBUG */ - buf_page_get(space_id, - buf_frame_get_page_no(data), - RW_X_LATCH, &mtr); + buf_page_get( + space_id, + buf_frame_get_page_no(data), + RW_X_LATCH, &mtr); #ifdef UNIV_SYNC_DEBUG - buf_page_dbg_add_level(rec_page, SYNC_NO_ORDER_CHECK); + buf_page_dbg_add_level( + rec_page, SYNC_NO_ORDER_CHECK); #endif /* UNIV_SYNC_DEBUG */ + } + mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, 0, - MLOG_4BYTES, &mtr); + MLOG_4BYTES, alloc_mtr); mlog_write_ulint(data + local_len + BTR_EXTERN_LEN + 4, big_rec_vec->fields[i].len - extern_len, - MLOG_4BYTES, &mtr); + MLOG_4BYTES, alloc_mtr); if (prev_page_no == FIL_NULL) { mlog_write_ulint(data + local_len + BTR_EXTERN_SPACE_ID, space_id, - MLOG_4BYTES, &mtr); + MLOG_4BYTES, alloc_mtr); mlog_write_ulint(data + local_len + BTR_EXTERN_PAGE_NO, page_no, - MLOG_4BYTES, &mtr); + MLOG_4BYTES, alloc_mtr); mlog_write_ulint(data + local_len + BTR_EXTERN_OFFSET, FIL_PAGE_DATA, - MLOG_4BYTES, &mtr); + MLOG_4BYTES, alloc_mtr); /* Set the bit denoting that this field in rec is stored externally */ @@ -3641,7 +3635,7 @@ btr_store_big_rec_extern_fields( rec_set_nth_field_extern_bit( rec, index, big_rec_vec->fields[i].field_no, - TRUE, &mtr); + TRUE, alloc_mtr); } prev_page_no = page_no; diff --git a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c index 08e033e7a63..78b39812cff 100644 --- a/storage/innobase/buf/buf0buf.c +++ b/storage/innobase/buf/buf0buf.c @@ -1008,29 +1008,6 @@ buf_page_peek_block( return(block); } -/************************************************************************ -Resets the check_index_page_at_flush field of a page if found in the buffer -pool. */ - -void -buf_reset_check_index_page_at_flush( -/*================================*/ - ulint space, /* in: space id */ - ulint offset) /* in: page number */ -{ - buf_block_t* block; - - mutex_enter_fast(&(buf_pool->mutex)); - - block = buf_page_hash_get(space, offset); - - if (block) { - block->check_index_page_at_flush = FALSE; - } - - mutex_exit(&(buf_pool->mutex)); -} - /************************************************************************ Returns the current state of is_hashed of a page. FALSE if the page is not in the pool. NOTE that this operation does not fix the page in the diff --git a/storage/innobase/fsp/fsp0fsp.c b/storage/innobase/fsp/fsp0fsp.c index d228e683957..d5be8fca38f 100644 --- a/storage/innobase/fsp/fsp0fsp.c +++ b/storage/innobase/fsp/fsp0fsp.c @@ -293,15 +293,19 @@ fseg_alloc_free_page_low( /* out: the allocated page number, FIL_NULL if no page could be allocated */ ulint space, /* in: space */ - fseg_inode_t* seg_inode, /* in: segment inode */ + fseg_inode_t* seg_inode, /* in/out: segment inode */ ulint hint, /* in: hint of which page would be desirable */ byte direction, /* in: if the new page is needed because of an index page split, and records are inserted there in order, into which direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR */ - mtr_t* mtr); /* in: mtr handle */ - + mtr_t* mtr, /* in/out: mini-transaction */ + mtr_t* init_mtr);/* in/out: mini-transaction in which the + page should be initialized + (may be the same as mtr), or NULL if it + should not be initialized (the page at hint + was previously freed in mtr) */ /************************************************************************** Reads the file space size stored in the header page. */ @@ -1371,6 +1375,43 @@ fsp_alloc_free_extent( return(descr); } +/**********************************************************************//** +Allocates a single free page from a space. */ +static __attribute__((nonnull)) +void +fsp_alloc_from_free_frag( +/*=====================*/ + fsp_header_t* header, /* in/out: tablespace header */ + xdes_t* descr, /* in/out: extent descriptor */ + ulint bit, /* in: slot to allocate in the extent */ + mtr_t* mtr) /* in/out: mini-transaction */ +{ + ulint frag_n_used; + + ut_ad(xdes_get_state(descr, mtr) == XDES_FREE_FRAG); + ut_a(xdes_get_bit(descr, XDES_FREE_BIT, bit, mtr)); + xdes_set_bit(descr, XDES_FREE_BIT, bit, FALSE, mtr); + + /* Update the FRAG_N_USED field */ + frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, + mtr); + frag_n_used++; + mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES, + mtr); + if (xdes_is_full(descr, mtr)) { + /* The fragment is full: move it to another list */ + flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, + mtr); + xdes_set_state(descr, XDES_FULL_FRAG, mtr); + + flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE, + mtr); + mlog_write_ulint(header + FSP_FRAG_N_USED, + frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES, + mtr); + } +} + /************************************************************************** Allocates a single free page from a space. The page is marked as used. */ static @@ -1381,19 +1422,22 @@ fsp_alloc_free_page( be allocated */ ulint space, /* in: space id */ ulint hint, /* in: hint of which page would be desirable */ - mtr_t* mtr) /* in: mtr handle */ + mtr_t* mtr, /* in/out: mini-transaction */ + mtr_t* init_mtr)/* in/out: mini-transaction in which the + page should be initialized + (may be the same as mtr) */ { fsp_header_t* header; fil_addr_t first; xdes_t* descr; page_t* page; ulint free; - ulint frag_n_used; ulint page_no; ulint space_size; ibool success; ut_ad(mtr); + ut_ad(init_mtr); header = fsp_get_space_header(space, mtr); @@ -1441,6 +1485,7 @@ fsp_alloc_free_page( if (free == ULINT_UNDEFINED) { ut_print_buf(stderr, ((byte*)descr) - 500, 1000); + putc('\n', stderr); ut_error; } @@ -1472,40 +1517,21 @@ fsp_alloc_free_page( } } - xdes_set_bit(descr, XDES_FREE_BIT, free, FALSE, mtr); - - /* Update the FRAG_N_USED field */ - frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, - mtr); - frag_n_used++; - mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES, - mtr); - if (xdes_is_full(descr, mtr)) { - /* The fragment is full: move it to another list */ - flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, - mtr); - xdes_set_state(descr, XDES_FULL_FRAG, mtr); - - flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE, - mtr); - mlog_write_ulint(header + FSP_FRAG_N_USED, - frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES, - mtr); - } + fsp_alloc_from_free_frag(header, descr, free, mtr); /* Initialize the allocated page to the buffer pool, so that it can be obtained immediately with buf_page_get without need for a disk read. */ - buf_page_create(space, page_no, mtr); + buf_page_create(space, page_no, init_mtr); - page = buf_page_get(space, page_no, RW_X_LATCH, mtr); + page = buf_page_get(space, page_no, RW_X_LATCH, init_mtr); #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(page, SYNC_FSP_PAGE); #endif /* UNIV_SYNC_DEBUG */ /* Prior contents of the page should be ignored */ - fsp_init_file_page(page, mtr); + fsp_init_file_page(page, init_mtr); return(page_no); } @@ -1724,7 +1750,7 @@ fsp_alloc_seg_inode_page( space = buf_frame_get_space_id(space_header); - page_no = fsp_alloc_free_page(space, 0, mtr); + page_no = fsp_alloc_free_page(space, 0, mtr, mtr); if (page_no == FIL_NULL) { @@ -2094,7 +2120,8 @@ fseg_create_general( } if (page == 0) { - page = fseg_alloc_free_page_low(space, inode, 0, FSP_UP, mtr); + page = fseg_alloc_free_page_low(space, + inode, 0, FSP_UP, mtr, mtr); if (page == FIL_NULL) { @@ -2331,14 +2358,19 @@ fseg_alloc_free_page_low( /* out: the allocated page number, FIL_NULL if no page could be allocated */ ulint space, /* in: space */ - fseg_inode_t* seg_inode, /* in: segment inode */ + fseg_inode_t* seg_inode, /* in/out: segment inode */ ulint hint, /* in: hint of which page would be desirable */ byte direction, /* in: if the new page is needed because of an index page split, and records are inserted there in order, into which direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR */ - mtr_t* mtr) /* in: mtr handle */ + mtr_t* mtr, /* in/out: mini-transaction */ + mtr_t* init_mtr)/* in/out: mini-transaction in which the + page should be initialized + (may be the same as mtr), or NULL if it + should not be initialized (the page at hint + was previously freed in mtr) */ { fsp_header_t* space_header; ulint space_size; @@ -2350,7 +2382,6 @@ fseg_alloc_free_page_low( if could not be allocated */ xdes_t* ret_descr; /* the extent of the allocated page */ page_t* page; - ibool frag_page_allocated = FALSE; ibool success; ulint n; @@ -2371,6 +2402,8 @@ fseg_alloc_free_page_low( if (descr == NULL) { /* Hint outside space or too high above free limit: reset hint */ + ut_a(init_mtr); + /* The file space header page is always allocated. */ hint = 0; descr = xdes_get_descriptor(space, hint, mtr); } @@ -2382,15 +2415,20 @@ fseg_alloc_free_page_low( mtr), seg_id)) && (xdes_get_bit(descr, XDES_FREE_BIT, hint % FSP_EXTENT_SIZE, mtr) == TRUE)) { - +take_hinted_page: /* 1. We can take the hinted page =================================*/ ret_descr = descr; ret_page = hint; + /* Skip the check for extending the tablespace. If the + page hint were not within the size of the tablespace, + we would have got (descr == NULL) above and reset the hint. */ + goto got_hinted_page; /*-----------------------------------------------------------*/ - } else if ((xdes_get_state(descr, mtr) == XDES_FREE) - && ((reserved - used) < reserved / FSEG_FILLFACTOR) - && (used >= FSEG_FRAG_LIMIT)) { + } else if (xdes_get_state(descr, mtr) == XDES_FREE + && (!init_mtr + || ((reserved - used < reserved / FSEG_FILLFACTOR) + && used >= FSEG_FRAG_LIMIT))) { /* 2. We allocate the free extent from space and can take ========================================================= @@ -2408,8 +2446,20 @@ fseg_alloc_free_page_low( /* Try to fill the segment free list */ fseg_fill_free_list(seg_inode, space, hint + FSP_EXTENT_SIZE, mtr); - ret_page = hint; + goto take_hinted_page; /*-----------------------------------------------------------*/ + } else if (!init_mtr) { + ut_a(xdes_get_state(descr, mtr) == XDES_FREE_FRAG); + fsp_alloc_from_free_frag(space_header, descr, + hint % FSP_EXTENT_SIZE, mtr); + ret_page = hint; + ret_descr = NULL; + + /* Put the page in the fragment page array of the segment */ + n = fseg_find_free_frag_page_slot(seg_inode, mtr); + ut_a(n != FIL_NULL); + fseg_set_nth_frag_page_no(seg_inode, n, ret_page, mtr); + goto got_hinted_page; } else if ((direction != FSP_NO_DIR) && ((reserved - used) < reserved / FSEG_FILLFACTOR) && (used >= FSEG_FRAG_LIMIT) @@ -2467,11 +2517,9 @@ fseg_alloc_free_page_low( } else if (used < FSEG_FRAG_LIMIT) { /* 6. We allocate an individual page from the space ===================================================*/ - ret_page = fsp_alloc_free_page(space, hint, mtr); + ret_page = fsp_alloc_free_page(space, hint, mtr, init_mtr); ret_descr = NULL; - frag_page_allocated = TRUE; - if (ret_page != FIL_NULL) { /* Put the page in the fragment page array of the segment */ @@ -2481,6 +2529,10 @@ fseg_alloc_free_page_low( fseg_set_nth_frag_page_no(seg_inode, n, ret_page, mtr); } + + /* fsp_alloc_free_page() invoked fsp_init_file_page() + already. */ + return(ret_page); /*-----------------------------------------------------------*/ } else { /* 7. We allocate a new extent and take its first page @@ -2527,22 +2579,31 @@ fseg_alloc_free_page_low( } } - if (!frag_page_allocated) { +got_hinted_page: + { /* Initialize the allocated page to buffer pool, so that it can be obtained immediately with buf_page_get without need for a disk read */ + mtr_t* block_mtr = init_mtr ? init_mtr : mtr; - page = buf_page_create(space, ret_page, mtr); + page = buf_page_create(space, ret_page, block_mtr); - ut_a(page == buf_page_get(space, ret_page, RW_X_LATCH, mtr)); + ut_a(page == buf_page_get(space, ret_page, RW_X_LATCH, + block_mtr)); #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(page, SYNC_FSP_PAGE); #endif /* UNIV_SYNC_DEBUG */ - /* The prior contents of the page should be ignored */ - fsp_init_file_page(page, mtr); + if (init_mtr) { + /* The prior contents of the page should be ignored */ + fsp_init_file_page(page, init_mtr); + } + } + /* ret_descr == NULL if the block was allocated from free_frag + (XDES_FREE_FRAG) */ + if (ret_descr != NULL) { /* At this point we know the extent and the page offset. The extent is still in the appropriate list (FSEG_NOT_FULL or FSEG_FREE), and the page is not yet marked as used. */ @@ -2554,8 +2615,6 @@ fseg_alloc_free_page_low( fseg_mark_page_used(seg_inode, space, ret_page, mtr); } - buf_reset_check_index_page_at_flush(space, ret_page); - return(ret_page); } @@ -2569,7 +2628,7 @@ fseg_alloc_free_page_general( /*=========================*/ /* out: allocated page offset, FIL_NULL if no page could be allocated */ - fseg_header_t* seg_header,/* in: segment header */ + fseg_header_t* seg_header,/* in/out: segment header */ ulint hint, /* in: hint of which page would be desirable */ byte direction,/* in: if the new page is needed because of an index page split, and records are @@ -2581,7 +2640,11 @@ fseg_alloc_free_page_general( with fsp_reserve_free_extents, then there is no need to do the check for this individual page */ - mtr_t* mtr) /* in: mtr handle */ + mtr_t* mtr, /* in/out: mini-transaction handle */ + mtr_t* init_mtr)/* in/out: mtr or another mini-transaction + in which the page should be initialized, + or NULL if this is a "fake allocation" of + a page that was previously freed in mtr */ { fseg_inode_t* inode; ulint space; @@ -2619,7 +2682,8 @@ fseg_alloc_free_page_general( } page_no = fseg_alloc_free_page_low(buf_frame_get_space_id(inode), - inode, hint, direction, mtr); + inode, hint, direction, + mtr, init_mtr); if (!has_done_reservation) { fil_space_release_free_extents(space, n_reserved); } @@ -2647,7 +2711,7 @@ fseg_alloc_free_page( mtr_t* mtr) /* in: mtr handle */ { return(fseg_alloc_free_page_general(seg_header, hint, direction, - FALSE, mtr)); + FALSE, mtr, mtr)); } /************************************************************************** diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index 269fa355558..3988019589d 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -379,7 +379,11 @@ btr_page_alloc( page split is made */ ulint level, /* in: level where the page is placed in the tree */ - mtr_t* mtr); /* in: mtr */ + mtr_t* mtr, /* in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr); /* in/out: mini-transaction + for x-latching and initializing + the page */ /****************************************************************** Frees a file page used in an index tree. NOTE: cannot free field external storage pages because the page must contain info on its level. */ @@ -402,6 +406,31 @@ btr_page_free_low( page_t* page, /* in: page to be freed, x-latched */ ulint level, /* in: page level */ mtr_t* mtr); /* in: mtr */ +/**************************************************************//** +Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free. +For invoking btr_store_big_rec_extern_fields() after an update, +we must temporarily mark freed clustered index pages allocated, so +that off-page columns will not be allocated from them. Between the +btr_store_big_rec_extern_fields() and mtr_commit() we have to +mark the pages free again, so that no pages will be leaked. */ + +void +btr_mark_freed_leaves( +/*==================*/ + dict_index_t* index, /* in/out: clustered index */ + mtr_t* mtr, /* in/out: mini-transaction */ + ibool nonfree);/* in: TRUE=mark nonfree, FALSE=mark freed */ +#ifdef UNIV_DEBUG +/**************************************************************//** +Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF. +See btr_mark_freed_leaves(). */ + +ibool +btr_freed_leaves_validate( +/*======================*/ + /* out: TRUE if valid */ + mtr_t* mtr); /* in: mini-transaction */ +#endif /* UNIV_DEBUG */ #ifdef UNIV_BTR_PRINT /***************************************************************** Prints size info of a B-tree. */ diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h index c068d8d3318..c2bf84ef9cb 100644 --- a/storage/innobase/include/btr0cur.h +++ b/storage/innobase/include/btr0cur.h @@ -252,15 +252,6 @@ btr_cur_pessimistic_update( updates */ que_thr_t* thr, /* in: query thread */ mtr_t* mtr); /* in: mtr */ -/***************************************************************** -Commits and restarts a mini-transaction so that it will retain an -x-lock on index->lock and the cursor page. */ - -void -btr_cur_mtr_commit_and_start( -/*=========================*/ - btr_cur_t* cursor, /* in: cursor */ - mtr_t* mtr); /* in/out: mini-transaction */ /*************************************************************** Marks a clustered index record deleted. Writes an undo log record to undo log on this delete marking. Writes in the trx id field the id @@ -471,6 +462,11 @@ btr_store_big_rec_extern_fields( this function returns */ big_rec_t* big_rec_vec, /* in: vector containing fields to be stored externally */ + mtr_t* alloc_mtr, /* in/out: in an insert, NULL; + in an update, local_mtr for + allocating BLOB pages and + updating BLOB pointers; alloc_mtr + must not have freed any leaf pages */ mtr_t* local_mtr); /* in: mtr containing the latch to rec and to the tree */ /*********************************************************************** diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 7479ce9cbf0..87b2f6172de 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -294,15 +294,6 @@ buf_page_peek_block( ulint space, /* in: space id */ ulint offset);/* in: page number */ /************************************************************************ -Resets the check_index_page_at_flush field of a page if found in the buffer -pool. */ - -void -buf_reset_check_index_page_at_flush( -/*================================*/ - ulint space, /* in: space id */ - ulint offset);/* in: page number */ -/************************************************************************ Sets file_page_was_freed TRUE if the page is found in the buffer pool. This function should be called when we free a file page and want the debug version to check that it is not accessed any more unless diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h index 17bfbeec2c1..4c58d6075e6 100644 --- a/storage/innobase/include/fsp0fsp.h +++ b/storage/innobase/include/fsp0fsp.h @@ -167,7 +167,7 @@ fseg_alloc_free_page_general( /*=========================*/ /* out: allocated page offset, FIL_NULL if no page could be allocated */ - fseg_header_t* seg_header,/* in: segment header */ + fseg_header_t* seg_header,/* in/out: segment header */ ulint hint, /* in: hint of which page would be desirable */ byte direction,/* in: if the new page is needed because of an index page split, and records are @@ -179,7 +179,11 @@ fseg_alloc_free_page_general( with fsp_reserve_free_extents, then there is no need to do the check for this individual page */ - mtr_t* mtr); /* in: mtr handle */ + mtr_t* mtr, /* in/out: mini-transaction */ + mtr_t* init_mtr);/* in/out: mtr or another mini-transaction + in which the page should be initialized, + or NULL if this is a "fake allocation" of + a page that was previously freed in mtr */ /************************************************************************** Reserves free pages from a tablespace. All mini-transactions which may use several pages from the tablespace should call this function beforehand diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index a6e2976830b..a0a51dbbd17 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -36,6 +36,8 @@ first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ #define MTR_MEMO_MODIFY 54 #define MTR_MEMO_S_LOCK 55 #define MTR_MEMO_X_LOCK 56 +/* The mini-transaction freed a clustered index leaf page. */ +#define MTR_MEMO_FREE_CLUST_LEAF 57 /* Log item types: we have made them to be of the type 'byte' for the compiler to warn if val and type parameters are switched @@ -325,9 +327,12 @@ struct mtr_struct{ ulint state; /* MTR_ACTIVE, MTR_COMMITTING, MTR_COMMITTED */ dyn_array_t memo; /* memo stack for locks etc. */ dyn_array_t log; /* mini-transaction log */ - ibool modifications; + unsigned modifications:1; /* TRUE if the mtr made modifications to buffer pool pages */ + unsigned freed_clust_leaf:1; + /* TRUE if MTR_MEMO_FREE_CLUST_LEAF + was logged in the mini-transaction */ ulint n_log_recs; /* count of how many page initial log records have been written to the mtr log */ diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic index 81eec3bfc92..6b4cacf0766 100644 --- a/storage/innobase/include/mtr0mtr.ic +++ b/storage/innobase/include/mtr0mtr.ic @@ -26,6 +26,7 @@ mtr_start( mtr->log_mode = MTR_LOG_ALL; mtr->modifications = FALSE; + mtr->freed_clust_leaf = FALSE; mtr->n_log_recs = 0; #ifdef UNIV_DEBUG @@ -50,7 +51,8 @@ mtr_memo_push( ut_ad(object); ut_ad(type >= MTR_MEMO_PAGE_S_FIX); - ut_ad(type <= MTR_MEMO_X_LOCK); + ut_ad(type <= MTR_MEMO_FREE_CLUST_LEAF); + ut_ad(type != MTR_MEMO_FREE_CLUST_LEAF || mtr->freed_clust_leaf); ut_ad(mtr); ut_ad(mtr->magic_n == MTR_MAGIC_N); diff --git a/storage/innobase/mtr/mtr0mtr.c b/storage/innobase/mtr/mtr0mtr.c index 365fa15878a..a11e20ca661 100644 --- a/storage/innobase/mtr/mtr0mtr.c +++ b/storage/innobase/mtr/mtr0mtr.c @@ -53,17 +53,13 @@ mtr_memo_slot_release( buf_page_release((buf_block_t*)object, type, mtr); } else if (type == MTR_MEMO_S_LOCK) { rw_lock_s_unlock((rw_lock_t*)object); -#ifdef UNIV_DEBUG - } else if (type == MTR_MEMO_X_LOCK) { - rw_lock_x_unlock((rw_lock_t*)object); - } else { - ut_ad(type == MTR_MEMO_MODIFY); + } else if (type != MTR_MEMO_X_LOCK) { + ut_ad(type == MTR_MEMO_MODIFY + || type == MTR_MEMO_FREE_CLUST_LEAF); ut_ad(mtr_memo_contains(mtr, object, MTR_MEMO_PAGE_X_FIX)); -#else } else { rw_lock_x_unlock((rw_lock_t*)object); -#endif } } diff --git a/storage/innobase/row/row0ins.c b/storage/innobase/row/row0ins.c index 7ff443a11ad..6366beb6b47 100644 --- a/storage/innobase/row/row0ins.c +++ b/storage/innobase/row/row0ins.c @@ -2089,15 +2089,20 @@ row_ins_index_entry_low( if (big_rec) { ut_a(err == DB_SUCCESS); /* Write out the externally stored - columns while still x-latching - index->lock and block->lock. We have - to mtr_commit(mtr) first, so that the - redo log will be written in the - correct order. Otherwise, we would run - into trouble on crash recovery if mtr - freed B-tree pages on which some of - the big_rec fields will be written. */ - btr_cur_mtr_commit_and_start(&cursor, &mtr); + columns, but allocate the pages and + write the pointers using the + mini-transaction of the record update. + If any pages were freed in the update, + temporarily mark them allocated so + that off-page columns will not + overwrite them. We must do this, + because we will write the redo log for + the BLOB writes before writing the + redo log for the record update. Thus, + redo log application at crash recovery + will see BLOBs being written to free pages. */ + + btr_mark_freed_leaves(index, &mtr, TRUE); rec = btr_cur_get_rec(&cursor); offsets = rec_get_offsets(rec, index, offsets, @@ -2105,7 +2110,8 @@ row_ins_index_entry_low( &heap); err = btr_store_big_rec_extern_fields( - index, rec, offsets, big_rec, &mtr); + index, rec, offsets, big_rec, + &mtr, &mtr); /* If writing big_rec fails (for example, because of DB_OUT_OF_FILE_SPACE), the record will be corrupted. Even if @@ -2118,6 +2124,9 @@ row_ins_index_entry_low( undo log, and thus the record cannot be rolled back. */ ut_a(err == DB_SUCCESS); + /* Free the pages again + in order to avoid a leak. */ + btr_mark_freed_leaves(index, &mtr, FALSE); goto stored_big_rec; } } else { @@ -2165,7 +2174,8 @@ function_exit: ULINT_UNDEFINED, &heap); err = btr_store_big_rec_extern_fields(index, rec, - offsets, big_rec, &mtr); + offsets, big_rec, + NULL, &mtr); stored_big_rec: if (modify) { dtuple_big_rec_free(big_rec); diff --git a/storage/innobase/row/row0row.c b/storage/innobase/row/row0row.c index 171039e34ac..ccb3c1f7781 100644 --- a/storage/innobase/row/row0row.c +++ b/storage/innobase/row/row0row.c @@ -212,23 +212,27 @@ row_build( } #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG - /* This condition can occur during crash recovery before - trx_rollback_or_clean_all_without_sess() has completed - execution. - - This condition is possible if the server crashed - during an insert or update before - btr_store_big_rec_extern_fields() did mtr_commit() all - BLOB pointers to the clustered index record. - - If the record contains a null BLOB pointer, look up the - transaction that holds the implicit lock on this record, and - assert that it is active. (In this version of InnoDB, we - cannot assert that it was recovered, because there is no - trx->is_recovered field.) */ - - ut_a(!rec_offs_any_null_extern(rec, offsets) - || trx_assert_active(row_get_rec_trx_id(rec, index, offsets))); + if (rec_offs_any_null_extern(rec, offsets)) { + /* This condition can occur during crash recovery + before trx_rollback_or_clean_all_without_sess() has + completed execution. + + This condition is possible if the server crashed + during an insert or update before + btr_store_big_rec_extern_fields() did mtr_commit() all + BLOB pointers to the clustered index record. + + If the record contains a null BLOB pointer, look up the + transaction that holds the implicit lock on this record, and + assert that it is active. (In this version of InnoDB, we + cannot assert that it was recovered, because there is no + trx->is_recovered field.) */ + + ut_a(trx_assert_active( + row_get_rec_trx_id(rec, index, offsets))); + ut_a(trx_undo_roll_ptr_is_insert( + row_get_rec_roll_ptr(rec, index, offsets))); + } #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ if (type != ROW_COPY_POINTERS) { diff --git a/storage/innobase/row/row0upd.c b/storage/innobase/row/row0upd.c index 694b00ea265..58739edfd98 100644 --- a/storage/innobase/row/row0upd.c +++ b/storage/innobase/row/row0upd.c @@ -1591,21 +1591,22 @@ row_upd_clust_rec( *offsets_ = (sizeof offsets_) / sizeof *offsets_; ut_a(err == DB_SUCCESS); - /* Write out the externally stored columns while still - x-latching index->lock and block->lock. We have to - mtr_commit(mtr) first, so that the redo log will be - written in the correct order. Otherwise, we would run - into trouble on crash recovery if mtr freed B-tree - pages on which some of the big_rec fields will be - written. */ - btr_cur_mtr_commit_and_start(btr_cur, mtr); - + /* Write out the externally stored columns, but + allocate the pages and write the pointers using the + mini-transaction of the record update. If any pages + were freed in the update, temporarily mark them + allocated so that off-page columns will not overwrite + them. We must do this, because we write the redo log + for the BLOB writes before writing the redo log for + the record update. */ + + btr_mark_freed_leaves(index, mtr, TRUE); rec = btr_cur_get_rec(btr_cur); err = btr_store_big_rec_extern_fields( index, rec, rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap), - big_rec, mtr); + big_rec, mtr, mtr); if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } @@ -1618,6 +1619,8 @@ row_upd_clust_rec( to the undo log, and thus the record cannot be rolled back. */ ut_a(err == DB_SUCCESS); + /* Free the pages again in order to avoid a leak. */ + btr_mark_freed_leaves(index, mtr, FALSE); } mtr_commit(mtr); diff --git a/storage/innobase/trx/trx0undo.c b/storage/innobase/trx/trx0undo.c index 329565943c8..ce09862f317 100644 --- a/storage/innobase/trx/trx0undo.c +++ b/storage/innobase/trx/trx0undo.c @@ -864,7 +864,7 @@ trx_undo_add_page( page_no = fseg_alloc_free_page_general(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER, undo->top_page_no + 1, FSP_UP, - TRUE, mtr); + TRUE, mtr, mtr); fil_space_release_free_extents(undo->space, n_reserved); diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index 0b90b5729d5..96b6a47085a 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,17 @@ +2011-08-29 The InnoDB Team + + * btr/btr0btr.c, btr/btr0cur.c, fsp/fsp0fsp.c, + include/btr0btr.h, include/btr0cur.h, include/fsp0fsp.h, + include/mtr0mtr.h, include/mtr0mtr.ic, mtr/mtr0mtr.c, + row/row0ins.c, row/row0row.c, row/row0upd.c, trx/trx0undo.c: + Fix Bug#12704861 Corruption after a crash during BLOB update + and other regressions from the fix of Bug#12612184 + +2011-08-23 The InnoDB Team + + * include/trx0undo.h, trx/trx0rec.c, trx/trx0undo.c: + Fix Bug#12547647 UPDATE LOGGING COULD EXCEED LOG PAGE SIZE + 2011-08-15 The InnoDB Team * btr/btr0btr.c, btr/btr0cur.c, btr/btr0pcur.c, btr/btr0sea.c, diff --git a/storage/innodb_plugin/btr/btr0btr.c b/storage/innodb_plugin/btr/btr0btr.c index 5e6724bbd54..71e1599d19e 100644 --- a/storage/innodb_plugin/btr/btr0btr.c +++ b/storage/innodb_plugin/btr/btr0btr.c @@ -906,28 +906,29 @@ btr_page_alloc_for_ibuf( /**************************************************************//** Allocates a new file page to be used in an index tree. NOTE: we assume that the caller has made the reservation for free extents! -@return new allocated block, x-latched; NULL if out of space */ -UNIV_INTERN -buf_block_t* -btr_page_alloc( -/*===========*/ +@return allocated page number, FIL_NULL if out of space */ +static __attribute__((nonnull(1,5), warn_unused_result)) +ulint +btr_page_alloc_low( +/*===============*/ dict_index_t* index, /*!< in: index */ ulint hint_page_no, /*!< in: hint of a good page */ byte file_direction, /*!< in: direction where a possible page split is made */ ulint level, /*!< in: level where the page is placed in the tree */ - mtr_t* mtr) /*!< in: mtr */ + mtr_t* mtr, /*!< in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr) /*!< in/out: mini-transaction + in which the page should be + initialized (may be the same + as mtr), or NULL if it should + not be initialized (the page + at hint was previously freed + in mtr) */ { fseg_header_t* seg_header; page_t* root; - buf_block_t* new_block; - ulint new_page_no; - - if (dict_index_is_ibuf(index)) { - - return(btr_page_alloc_for_ibuf(index, mtr)); - } root = btr_root_get(index, mtr); @@ -941,8 +942,42 @@ btr_page_alloc( reservation for free extents, and thus we know that a page can be allocated: */ - new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no, - file_direction, TRUE, mtr); + return(fseg_alloc_free_page_general( + seg_header, hint_page_no, file_direction, + TRUE, mtr, init_mtr)); +} + +/**************************************************************//** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! +@return new allocated block, x-latched; NULL if out of space */ +UNIV_INTERN +buf_block_t* +btr_page_alloc( +/*===========*/ + dict_index_t* index, /*!< in: index */ + ulint hint_page_no, /*!< in: hint of a good page */ + byte file_direction, /*!< in: direction where a possible + page split is made */ + ulint level, /*!< in: level where the page is placed + in the tree */ + mtr_t* mtr, /*!< in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr) /*!< in/out: mini-transaction + for x-latching and initializing + the page */ +{ + buf_block_t* new_block; + ulint new_page_no; + + if (dict_index_is_ibuf(index)) { + + return(btr_page_alloc_for_ibuf(index, mtr)); + } + + new_page_no = btr_page_alloc_low( + index, hint_page_no, file_direction, level, mtr, init_mtr); + if (new_page_no == FIL_NULL) { return(NULL); @@ -950,9 +985,16 @@ btr_page_alloc( new_block = buf_page_get(dict_index_get_space(index), dict_table_zip_size(index->table), - new_page_no, RW_X_LATCH, mtr); + new_page_no, RW_X_LATCH, init_mtr); buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW); + if (mtr->freed_clust_leaf) { + mtr_memo_release(mtr, new_block, MTR_MEMO_FREE_CLUST_LEAF); + ut_ad(!mtr_memo_contains(mtr, new_block, + MTR_MEMO_FREE_CLUST_LEAF)); + } + + ut_ad(btr_freed_leaves_validate(mtr)); return(new_block); } @@ -1065,6 +1107,15 @@ btr_page_free_low( fseg_free_page(seg_header, buf_block_get_space(block), buf_block_get_page_no(block), mtr); + + /* The page was marked free in the allocation bitmap, but it + should remain buffer-fixed until mtr_commit(mtr) or until it + is explicitly freed from the mini-transaction. */ + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + /* TODO: Discard any operations on the page from the redo log + and remove the block from the flush list and the buffer pool. + This would free up buffer pool earlier and reduce writes to + both the tablespace and the redo log. */ } /**************************************************************//** @@ -1078,13 +1129,140 @@ btr_page_free( buf_block_t* block, /*!< in: block to be freed, x-latched */ mtr_t* mtr) /*!< in: mtr */ { - ulint level; - - level = btr_page_get_level(buf_block_get_frame(block), mtr); + const page_t* page = buf_block_get_frame(block); + ulint level = btr_page_get_level(page, mtr); + ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_INDEX); btr_page_free_low(index, block, level, mtr); + + /* The handling of MTR_MEMO_FREE_CLUST_LEAF assumes this. */ + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + + if (level == 0 && dict_index_is_clust(index)) { + /* We may have to call btr_mark_freed_leaves() to + temporarily mark the block nonfree for invoking + btr_store_big_rec_extern_fields_func() after an + update. Remember that the block was freed. */ + mtr->freed_clust_leaf = TRUE; + mtr_memo_push(mtr, block, MTR_MEMO_FREE_CLUST_LEAF); + } + + ut_ad(btr_freed_leaves_validate(mtr)); } +/**************************************************************//** +Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free. +For invoking btr_store_big_rec_extern_fields() after an update, +we must temporarily mark freed clustered index pages allocated, so +that off-page columns will not be allocated from them. Between the +btr_store_big_rec_extern_fields() and mtr_commit() we have to +mark the pages free again, so that no pages will be leaked. */ +UNIV_INTERN +void +btr_mark_freed_leaves( +/*==================*/ + dict_index_t* index, /*!< in/out: clustered index */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + ibool nonfree)/*!< in: TRUE=mark nonfree, FALSE=mark freed */ +{ + /* This is loosely based on mtr_memo_release(). */ + + ulint offset; + + ut_ad(dict_index_is_clust(index)); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + + if (!mtr->freed_clust_leaf) { + return; + } + + offset = dyn_array_get_data_size(&mtr->memo); + + while (offset > 0) { + mtr_memo_slot_t* slot; + buf_block_t* block; + + offset -= sizeof *slot; + + slot = dyn_array_get_element(&mtr->memo, offset); + + if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) { + continue; + } + + /* Because btr_page_alloc() does invoke + mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all + blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the + memo must still be clustered index leaf tree pages. */ + block = slot->object; + ut_a(buf_block_get_space(block) + == dict_index_get_space(index)); + ut_a(fil_page_get_type(buf_block_get_frame(block)) + == FIL_PAGE_INDEX); + ut_a(page_is_leaf(buf_block_get_frame(block))); + + if (nonfree) { + /* Allocate the same page again. */ + ulint page_no; + page_no = btr_page_alloc_low( + index, buf_block_get_page_no(block), + FSP_NO_DIR, 0, mtr, NULL); + ut_a(page_no == buf_block_get_page_no(block)); + } else { + /* Assert that the page is allocated and free it. */ + btr_page_free_low(index, block, 0, mtr); + } + } + + ut_ad(btr_freed_leaves_validate(mtr)); +} + +#ifdef UNIV_DEBUG +/**************************************************************//** +Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF. +@see btr_mark_freed_leaves() +@return TRUE */ +UNIV_INTERN +ibool +btr_freed_leaves_validate( +/*======================*/ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + ulint offset; + + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + + offset = dyn_array_get_data_size(&mtr->memo); + + while (offset > 0) { + const mtr_memo_slot_t* slot; + const buf_block_t* block; + + offset -= sizeof *slot; + + slot = dyn_array_get_element(&mtr->memo, offset); + + if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) { + continue; + } + + ut_a(mtr->freed_clust_leaf); + /* Because btr_page_alloc() does invoke + mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all + blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the + memo must still be clustered index leaf tree pages. */ + block = slot->object; + ut_a(fil_page_get_type(buf_block_get_frame(block)) + == FIL_PAGE_INDEX); + ut_a(page_is_leaf(buf_block_get_frame(block))); + } + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + /**************************************************************//** Sets the child node file address in a node pointer. */ UNIV_INLINE @@ -1806,7 +1984,7 @@ btr_root_raise_and_insert( level = btr_page_get_level(root, mtr); - new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr); + new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr); new_page = buf_block_get_frame(new_block); new_page_zip = buf_block_get_page_zip(new_block); ut_a(!new_page_zip == !root_page_zip); @@ -2542,7 +2720,7 @@ func_start: /* 2. Allocate a new page to the index */ new_block = btr_page_alloc(cursor->index, hint_page_no, direction, - btr_page_get_level(page, mtr), mtr); + btr_page_get_level(page, mtr), mtr, mtr); new_page = buf_block_get_frame(new_block); new_page_zip = buf_block_get_page_zip(new_block); btr_page_create(new_block, new_page_zip, cursor->index, diff --git a/storage/innodb_plugin/btr/btr0cur.c b/storage/innodb_plugin/btr/btr0cur.c index 5cefa51bcd5..f1c2c2ddd5e 100644 --- a/storage/innodb_plugin/btr/btr0cur.c +++ b/storage/innodb_plugin/btr/btr0cur.c @@ -2414,39 +2414,6 @@ return_after_reservations: return(err); } -/**************************************************************//** -Commits and restarts a mini-transaction so that it will retain an -x-lock on index->lock and the cursor page. */ -UNIV_INTERN -void -btr_cur_mtr_commit_and_start( -/*=========================*/ - btr_cur_t* cursor, /*!< in: cursor */ - mtr_t* mtr) /*!< in/out: mini-transaction */ -{ - buf_block_t* block; - - block = btr_cur_get_block(cursor); - - ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index), - MTR_MEMO_X_LOCK)); - ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); - /* Keep the locks across the mtr_commit(mtr). */ - rw_lock_x_lock(dict_index_get_lock(cursor->index)); - rw_lock_x_lock(&block->lock); - mutex_enter(&block->mutex); - buf_block_buf_fix_inc(block, __FILE__, __LINE__); - mutex_exit(&block->mutex); - /* Write out the redo log. */ - mtr_commit(mtr); - mtr_start(mtr); - /* Reassociate the locks with the mini-transaction. - They will be released on mtr_commit(mtr). */ - mtr_memo_push(mtr, dict_index_get_lock(cursor->index), - MTR_MEMO_X_LOCK); - mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX); -} - /*==================== B-TREE DELETE MARK AND UNMARK ===============*/ /****************************************************************//** @@ -3901,6 +3868,9 @@ btr_store_big_rec_extern_fields_func( the "external storage" flags in offsets will not correspond to rec when this function returns */ + const big_rec_t*big_rec_vec, /*!< in: vector containing fields + to be stored externally */ + #ifdef UNIV_DEBUG mtr_t* local_mtr, /*!< in: mtr containing the latch to rec and to the tree */ @@ -3909,9 +3879,11 @@ btr_store_big_rec_extern_fields_func( ibool update_in_place,/*! in: TRUE if the record is updated in place (not delete+insert) */ #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ - const big_rec_t*big_rec_vec) /*!< in: vector containing fields - to be stored externally */ - + mtr_t* alloc_mtr) /*!< in/out: in an insert, NULL; + in an update, local_mtr for + allocating BLOB pages and + updating BLOB pointers; alloc_mtr + must not have freed any leaf pages */ { ulint rec_page_no; byte* field_ref; @@ -3930,6 +3902,9 @@ btr_store_big_rec_extern_fields_func( ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(rec_offs_any_extern(offsets)); + ut_ad(local_mtr); + ut_ad(!alloc_mtr || alloc_mtr == local_mtr); + ut_ad(!update_in_place || alloc_mtr); ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index), MTR_MEMO_X_LOCK)); ut_ad(mtr_memo_contains(local_mtr, rec_block, MTR_MEMO_PAGE_X_FIX)); @@ -3945,6 +3920,25 @@ btr_store_big_rec_extern_fields_func( rec_page_no = buf_block_get_page_no(rec_block); ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX); + if (alloc_mtr) { + /* Because alloc_mtr will be committed after + mtr, it is possible that the tablespace has been + extended when the B-tree record was updated or + inserted, or it will be extended while allocating + pages for big_rec. + + TODO: In mtr (not alloc_mtr), write a redo log record + about extending the tablespace to its current size, + and remember the current size. Whenever the tablespace + grows as pages are allocated, write further redo log + records to mtr. (Currently tablespace extension is not + covered by the redo log. If it were, the record would + only be written to alloc_mtr, which is committed after + mtr.) */ + } else { + alloc_mtr = &mtr; + } + if (UNIV_LIKELY_NULL(page_zip)) { int err; @@ -4021,7 +4015,7 @@ btr_store_big_rec_extern_fields_func( } block = btr_page_alloc(index, hint_page_no, - FSP_NO_DIR, 0, &mtr); + FSP_NO_DIR, 0, alloc_mtr, &mtr); if (UNIV_UNLIKELY(block == NULL)) { mtr_commit(&mtr); @@ -4148,11 +4142,15 @@ btr_store_big_rec_extern_fields_func( goto next_zip_page; } - rec_block = buf_page_get(space_id, zip_size, - rec_page_no, - RW_X_LATCH, &mtr); - buf_block_dbg_add_level(rec_block, - SYNC_NO_ORDER_CHECK); + if (alloc_mtr == &mtr) { + rec_block = buf_page_get( + space_id, zip_size, + rec_page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level( + rec_block, + SYNC_NO_ORDER_CHECK); + } if (err == Z_STREAM_END) { mach_write_to_4(field_ref @@ -4186,7 +4184,8 @@ btr_store_big_rec_extern_fields_func( page_zip_write_blob_ptr( page_zip, rec, index, offsets, - big_rec_vec->fields[i].field_no, &mtr); + big_rec_vec->fields[i].field_no, + alloc_mtr); next_zip_page: prev_page_no = page_no; @@ -4231,19 +4230,23 @@ next_zip_page: extern_len -= store_len; - rec_block = buf_page_get(space_id, zip_size, - rec_page_no, - RW_X_LATCH, &mtr); - buf_block_dbg_add_level(rec_block, - SYNC_NO_ORDER_CHECK); + if (alloc_mtr == &mtr) { + rec_block = buf_page_get( + space_id, zip_size, + rec_page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level( + rec_block, + SYNC_NO_ORDER_CHECK); + } mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0, - MLOG_4BYTES, &mtr); + MLOG_4BYTES, alloc_mtr); mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4, big_rec_vec->fields[i].len - extern_len, - MLOG_4BYTES, &mtr); + MLOG_4BYTES, alloc_mtr); if (prev_page_no == FIL_NULL) { btr_blob_dbg_add_blob( @@ -4253,18 +4256,19 @@ next_zip_page: mlog_write_ulint(field_ref + BTR_EXTERN_SPACE_ID, - space_id, - MLOG_4BYTES, &mtr); + space_id, MLOG_4BYTES, + alloc_mtr); mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO, - page_no, - MLOG_4BYTES, &mtr); + page_no, MLOG_4BYTES, + alloc_mtr); mlog_write_ulint(field_ref + BTR_EXTERN_OFFSET, FIL_PAGE_DATA, - MLOG_4BYTES, &mtr); + MLOG_4BYTES, + alloc_mtr); } prev_page_no = page_no; diff --git a/storage/innodb_plugin/buf/buf0buf.c b/storage/innodb_plugin/buf/buf0buf.c index cd1461d22b7..47300627acc 100644 --- a/storage/innodb_plugin/buf/buf0buf.c +++ b/storage/innodb_plugin/buf/buf0buf.c @@ -1174,29 +1174,6 @@ buf_page_set_accessed_make_young( } } -/********************************************************************//** -Resets the check_index_page_at_flush field of a page if found in the buffer -pool. */ -UNIV_INTERN -void -buf_reset_check_index_page_at_flush( -/*================================*/ - ulint space, /*!< in: space id */ - ulint offset) /*!< in: page number */ -{ - buf_block_t* block; - - buf_pool_mutex_enter(); - - block = (buf_block_t*) buf_page_hash_get(space, offset); - - if (block && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE) { - block->check_index_page_at_flush = FALSE; - } - - buf_pool_mutex_exit(); -} - /********************************************************************//** Returns the current state of is_hashed of a page. FALSE if the page is not in the pool. NOTE that this operation does not fix the page in the diff --git a/storage/innodb_plugin/fsp/fsp0fsp.c b/storage/innodb_plugin/fsp/fsp0fsp.c index d091a14c474..19846b63d5b 100644 --- a/storage/innodb_plugin/fsp/fsp0fsp.c +++ b/storage/innodb_plugin/fsp/fsp0fsp.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -312,8 +312,9 @@ fsp_fill_free_list( descriptor page and ibuf bitmap page; then we do not allocate more extents */ ulint space, /*!< in: space */ - fsp_header_t* header, /*!< in: space header */ - mtr_t* mtr); /*!< in: mtr */ + fsp_header_t* header, /*!< in/out: space header */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); /**********************************************************************//** Allocates a single free page from a segment. This function implements the intelligent allocation strategy which tries to minimize file space @@ -326,14 +327,20 @@ fseg_alloc_free_page_low( ulint space, /*!< in: space */ ulint zip_size,/*!< in: compressed page size in bytes or 0 for uncompressed pages */ - fseg_inode_t* seg_inode, /*!< in: segment inode */ + fseg_inode_t* seg_inode, /*!< in/out: segment inode */ ulint hint, /*!< in: hint of which page would be desirable */ byte direction, /*!< in: if the new page is needed because of an index page split, and records are inserted there in order, into which direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR */ - mtr_t* mtr); /*!< in: mtr handle */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr)/*!< in/out: mini-transaction in which the + page should be initialized + (may be the same as mtr), or NULL if it + should not be initialized (the page at hint + was previously freed in mtr) */ + __attribute__((warn_unused_result, nonnull(3,6))); #endif /* !UNIV_HOTBACKUP */ /**********************************************************************//** @@ -701,17 +708,18 @@ list, if not free limit == space size. This adding is necessary to make the descriptor defined, as they are uninitialized above the free limit. @return pointer to the extent descriptor, NULL if the page does not exist in the space or if the offset exceeds the free limit */ -UNIV_INLINE +UNIV_INLINE __attribute__((nonnull, warn_unused_result)) xdes_t* xdes_get_descriptor_with_space_hdr( /*===============================*/ - fsp_header_t* sp_header,/*!< in/out: space header, x-latched */ - ulint space, /*!< in: space id */ - ulint offset, /*!< in: page offset; - if equal to the free limit, - we try to add new extents to - the space free list */ - mtr_t* mtr) /*!< in: mtr handle */ + fsp_header_t* sp_header, /*!< in/out: space header, x-latched + in mtr */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: page offset; if equal + to the free limit, we try to + add new extents to the space + free list */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { ulint limit; ulint size; @@ -719,11 +727,9 @@ xdes_get_descriptor_with_space_hdr( ulint descr_page_no; page_t* descr_page; - ut_ad(mtr); ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL), MTR_MEMO_X_LOCK)); - ut_ad(mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_S_FIX) - || mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_X_FIX)); ut_ad(page_offset(sp_header) == FSP_HEADER_OFFSET); /* Read free limit and space size */ limit = mach_read_from_4(sp_header + FSP_FREE_LIMIT); @@ -773,7 +779,7 @@ is necessary to make the descriptor defined, as they are uninitialized above the free limit. @return pointer to the extent descriptor, NULL if the page does not exist in the space or if the offset exceeds the free limit */ -static +static __attribute__((nonnull, warn_unused_result)) xdes_t* xdes_get_descriptor( /*================*/ @@ -782,7 +788,7 @@ xdes_get_descriptor( or 0 for uncompressed pages */ ulint offset, /*!< in: page offset; if equal to the free limit, we try to add new extents to the space free list */ - mtr_t* mtr) /*!< in: mtr handle */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { buf_block_t* block; fsp_header_t* sp_header; @@ -1160,14 +1166,14 @@ fsp_header_get_tablespace_size(void) Tries to extend a single-table tablespace so that a page would fit in the data file. @return TRUE if success */ -static +static __attribute__((nonnull, warn_unused_result)) ibool fsp_try_extend_data_file_with_pages( /*================================*/ ulint space, /*!< in: space */ ulint page_no, /*!< in: page number */ - fsp_header_t* header, /*!< in: space header */ - mtr_t* mtr) /*!< in: mtr */ + fsp_header_t* header, /*!< in/out: space header */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { ibool success; ulint actual_size; @@ -1192,7 +1198,7 @@ fsp_try_extend_data_file_with_pages( /***********************************************************************//** Tries to extend the last data file of a tablespace if it is auto-extending. @return FALSE if not auto-extending */ -static +static __attribute__((nonnull)) ibool fsp_try_extend_data_file( /*=====================*/ @@ -1202,8 +1208,8 @@ fsp_try_extend_data_file( the actual file size rounded down to megabyte */ ulint space, /*!< in: space */ - fsp_header_t* header, /*!< in: space header */ - mtr_t* mtr) /*!< in: mtr */ + fsp_header_t* header, /*!< in/out: space header */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { ulint size; ulint zip_size; @@ -1339,7 +1345,7 @@ fsp_fill_free_list( then we do not allocate more extents */ ulint space, /*!< in: space */ fsp_header_t* header, /*!< in/out: space header */ - mtr_t* mtr) /*!< in: mtr */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { ulint limit; ulint size; @@ -1537,10 +1543,47 @@ fsp_alloc_free_extent( return(descr); } +/**********************************************************************//** +Allocates a single free page from a space. */ +static __attribute__((nonnull)) +void +fsp_alloc_from_free_frag( +/*=====================*/ + fsp_header_t* header, /*!< in/out: tablespace header */ + xdes_t* descr, /*!< in/out: extent descriptor */ + ulint bit, /*!< in: slot to allocate in the extent */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint frag_n_used; + + ut_ad(xdes_get_state(descr, mtr) == XDES_FREE_FRAG); + ut_a(xdes_get_bit(descr, XDES_FREE_BIT, bit, mtr)); + xdes_set_bit(descr, XDES_FREE_BIT, bit, FALSE, mtr); + + /* Update the FRAG_N_USED field */ + frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, + mtr); + frag_n_used++; + mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES, + mtr); + if (xdes_is_full(descr, mtr)) { + /* The fragment is full: move it to another list */ + flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, + mtr); + xdes_set_state(descr, XDES_FULL_FRAG, mtr); + + flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE, + mtr); + mlog_write_ulint(header + FSP_FRAG_N_USED, + frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES, + mtr); + } +} + /**********************************************************************//** Allocates a single free page from a space. The page is marked as used. @return the page offset, FIL_NULL if no page could be allocated */ -static +static __attribute__((nonnull, warn_unused_result)) ulint fsp_alloc_free_page( /*================*/ @@ -1548,19 +1591,22 @@ fsp_alloc_free_page( ulint zip_size,/*!< in: compressed page size in bytes or 0 for uncompressed pages */ ulint hint, /*!< in: hint of which page would be desirable */ - mtr_t* mtr) /*!< in: mtr handle */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr)/*!< in/out: mini-transaction in which the + page should be initialized + (may be the same as mtr) */ { fsp_header_t* header; fil_addr_t first; xdes_t* descr; buf_block_t* block; ulint free; - ulint frag_n_used; ulint page_no; ulint space_size; ibool success; ut_ad(mtr); + ut_ad(init_mtr); header = fsp_get_space_header(space, zip_size, mtr); @@ -1642,38 +1688,19 @@ fsp_alloc_free_page( } } - xdes_set_bit(descr, XDES_FREE_BIT, free, FALSE, mtr); - - /* Update the FRAG_N_USED field */ - frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, - mtr); - frag_n_used++; - mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES, - mtr); - if (xdes_is_full(descr, mtr)) { - /* The fragment is full: move it to another list */ - flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, - mtr); - xdes_set_state(descr, XDES_FULL_FRAG, mtr); - - flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE, - mtr); - mlog_write_ulint(header + FSP_FRAG_N_USED, - frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES, - mtr); - } + fsp_alloc_from_free_frag(header, descr, free, mtr); /* Initialize the allocated page to the buffer pool, so that it can be obtained immediately with buf_page_get without need for a disk read. */ - buf_page_create(space, page_no, zip_size, mtr); + buf_page_create(space, page_no, zip_size, init_mtr); - block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr); + block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, init_mtr); buf_block_dbg_add_level(block, SYNC_FSP_PAGE); /* Prior contents of the page should be ignored */ - fsp_init_file_page(block, mtr); + fsp_init_file_page(block, init_mtr); return(page_no); } @@ -1909,7 +1936,7 @@ fsp_alloc_seg_inode_page( zip_size = dict_table_flags_to_zip_size( mach_read_from_4(FSP_SPACE_FLAGS + space_header)); - page_no = fsp_alloc_free_page(space, zip_size, 0, mtr); + page_no = fsp_alloc_free_page(space, zip_size, 0, mtr, mtr); if (page_no == FIL_NULL) { @@ -2323,7 +2350,7 @@ fseg_create_general( if (page == 0) { page = fseg_alloc_free_page_low(space, zip_size, - inode, 0, FSP_UP, mtr); + inode, 0, FSP_UP, mtr, mtr); if (page == FIL_NULL) { @@ -2572,14 +2599,19 @@ fseg_alloc_free_page_low( ulint space, /*!< in: space */ ulint zip_size,/*!< in: compressed page size in bytes or 0 for uncompressed pages */ - fseg_inode_t* seg_inode, /*!< in: segment inode */ + fseg_inode_t* seg_inode, /*!< in/out: segment inode */ ulint hint, /*!< in: hint of which page would be desirable */ byte direction, /*!< in: if the new page is needed because of an index page split, and records are inserted there in order, into which direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR */ - mtr_t* mtr) /*!< in: mtr handle */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr)/*!< in/out: mini-transaction in which the + page should be initialized + (may be the same as mtr), or NULL if it + should not be initialized (the page at hint + was previously freed in mtr) */ { fsp_header_t* space_header; ulint space_size; @@ -2590,7 +2622,6 @@ fseg_alloc_free_page_low( ulint ret_page; /*!< the allocated page offset, FIL_NULL if could not be allocated */ xdes_t* ret_descr; /*!< the extent of the allocated page */ - ibool frag_page_allocated = FALSE; ibool success; ulint n; @@ -2612,6 +2643,8 @@ fseg_alloc_free_page_low( if (descr == NULL) { /* Hint outside space or too high above free limit: reset hint */ + ut_a(init_mtr); + /* The file space header page is always allocated. */ hint = 0; descr = xdes_get_descriptor(space, zip_size, hint, mtr); } @@ -2623,15 +2656,20 @@ fseg_alloc_free_page_low( mtr), seg_id)) && (xdes_get_bit(descr, XDES_FREE_BIT, hint % FSP_EXTENT_SIZE, mtr) == TRUE)) { - +take_hinted_page: /* 1. We can take the hinted page =================================*/ ret_descr = descr; ret_page = hint; + /* Skip the check for extending the tablespace. If the + page hint were not within the size of the tablespace, + we would have got (descr == NULL) above and reset the hint. */ + goto got_hinted_page; /*-----------------------------------------------------------*/ - } else if ((xdes_get_state(descr, mtr) == XDES_FREE) - && ((reserved - used) < reserved / FSEG_FILLFACTOR) - && (used >= FSEG_FRAG_LIMIT)) { + } else if (xdes_get_state(descr, mtr) == XDES_FREE + && (!init_mtr + || ((reserved - used < reserved / FSEG_FILLFACTOR) + && used >= FSEG_FRAG_LIMIT))) { /* 2. We allocate the free extent from space and can take ========================================================= @@ -2649,8 +2687,20 @@ fseg_alloc_free_page_low( /* Try to fill the segment free list */ fseg_fill_free_list(seg_inode, space, zip_size, hint + FSP_EXTENT_SIZE, mtr); - ret_page = hint; + goto take_hinted_page; /*-----------------------------------------------------------*/ + } else if (!init_mtr) { + ut_a(xdes_get_state(descr, mtr) == XDES_FREE_FRAG); + fsp_alloc_from_free_frag(space_header, descr, + hint % FSP_EXTENT_SIZE, mtr); + ret_page = hint; + ret_descr = NULL; + + /* Put the page in the fragment page array of the segment */ + n = fseg_find_free_frag_page_slot(seg_inode, mtr); + ut_a(n != FIL_NULL); + fseg_set_nth_frag_page_no(seg_inode, n, ret_page, mtr); + goto got_hinted_page; } else if ((direction != FSP_NO_DIR) && ((reserved - used) < reserved / FSEG_FILLFACTOR) && (used >= FSEG_FRAG_LIMIT) @@ -2710,11 +2760,10 @@ fseg_alloc_free_page_low( } else if (used < FSEG_FRAG_LIMIT) { /* 6. We allocate an individual page from the space ===================================================*/ - ret_page = fsp_alloc_free_page(space, zip_size, hint, mtr); + ret_page = fsp_alloc_free_page(space, zip_size, hint, + mtr, init_mtr); ret_descr = NULL; - frag_page_allocated = TRUE; - if (ret_page != FIL_NULL) { /* Put the page in the fragment page array of the segment */ @@ -2724,6 +2773,10 @@ fseg_alloc_free_page_low( fseg_set_nth_frag_page_no(seg_inode, n, ret_page, mtr); } + + /* fsp_alloc_free_page() invoked fsp_init_file_page() + already. */ + return(ret_page); /*-----------------------------------------------------------*/ } else { /* 7. We allocate a new extent and take its first page @@ -2771,26 +2824,34 @@ fseg_alloc_free_page_low( } } - if (!frag_page_allocated) { +got_hinted_page: + { /* Initialize the allocated page to buffer pool, so that it can be obtained immediately with buf_page_get without need for a disk read */ buf_block_t* block; ulint zip_size = dict_table_flags_to_zip_size( mach_read_from_4(FSP_SPACE_FLAGS + space_header)); + mtr_t* block_mtr = init_mtr ? init_mtr : mtr; - block = buf_page_create(space, ret_page, zip_size, mtr); + block = buf_page_create(space, ret_page, zip_size, block_mtr); buf_block_dbg_add_level(block, SYNC_FSP_PAGE); if (UNIV_UNLIKELY(block != buf_page_get(space, zip_size, ret_page, RW_X_LATCH, - mtr))) { + block_mtr))) { ut_error; } - /* The prior contents of the page should be ignored */ - fsp_init_file_page(block, mtr); + if (init_mtr) { + /* The prior contents of the page should be ignored */ + fsp_init_file_page(block, init_mtr); + } + } + /* ret_descr == NULL if the block was allocated from free_frag + (XDES_FREE_FRAG) */ + if (ret_descr != NULL) { /* At this point we know the extent and the page offset. The extent is still in the appropriate list (FSEG_NOT_FULL or FSEG_FREE), and the page is not yet marked as used. */ @@ -2803,8 +2864,6 @@ fseg_alloc_free_page_low( fseg_mark_page_used(seg_inode, space, zip_size, ret_page, mtr); } - buf_reset_check_index_page_at_flush(space, ret_page); - return(ret_page); } @@ -2817,7 +2876,7 @@ UNIV_INTERN ulint fseg_alloc_free_page_general( /*=========================*/ - fseg_header_t* seg_header,/*!< in: segment header */ + fseg_header_t* seg_header,/*!< in/out: segment header */ ulint hint, /*!< in: hint of which page would be desirable */ byte direction,/*!< in: if the new page is needed because of an index page split, and records are @@ -2829,7 +2888,11 @@ fseg_alloc_free_page_general( with fsp_reserve_free_extents, then there is no need to do the check for this individual page */ - mtr_t* mtr) /*!< in: mtr handle */ + mtr_t* mtr, /*!< in/out: mini-transaction handle */ + mtr_t* init_mtr)/*!< in/out: mtr or another mini-transaction + in which the page should be initialized, + or NULL if this is a "fake allocation" of + a page that was previously freed in mtr */ { fseg_inode_t* inode; ulint space; @@ -2871,7 +2934,8 @@ fseg_alloc_free_page_general( } page_no = fseg_alloc_free_page_low(space, zip_size, - inode, hint, direction, mtr); + inode, hint, direction, + mtr, init_mtr); if (!has_done_reservation) { fil_space_release_free_extents(space, n_reserved); } @@ -2879,28 +2943,6 @@ fseg_alloc_free_page_general( return(page_no); } -/**********************************************************************//** -Allocates a single free page from a segment. This function implements -the intelligent allocation strategy which tries to minimize file space -fragmentation. -@return allocated page offset, FIL_NULL if no page could be allocated */ -UNIV_INTERN -ulint -fseg_alloc_free_page( -/*=================*/ - fseg_header_t* seg_header,/*!< in: segment header */ - ulint hint, /*!< in: hint of which page would be desirable */ - byte direction,/*!< in: if the new page is needed because - of an index page split, and records are - inserted there in order, into which - direction they go alphabetically: FSP_DOWN, - FSP_UP, FSP_NO_DIR */ - mtr_t* mtr) /*!< in: mtr handle */ -{ - return(fseg_alloc_free_page_general(seg_header, hint, direction, - FALSE, mtr)); -} - /**********************************************************************//** Checks that we have at least 2 frag pages free in the first extent of a single-table tablespace, and they are also physically initialized to the data diff --git a/storage/innodb_plugin/include/btr0btr.h b/storage/innodb_plugin/include/btr0btr.h index c0a038dd21d..476ad29adac 100644 --- a/storage/innodb_plugin/include/btr0btr.h +++ b/storage/innodb_plugin/include/btr0btr.h @@ -557,7 +557,12 @@ btr_page_alloc( page split is made */ ulint level, /*!< in: level where the page is placed in the tree */ - mtr_t* mtr); /*!< in: mtr */ + mtr_t* mtr, /*!< in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr) /*!< in/out: mini-transaction + for x-latching and initializing + the page */ + __attribute__((nonnull, warn_unused_result)); /**************************************************************//** Frees a file page used in an index tree. NOTE: cannot free field external storage pages because the page must contain info on its level. */ @@ -580,6 +585,33 @@ btr_page_free_low( buf_block_t* block, /*!< in: block to be freed, x-latched */ ulint level, /*!< in: page level */ mtr_t* mtr); /*!< in: mtr */ +/**************************************************************//** +Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free. +For invoking btr_store_big_rec_extern_fields() after an update, +we must temporarily mark freed clustered index pages allocated, so +that off-page columns will not be allocated from them. Between the +btr_store_big_rec_extern_fields() and mtr_commit() we have to +mark the pages free again, so that no pages will be leaked. */ +UNIV_INTERN +void +btr_mark_freed_leaves( +/*==================*/ + dict_index_t* index, /*!< in/out: clustered index */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + ibool nonfree)/*!< in: TRUE=mark nonfree, FALSE=mark freed */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +/**************************************************************//** +Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF. +@see btr_mark_freed_leaves() +@return TRUE */ +UNIV_INTERN +ibool +btr_freed_leaves_validate( +/*======================*/ + mtr_t* mtr) /*!< in: mini-transaction */ + __attribute__((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ #ifdef UNIV_BTR_PRINT /*************************************************************//** Prints size info of a B-tree. */ diff --git a/storage/innodb_plugin/include/btr0cur.h b/storage/innodb_plugin/include/btr0cur.h index 6094a2a6c7a..1d97c5b9452 100644 --- a/storage/innodb_plugin/include/btr0cur.h +++ b/storage/innodb_plugin/include/btr0cur.h @@ -326,16 +326,6 @@ btr_cur_pessimistic_update( que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr); /*!< in: mtr; must be committed before latching any further pages */ -/***************************************************************** -Commits and restarts a mini-transaction so that it will retain an -x-lock on index->lock and the cursor page. */ -UNIV_INTERN -void -btr_cur_mtr_commit_and_start( -/*=========================*/ - btr_cur_t* cursor, /*!< in: cursor */ - mtr_t* mtr) /*!< in/out: mini-transaction */ - __attribute__((nonnull)); /***********************************************************//** Marks a clustered index record deleted. Writes an undo log record to undo log on this delete marking. Writes in the trx id field the id @@ -540,6 +530,8 @@ btr_store_big_rec_extern_fields_func( the "external storage" flags in offsets will not correspond to rec when this function returns */ + const big_rec_t*big_rec_vec, /*!< in: vector containing fields + to be stored externally */ #ifdef UNIV_DEBUG mtr_t* local_mtr, /*!< in: mtr containing the latch to rec and to the tree */ @@ -548,9 +540,12 @@ btr_store_big_rec_extern_fields_func( ibool update_in_place,/*! in: TRUE if the record is updated in place (not delete+insert) */ #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ - const big_rec_t*big_rec_vec) /*!< in: vector containing fields - to be stored externally */ - __attribute__((nonnull)); + mtr_t* alloc_mtr) /*!< in/out: in an insert, NULL; + in an update, local_mtr for + allocating BLOB pages and + updating BLOB pointers; alloc_mtr + must not have freed any leaf pages */ + __attribute__((nonnull(1,2,3,4,5), warn_unused_result)); /** Stores the fields in big_rec_vec to the tablespace and puts pointers to them in rec. The extern flags in rec will have to be set beforehand. @@ -559,21 +554,22 @@ file segment of the index tree. @param index in: clustered index; MUST be X-latched by mtr @param b in/out: block containing rec; MUST be X-latched by mtr @param rec in/out: clustered index record -@param offsets in: rec_get_offsets(rec, index); +@param offs in: rec_get_offsets(rec, index); the "external storage" flags in offsets will not be adjusted +@param big in: vector containing fields to be stored externally @param mtr in: mini-transaction that holds x-latch on index and b @param upd in: TRUE if the record is updated in place (not delete+insert) -@param big in: vector containing fields to be stored externally +@param rmtr in/out: in updates, the mini-transaction that holds rec @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ #ifdef UNIV_DEBUG -# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \ - btr_store_big_rec_extern_fields_func(index,b,rec,offsets,mtr,upd,big) +# define btr_store_big_rec_extern_fields(index,b,rec,offs,big,mtr,upd,rmtr) \ + btr_store_big_rec_extern_fields_func(index,b,rec,offs,big,mtr,upd,rmtr) #elif defined UNIV_BLOB_LIGHT_DEBUG -# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \ - btr_store_big_rec_extern_fields_func(index,b,rec,offsets,upd,big) +# define btr_store_big_rec_extern_fields(index,b,rec,offs,big,mtr,upd,rmtr) \ + btr_store_big_rec_extern_fields_func(index,b,rec,offs,big,upd,rmtr) #else -# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \ - btr_store_big_rec_extern_fields_func(index,b,rec,offsets,big) +# define btr_store_big_rec_extern_fields(index,b,rec,offs,big,mtr,upd,rmtr) \ + btr_store_big_rec_extern_fields_func(index,b,rec,offs,big,rmtr) #endif /*******************************************************************//** diff --git a/storage/innodb_plugin/include/buf0buf.h b/storage/innodb_plugin/include/buf0buf.h index 9856bfce409..557bc17d311 100644 --- a/storage/innodb_plugin/include/buf0buf.h +++ b/storage/innodb_plugin/include/buf0buf.h @@ -372,15 +372,6 @@ buf_page_peek( /*==========*/ ulint space, /*!< in: space id */ ulint offset);/*!< in: page number */ -/********************************************************************//** -Resets the check_index_page_at_flush field of a page if found in the buffer -pool. */ -UNIV_INTERN -void -buf_reset_check_index_page_at_flush( -/*================================*/ - ulint space, /*!< in: space id */ - ulint offset);/*!< in: page number */ #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG /********************************************************************//** Sets file_page_was_freed TRUE if the page is found in the buffer pool. diff --git a/storage/innodb_plugin/include/fsp0fsp.h b/storage/innodb_plugin/include/fsp0fsp.h index 7abd3914eda..2221380c9a2 100644 --- a/storage/innodb_plugin/include/fsp0fsp.h +++ b/storage/innodb_plugin/include/fsp0fsp.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -176,19 +176,18 @@ fseg_n_reserved_pages( Allocates a single free page from a segment. This function implements the intelligent allocation strategy which tries to minimize file space fragmentation. -@return the allocated page offset FIL_NULL if no page could be allocated */ -UNIV_INTERN -ulint -fseg_alloc_free_page( -/*=================*/ - fseg_header_t* seg_header, /*!< in: segment header */ - ulint hint, /*!< in: hint of which page would be desirable */ - byte direction, /*!< in: if the new page is needed because +@param[in/out] seg_header segment header +@param[in] hint hint of which page would be desirable +@param[in] direction if the new page is needed because of an index page split, and records are inserted there in order, into which direction they go alphabetically: FSP_DOWN, - FSP_UP, FSP_NO_DIR */ - mtr_t* mtr); /*!< in: mtr handle */ + FSP_UP, FSP_NO_DIR +@param[in/out] mtr mini-transaction +@return the allocated page offset FIL_NULL if no page could be allocated */ +#define fseg_alloc_free_page(seg_header, hint, direction, mtr) \ + fseg_alloc_free_page_general(seg_header, hint, direction, \ + FALSE, mtr, mtr) /**********************************************************************//** Allocates a single free page from a segment. This function implements the intelligent allocation strategy which tries to minimize file space @@ -198,7 +197,7 @@ UNIV_INTERN ulint fseg_alloc_free_page_general( /*=========================*/ - fseg_header_t* seg_header,/*!< in: segment header */ + fseg_header_t* seg_header,/*!< in/out: segment header */ ulint hint, /*!< in: hint of which page would be desirable */ byte direction,/*!< in: if the new page is needed because of an index page split, and records are @@ -210,7 +209,12 @@ fseg_alloc_free_page_general( with fsp_reserve_free_extents, then there is no need to do the check for this individual page */ - mtr_t* mtr); /*!< in: mtr handle */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr)/*!< in/out: mtr or another mini-transaction + in which the page should be initialized, + or NULL if this is a "fake allocation" of + a page that was previously freed in mtr */ + __attribute__((warn_unused_result, nonnull(1,5))); /**********************************************************************//** Reserves free pages from a tablespace. All mini-transactions which may use several pages from the tablespace should call this function beforehand diff --git a/storage/innodb_plugin/include/mtr0mtr.h b/storage/innodb_plugin/include/mtr0mtr.h index bc3f1951be9..2a561131c09 100644 --- a/storage/innodb_plugin/include/mtr0mtr.h +++ b/storage/innodb_plugin/include/mtr0mtr.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -53,6 +53,8 @@ first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ #define MTR_MEMO_MODIFY 54 #define MTR_MEMO_S_LOCK 55 #define MTR_MEMO_X_LOCK 56 +/** The mini-transaction freed a clustered index leaf page. */ +#define MTR_MEMO_FREE_CLUST_LEAF 57 /** @name Log item types The log items are declared 'byte' so that the compiler can warn if val @@ -387,9 +389,12 @@ struct mtr_struct{ #endif dyn_array_t memo; /*!< memo stack for locks etc. */ dyn_array_t log; /*!< mini-transaction log */ - ibool modifications; - /* TRUE if the mtr made modifications to - buffer pool pages */ + unsigned modifications:1; + /*!< TRUE if the mini-transaction + modified buffer pool pages */ + unsigned freed_clust_leaf:1; + /*!< TRUE if MTR_MEMO_FREE_CLUST_LEAF + was logged in the mini-transaction */ ulint n_log_recs; /* count of how many page initial log records have been written to the mtr log */ diff --git a/storage/innodb_plugin/include/mtr0mtr.ic b/storage/innodb_plugin/include/mtr0mtr.ic index 18f8e87b3cf..9c0ddff9132 100644 --- a/storage/innodb_plugin/include/mtr0mtr.ic +++ b/storage/innodb_plugin/include/mtr0mtr.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -44,6 +44,7 @@ mtr_start( mtr->log_mode = MTR_LOG_ALL; mtr->modifications = FALSE; + mtr->freed_clust_leaf = FALSE; mtr->n_log_recs = 0; ut_d(mtr->state = MTR_ACTIVE); @@ -67,7 +68,8 @@ mtr_memo_push( ut_ad(object); ut_ad(type >= MTR_MEMO_PAGE_S_FIX); - ut_ad(type <= MTR_MEMO_X_LOCK); + ut_ad(type <= MTR_MEMO_FREE_CLUST_LEAF); + ut_ad(type != MTR_MEMO_FREE_CLUST_LEAF || mtr->freed_clust_leaf); ut_ad(mtr); ut_ad(mtr->magic_n == MTR_MAGIC_N); ut_ad(mtr->state == MTR_ACTIVE); diff --git a/storage/innodb_plugin/include/trx0undo.h b/storage/innodb_plugin/include/trx0undo.h index 4f15cd85833..c95f99d6417 100644 --- a/storage/innodb_plugin/include/trx0undo.h +++ b/storage/innodb_plugin/include/trx0undo.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -204,17 +204,51 @@ trx_undo_add_page( mtr_t* mtr); /*!< in: mtr which does not have a latch to any undo log page; the caller must have reserved the rollback segment mutex */ +/********************************************************************//** +Frees the last undo log page. +The caller must hold the rollback segment mutex. */ +UNIV_INTERN +void +trx_undo_free_last_page_func( +/*==========================*/ +#ifdef UNIV_DEBUG + const trx_t* trx, /*!< in: transaction */ +#endif /* UNIV_DEBUG */ + trx_undo_t* undo, /*!< in/out: undo log memory copy */ + mtr_t* mtr) /*!< in/out: mini-transaction which does not + have a latch to any undo log page or which + has allocated the undo log page */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +# define trx_undo_free_last_page(trx,undo,mtr) \ + trx_undo_free_last_page_func(trx,undo,mtr) +#else /* UNIV_DEBUG */ +# define trx_undo_free_last_page(trx,undo,mtr) \ + trx_undo_free_last_page_func(undo,mtr) +#endif /* UNIV_DEBUG */ + /***********************************************************************//** Truncates an undo log from the end. This function is used during a rollback to free space from an undo log. */ UNIV_INTERN void -trx_undo_truncate_end( -/*==================*/ - trx_t* trx, /*!< in: transaction whose undo log it is */ - trx_undo_t* undo, /*!< in: undo log */ - undo_no_t limit); /*!< in: all undo records with undo number +trx_undo_truncate_end_func( +/*=======================*/ +#ifdef UNIV_DEBUG + const trx_t* trx, /*!< in: transaction whose undo log it is */ +#endif /* UNIV_DEBUG */ + trx_undo_t* undo, /*!< in/out: undo log */ + undo_no_t limit) /*!< in: all undo records with undo number >= this value should be truncated */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +# define trx_undo_truncate_end(trx,undo,limit) \ + trx_undo_truncate_end_func(trx,undo,limit) +#else /* UNIV_DEBUG */ +# define trx_undo_truncate_end(trx,undo,limit) \ + trx_undo_truncate_end_func(undo,limit) +#endif /* UNIV_DEBUG */ + /***********************************************************************//** Truncates an undo log from the start. This function is used during a purge operation. */ diff --git a/storage/innodb_plugin/mtr/mtr0mtr.c b/storage/innodb_plugin/mtr/mtr0mtr.c index 417e97732bb..6dd5b6eb8c3 100644 --- a/storage/innodb_plugin/mtr/mtr0mtr.c +++ b/storage/innodb_plugin/mtr/mtr0mtr.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -58,12 +58,11 @@ mtr_memo_slot_release( buf_page_release((buf_block_t*)object, type, mtr); } else if (type == MTR_MEMO_S_LOCK) { rw_lock_s_unlock((rw_lock_t*)object); -#ifdef UNIV_DEBUG } else if (type != MTR_MEMO_X_LOCK) { - ut_ad(type == MTR_MEMO_MODIFY); + ut_ad(type == MTR_MEMO_MODIFY + || type == MTR_MEMO_FREE_CLUST_LEAF); ut_ad(mtr_memo_contains(mtr, object, MTR_MEMO_PAGE_X_FIX)); -#endif /* UNIV_DEBUG */ } else { rw_lock_x_unlock((rw_lock_t*)object); } diff --git a/storage/innodb_plugin/row/row0ins.c b/storage/innodb_plugin/row/row0ins.c index ea43cbfb5f1..0f158cdc706 100644 --- a/storage/innodb_plugin/row/row0ins.c +++ b/storage/innodb_plugin/row/row0ins.c @@ -2094,15 +2094,20 @@ row_ins_index_entry_low( if (big_rec) { ut_a(err == DB_SUCCESS); /* Write out the externally stored - columns while still x-latching - index->lock and block->lock. We have - to mtr_commit(mtr) first, so that the - redo log will be written in the - correct order. Otherwise, we would run - into trouble on crash recovery if mtr - freed B-tree pages on which some of - the big_rec fields will be written. */ - btr_cur_mtr_commit_and_start(&cursor, &mtr); + columns, but allocate the pages and + write the pointers using the + mini-transaction of the record update. + If any pages were freed in the update, + temporarily mark them allocated so + that off-page columns will not + overwrite them. We must do this, + because we will write the redo log for + the BLOB writes before writing the + redo log for the record update. Thus, + redo log application at crash recovery + will see BLOBs being written to free pages. */ + + btr_mark_freed_leaves(index, &mtr, TRUE); rec = btr_cur_get_rec(&cursor); offsets = rec_get_offsets( @@ -2111,7 +2116,8 @@ row_ins_index_entry_low( err = btr_store_big_rec_extern_fields( index, btr_cur_get_block(&cursor), - rec, offsets, &mtr, FALSE, big_rec); + rec, offsets, big_rec, &mtr, + FALSE, &mtr); /* If writing big_rec fails (for example, because of DB_OUT_OF_FILE_SPACE), the record will be corrupted. Even if @@ -2124,6 +2130,9 @@ row_ins_index_entry_low( undo log, and thus the record cannot be rolled back. */ ut_a(err == DB_SUCCESS); + /* Free the pages again + in order to avoid a leak. */ + btr_mark_freed_leaves(index, &mtr, FALSE); goto stored_big_rec; } } else { @@ -2165,7 +2174,7 @@ function_exit: err = btr_store_big_rec_extern_fields( index, btr_cur_get_block(&cursor), - rec, offsets, &mtr, FALSE, big_rec); + rec, offsets, big_rec, &mtr, FALSE, NULL); stored_big_rec: if (modify) { diff --git a/storage/innodb_plugin/row/row0row.c b/storage/innodb_plugin/row/row0row.c index 9cdbbe76e04..e476ffae84e 100644 --- a/storage/innodb_plugin/row/row0row.c +++ b/storage/innodb_plugin/row/row0row.c @@ -243,19 +243,20 @@ row_build( } #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG - /* This condition can occur during crash recovery before - trx_rollback_active() has completed execution. - - This condition is possible if the server crashed - during an insert or update before - btr_store_big_rec_extern_fields() did mtr_commit() all - BLOB pointers to the clustered index record. - - If the record contains a null BLOB pointer, look up the - transaction that holds the implicit lock on this record, and - assert that it was recovered (and will soon be rolled back). */ - ut_a(!rec_offs_any_null_extern(rec, offsets) - || trx_assert_recovered(row_get_rec_trx_id(rec, index, offsets))); + if (rec_offs_any_null_extern(rec, offsets)) { + /* This condition can occur during crash recovery + before trx_rollback_active() has completed execution. + + This condition is possible if the server crashed + during an insert or update-by-delete-and-insert before + btr_store_big_rec_extern_fields() did mtr_commit() all + BLOB pointers to the freshly inserted clustered index + record. */ + ut_a(trx_assert_recovered( + row_get_rec_trx_id(rec, index, offsets))); + ut_a(trx_undo_roll_ptr_is_insert( + row_get_rec_roll_ptr(rec, index, offsets))); + } #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ if (type != ROW_COPY_POINTERS) { diff --git a/storage/innodb_plugin/row/row0upd.c b/storage/innodb_plugin/row/row0upd.c index b5952ff0a78..05856687015 100644 --- a/storage/innodb_plugin/row/row0upd.c +++ b/storage/innodb_plugin/row/row0upd.c @@ -1978,21 +1978,22 @@ row_upd_clust_rec( rec_offs_init(offsets_); ut_a(err == DB_SUCCESS); - /* Write out the externally stored columns while still - x-latching index->lock and block->lock. We have to - mtr_commit(mtr) first, so that the redo log will be - written in the correct order. Otherwise, we would run - into trouble on crash recovery if mtr freed B-tree - pages on which some of the big_rec fields will be - written. */ - btr_cur_mtr_commit_and_start(btr_cur, mtr); - + /* Write out the externally stored columns, but + allocate the pages and write the pointers using the + mini-transaction of the record update. If any pages + were freed in the update, temporarily mark them + allocated so that off-page columns will not overwrite + them. We must do this, because we write the redo log + for the BLOB writes before writing the redo log for + the record update. */ + + btr_mark_freed_leaves(index, mtr, TRUE); rec = btr_cur_get_rec(btr_cur); err = btr_store_big_rec_extern_fields( index, btr_cur_get_block(btr_cur), rec, rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap), - mtr, TRUE, big_rec); + big_rec, mtr, TRUE, mtr); /* If writing big_rec fails (for example, because of DB_OUT_OF_FILE_SPACE), the record will be corrupted. Even if we did not update any externally stored @@ -2002,6 +2003,8 @@ row_upd_clust_rec( to the undo log, and thus the record cannot be rolled back. */ ut_a(err == DB_SUCCESS); + /* Free the pages again in order to avoid a leak. */ + btr_mark_freed_leaves(index, mtr, FALSE); } mtr_commit(mtr); diff --git a/storage/innodb_plugin/trx/trx0rec.c b/storage/innodb_plugin/trx/trx0rec.c index 9f2fd59d82b..a729a39d0cc 100644 --- a/storage/innodb_plugin/trx/trx0rec.c +++ b/storage/innodb_plugin/trx/trx0rec.c @@ -1097,22 +1097,29 @@ trx_undo_rec_get_partial_row( #endif /* !UNIV_HOTBACKUP */ /***********************************************************************//** -Erases the unused undo log page end. */ -static -void +Erases the unused undo log page end. +@return TRUE if the page contained something, FALSE if it was empty */ +static __attribute__((nonnull, warn_unused_result)) +ibool trx_undo_erase_page_end( /*====================*/ - page_t* undo_page, /*!< in: undo page whose end to erase */ - mtr_t* mtr) /*!< in: mtr */ + page_t* undo_page, /*!< in/out: undo page whose end to erase */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { ulint first_free; first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE); + if (first_free == TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE) { + /* This was an empty page to begin with. + Do nothing here; the caller should free the page. */ + return(FALSE); + } memset(undo_page + first_free, 0xff, (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) - first_free); mlog_write_initial_log_record(undo_page, MLOG_UNDO_ERASE_END, mtr); + return(TRUE); } /***********************************************************//** @@ -1134,7 +1141,11 @@ trx_undo_parse_erase_page_end( return(ptr); } - trx_undo_erase_page_end(page, mtr); + if (!trx_undo_erase_page_end(page, mtr)) { + /* The function trx_undo_erase_page_end() should not + have done anything to an empty page. */ + ut_ad(0); + } return(ptr); } @@ -1180,6 +1191,9 @@ trx_undo_report_row_operation( mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets = offsets_; +#ifdef UNIV_DEBUG + int loop_count = 0; +#endif /* UNIV_DEBUG */ rec_offs_init(offsets_); ut_a(dict_index_is_clust(index)); @@ -1242,7 +1256,7 @@ trx_undo_report_row_operation( mtr_start(&mtr); - for (;;) { + do { buf_block_t* undo_block; page_t* undo_page; ulint offset; @@ -1271,7 +1285,19 @@ trx_undo_report_row_operation( version the replicate page constructed using the log records stays identical to the original page */ - trx_undo_erase_page_end(undo_page, &mtr); + if (!trx_undo_erase_page_end(undo_page, &mtr)) { + /* The record did not fit on an empty + undo page. Discard the freshly allocated + page and return an error. */ + + mutex_enter(&rseg->mutex); + trx_undo_free_last_page(trx, undo, &mtr); + mutex_exit(&rseg->mutex); + + err = DB_TOO_BIG_RECORD; + goto err_exit; + } + mtr_commit(&mtr); } else { /* Success */ @@ -1291,16 +1317,15 @@ trx_undo_report_row_operation( *roll_ptr = trx_undo_build_roll_ptr( op_type == TRX_UNDO_INSERT_OP, rseg->id, page_no, offset); - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } - return(DB_SUCCESS); + err = DB_SUCCESS; + goto func_exit; } ut_ad(page_no == undo->last_page_no); /* We have to extend the undo log by one page */ + ut_ad(++loop_count < 2); mtr_start(&mtr); /* When we add a page to an undo log, this is analogous to @@ -1312,18 +1337,19 @@ trx_undo_report_row_operation( page_no = trx_undo_add_page(trx, undo, &mtr); mutex_exit(&(rseg->mutex)); + } while (UNIV_LIKELY(page_no != FIL_NULL)); - if (UNIV_UNLIKELY(page_no == FIL_NULL)) { - /* Did not succeed: out of space */ + /* Did not succeed: out of space */ + err = DB_OUT_OF_FILE_SPACE; - mutex_exit(&(trx->undo_mutex)); - mtr_commit(&mtr); - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } - return(DB_OUT_OF_FILE_SPACE); - } +err_exit: + mutex_exit(&trx->undo_mutex); + mtr_commit(&mtr); +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); } + return(err); } /*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/ diff --git a/storage/innodb_plugin/trx/trx0undo.c b/storage/innodb_plugin/trx/trx0undo.c index 7f03b68fb55..c36f55fbd9c 100644 --- a/storage/innodb_plugin/trx/trx0undo.c +++ b/storage/innodb_plugin/trx/trx0undo.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -912,7 +912,7 @@ trx_undo_add_page( page_no = fseg_alloc_free_page_general(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER, undo->top_page_no + 1, FSP_UP, - TRUE, mtr); + TRUE, mtr, mtr); fil_space_release_free_extents(undo->space, n_reserved); @@ -998,29 +998,28 @@ trx_undo_free_page( } /********************************************************************//** -Frees an undo log page when there is also the memory object for the undo -log. */ -static +Frees the last undo log page. +The caller must hold the rollback segment mutex. */ +UNIV_INTERN void -trx_undo_free_page_in_rollback( -/*===========================*/ - trx_t* trx __attribute__((unused)), /*!< in: transaction */ - trx_undo_t* undo, /*!< in: undo log memory copy */ - ulint page_no,/*!< in: page number to free: must not be the - header page */ - mtr_t* mtr) /*!< in: mtr which does not have a latch to any - undo log page; the caller must have reserved - the rollback segment mutex */ +trx_undo_free_last_page_func( +/*==========================*/ +#ifdef UNIV_DEBUG + const trx_t* trx, /*!< in: transaction */ +#endif /* UNIV_DEBUG */ + trx_undo_t* undo, /*!< in/out: undo log memory copy */ + mtr_t* mtr) /*!< in/out: mini-transaction which does not + have a latch to any undo log page or which + has allocated the undo log page */ { - ulint last_page_no; - - ut_ad(undo->hdr_page_no != page_no); - ut_ad(mutex_own(&(trx->undo_mutex))); + ut_ad(mutex_own(&trx->undo_mutex)); + ut_ad(undo->hdr_page_no != undo->last_page_no); + ut_ad(undo->size > 0); - last_page_no = trx_undo_free_page(undo->rseg, FALSE, undo->space, - undo->hdr_page_no, page_no, mtr); + undo->last_page_no = trx_undo_free_page( + undo->rseg, FALSE, undo->space, + undo->hdr_page_no, undo->last_page_no, mtr); - undo->last_page_no = last_page_no; undo->size--; } @@ -1056,9 +1055,11 @@ Truncates an undo log from the end. This function is used during a rollback to free space from an undo log. */ UNIV_INTERN void -trx_undo_truncate_end( -/*==================*/ - trx_t* trx, /*!< in: transaction whose undo log it is */ +trx_undo_truncate_end_func( +/*=======================*/ +#ifdef UNIV_DEBUG + const trx_t* trx, /*!< in: transaction whose undo log it is */ +#endif /* UNIV_DEBUG */ trx_undo_t* undo, /*!< in: undo log */ undo_no_t limit) /*!< in: all undo records with undo number >= this value should be truncated */ @@ -1084,18 +1085,7 @@ trx_undo_truncate_end( rec = trx_undo_page_get_last_rec(undo_page, undo->hdr_page_no, undo->hdr_offset); - for (;;) { - if (rec == NULL) { - if (last_page_no == undo->hdr_page_no) { - - goto function_exit; - } - - trx_undo_free_page_in_rollback( - trx, undo, last_page_no, &mtr); - break; - } - + while (rec) { if (ut_dulint_cmp(trx_undo_rec_get_undo_no(rec), limit) >= 0) { /* Truncate at least this record off, maybe @@ -1110,6 +1100,14 @@ trx_undo_truncate_end( undo->hdr_offset); } + if (last_page_no == undo->hdr_page_no) { + + goto function_exit; + } + + ut_ad(last_page_no == undo->last_page_no); + trx_undo_free_last_page(trx, undo, &mtr); + mtr_commit(&mtr); } -- cgit v1.2.1 From f610c5658748ae97a5e2c1e1afbd229f2121a082 Mon Sep 17 00:00:00 2001 From: Tor Didriksen Date: Mon, 29 Aug 2011 11:24:36 +0200 Subject: BUG#12911710 - VALGRIND FAILURE IN ROW-DEBUG:PERFSCHEMA.SOCKET_SUMMARY_BY_INSTANCE_FUNC Converting the number zero to binary and back yielded the number zero, but with no digits, i.e. zero precision. This made the multiply algorithm go haywire in various ways. include/decimal.h: Document struct st_decimal_t mysql-test/r/type_newdecimal.result: New test case (valgrind warnings) mysql-test/t/type_newdecimal.test: New test case (valgrind warnings) sql/my_decimal.h: Remove the HAVE_purify enabled/disabled code. strings/decimal.c: Make a proper zero, with non-zero precision. --- include/decimal.h | 9 +++++++++ mysql-test/r/type_newdecimal.result | 11 +++++++++++ mysql-test/t/type_newdecimal.test | 14 ++++++++++++++ sql/my_decimal.h | 6 +----- strings/decimal.c | 9 ++++++++- 5 files changed, 43 insertions(+), 6 deletions(-) diff --git a/include/decimal.h b/include/decimal.h index 530ed9e1757..c377bd4a400 100644 --- a/include/decimal.h +++ b/include/decimal.h @@ -21,6 +21,15 @@ typedef enum decimal_round_mode; typedef int32 decimal_digit_t; +/** + intg is the number of *decimal* digits (NOT number of decimal_digit_t's !) + before the point + frac is the number of decimal digits after the point + len is the length of buf (length of allocated space) in decimal_digit_t's, + not in bytes + sign false means positive, true means negative + buf is an array of decimal_digit_t's + */ typedef struct st_decimal_t { int intg, frac, len; my_bool sign; diff --git a/mysql-test/r/type_newdecimal.result b/mysql-test/r/type_newdecimal.result index c301a7dd629..0c6c1333e9b 100644 --- a/mysql-test/r/type_newdecimal.result +++ b/mysql-test/r/type_newdecimal.result @@ -1927,3 +1927,14 @@ f1 0.000000000000000000000000 DROP TABLE IF EXISTS t1; End of 5.1 tests +# +# BUG#12911710 - VALGRIND FAILURE IN +# ROW-DEBUG:PERFSCHEMA.SOCKET_SUMMARY_BY_INSTANCE_FUNC +# +CREATE TABLE t1(d1 DECIMAL(60,0) NOT NULL, +d2 DECIMAL(60,0) NOT NULL); +INSERT INTO t1 (d1, d2) VALUES(0.0, 0.0); +SELECT d1 * d2 FROM t1; +d1 * d2 +0 +DROP TABLE t1; diff --git a/mysql-test/t/type_newdecimal.test b/mysql-test/t/type_newdecimal.test index 31a8808da55..567d6c0b6a1 100644 --- a/mysql-test/t/type_newdecimal.test +++ b/mysql-test/t/type_newdecimal.test @@ -1526,3 +1526,17 @@ DROP TABLE IF EXISTS t1; --echo End of 5.1 tests + +--echo # +--echo # BUG#12911710 - VALGRIND FAILURE IN +--echo # ROW-DEBUG:PERFSCHEMA.SOCKET_SUMMARY_BY_INSTANCE_FUNC +--echo # + +CREATE TABLE t1(d1 DECIMAL(60,0) NOT NULL, + d2 DECIMAL(60,0) NOT NULL); + +INSERT INTO t1 (d1, d2) VALUES(0.0, 0.0); +SELECT d1 * d2 FROM t1; + +DROP TABLE t1; + diff --git a/sql/my_decimal.h b/sql/my_decimal.h index c7a99e10233..21f485560da 100644 --- a/sql/my_decimal.h +++ b/sql/my_decimal.h @@ -101,12 +101,8 @@ public: { len= DECIMAL_BUFF_LENGTH; buf= buffer; -#if !defined (HAVE_purify) && !defined(DBUG_OFF) - /* Set buffer to 'random' value to find wrong buffer usage */ - for (uint i= 0; i < DECIMAL_BUFF_LENGTH; i++) - buffer[i]= i; -#endif } + my_decimal() { init(); diff --git a/strings/decimal.c b/strings/decimal.c index 43957c7dc19..6c89657004c 100644 --- a/strings/decimal.c +++ b/strings/decimal.c @@ -1423,11 +1423,18 @@ int bin2decimal(const uchar *from, decimal_t *to, int precision, int scale) buf++; } my_afree(d_copy); + + /* + No digits? We have read the number zero, of unspecified precision. + Make it a proper zero, with non-zero precision. + */ + if (to->intg == 0 && to->frac == 0) + decimal_make_zero(to); return error; err: my_afree(d_copy); - decimal_make_zero(((decimal_t*) to)); + decimal_make_zero(to); return(E_DEC_BAD_NUM); } -- cgit v1.2.1