diff options
author | Inaam Rana <inaam.rana@oracle.com> | 2011-12-07 09:12:53 -0500 |
---|---|---|
committer | Inaam Rana <inaam.rana@oracle.com> | 2011-12-07 09:12:53 -0500 |
commit | 358a31df435d2e18f93a48be76bc2224e479948a (patch) | |
tree | 7ac495202178d7737e53b80160183f2026b60477 | |
parent | 0cd9228124266a1e8cf41e74994cdba1380ac2e2 (diff) | |
download | mariadb-git-358a31df435d2e18f93a48be76bc2224e479948a.tar.gz |
Bug#11759044 - 51325: DROPPING AN EMPTY INNODB TABLE TAKES A LONG TIME
WITH LARGE BUFFER POOL
(Note: this a backport of revno:3472 from mysql-trunk)
rb://845
approved by: Marko
When dropping a table (with an .ibd file i.e.: with
innodb_file_per_table set) we scan entire LRU to invalidate pages from
that table. This can be painful in case of large buffer pools as we hold
the buf_pool->mutex for the scan. Note that gravity of the problem does
not depend on the size of the table. Even with an empty table but a
large and filled up buffer pool we'll end up scanning a very long LRU
list.
The fix is to scan flush_list and just remove the blocks belonging to
the table from the flush_list, marking them as non-dirty. The blocks
are left in the LRU list for eventual eviction due to aging. The
flush_list is typically much smaller than the LRU list but for cases
where it is very long we have the solution of releasing the
buf_pool->mutex after scanning 1K pages.
buf_page_[set|unset]_sticky(): Use new IO-state BUF_IO_PIN to ensure
that a block stays in the flush_list and LRU list when we release
buf_pool->mutex. Previously we have been abusing BUF_IO_READ to achieve
this.
-rw-r--r-- | mysql-test/suite/innodb/r/innodb_cmp_drop_table.result | 1 | ||||
-rw-r--r-- | mysql-test/suite/innodb/t/innodb_cmp_drop_table.test | 4 | ||||
-rw-r--r-- | storage/innobase/buf/buf0buf.c | 5 | ||||
-rw-r--r-- | storage/innobase/buf/buf0lru.c | 159 | ||||
-rw-r--r-- | storage/innobase/include/buf0buf.h | 22 | ||||
-rw-r--r-- | storage/innobase/include/buf0buf.ic | 44 | ||||
-rw-r--r-- | storage/innobase/include/buf0types.h | 5 |
7 files changed, 162 insertions, 78 deletions
diff --git a/mysql-test/suite/innodb/r/innodb_cmp_drop_table.result b/mysql-test/suite/innodb/r/innodb_cmp_drop_table.result index bae2a17bd02..1f6d6948756 100644 --- a/mysql-test/suite/innodb/r/innodb_cmp_drop_table.result +++ b/mysql-test/suite/innodb/r/innodb_cmp_drop_table.result @@ -7,6 +7,7 @@ page_size drop table t1; SELECT page_size FROM information_schema.innodb_cmpmem WHERE pages_used > 0; page_size +8192 create table t2(a text) engine=innodb; SELECT page_size FROM information_schema.innodb_cmpmem WHERE pages_used > 0; page_size diff --git a/mysql-test/suite/innodb/t/innodb_cmp_drop_table.test b/mysql-test/suite/innodb/t/innodb_cmp_drop_table.test index 481ccd646f8..92f4f715241 100644 --- a/mysql-test/suite/innodb/t/innodb_cmp_drop_table.test +++ b/mysql-test/suite/innodb/t/innodb_cmp_drop_table.test @@ -26,7 +26,7 @@ while ($i) drop table t1; -# no lazy eviction at drop table in 5.1 and 5.5 there should be no +# because of lazy eviction at drop table in 5.5 there should be some # used 8K pages -- eval $query_i_s @@ -36,7 +36,7 @@ create table t2(a text) engine=innodb; -- disable_query_log --- let $i = 200 +-- let $i = 400 while ($i) { insert into t2 values(repeat('abcdefghijklmnopqrstuvwxyz',1000)); diff --git a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c index fbb6fecadf6..c3191e677f7 100644 --- a/storage/innobase/buf/buf0buf.c +++ b/storage/innobase/buf/buf0buf.c @@ -3888,6 +3888,9 @@ buf_pool_validate_instance( ut_a(rw_lock_is_locked(&block->lock, RW_LOCK_EX)); break; + + case BUF_IO_PIN: + break; } n_lru++; @@ -3917,6 +3920,7 @@ buf_pool_validate_instance( ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); switch (buf_page_get_io_fix(b)) { case BUF_IO_NONE: + case BUF_IO_PIN: /* All clean blocks should be I/O-unfixed. */ break; case BUF_IO_READ: @@ -3956,6 +3960,7 @@ buf_pool_validate_instance( switch (buf_page_get_io_fix(b)) { case BUF_IO_NONE: case BUF_IO_READ: + case BUF_IO_PIN: break; case BUF_IO_WRITE: switch (buf_page_get_flush_type(b)) { diff --git a/storage/innobase/buf/buf0lru.c b/storage/innobase/buf/buf0lru.c index 510f6eefba5..15b0ad40aaa 100644 --- a/storage/innobase/buf/buf0lru.c +++ b/storage/innobase/buf/buf0lru.c @@ -68,8 +68,12 @@ allowed to point to either end of the LRU list. */ /** When dropping the search hash index entries before deleting an ibd file, we build a local array of pages belonging to that tablespace -in the buffer pool. Following is the size of that array. */ -#define BUF_LRU_DROP_SEARCH_HASH_SIZE 1024 +in the buffer pool. Following is the size of that array. +We also release buf_pool->mutex after scanning this many pages of the +flush_list when dropping a table. This is to ensure that other threads +are not blocked for extended period of time when using very large +buffer pools. */ +#define BUF_LRU_DROP_SEARCH_SIZE 1024 /** If we switch on the InnoDB monitor because there are too few available frames in the buffer pool, we set this to TRUE */ @@ -210,7 +214,7 @@ buf_LRU_drop_page_hash_batch( ulint i; ut_ad(arr != NULL); - ut_ad(count <= BUF_LRU_DROP_SEARCH_HASH_SIZE); + ut_ad(count <= BUF_LRU_DROP_SEARCH_SIZE); for (i = 0; i < count; ++i) { btr_search_drop_page_hash_when_freed(space_id, zip_size, @@ -244,7 +248,7 @@ buf_LRU_drop_page_hash_for_tablespace( } page_arr = ut_malloc( - sizeof(ulint) * BUF_LRU_DROP_SEARCH_HASH_SIZE); + sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE); buf_pool_mutex_enter(buf_pool); num_entries = 0; @@ -283,10 +287,10 @@ next_page: /* Store the page number so that we can drop the hash index in a batch later. */ page_arr[num_entries] = bpage->offset; - ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE); + ut_a(num_entries < BUF_LRU_DROP_SEARCH_SIZE); ++num_entries; - if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) { + if (num_entries < BUF_LRU_DROP_SEARCH_SIZE) { goto next_page; } @@ -331,37 +335,40 @@ next_page: } /******************************************************************//** -Invalidates all pages belonging to a given tablespace inside a specific +Remove all dirty pages belonging to a given tablespace inside a specific buffer pool instance when we are deleting the data file(s) of that -tablespace. */ +tablespace. The pages still remain a part of LRU and are evicted from +the list as they age towards the tail of the LRU. */ static void -buf_LRU_invalidate_tablespace_buf_pool_instance( -/*============================================*/ +buf_LRU_remove_dirty_pages_for_tablespace( +/*======================================*/ buf_pool_t* buf_pool, /*!< buffer pool instance */ ulint id) /*!< in: space id */ { buf_page_t* bpage; ibool all_freed; + ulint i; scan_again: buf_pool_mutex_enter(buf_pool); + buf_flush_list_mutex_enter(buf_pool); all_freed = TRUE; - bpage = UT_LIST_GET_LAST(buf_pool->LRU); + for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list), i = 0; + bpage != NULL; ++i) { - while (bpage != NULL) { buf_page_t* prev_bpage; mutex_t* block_mutex = NULL; ut_a(buf_page_in_file(bpage)); - prev_bpage = UT_LIST_GET_PREV(LRU, bpage); + prev_bpage = UT_LIST_GET_PREV(list, bpage); /* bpage->space and bpage->io_fix are protected by - buf_pool_mutex and block_mutex. It is safe to check - them while holding buf_pool_mutex only. */ + buf_pool->mutex and block_mutex. It is safe to check + them while holding buf_pool->mutex only. */ if (buf_page_get_space(bpage) != id) { /* Skip this block, as it does not belong to @@ -374,79 +381,83 @@ scan_again: all_freed = FALSE; goto next_page; - } else { - block_mutex = buf_page_get_mutex(bpage); - mutex_enter(block_mutex); + } - if (bpage->buf_fix_count > 0) { + /* We have to release the flush_list_mutex to obey the + latching order. We are however guaranteed that the page + will stay in the flush_list because buf_flush_remove() + needs buf_pool->mutex as well. */ + buf_flush_list_mutex_exit(buf_pool); + block_mutex = buf_page_get_mutex(bpage); + mutex_enter(block_mutex); - mutex_exit(block_mutex); - /* We cannot remove this page during - this scan yet; maybe the system is - currently reading it in, or flushing - the modifications to the file */ + if (bpage->buf_fix_count > 0) { + mutex_exit(block_mutex); + buf_flush_list_mutex_enter(buf_pool); - all_freed = FALSE; + /* We cannot remove this page during + this scan yet; maybe the system is + currently reading it in, or flushing + the modifications to the file */ - goto next_page; - } + all_freed = FALSE; + goto next_page; } - ut_ad(mutex_own(block_mutex)); + ut_ad(bpage->oldest_modification != 0); -#ifdef UNIV_DEBUG - if (buf_debug_prints) { - fprintf(stderr, - "Dropping space %lu page %lu\n", - (ulong) buf_page_get_space(bpage), - (ulong) buf_page_get_page_no(bpage)); - } -#endif - if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { - /* This is a compressed-only block - descriptor. Do nothing. */ - } else if (((buf_block_t*) bpage)->index) { - ulint page_no; - ulint zip_size; + buf_flush_remove(bpage); - buf_pool_mutex_exit(buf_pool); - - zip_size = buf_page_get_zip_size(bpage); - page_no = buf_page_get_page_no(bpage); + mutex_exit(block_mutex); + buf_flush_list_mutex_enter(buf_pool); +next_page: + bpage = prev_bpage; - mutex_exit(block_mutex); + if (!bpage) { + break; + } - /* Note that the following call will acquire - and release an X-latch on the page. */ + /* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the + loop we release buf_pool->mutex to let other threads + do their job. */ + if (i < BUF_LRU_DROP_SEARCH_SIZE) { + continue; + } - btr_search_drop_page_hash_when_freed( - id, zip_size, page_no); - goto scan_again; + /* We IO-fix the block to make sure that the block + stays in its position in the flush_list. */ + if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) { + /* Block is already IO-fixed. We don't + want to change the value. Lets leave + this block alone. */ + continue; } - if (bpage->oldest_modification != 0) { + buf_flush_list_mutex_exit(buf_pool); + block_mutex = buf_page_get_mutex(bpage); + mutex_enter(block_mutex); + buf_page_set_sticky(bpage); + mutex_exit(block_mutex); - buf_flush_remove(bpage); - } + /* Now it is safe to release the buf_pool->mutex. */ + buf_pool_mutex_exit(buf_pool); + os_thread_yield(); + buf_pool_mutex_enter(buf_pool); - /* Remove from the LRU list. */ + mutex_enter(block_mutex); + buf_page_unset_sticky(bpage); + mutex_exit(block_mutex); - if (buf_LRU_block_remove_hashed_page(bpage, TRUE) - != BUF_BLOCK_ZIP_FREE) { - buf_LRU_block_free_hashed_page((buf_block_t*) bpage); - mutex_exit(block_mutex); - } else { - /* The block_mutex should have been released - by buf_LRU_block_remove_hashed_page() when it - returns BUF_BLOCK_ZIP_FREE. */ - ut_ad(block_mutex == &buf_pool->zip_mutex); - ut_ad(!mutex_own(block_mutex)); - } -next_page: - bpage = prev_bpage; + buf_flush_list_mutex_enter(buf_pool); + ut_ad(bpage->in_flush_list); + + i = 0; } buf_pool_mutex_exit(buf_pool); + buf_flush_list_mutex_exit(buf_pool); + + ut_ad(buf_flush_validate(buf_pool)); if (!all_freed) { os_thread_sleep(20000); @@ -477,7 +488,7 @@ buf_LRU_invalidate_tablespace( buf_pool = buf_pool_from_array(i); buf_LRU_drop_page_hash_for_tablespace(buf_pool, id); - buf_LRU_invalidate_tablespace_buf_pool_instance(buf_pool, id); + buf_LRU_remove_dirty_pages_for_tablespace(buf_pool, id); } } @@ -1532,8 +1543,9 @@ alloc: /* Prevent buf_page_get_gen() from decompressing the block while we release buf_pool->mutex and block_mutex. */ - b->buf_fix_count++; - b->io_fix = BUF_IO_READ; + mutex_enter(&buf_pool->zip_mutex); + buf_page_set_sticky(b); + mutex_exit(&buf_pool->zip_mutex); } buf_pool_mutex_exit(buf_pool); @@ -1573,8 +1585,7 @@ alloc: if (b) { mutex_enter(&buf_pool->zip_mutex); - b->buf_fix_count--; - buf_page_set_io_fix(b, BUF_IO_NONE); + buf_page_unset_sticky(b); mutex_exit(&buf_pool->zip_mutex); } diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index c0ff7b1766b..456f077a13d 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -910,7 +910,27 @@ buf_block_set_io_fix( /*=================*/ buf_block_t* block, /*!< in/out: control block */ enum buf_io_fix io_fix);/*!< in: io_fix state */ - +/*********************************************************************//** +Makes a block sticky. A sticky block implies that even after we release +the buf_pool->mutex and the block->mutex: +* it cannot be removed from the flush_list +* the block descriptor cannot be relocated +* it cannot be removed from the LRU list +Note that: +* the block can still change its position in the LRU list +* the next and previous pointers can change. */ +UNIV_INLINE +void +buf_page_set_sticky( +/*================*/ + buf_page_t* bpage); /*!< in/out: control block */ +/*********************************************************************//** +Removes stickiness of a block. */ +UNIV_INLINE +void +buf_page_unset_sticky( +/*==================*/ + buf_page_t* bpage); /*!< in/out: control block */ /********************************************************************//** Determine if a buffer block can be relocated in memory. The block can be dirty, but it must not be I/O-fixed or bufferfixed. */ diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic index b65b5133c15..99e55df3312 100644 --- a/storage/innobase/include/buf0buf.ic +++ b/storage/innobase/include/buf0buf.ic @@ -414,6 +414,7 @@ buf_page_get_io_fix( case BUF_IO_NONE: case BUF_IO_READ: case BUF_IO_WRITE: + case BUF_IO_PIN: return(io_fix); } ut_error; @@ -464,6 +465,49 @@ buf_block_set_io_fix( buf_page_set_io_fix(&block->page, io_fix); } +/*********************************************************************//** +Makes a block sticky. A sticky block implies that even after we release +the buf_pool->mutex and the block->mutex: +* it cannot be removed from the flush_list +* the block descriptor cannot be relocated +* it cannot be removed from the LRU list +Note that: +* the block can still change its position in the LRU list +* the next and previous pointers can change. */ +UNIV_INLINE +void +buf_page_set_sticky( +/*================*/ + buf_page_t* bpage) /*!< in/out: control block */ +{ +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_pool_mutex_own(buf_pool)); +#endif + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_NONE); + + bpage->io_fix = BUF_IO_PIN; +} + +/*********************************************************************//** +Removes stickiness of a block. */ +UNIV_INLINE +void +buf_page_unset_sticky( +/*==================*/ + buf_page_t* bpage) /*!< in/out: control block */ +{ +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_pool_mutex_own(buf_pool)); +#endif + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_PIN); + + bpage->io_fix = BUF_IO_NONE; +} + /********************************************************************//** Determine if a buffer block can be relocated in memory. The block can be dirty, but it must not be I/O-fixed or bufferfixed. */ diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h index 0cc2defb3ff..12b9e22f673 100644 --- a/storage/innobase/include/buf0types.h +++ b/storage/innobase/include/buf0types.h @@ -57,7 +57,10 @@ enum buf_flush { enum buf_io_fix { BUF_IO_NONE = 0, /**< no pending I/O */ BUF_IO_READ, /**< read pending */ - BUF_IO_WRITE /**< write pending */ + BUF_IO_WRITE, /**< write pending */ + BUF_IO_PIN /**< disallow relocation of + block and its removal of from + the flush_list */ }; /** Parameters of binary buddy system for compressed pages (buf0buddy.h) */ |