summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorInaam Rana <inaam.rana@oracle.com>2011-12-07 09:12:53 -0500
committerInaam Rana <inaam.rana@oracle.com>2011-12-07 09:12:53 -0500
commit358a31df435d2e18f93a48be76bc2224e479948a (patch)
tree7ac495202178d7737e53b80160183f2026b60477
parent0cd9228124266a1e8cf41e74994cdba1380ac2e2 (diff)
downloadmariadb-git-358a31df435d2e18f93a48be76bc2224e479948a.tar.gz
Bug#11759044 - 51325: DROPPING AN EMPTY INNODB TABLE TAKES A LONG TIME
WITH LARGE BUFFER POOL (Note: this a backport of revno:3472 from mysql-trunk) rb://845 approved by: Marko When dropping a table (with an .ibd file i.e.: with innodb_file_per_table set) we scan entire LRU to invalidate pages from that table. This can be painful in case of large buffer pools as we hold the buf_pool->mutex for the scan. Note that gravity of the problem does not depend on the size of the table. Even with an empty table but a large and filled up buffer pool we'll end up scanning a very long LRU list. The fix is to scan flush_list and just remove the blocks belonging to the table from the flush_list, marking them as non-dirty. The blocks are left in the LRU list for eventual eviction due to aging. The flush_list is typically much smaller than the LRU list but for cases where it is very long we have the solution of releasing the buf_pool->mutex after scanning 1K pages. buf_page_[set|unset]_sticky(): Use new IO-state BUF_IO_PIN to ensure that a block stays in the flush_list and LRU list when we release buf_pool->mutex. Previously we have been abusing BUF_IO_READ to achieve this.
-rw-r--r--mysql-test/suite/innodb/r/innodb_cmp_drop_table.result1
-rw-r--r--mysql-test/suite/innodb/t/innodb_cmp_drop_table.test4
-rw-r--r--storage/innobase/buf/buf0buf.c5
-rw-r--r--storage/innobase/buf/buf0lru.c159
-rw-r--r--storage/innobase/include/buf0buf.h22
-rw-r--r--storage/innobase/include/buf0buf.ic44
-rw-r--r--storage/innobase/include/buf0types.h5
7 files changed, 162 insertions, 78 deletions
diff --git a/mysql-test/suite/innodb/r/innodb_cmp_drop_table.result b/mysql-test/suite/innodb/r/innodb_cmp_drop_table.result
index bae2a17bd02..1f6d6948756 100644
--- a/mysql-test/suite/innodb/r/innodb_cmp_drop_table.result
+++ b/mysql-test/suite/innodb/r/innodb_cmp_drop_table.result
@@ -7,6 +7,7 @@ page_size
drop table t1;
SELECT page_size FROM information_schema.innodb_cmpmem WHERE pages_used > 0;
page_size
+8192
create table t2(a text) engine=innodb;
SELECT page_size FROM information_schema.innodb_cmpmem WHERE pages_used > 0;
page_size
diff --git a/mysql-test/suite/innodb/t/innodb_cmp_drop_table.test b/mysql-test/suite/innodb/t/innodb_cmp_drop_table.test
index 481ccd646f8..92f4f715241 100644
--- a/mysql-test/suite/innodb/t/innodb_cmp_drop_table.test
+++ b/mysql-test/suite/innodb/t/innodb_cmp_drop_table.test
@@ -26,7 +26,7 @@ while ($i)
drop table t1;
-# no lazy eviction at drop table in 5.1 and 5.5 there should be no
+# because of lazy eviction at drop table in 5.5 there should be some
# used 8K pages
-- eval $query_i_s
@@ -36,7 +36,7 @@ create table t2(a text) engine=innodb;
-- disable_query_log
--- let $i = 200
+-- let $i = 400
while ($i)
{
insert into t2 values(repeat('abcdefghijklmnopqrstuvwxyz',1000));
diff --git a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c
index fbb6fecadf6..c3191e677f7 100644
--- a/storage/innobase/buf/buf0buf.c
+++ b/storage/innobase/buf/buf0buf.c
@@ -3888,6 +3888,9 @@ buf_pool_validate_instance(
ut_a(rw_lock_is_locked(&block->lock,
RW_LOCK_EX));
break;
+
+ case BUF_IO_PIN:
+ break;
}
n_lru++;
@@ -3917,6 +3920,7 @@ buf_pool_validate_instance(
ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
switch (buf_page_get_io_fix(b)) {
case BUF_IO_NONE:
+ case BUF_IO_PIN:
/* All clean blocks should be I/O-unfixed. */
break;
case BUF_IO_READ:
@@ -3956,6 +3960,7 @@ buf_pool_validate_instance(
switch (buf_page_get_io_fix(b)) {
case BUF_IO_NONE:
case BUF_IO_READ:
+ case BUF_IO_PIN:
break;
case BUF_IO_WRITE:
switch (buf_page_get_flush_type(b)) {
diff --git a/storage/innobase/buf/buf0lru.c b/storage/innobase/buf/buf0lru.c
index 510f6eefba5..15b0ad40aaa 100644
--- a/storage/innobase/buf/buf0lru.c
+++ b/storage/innobase/buf/buf0lru.c
@@ -68,8 +68,12 @@ allowed to point to either end of the LRU list. */
/** When dropping the search hash index entries before deleting an ibd
file, we build a local array of pages belonging to that tablespace
-in the buffer pool. Following is the size of that array. */
-#define BUF_LRU_DROP_SEARCH_HASH_SIZE 1024
+in the buffer pool. Following is the size of that array.
+We also release buf_pool->mutex after scanning this many pages of the
+flush_list when dropping a table. This is to ensure that other threads
+are not blocked for extended period of time when using very large
+buffer pools. */
+#define BUF_LRU_DROP_SEARCH_SIZE 1024
/** If we switch on the InnoDB monitor because there are too few available
frames in the buffer pool, we set this to TRUE */
@@ -210,7 +214,7 @@ buf_LRU_drop_page_hash_batch(
ulint i;
ut_ad(arr != NULL);
- ut_ad(count <= BUF_LRU_DROP_SEARCH_HASH_SIZE);
+ ut_ad(count <= BUF_LRU_DROP_SEARCH_SIZE);
for (i = 0; i < count; ++i) {
btr_search_drop_page_hash_when_freed(space_id, zip_size,
@@ -244,7 +248,7 @@ buf_LRU_drop_page_hash_for_tablespace(
}
page_arr = ut_malloc(
- sizeof(ulint) * BUF_LRU_DROP_SEARCH_HASH_SIZE);
+ sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE);
buf_pool_mutex_enter(buf_pool);
num_entries = 0;
@@ -283,10 +287,10 @@ next_page:
/* Store the page number so that we can drop the hash
index in a batch later. */
page_arr[num_entries] = bpage->offset;
- ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE);
+ ut_a(num_entries < BUF_LRU_DROP_SEARCH_SIZE);
++num_entries;
- if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) {
+ if (num_entries < BUF_LRU_DROP_SEARCH_SIZE) {
goto next_page;
}
@@ -331,37 +335,40 @@ next_page:
}
/******************************************************************//**
-Invalidates all pages belonging to a given tablespace inside a specific
+Remove all dirty pages belonging to a given tablespace inside a specific
buffer pool instance when we are deleting the data file(s) of that
-tablespace. */
+tablespace. The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU. */
static
void
-buf_LRU_invalidate_tablespace_buf_pool_instance(
-/*============================================*/
+buf_LRU_remove_dirty_pages_for_tablespace(
+/*======================================*/
buf_pool_t* buf_pool, /*!< buffer pool instance */
ulint id) /*!< in: space id */
{
buf_page_t* bpage;
ibool all_freed;
+ ulint i;
scan_again:
buf_pool_mutex_enter(buf_pool);
+ buf_flush_list_mutex_enter(buf_pool);
all_freed = TRUE;
- bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+ for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list), i = 0;
+ bpage != NULL; ++i) {
- while (bpage != NULL) {
buf_page_t* prev_bpage;
mutex_t* block_mutex = NULL;
ut_a(buf_page_in_file(bpage));
- prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
+ prev_bpage = UT_LIST_GET_PREV(list, bpage);
/* bpage->space and bpage->io_fix are protected by
- buf_pool_mutex and block_mutex. It is safe to check
- them while holding buf_pool_mutex only. */
+ buf_pool->mutex and block_mutex. It is safe to check
+ them while holding buf_pool->mutex only. */
if (buf_page_get_space(bpage) != id) {
/* Skip this block, as it does not belong to
@@ -374,79 +381,83 @@ scan_again:
all_freed = FALSE;
goto next_page;
- } else {
- block_mutex = buf_page_get_mutex(bpage);
- mutex_enter(block_mutex);
+ }
- if (bpage->buf_fix_count > 0) {
+ /* We have to release the flush_list_mutex to obey the
+ latching order. We are however guaranteed that the page
+ will stay in the flush_list because buf_flush_remove()
+ needs buf_pool->mutex as well. */
+ buf_flush_list_mutex_exit(buf_pool);
+ block_mutex = buf_page_get_mutex(bpage);
+ mutex_enter(block_mutex);
- mutex_exit(block_mutex);
- /* We cannot remove this page during
- this scan yet; maybe the system is
- currently reading it in, or flushing
- the modifications to the file */
+ if (bpage->buf_fix_count > 0) {
+ mutex_exit(block_mutex);
+ buf_flush_list_mutex_enter(buf_pool);
- all_freed = FALSE;
+ /* We cannot remove this page during
+ this scan yet; maybe the system is
+ currently reading it in, or flushing
+ the modifications to the file */
- goto next_page;
- }
+ all_freed = FALSE;
+ goto next_page;
}
- ut_ad(mutex_own(block_mutex));
+ ut_ad(bpage->oldest_modification != 0);
-#ifdef UNIV_DEBUG
- if (buf_debug_prints) {
- fprintf(stderr,
- "Dropping space %lu page %lu\n",
- (ulong) buf_page_get_space(bpage),
- (ulong) buf_page_get_page_no(bpage));
- }
-#endif
- if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
- /* This is a compressed-only block
- descriptor. Do nothing. */
- } else if (((buf_block_t*) bpage)->index) {
- ulint page_no;
- ulint zip_size;
+ buf_flush_remove(bpage);
- buf_pool_mutex_exit(buf_pool);
-
- zip_size = buf_page_get_zip_size(bpage);
- page_no = buf_page_get_page_no(bpage);
+ mutex_exit(block_mutex);
+ buf_flush_list_mutex_enter(buf_pool);
+next_page:
+ bpage = prev_bpage;
- mutex_exit(block_mutex);
+ if (!bpage) {
+ break;
+ }
- /* Note that the following call will acquire
- and release an X-latch on the page. */
+ /* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the
+ loop we release buf_pool->mutex to let other threads
+ do their job. */
+ if (i < BUF_LRU_DROP_SEARCH_SIZE) {
+ continue;
+ }
- btr_search_drop_page_hash_when_freed(
- id, zip_size, page_no);
- goto scan_again;
+ /* We IO-fix the block to make sure that the block
+ stays in its position in the flush_list. */
+ if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+ /* Block is already IO-fixed. We don't
+ want to change the value. Lets leave
+ this block alone. */
+ continue;
}
- if (bpage->oldest_modification != 0) {
+ buf_flush_list_mutex_exit(buf_pool);
+ block_mutex = buf_page_get_mutex(bpage);
+ mutex_enter(block_mutex);
+ buf_page_set_sticky(bpage);
+ mutex_exit(block_mutex);
- buf_flush_remove(bpage);
- }
+ /* Now it is safe to release the buf_pool->mutex. */
+ buf_pool_mutex_exit(buf_pool);
+ os_thread_yield();
+ buf_pool_mutex_enter(buf_pool);
- /* Remove from the LRU list. */
+ mutex_enter(block_mutex);
+ buf_page_unset_sticky(bpage);
+ mutex_exit(block_mutex);
- if (buf_LRU_block_remove_hashed_page(bpage, TRUE)
- != BUF_BLOCK_ZIP_FREE) {
- buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
- mutex_exit(block_mutex);
- } else {
- /* The block_mutex should have been released
- by buf_LRU_block_remove_hashed_page() when it
- returns BUF_BLOCK_ZIP_FREE. */
- ut_ad(block_mutex == &buf_pool->zip_mutex);
- ut_ad(!mutex_own(block_mutex));
- }
-next_page:
- bpage = prev_bpage;
+ buf_flush_list_mutex_enter(buf_pool);
+ ut_ad(bpage->in_flush_list);
+
+ i = 0;
}
buf_pool_mutex_exit(buf_pool);
+ buf_flush_list_mutex_exit(buf_pool);
+
+ ut_ad(buf_flush_validate(buf_pool));
if (!all_freed) {
os_thread_sleep(20000);
@@ -477,7 +488,7 @@ buf_LRU_invalidate_tablespace(
buf_pool = buf_pool_from_array(i);
buf_LRU_drop_page_hash_for_tablespace(buf_pool, id);
- buf_LRU_invalidate_tablespace_buf_pool_instance(buf_pool, id);
+ buf_LRU_remove_dirty_pages_for_tablespace(buf_pool, id);
}
}
@@ -1532,8 +1543,9 @@ alloc:
/* Prevent buf_page_get_gen() from
decompressing the block while we release
buf_pool->mutex and block_mutex. */
- b->buf_fix_count++;
- b->io_fix = BUF_IO_READ;
+ mutex_enter(&buf_pool->zip_mutex);
+ buf_page_set_sticky(b);
+ mutex_exit(&buf_pool->zip_mutex);
}
buf_pool_mutex_exit(buf_pool);
@@ -1573,8 +1585,7 @@ alloc:
if (b) {
mutex_enter(&buf_pool->zip_mutex);
- b->buf_fix_count--;
- buf_page_set_io_fix(b, BUF_IO_NONE);
+ buf_page_unset_sticky(b);
mutex_exit(&buf_pool->zip_mutex);
}
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
index c0ff7b1766b..456f077a13d 100644
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -910,7 +910,27 @@ buf_block_set_io_fix(
/*=================*/
buf_block_t* block, /*!< in/out: control block */
enum buf_io_fix io_fix);/*!< in: io_fix state */
-
+/*********************************************************************//**
+Makes a block sticky. A sticky block implies that even after we release
+the buf_pool->mutex and the block->mutex:
+* it cannot be removed from the flush_list
+* the block descriptor cannot be relocated
+* it cannot be removed from the LRU list
+Note that:
+* the block can still change its position in the LRU list
+* the next and previous pointers can change. */
+UNIV_INLINE
+void
+buf_page_set_sticky(
+/*================*/
+ buf_page_t* bpage); /*!< in/out: control block */
+/*********************************************************************//**
+Removes stickiness of a block. */
+UNIV_INLINE
+void
+buf_page_unset_sticky(
+/*==================*/
+ buf_page_t* bpage); /*!< in/out: control block */
/********************************************************************//**
Determine if a buffer block can be relocated in memory. The block
can be dirty, but it must not be I/O-fixed or bufferfixed. */
diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic
index b65b5133c15..99e55df3312 100644
--- a/storage/innobase/include/buf0buf.ic
+++ b/storage/innobase/include/buf0buf.ic
@@ -414,6 +414,7 @@ buf_page_get_io_fix(
case BUF_IO_NONE:
case BUF_IO_READ:
case BUF_IO_WRITE:
+ case BUF_IO_PIN:
return(io_fix);
}
ut_error;
@@ -464,6 +465,49 @@ buf_block_set_io_fix(
buf_page_set_io_fix(&block->page, io_fix);
}
+/*********************************************************************//**
+Makes a block sticky. A sticky block implies that even after we release
+the buf_pool->mutex and the block->mutex:
+* it cannot be removed from the flush_list
+* the block descriptor cannot be relocated
+* it cannot be removed from the LRU list
+Note that:
+* the block can still change its position in the LRU list
+* the next and previous pointers can change. */
+UNIV_INLINE
+void
+buf_page_set_sticky(
+/*================*/
+ buf_page_t* bpage) /*!< in/out: control block */
+{
+#ifdef UNIV_DEBUG
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+ ut_ad(buf_pool_mutex_own(buf_pool));
+#endif
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+ ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
+
+ bpage->io_fix = BUF_IO_PIN;
+}
+
+/*********************************************************************//**
+Removes stickiness of a block. */
+UNIV_INLINE
+void
+buf_page_unset_sticky(
+/*==================*/
+ buf_page_t* bpage) /*!< in/out: control block */
+{
+#ifdef UNIV_DEBUG
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+ ut_ad(buf_pool_mutex_own(buf_pool));
+#endif
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+ ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_PIN);
+
+ bpage->io_fix = BUF_IO_NONE;
+}
+
/********************************************************************//**
Determine if a buffer block can be relocated in memory. The block
can be dirty, but it must not be I/O-fixed or bufferfixed. */
diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h
index 0cc2defb3ff..12b9e22f673 100644
--- a/storage/innobase/include/buf0types.h
+++ b/storage/innobase/include/buf0types.h
@@ -57,7 +57,10 @@ enum buf_flush {
enum buf_io_fix {
BUF_IO_NONE = 0, /**< no pending I/O */
BUF_IO_READ, /**< read pending */
- BUF_IO_WRITE /**< write pending */
+ BUF_IO_WRITE, /**< write pending */
+ BUF_IO_PIN /**< disallow relocation of
+ block and its removal of from
+ the flush_list */
};
/** Parameters of binary buddy system for compressed pages (buf0buddy.h) */