summaryrefslogtreecommitdiff
path: root/storage/innodb_plugin/buf
diff options
context:
space:
mode:
authorSergey Vojtovich <svoj@sun.com>2010-04-01 16:56:22 +0400
committerSergey Vojtovich <svoj@sun.com>2010-04-01 16:56:22 +0400
commit3520a0bd7ce771cdc05cb0abf6457ad5fb8a194e (patch)
tree14eed13dad977fcfa6598a89777a3fd65da645f6 /storage/innodb_plugin/buf
parentfdb9161579dc2ed30c535a15e30dc6f5b685e9ca (diff)
downloadmariadb-git-3520a0bd7ce771cdc05cb0abf6457ad5fb8a194e.tar.gz
Applying InnoDB snapshot
Detailed revision comments: r6860 | jyang | 2010-03-23 18:20:36 +0200 (Tue, 23 Mar 2010) | 5 lines branches/zip: This is patch from Inaam that uses red-black tree to speed up insertions into the flush_list and thus the recovery process. The patch has been tested by Nokia.
Diffstat (limited to 'storage/innodb_plugin/buf')
-rw-r--r--storage/innodb_plugin/buf/buf0buddy.c2
-rw-r--r--storage/innodb_plugin/buf/buf0buf.c23
-rw-r--r--storage/innodb_plugin/buf/buf0flu.c254
-rw-r--r--storage/innodb_plugin/buf/buf0lru.c22
-rw-r--r--storage/innodb_plugin/buf/buf0rea.c6
5 files changed, 259 insertions, 48 deletions
diff --git a/storage/innodb_plugin/buf/buf0buddy.c b/storage/innodb_plugin/buf/buf0buddy.c
index f0e1395c307..b879e97a989 100644
--- a/storage/innodb_plugin/buf/buf0buddy.c
+++ b/storage/innodb_plugin/buf/buf0buddy.c
@@ -391,6 +391,8 @@ buf_buddy_relocate_block(
UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, dpage);
}
+ UNIV_MEM_INVALID(bpage, sizeof *bpage);
+
mutex_exit(&buf_pool_zip_mutex);
return(TRUE);
}
diff --git a/storage/innodb_plugin/buf/buf0buf.c b/storage/innodb_plugin/buf/buf0buf.c
index 40f7925f520..d4a88565570 100644
--- a/storage/innodb_plugin/buf/buf0buf.c
+++ b/storage/innodb_plugin/buf/buf0buf.c
@@ -1191,8 +1191,6 @@ buf_relocate(
HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage);
-
- UNIV_MEM_INVALID(bpage, sizeof *bpage);
}
/********************************************************************//**
@@ -2224,22 +2222,8 @@ wait_until_unfixed:
ut_ad(!block->page.in_flush_list);
} else {
/* Relocate buf_pool->flush_list. */
- buf_page_t* b;
-
- b = UT_LIST_GET_PREV(list, &block->page);
- ut_ad(block->page.in_flush_list);
- UT_LIST_REMOVE(list, buf_pool->flush_list,
- &block->page);
-
- if (b) {
- UT_LIST_INSERT_AFTER(
- list, buf_pool->flush_list, b,
- &block->page);
- } else {
- UT_LIST_ADD_FIRST(
- list, buf_pool->flush_list,
- &block->page);
- }
+ buf_flush_relocate_on_flush_list(bpage,
+ &block->page);
}
/* Buffer-fix, I/O-fix, and X-latch the block
@@ -2253,6 +2237,9 @@ wait_until_unfixed:
block->page.buf_fix_count = 1;
buf_block_set_io_fix(block, BUF_IO_READ);
rw_lock_x_lock(&block->lock);
+
+ UNIV_MEM_INVALID(bpage, sizeof *bpage);
+
mutex_exit(&block->mutex);
mutex_exit(&buf_pool_zip_mutex);
buf_pool->n_pend_unzip++;
diff --git a/storage/innodb_plugin/buf/buf0flu.c b/storage/innodb_plugin/buf/buf0flu.c
index 8b614ce90e5..86b9ced73fa 100644
--- a/storage/innodb_plugin/buf/buf0flu.c
+++ b/storage/innodb_plugin/buf/buf0flu.c
@@ -88,6 +88,138 @@ buf_flush_validate_low(void);
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
/********************************************************************//**
+Insert a block in the flush_rbt and returns a pointer to its
+predecessor or NULL if no predecessor. The ordering is maintained
+on the basis of the <oldest_modification, space, offset> key.
+@return pointer to the predecessor or NULL if no predecessor. */
+static
+buf_page_t*
+buf_flush_insert_in_flush_rbt(
+/*==========================*/
+ buf_page_t* bpage) /*!< in: bpage to be inserted. */
+{
+ buf_page_t* prev = NULL;
+ const ib_rbt_node_t* c_node;
+ const ib_rbt_node_t* p_node;
+
+ ut_ad(buf_pool_mutex_own());
+
+ /* Insert this buffer into the rbt. */
+ c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
+ ut_a(c_node != NULL);
+
+ /* Get the predecessor. */
+ p_node = rbt_prev(buf_pool->flush_rbt, c_node);
+
+ if (p_node != NULL) {
+ prev = *rbt_value(buf_page_t*, p_node);
+ ut_a(prev != NULL);
+ }
+
+ return(prev);
+}
+
+/********************************************************************//**
+Delete a bpage from the flush_rbt. */
+static
+void
+buf_flush_delete_from_flush_rbt(
+/*============================*/
+ buf_page_t* bpage) /*!< in: bpage to be removed. */
+{
+
+ ibool ret = FALSE;
+
+ ut_ad(buf_pool_mutex_own());
+ ret = rbt_delete(buf_pool->flush_rbt, &bpage);
+ ut_ad(ret);
+}
+
+/********************************************************************//**
+Compare two modified blocks in the buffer pool. The key for comparison
+is:
+key = <oldest_modification, space, offset>
+This comparison is used to maintian ordering of blocks in the
+buf_pool->flush_rbt.
+Note that for the purpose of flush_rbt, we only need to order blocks
+on the oldest_modification. The other two fields are used to uniquely
+identify the blocks.
+@return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
+static
+int
+buf_flush_block_cmp(
+/*================*/
+ const void* p1, /*!< in: block1 */
+ const void* p2) /*!< in: block2 */
+{
+ int ret;
+
+ ut_ad(p1 != NULL);
+ ut_ad(p2 != NULL);
+
+ const buf_page_t* b1 = *(const buf_page_t**) p1;
+ const buf_page_t* b2 = *(const buf_page_t**) p2;
+
+ ut_ad(b1 != NULL);
+ ut_ad(b2 != NULL);
+
+ ut_ad(b1->in_flush_list);
+ ut_ad(b2->in_flush_list);
+
+ if (b2->oldest_modification
+ > b1->oldest_modification) {
+ return(1);
+ }
+
+ if (b2->oldest_modification
+ < b1->oldest_modification) {
+ return(-1);
+ }
+
+ /* If oldest_modification is same then decide on the space. */
+ ret = (int)(b2->space - b1->space);
+
+ /* Or else decide ordering on the offset field. */
+ return(ret ? ret : (int)(b2->offset - b1->offset));
+}
+
+/********************************************************************//**
+Initialize the red-black tree to speed up insertions into the flush_list
+during recovery process. Should be called at the start of recovery
+process before any page has been read/written. */
+UNIV_INTERN
+void
+buf_flush_init_flush_rbt(void)
+/*==========================*/
+{
+ buf_pool_mutex_enter();
+
+ /* Create red black tree for speedy insertions in flush list. */
+ buf_pool->flush_rbt = rbt_create(sizeof(buf_page_t*),
+ buf_flush_block_cmp);
+ buf_pool_mutex_exit();
+}
+
+/********************************************************************//**
+Frees up the red-black tree. */
+UNIV_INTERN
+void
+buf_flush_free_flush_rbt(void)
+/*==========================*/
+{
+ buf_pool_mutex_enter();
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(buf_flush_validate_low());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ rbt_free(buf_pool->flush_rbt);
+ buf_pool->flush_rbt = NULL;
+
+ buf_pool_mutex_exit();
+}
+
+/********************************************************************//**
Inserts a modified block into the flush list. */
UNIV_INTERN
void
@@ -100,6 +232,13 @@ buf_flush_insert_into_flush_list(
|| (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
<= block->page.oldest_modification));
+ /* If we are in the recovery then we need to update the flush
+ red-black tree as well. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ buf_flush_insert_sorted_into_flush_list(block);
+ return;
+ }
+
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
ut_ad(block->page.in_LRU_list);
ut_ad(block->page.in_page_hash);
@@ -136,12 +275,27 @@ buf_flush_insert_sorted_into_flush_list(
ut_d(block->page.in_flush_list = TRUE);
prev_b = NULL;
- b = UT_LIST_GET_FIRST(buf_pool->flush_list);
- while (b && b->oldest_modification > block->page.oldest_modification) {
- ut_ad(b->in_flush_list);
- prev_b = b;
- b = UT_LIST_GET_NEXT(list, b);
+ /* For the most part when this function is called the flush_rbt
+ should not be NULL. In a very rare boundary case it is possible
+ that the flush_rbt has already been freed by the recovery thread
+ before the last page was hooked up in the flush_list by the
+ io-handler thread. In that case we'll just do a simple
+ linear search in the else block. */
+ if (buf_pool->flush_rbt) {
+
+ prev_b = buf_flush_insert_in_flush_rbt(&block->page);
+
+ } else {
+
+ b = UT_LIST_GET_FIRST(buf_pool->flush_list);
+
+ while (b && b->oldest_modification
+ > block->page.oldest_modification) {
+ ut_ad(b->in_flush_list);
+ prev_b = b;
+ b = UT_LIST_GET_NEXT(list, b);
+ }
}
if (prev_b == NULL) {
@@ -237,7 +391,6 @@ buf_flush_remove(
ut_ad(buf_pool_mutex_own());
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
ut_ad(bpage->in_flush_list);
- ut_d(bpage->in_flush_list = FALSE);
switch (buf_page_get_state(bpage)) {
case BUF_BLOCK_ZIP_PAGE:
@@ -259,6 +412,15 @@ buf_flush_remove(
break;
}
+ /* If the flush_rbt is active then delete from it as well. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ buf_flush_delete_from_flush_rbt(bpage);
+ }
+
+ /* Must be done after we have removed it from the flush_rbt
+ because we assert on in_flush_list in comparison function. */
+ ut_d(bpage->in_flush_list = FALSE);
+
bpage->oldest_modification = 0;
ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
@@ -266,6 +428,63 @@ buf_flush_remove(
}
/********************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage has already been
+copied to dpage. */
+UNIV_INTERN
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+ buf_page_t* bpage, /*!< in/out: control block being moved */
+ buf_page_t* dpage) /*!< in/out: destination block */
+{
+ buf_page_t* prev;
+ buf_page_t* prev_b = NULL;
+
+ ut_ad(buf_pool_mutex_own());
+
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+ ut_ad(bpage->in_flush_list);
+ ut_ad(dpage->in_flush_list);
+
+ /* If recovery is active we must swap the control blocks in
+ the flush_rbt as well. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ buf_flush_delete_from_flush_rbt(bpage);
+ prev_b = buf_flush_insert_in_flush_rbt(dpage);
+ }
+
+ /* Must be done after we have removed it from the flush_rbt
+ because we assert on in_flush_list in comparison function. */
+ ut_d(bpage->in_flush_list = FALSE);
+
+ prev = UT_LIST_GET_PREV(list, bpage);
+ UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
+
+ if (prev) {
+ ut_ad(prev->in_flush_list);
+ UT_LIST_INSERT_AFTER(
+ list,
+ buf_pool->flush_list,
+ prev, dpage);
+ } else {
+ UT_LIST_ADD_FIRST(
+ list,
+ buf_pool->flush_list,
+ dpage);
+ }
+
+ /* Just an extra check. Previous in flush_list
+ should be the same control block as in flush_rbt. */
+ ut_a(!buf_pool->flush_rbt || prev_b == prev);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(buf_flush_validate_low());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+}
+
+/********************************************************************//**
Updates the flush system data structures when a write is completed. */
UNIV_INTERN
void
@@ -1367,24 +1586,45 @@ ibool
buf_flush_validate_low(void)
/*========================*/
{
- buf_page_t* bpage;
+ buf_page_t* bpage;
+ const ib_rbt_node_t* rnode = NULL;
UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
ut_ad(ut_list_node_313->in_flush_list));
bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
+ /* If we are in recovery mode i.e.: flush_rbt != NULL
+ then each block in the flush_list must also be present
+ in the flush_rbt. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ rnode = rbt_first(buf_pool->flush_rbt);
+ }
+
while (bpage != NULL) {
const ib_uint64_t om = bpage->oldest_modification;
ut_ad(bpage->in_flush_list);
ut_a(buf_page_in_file(bpage));
ut_a(om > 0);
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ ut_a(rnode);
+ buf_page_t* rpage = *rbt_value(buf_page_t*,
+ rnode);
+ ut_a(rpage);
+ ut_a(rpage == bpage);
+ rnode = rbt_next(buf_pool->flush_rbt, rnode);
+ }
+
bpage = UT_LIST_GET_NEXT(list, bpage);
ut_a(!bpage || om >= bpage->oldest_modification);
}
+ /* By this time we must have exhausted the traversal of
+ flush_rbt (if active) as well. */
+ ut_a(rnode == NULL);
+
return(TRUE);
}
diff --git a/storage/innodb_plugin/buf/buf0lru.c b/storage/innodb_plugin/buf/buf0lru.c
index 09878f789d4..9cfa02ba3ac 100644
--- a/storage/innodb_plugin/buf/buf0lru.c
+++ b/storage/innodb_plugin/buf/buf0lru.c
@@ -1530,26 +1530,8 @@ alloc:
if (b->state == BUF_BLOCK_ZIP_PAGE) {
buf_LRU_insert_zip_clean(b);
} else {
- buf_page_t* prev;
-
- ut_ad(b->in_flush_list);
- ut_d(bpage->in_flush_list = FALSE);
-
- prev = UT_LIST_GET_PREV(list, b);
- UT_LIST_REMOVE(list, buf_pool->flush_list, b);
-
- if (prev) {
- ut_ad(prev->in_flush_list);
- UT_LIST_INSERT_AFTER(
- list,
- buf_pool->flush_list,
- prev, b);
- } else {
- UT_LIST_ADD_FIRST(
- list,
- buf_pool->flush_list,
- b);
- }
+ /* Relocate on buf_pool->flush_list. */
+ buf_flush_relocate_on_flush_list(bpage, b);
}
bpage->zip.data = NULL;
diff --git a/storage/innodb_plugin/buf/buf0rea.c b/storage/innodb_plugin/buf/buf0rea.c
index dd98ea17eb5..a973b1b2d26 100644
--- a/storage/innodb_plugin/buf/buf0rea.c
+++ b/storage/innodb_plugin/buf/buf0rea.c
@@ -608,14 +608,14 @@ buf_read_recv_pages(
while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
os_aio_simulated_wake_handler_threads();
- os_thread_sleep(500000);
+ os_thread_sleep(10000);
count++;
- if (count > 100) {
+ if (count > 1000) {
fprintf(stderr,
"InnoDB: Error: InnoDB has waited for"
- " 50 seconds for pending\n"
+ " 10 seconds for pending\n"
"InnoDB: reads to the buffer pool to"
" be finished.\n"
"InnoDB: Number of pending reads %lu,"