diff options
Diffstat (limited to 'storage/xtradb/buf')
-rw-r--r-- | storage/xtradb/buf/buf0buddy.c | 9 | ||||
-rw-r--r-- | storage/xtradb/buf/buf0buf.c | 101 | ||||
-rw-r--r-- | storage/xtradb/buf/buf0flu.c | 284 | ||||
-rw-r--r-- | storage/xtradb/buf/buf0lru.c | 141 | ||||
-rw-r--r-- | storage/xtradb/buf/buf0rea.c | 54 |
5 files changed, 478 insertions, 111 deletions
diff --git a/storage/xtradb/buf/buf0buddy.c b/storage/xtradb/buf/buf0buddy.c index d5e45745757..e4a79026d3a 100644 --- a/storage/xtradb/buf/buf0buddy.c +++ b/storage/xtradb/buf/buf0buddy.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2006, 2010, Innobase Oy. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -430,6 +430,8 @@ buf_buddy_relocate_block( } mutex_exit(&flush_list_mutex); + UNIV_MEM_INVALID(bpage, sizeof *bpage); + mutex_exit(&buf_pool_zip_mutex); mutex_exit(&zip_free_mutex); return(TRUE); @@ -567,7 +569,12 @@ success: } } else if (i == buf_buddy_get_slot(sizeof(buf_page_t))) { /* This must be a buf_page_t object. */ +#if UNIV_WORD_SIZE == 4 + /* On 32-bit systems, there is no padding in + buf_page_t. On other systems, Valgrind could complain + about uninitialized pad bytes. */ UNIV_MEM_ASSERT_RW(src, size); +#endif mutex_exit(&zip_free_mutex); diff --git a/storage/xtradb/buf/buf0buf.c b/storage/xtradb/buf/buf0buf.c index 79a9488e339..a5a9e1d9004 100644 --- a/storage/xtradb/buf/buf0buf.c +++ b/storage/xtradb/buf/buf0buf.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -277,6 +277,8 @@ the read requests for the whole area. #ifndef UNIV_HOTBACKUP /** Value in microseconds */ static const int WAIT_FOR_READ = 5000; +/** Number of attemtps made to read in a page in the buffer pool */ +static const ulint BUF_PAGE_READ_MAX_RETRIES = 100; /** The buffer buf_pool of the database */ UNIV_INTERN buf_pool_t* buf_pool = NULL; @@ -826,7 +828,7 @@ buf_chunk_init( buf_block_init(block, frame); -#ifdef HAVE_purify +#ifdef HAVE_valgrind /* Wipe contents of frame to eliminate a Purify warning */ memset(block->frame, '\0', UNIV_PAGE_SIZE); #endif @@ -1150,7 +1152,9 @@ buf_pool_drop_hash_index(void) when we have an x-latch on btr_search_latch; see the comment in buf0buf.h */ - if (!block->is_hashed) { + if (buf_block_get_state(block) + != BUF_BLOCK_FILE_PAGE + || !block->is_hashed) { continue; } @@ -1283,8 +1287,6 @@ buf_relocate( HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage); HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage); - - UNIV_MEM_INVALID(bpage, sizeof *bpage); } /********************************************************************//** @@ -1980,14 +1982,14 @@ buf_zip_decompress( buf_block_t* block, /*!< in/out: block */ ibool check) /*!< in: TRUE=verify the page checksum */ { - const byte* frame = block->page.zip.data; + const byte* frame = block->page.zip.data; + ulint stamp_checksum = mach_read_from_4( + frame + FIL_PAGE_SPACE_OR_CHKSUM); ut_ad(buf_block_get_zip_size(block)); ut_a(buf_block_get_space(block) != 0); - if (UNIV_LIKELY(check)) { - ulint stamp_checksum = mach_read_from_4( - frame + FIL_PAGE_SPACE_OR_CHKSUM); + if (UNIV_LIKELY(check && stamp_checksum != BUF_NO_CHECKSUM_MAGIC)) { ulint calc_checksum = page_zip_calc_checksum( frame, page_zip_get_size(&block->page.zip)); @@ -2196,6 +2198,7 @@ buf_page_get_gen( unsigned access_time; ulint fix_type; ibool must_read; + ulint retries = 0; mutex_t* block_mutex; trx_t* trx = NULL; ulint sec; @@ -2204,6 +2207,7 @@ buf_page_get_gen( ib_uint64_t finish_time; ut_ad(mtr); + ut_ad(mtr->state == MTR_ACTIVE); ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH) || (rw_latch == RW_NO_LATCH)); @@ -2271,7 +2275,29 @@ loop2: return(NULL); } - buf_read_page(space, zip_size, offset, trx); + if (buf_read_page(space, zip_size, offset, trx)) { + retries = 0; + } else if (retries < BUF_PAGE_READ_MAX_RETRIES) { + ++retries; + } else { + fprintf(stderr, "InnoDB: Error: Unable" + " to read tablespace %lu page no" + " %lu into the buffer pool after" + " %lu attempts\n" + "InnoDB: The most probable cause" + " of this error may be that the" + " table has been corrupted.\n" + "InnoDB: You can try to fix this" + " problem by using" + " innodb_force_recovery.\n" + "InnoDB: Please see reference manual" + " for more details.\n" + "InnoDB: Aborting...\n", + space, offset, + BUF_PAGE_READ_MAX_RETRIES); + + ut_error; + } #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG ut_a(++buf_dbg_counter % 37 || buf_validate()); @@ -2414,22 +2440,8 @@ wait_until_unfixed: ut_ad(!block->page.in_flush_list); } else { /* Relocate buf_pool->flush_list. */ - buf_page_t* b; - - b = UT_LIST_GET_PREV(flush_list, &block->page); - ut_ad(block->page.in_flush_list); - UT_LIST_REMOVE(flush_list, buf_pool->flush_list, - &block->page); - - if (b) { - UT_LIST_INSERT_AFTER( - flush_list, buf_pool->flush_list, b, - &block->page); - } else { - UT_LIST_ADD_FIRST( - flush_list, buf_pool->flush_list, - &block->page); - } + buf_flush_relocate_on_flush_list(bpage, + &block->page); } mutex_exit(&flush_list_mutex); @@ -2447,6 +2459,9 @@ wait_until_unfixed: block->page.buf_fix_count = 1; buf_block_set_io_fix(block, BUF_IO_READ); rw_lock_x_lock(&block->lock); + + UNIV_MEM_INVALID(bpage, sizeof *bpage); + mutex_exit(block_mutex); mutex_exit(&buf_pool_zip_mutex); @@ -2461,8 +2476,9 @@ wait_until_unfixed: /* Decompress the page and apply buffered operations while not holding buf_pool_mutex or block->mutex. */ success = buf_zip_decompress(block, srv_use_checksums); + ut_a(success); - if (UNIV_LIKELY(success)) { + if (UNIV_LIKELY(!recv_no_ibuf_operations)) { ibuf_merge_or_delete_for_page(block, space, offset, zip_size, TRUE); } @@ -2478,14 +2494,6 @@ wait_until_unfixed: buf_pool->n_pend_unzip--; mutex_exit(&buf_pool_mutex); rw_lock_x_unlock(&block->lock); - - if (UNIV_UNLIKELY(!success)) { - - //buf_pool_mutex_exit(); - mutex_exit(block_mutex); - return(NULL); - } - break; case BUF_BLOCK_ZIP_FREE: @@ -2500,7 +2508,12 @@ wait_until_unfixed: ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); //mutex_enter(&block->mutex); +#if UNIV_WORD_SIZE == 4 + /* On 32-bit systems, there is no padding in buf_page_t. On + other systems, Valgrind could complain about uninitialized pad + bytes. */ UNIV_MEM_ASSERT_RW(&block->page, sizeof block->page); +#endif buf_block_buf_fix_inc(block, file, line); @@ -2603,8 +2616,8 @@ page. @return TRUE if success */ UNIV_INTERN ibool -buf_page_optimistic_get_func( -/*=========================*/ +buf_page_optimistic_get( +/*====================*/ ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */ buf_block_t* block, /*!< in: guessed buffer block */ ib_uint64_t modify_clock,/*!< in: modify clock value if mode is @@ -2618,7 +2631,9 @@ buf_page_optimistic_get_func( ulint fix_type; trx_t* trx = NULL; - ut_ad(mtr && block); + ut_ad(block); + ut_ad(mtr); + ut_ad(mtr->state == MTR_ACTIVE); ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); mutex_enter(&block->mutex); @@ -2738,6 +2753,7 @@ buf_page_get_known_nowait( trx_t* trx = NULL; ut_ad(mtr); + ut_ad(mtr->state == MTR_ACTIVE); ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); mutex_enter(&block->mutex); @@ -2846,6 +2862,9 @@ buf_page_try_get_func( ibool success; ulint fix_type; + ut_ad(mtr); + ut_ad(mtr->state == MTR_ACTIVE); + //buf_pool_mutex_enter(); rw_lock_s_lock(&page_hash_latch); block = buf_block_hash_get(space_id, page_no); @@ -3249,6 +3268,7 @@ buf_page_create( ulint time_ms = ut_time_ms(); ut_ad(mtr); + ut_ad(mtr->state == MTR_ACTIVE); ut_ad(space || !zip_size); free_block = buf_LRU_get_free_block(0); @@ -3431,7 +3451,8 @@ buf_page_io_complete( read_space_id = mach_read_from_4( frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); - if (bpage->space == TRX_SYS_SPACE + if ((bpage->space == TRX_SYS_SPACE + || (srv_doublewrite_file && bpage->space == TRX_DOUBLEWRITE_SPACE)) && trx_doublewrite_page_inside(bpage->offset)) { ut_print_timestamp(stderr); @@ -3503,7 +3524,7 @@ corrupt: REFMAN "forcing-recovery.html\n" "InnoDB: about forcing recovery.\n", stderr); - if (srv_pass_corrupt_table && bpage->space > 0 + if (srv_pass_corrupt_table && !trx_sys_sys_space(bpage->space) && bpage->space < SRV_LOG_SPACE_FIRST_ID) { fprintf(stderr, "InnoDB: space %u will be treated as corrupt.\n", diff --git a/storage/xtradb/buf/buf0flu.c b/storage/xtradb/buf/buf0flu.c index 1735f6ac3cb..17588475bbf 100644 --- a/storage/xtradb/buf/buf0flu.c +++ b/storage/xtradb/buf/buf0flu.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -88,6 +88,146 @@ buf_flush_validate_low(void); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ /********************************************************************//** +Insert a block in the flush_rbt and returns a pointer to its +predecessor or NULL if no predecessor. The ordering is maintained +on the basis of the <oldest_modification, space, offset> key. +@return pointer to the predecessor or NULL if no predecessor. */ +static +buf_page_t* +buf_flush_insert_in_flush_rbt( +/*==========================*/ + buf_page_t* bpage) /*!< in: bpage to be inserted. */ +{ + buf_page_t* prev = NULL; + const ib_rbt_node_t* c_node; + const ib_rbt_node_t* p_node; + + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&flush_list_mutex)); + + /* Insert this buffer into the rbt. */ + c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage); + ut_a(c_node != NULL); + + /* Get the predecessor. */ + p_node = rbt_prev(buf_pool->flush_rbt, c_node); + + if (p_node != NULL) { + prev = *rbt_value(buf_page_t*, p_node); + ut_a(prev != NULL); + } + + return(prev); +} + +/********************************************************************//** +Delete a bpage from the flush_rbt. */ +static +void +buf_flush_delete_from_flush_rbt( +/*============================*/ + buf_page_t* bpage) /*!< in: bpage to be removed. */ +{ + + ibool ret = FALSE; + + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&flush_list_mutex)); + ret = rbt_delete(buf_pool->flush_rbt, &bpage); + ut_ad(ret); +} + +/********************************************************************//** +Compare two modified blocks in the buffer pool. The key for comparison +is: +key = <oldest_modification, space, offset> +This comparison is used to maintian ordering of blocks in the +buf_pool->flush_rbt. +Note that for the purpose of flush_rbt, we only need to order blocks +on the oldest_modification. The other two fields are used to uniquely +identify the blocks. +@return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */ +static +int +buf_flush_block_cmp( +/*================*/ + const void* p1, /*!< in: block1 */ + const void* p2) /*!< in: block2 */ +{ + int ret; + const buf_page_t* b1; + const buf_page_t* b2; + + ut_ad(p1 != NULL); + ut_ad(p2 != NULL); + + b1 = *(const buf_page_t**) p1; + b2 = *(const buf_page_t**) p2; + + ut_ad(b1 != NULL); + ut_ad(b2 != NULL); + + ut_ad(b1->in_flush_list); + ut_ad(b2->in_flush_list); + + if (b2->oldest_modification + > b1->oldest_modification) { + return(1); + } + + if (b2->oldest_modification + < b1->oldest_modification) { + return(-1); + } + + /* If oldest_modification is same then decide on the space. */ + ret = (int)(b2->space - b1->space); + + /* Or else decide ordering on the offset field. */ + return(ret ? ret : (int)(b2->offset - b1->offset)); +} + +/********************************************************************//** +Initialize the red-black tree to speed up insertions into the flush_list +during recovery process. Should be called at the start of recovery +process before any page has been read/written. */ +UNIV_INTERN +void +buf_flush_init_flush_rbt(void) +/*==========================*/ +{ + //buf_pool_mutex_enter(); + mutex_enter(&flush_list_mutex); + + /* Create red black tree for speedy insertions in flush list. */ + buf_pool->flush_rbt = rbt_create(sizeof(buf_page_t*), + buf_flush_block_cmp); + //buf_pool_mutex_exit(); + mutex_exit(&flush_list_mutex); +} + +/********************************************************************//** +Frees up the red-black tree. */ +UNIV_INTERN +void +buf_flush_free_flush_rbt(void) +/*==========================*/ +{ + //buf_pool_mutex_enter(); + mutex_enter(&flush_list_mutex); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(buf_flush_validate_low()); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + + rbt_free(buf_pool->flush_rbt); + buf_pool->flush_rbt = NULL; + + //buf_pool_mutex_exit(); + mutex_exit(&flush_list_mutex); +} + +/********************************************************************//** Inserts a modified block into the flush list. */ UNIV_INTERN void @@ -102,6 +242,13 @@ buf_flush_insert_into_flush_list( || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification <= block->page.oldest_modification)); + /* If we are in the recovery then we need to update the flush + red-black tree as well. */ + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + buf_flush_insert_sorted_into_flush_list(block); + return; + } + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(block->page.in_LRU_list); ut_ad(block->page.in_page_hash); @@ -140,26 +287,27 @@ buf_flush_insert_sorted_into_flush_list( ut_d(block->page.in_flush_list = TRUE); prev_b = NULL; - b = UT_LIST_GET_FIRST(buf_pool->flush_list); - if (srv_fast_recovery) { - /* speed hack */ - if (b == NULL || b->oldest_modification < block->page.oldest_modification) { - UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, &block->page); + /* For the most part when this function is called the flush_rbt + should not be NULL. In a very rare boundary case it is possible + that the flush_rbt has already been freed by the recovery thread + before the last page was hooked up in the flush_list by the + io-handler thread. In that case we'll just do a simple + linear search in the else block. */ + if (buf_pool->flush_rbt) { + + prev_b = buf_flush_insert_in_flush_rbt(&block->page); + } else { - b = UT_LIST_GET_LAST(buf_pool->flush_list); - if (b->oldest_modification < block->page.oldest_modification) { - /* align oldest_modification not to sort */ - block->page.oldest_modification = b->oldest_modification; + + b = UT_LIST_GET_FIRST(buf_pool->flush_list); + + while (b && b->oldest_modification + > block->page.oldest_modification) { + ut_ad(b->in_flush_list); + prev_b = b; + b = UT_LIST_GET_NEXT(flush_list, b); } - UT_LIST_ADD_LAST(flush_list, buf_pool->flush_list, &block->page); - } - } else { - /* normal */ - while (b && b->oldest_modification > block->page.oldest_modification) { - ut_ad(b->in_flush_list); - prev_b = b; - b = UT_LIST_GET_NEXT(flush_list, b); } if (prev_b == NULL) { @@ -168,7 +316,6 @@ buf_flush_insert_sorted_into_flush_list( UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list, prev_b, &block->page); } - } #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG ut_a(buf_flush_validate_low()); @@ -262,7 +409,6 @@ buf_flush_remove( mutex_enter(&flush_list_mutex); ut_ad(bpage->in_flush_list); - ut_d(bpage->in_flush_list = FALSE); switch (buf_page_get_state(bpage)) { case BUF_BLOCK_ZIP_PAGE: @@ -285,6 +431,15 @@ buf_flush_remove( break; } + /* If the flush_rbt is active then delete from it as well. */ + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + buf_flush_delete_from_flush_rbt(bpage); + } + + /* Must be done after we have removed it from the flush_rbt + because we assert on in_flush_list in comparison function. */ + ut_d(bpage->in_flush_list = FALSE); + bpage->oldest_modification = 0; ut_d(UT_LIST_VALIDATE(flush_list, buf_page_t, buf_pool->flush_list, @@ -293,6 +448,64 @@ buf_flush_remove( } /********************************************************************//** +Relocates a buffer control block on the flush_list. +Note that it is assumed that the contents of bpage has already been +copied to dpage. */ +UNIV_INTERN +void +buf_flush_relocate_on_flush_list( +/*=============================*/ + buf_page_t* bpage, /*!< in/out: control block being moved */ + buf_page_t* dpage) /*!< in/out: destination block */ +{ + buf_page_t* prev; + buf_page_t* prev_b = NULL; + + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&flush_list_mutex)); + + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + + ut_ad(bpage->in_flush_list); + ut_ad(dpage->in_flush_list); + + /* If recovery is active we must swap the control blocks in + the flush_rbt as well. */ + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + buf_flush_delete_from_flush_rbt(bpage); + prev_b = buf_flush_insert_in_flush_rbt(dpage); + } + + /* Must be done after we have removed it from the flush_rbt + because we assert on in_flush_list in comparison function. */ + ut_d(bpage->in_flush_list = FALSE); + + prev = UT_LIST_GET_PREV(flush_list, bpage); + UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage); + + if (prev) { + ut_ad(prev->in_flush_list); + UT_LIST_INSERT_AFTER( + flush_list, + buf_pool->flush_list, + prev, dpage); + } else { + UT_LIST_ADD_FIRST( + flush_list, + buf_pool->flush_list, + dpage); + } + + /* Just an extra check. Previous in flush_list + should be the same control block as in flush_rbt. */ + ut_a(!buf_pool->flush_rbt || prev_b == prev); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(buf_flush_validate_low()); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +} + +/********************************************************************//** Updates the flush system data structures when a write is completed. */ UNIV_INTERN void @@ -452,7 +665,8 @@ corrupted_page: write_buf = trx_doublewrite->write_buf; i = 0; - fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, + fil_io(OS_FILE_WRITE, TRUE, + (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE), 0, trx_doublewrite->block1, 0, len, (void*) write_buf, NULL); @@ -489,7 +703,8 @@ corrupted_page: + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE); - fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, + fil_io(OS_FILE_WRITE, TRUE, + (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE), 0, trx_doublewrite->block2, 0, len, (void*) write_buf, NULL); @@ -519,7 +734,7 @@ corrupted_page: flush: /* Now flush the doublewrite buffer data to disk */ - fil_flush(TRX_SYS_SPACE); + fil_flush(srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE); /* We know that the writes have been flushed to disk now and in recovery we will find them in the doublewrite buffer @@ -1473,24 +1688,45 @@ ibool buf_flush_validate_low(void) /*========================*/ { - buf_page_t* bpage; + buf_page_t* bpage; + const ib_rbt_node_t* rnode = NULL; UT_LIST_VALIDATE(flush_list, buf_page_t, buf_pool->flush_list, ut_ad(ut_list_node_313->in_flush_list)); bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); + /* If we are in recovery mode i.e.: flush_rbt != NULL + then each block in the flush_list must also be present + in the flush_rbt. */ + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + rnode = rbt_first(buf_pool->flush_rbt); + } + while (bpage != NULL) { const ib_uint64_t om = bpage->oldest_modification; ut_ad(bpage->in_flush_list); //ut_a(buf_page_in_file(bpage)); /* optimistic */ ut_a(om > 0); + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + ut_a(rnode); + buf_page_t* rpage = *rbt_value(buf_page_t*, + rnode); + ut_a(rpage); + ut_a(rpage == bpage); + rnode = rbt_next(buf_pool->flush_rbt, rnode); + } + bpage = UT_LIST_GET_NEXT(flush_list, bpage); ut_a(!bpage || om >= bpage->oldest_modification); } + /* By this time we must have exhausted the traversal of + flush_rbt (if active) as well. */ + ut_a(rnode == NULL); + return(TRUE); } diff --git a/storage/xtradb/buf/buf0lru.c b/storage/xtradb/buf/buf0lru.c index 58e2c23275b..65fdf342e4f 100644 --- a/storage/xtradb/buf/buf0lru.c +++ b/storage/xtradb/buf/buf0lru.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -371,21 +371,39 @@ scan_again: bpage = UT_LIST_GET_LAST(buf_pool->LRU); while (bpage != NULL) { - mutex_t* block_mutex = buf_page_get_mutex_enter(bpage); buf_page_t* prev_bpage; + ibool prev_bpage_buf_fix = FALSE; ut_a(buf_page_in_file(bpage)); prev_bpage = UT_LIST_GET_PREV(LRU, bpage); - if (!block_mutex) { - bpage = prev_bpage; - continue; - } + /* bpage->space and bpage->io_fix are protected by + buf_pool_mutex and block_mutex. It is safe to check + them while holding buf_pool_mutex only. */ + + if (buf_page_get_space(bpage) != id) { + /* Skip this block, as it does not belong to + the space that is being invalidated. */ + } else if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) { + /* We cannot remove this page during this scan + yet; maybe the system is currently reading it + in, or flushing the modifications to the file */ + + all_freed = FALSE; + } else { + mutex_t* block_mutex = buf_page_get_mutex_enter(bpage); - if (buf_page_get_space(bpage) == id) { - if (bpage->buf_fix_count > 0 - || buf_page_get_io_fix(bpage) != BUF_IO_NONE) { + if (!block_mutex) { + /* It may be impossible case... + Something wrong, so will be scan_again */ + + all_freed = FALSE; + + goto next_page_no_mutex; + } + + if (bpage->buf_fix_count > 0) { /* We cannot remove this page during this scan yet; maybe the system is @@ -405,8 +423,40 @@ scan_again: (ulong) buf_page_get_page_no(bpage)); } #endif - if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE - && ((buf_block_t*) bpage)->is_hashed) { + if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { + /* This is a compressed-only block + descriptor. Ensure that prev_bpage + cannot be relocated when bpage is freed. */ + if (UNIV_LIKELY(prev_bpage != NULL)) { + switch (buf_page_get_state( + prev_bpage)) { + case BUF_BLOCK_FILE_PAGE: + /* Descriptors of uncompressed + blocks will not be relocated, + because we are holding the + buf_pool_mutex. */ + break; + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + /* Descriptors of compressed- + only blocks can be relocated, + unless they are buffer-fixed. + Because both bpage and + prev_bpage are protected by + buf_pool_zip_mutex, it is + not necessary to acquire + further mutexes. */ + ut_ad(&buf_pool_zip_mutex + == block_mutex); + ut_ad(mutex_own(block_mutex)); + prev_bpage_buf_fix = TRUE; + prev_bpage->buf_fix_count++; + break; + default: + ut_error; + } + } + } else if (((buf_block_t*) bpage)->is_hashed) { ulint page_no; ulint zip_size; @@ -432,7 +482,8 @@ scan_again: buf_flush_remove(bpage); } - /* Remove from the LRU list */ + /* Remove from the LRU list. */ + if (buf_LRU_block_remove_hashed_page(bpage, TRUE) != BUF_BLOCK_ZIP_FREE) { buf_LRU_block_free_hashed_page((buf_block_t*) @@ -444,18 +495,27 @@ scan_again: ut_ad(block_mutex == &buf_pool_zip_mutex); ut_ad(!mutex_own(block_mutex)); - /* The compressed block descriptor - (bpage) has been deallocated and - block_mutex released. Also, - buf_buddy_free() may have relocated - prev_bpage. Rescan the LRU list. */ + if (prev_bpage_buf_fix) { + /* We temporarily buffer-fixed + prev_bpage, so that + buf_buddy_free() could not + relocate it, in case it was a + compressed-only block + descriptor. */ + + mutex_enter(block_mutex); + ut_ad(prev_bpage->buf_fix_count > 0); + prev_bpage->buf_fix_count--; + mutex_exit(block_mutex); + } - bpage = UT_LIST_GET_LAST(buf_pool->LRU); - continue; + goto next_page_no_mutex; } - } next_page: - mutex_exit(block_mutex); + mutex_exit(block_mutex); + } + +next_page_no_mutex: bpage = prev_bpage; } @@ -1425,7 +1485,12 @@ buf_LRU_free_block( ut_ad(buf_page_in_file(bpage)); //ut_ad(bpage->in_LRU_list); ut_ad(!bpage->in_flush_list == !bpage->oldest_modification); +#if UNIV_WORD_SIZE == 4 + /* On 32-bit systems, there is no padding in buf_page_t. On + other systems, Valgrind could complain about uninitialized pad + bytes. */ UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage); +#endif if (!bpage->in_LRU_list || !block_mutex || !buf_page_can_relocate(bpage)) { @@ -1558,8 +1623,13 @@ not_freed: ut_ad(prev_b->in_LRU_list); ut_ad(buf_page_in_file(prev_b)); +#if UNIV_WORD_SIZE == 4 + /* On 32-bit systems, there is no + padding in buf_page_t. On other + systems, Valgrind could complain about + uninitialized pad bytes. */ UNIV_MEM_ASSERT_RW(prev_b, sizeof *prev_b); - +#endif UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, prev_b, b); @@ -1600,26 +1670,8 @@ not_freed: if (b->state == BUF_BLOCK_ZIP_PAGE) { buf_LRU_insert_zip_clean(b); } else { - buf_page_t* prev; - - ut_ad(b->in_flush_list); - ut_d(bpage->in_flush_list = FALSE); - - prev = UT_LIST_GET_PREV(flush_list, b); - UT_LIST_REMOVE(flush_list, buf_pool->flush_list, b); - - if (prev) { - ut_ad(prev->in_flush_list); - UT_LIST_INSERT_AFTER( - flush_list, - buf_pool->flush_list, - prev, b); - } else { - UT_LIST_ADD_FIRST( - flush_list, - buf_pool->flush_list, - b); - } + /* Relocate on buf_pool->flush_list. */ + buf_flush_relocate_on_flush_list(bpage, b); } mutex_exit(&flush_list_mutex); @@ -1792,7 +1844,12 @@ buf_LRU_block_remove_hashed_page( ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE); ut_a(bpage->buf_fix_count == 0); +#if UNIV_WORD_SIZE == 4 + /* On 32-bit systems, there is no padding in + buf_page_t. On other systems, Valgrind could complain + about uninitialized pad bytes. */ UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage); +#endif buf_LRU_remove_block(bpage); diff --git a/storage/xtradb/buf/buf0rea.c b/storage/xtradb/buf/buf0rea.c index e5d04df797f..59de70d9a8a 100644 --- a/storage/xtradb/buf/buf0rea.c +++ b/storage/xtradb/buf/buf0rea.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -86,7 +86,9 @@ buf_read_page_low( wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER; - if (trx_doublewrite && space == TRX_SYS_SPACE + if (trx_doublewrite + && (space == TRX_SYS_SPACE + || (srv_doublewrite_file && space == TRX_DOUBLEWRITE_SPACE)) && ( (offset >= trx_doublewrite->block1 && offset < trx_doublewrite->block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) @@ -656,6 +658,50 @@ buf_read_recv_pages( /* It is a single table tablespace and the .ibd file is missing: do nothing */ + /* the log records should be treated here same reason + for http://bugs.mysql.com/bug.php?id=43948 */ + + if (recv_recovery_is_on()) { + recv_addr_t* recv_addr; + + mutex_enter(&(recv_sys->mutex)); + + if (recv_sys->apply_log_recs == FALSE) { + mutex_exit(&(recv_sys->mutex)); + goto not_to_recover; + } + + for (i = 0; i < n_stored; i++) { + /* recv_get_fil_addr_struct() */ + recv_addr = HASH_GET_FIRST(recv_sys->addr_hash, + hash_calc_hash(ut_fold_ulint_pair(space, page_nos[i]), + recv_sys->addr_hash)); + while (recv_addr) { + if ((recv_addr->space == space) + && (recv_addr->page_no == page_nos[i])) { + break; + } + recv_addr = HASH_GET_NEXT(addr_hash, recv_addr); + } + + if ((recv_addr == NULL) + || (recv_addr->state == RECV_BEING_PROCESSED) + || (recv_addr->state == RECV_PROCESSED)) { + continue; + } + + recv_addr->state = RECV_PROCESSED; + + ut_a(recv_sys->n_addrs); + recv_sys->n_addrs--; + } + + mutex_exit(&(recv_sys->mutex)); + + fprintf(stderr, " (cannot find space: %lu)", space); + } +not_to_recover: + return; } @@ -674,10 +720,10 @@ buf_read_recv_pages( count++; - if (count > 5000) { + if (count > 1000) { fprintf(stderr, "InnoDB: Error: InnoDB has waited for" - " 50 seconds for pending\n" + " 10 seconds for pending\n" "InnoDB: reads to the buffer pool to" " be finished.\n" "InnoDB: Number of pending reads %lu," |