From aa3d29681e7ac0e005d04b2c18127746b7bd9f9e Mon Sep 17 00:00:00 2001 From: Inaam Rana Date: Tue, 19 Jul 2011 10:37:37 -0400 Subject: Bug 12356373 - PERFORMANCE REGRESSION FROM 5.1 TO 5.5 : GROUP BY: The title of the bug is a little confusing. The actual fix is to reintroduce random readahead inside InnoDB with a dynamic, global switch innodb_random_read_ahead [default = off]. Approved by: Sunny Bains rb://696 --- storage/innodb_plugin/ChangeLog | 7 ++ storage/innodb_plugin/buf/buf0buf.c | 6 +- storage/innodb_plugin/buf/buf0rea.c | 170 +++++++++++++++++++++++++++++ storage/innodb_plugin/handler/ha_innodb.cc | 8 ++ storage/innodb_plugin/include/buf0buf.h | 14 +++ storage/innodb_plugin/include/buf0buf.ic | 28 ++++- storage/innodb_plugin/include/srv0srv.h | 2 + storage/innodb_plugin/srv/srv0srv.c | 4 + 8 files changed, 232 insertions(+), 7 deletions(-) diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index 853ae782a44..617df0a61fe 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,10 @@ +2011-07-19 The InnoDB Team + + * buf/buf0buf.c, buf/buf0rea.c, handler/ha_innodb.cc, + include/buf0buf.h, include/buf0buf.ic, include/srv0srv.h, + srv/srv0srv.c: + Fix bug#Bug 12356373 by reintroducing random readahead + 2011-06-30 The InnoDB Team * row/row0row.c: diff --git a/storage/innodb_plugin/buf/buf0buf.c b/storage/innodb_plugin/buf/buf0buf.c index c0da7dbdbe5..cd1461d22b7 100644 --- a/storage/innodb_plugin/buf/buf0buf.c +++ b/storage/innodb_plugin/buf/buf0buf.c @@ -3590,12 +3590,16 @@ buf_print_io( /* Statistics about read ahead algorithm */ fprintf(file, "Pages read ahead %.2f/s," - " evicted without access %.2f/s\n", + " evicted without access %.2f/s," + " Random read ahead %.2f/s\n", (buf_pool->stat.n_ra_pages_read - buf_pool->old_stat.n_ra_pages_read) / time_elapsed, (buf_pool->stat.n_ra_pages_evicted - buf_pool->old_stat.n_ra_pages_evicted) + / time_elapsed, + (buf_pool->stat.n_ra_pages_read_rnd + - buf_pool->old_stat.n_ra_pages_read_rnd) / time_elapsed); /* Print some values to help us with visualizing what is diff --git a/storage/innodb_plugin/buf/buf0rea.c b/storage/innodb_plugin/buf/buf0rea.c index 81f788baac2..0b36eeb5cd3 100644 --- a/storage/innodb_plugin/buf/buf0rea.c +++ b/storage/innodb_plugin/buf/buf0rea.c @@ -38,6 +38,14 @@ Created 11/5/1995 Heikki Tuuri #include "srv0start.h" #include "srv0srv.h" +/** The size in blocks of the area where the random read-ahead algorithm counts +the accessed pages when deciding whether to read-ahead */ +#define BUF_READ_AHEAD_RANDOM_AREA BUF_READ_AHEAD_AREA + +/** There must be at least this many pages in buf_pool in the area to start +a random read-ahead */ +#define BUF_READ_AHEAD_RANDOM_THRESHOLD (5 + BUF_READ_AHEAD_RANDOM_AREA / 8) + /** The linear read-ahead area size */ #define BUF_READ_AHEAD_LINEAR_AREA BUF_READ_AHEAD_AREA @@ -157,6 +165,165 @@ buf_read_page_low( return(1); } +/********************************************************************//** +Applies a random read-ahead in buf_pool if there are at least a threshold +value of accessed pages from the random read-ahead area. Does not read any +page, not even the one at the position (space, offset), if the read-ahead +mechanism is not activated. NOTE 1: the calling thread may own latches on +pages: to avoid deadlocks this function must be written such that it cannot +end up waiting for these latches! NOTE 2: the calling thread must want +access to the page given: this rule is set to prevent unintended read-aheads +performed by ibuf routines, a situation which could result in a deadlock if +the OS does not support asynchronous i/o. +@return number of page read requests issued; NOTE that if we read ibuf +pages, it may happen that the page at the given page number does not +get read even if we return a positive value! */ +static +ulint +buf_read_ahead_random( +/*==================*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ + ulint offset) /*!< in: page number of a page which the current thread + wants to access */ +{ + ib_int64_t tablespace_version; + ulint recent_blocks = 0; + ulint count; + ulint ibuf_mode; + ulint low, high; + ulint err; + ulint i; + ulint buf_read_ahead_random_area; + + if (!srv_random_read_ahead) { + /* Disabled by user */ + return(0); + } + + if (srv_startup_is_before_trx_rollback_phase) { + /* No read-ahead to avoid thread deadlocks */ + return(0); + } + + if (ibuf_bitmap_page(zip_size, offset) + || trx_sys_hdr_page(space, offset)) { + + /* If it is an ibuf bitmap page or trx sys hdr, we do + no read-ahead, as that could break the ibuf page access + order */ + + return(0); + } + + /* Remember the tablespace version before we ask the tablespace size + below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we + do not try to read outside the bounds of the tablespace! */ + + tablespace_version = fil_space_get_version(space); + + buf_read_ahead_random_area = BUF_READ_AHEAD_RANDOM_AREA; + + low = (offset / buf_read_ahead_random_area) + * buf_read_ahead_random_area; + high = (offset / buf_read_ahead_random_area + 1) + * buf_read_ahead_random_area; + if (high > fil_space_get_size(space)) { + + high = fil_space_get_size(space); + } + + buf_pool_mutex_enter(); + + if (buf_pool->n_pend_reads + > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { + buf_pool_mutex_exit(); + + return(0); + } + + /* Count how many blocks in the area have been recently accessed, + that is, reside near the start of the LRU list. */ + + for (i = low; i < high; i++) { + const buf_page_t* bpage = buf_page_hash_get(space, i); + + if (bpage + && buf_page_is_accessed(bpage) + && buf_page_peek_if_young(bpage)) { + + recent_blocks++; + + if (recent_blocks >= BUF_READ_AHEAD_RANDOM_THRESHOLD) { + + buf_pool_mutex_exit(); + goto read_ahead; + } + } + } + + buf_pool_mutex_exit(); + /* Do nothing */ + return(0); + +read_ahead: + /* Read all the suitable blocks within the area */ + + if (ibuf_inside()) { + ibuf_mode = BUF_READ_IBUF_PAGES_ONLY; + } else { + ibuf_mode = BUF_READ_ANY_PAGE; + } + + count = 0; + + for (i = low; i < high; i++) { + /* It is only sensible to do read-ahead in the non-sync aio + mode: hence FALSE as the first parameter */ + + if (!ibuf_bitmap_page(zip_size, i)) { + count += buf_read_page_low( + &err, FALSE, + ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER, + space, zip_size, FALSE, + tablespace_version, i); + if (err == DB_TABLESPACE_DELETED) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: in random" + " readahead trying to access\n" + "InnoDB: tablespace %lu page %lu,\n" + "InnoDB: but the tablespace does not" + " exist or is just being dropped.\n", + (ulong) space, (ulong) i); + } + } + } + + /* In simulated aio we wake the aio handler threads only after + queuing all aio requests, in native aio the following call does + nothing: */ + + os_aio_simulated_wake_handler_threads(); + +#ifdef UNIV_DEBUG + if (buf_debug_prints && (count > 0)) { + fprintf(stderr, + "Random read-ahead space %lu offset %lu pages %lu\n", + (ulong) space, (ulong) offset, + (ulong) count); + } +#endif /* UNIV_DEBUG */ + + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_LRU_stat_inc_io(); + + buf_pool->stat.n_ra_pages_read_rnd += count; + return(count); +} + + /********************************************************************//** High-level function which reads a page asynchronously from a file to the buffer buf_pool if it is not already there. Sets the io_fix flag and sets @@ -175,6 +342,9 @@ buf_read_page( ulint count; ulint err; + count = buf_read_ahead_random(space, zip_size, offset); + srv_buf_pool_reads += count; + tablespace_version = fil_space_get_version(space); /* We do the i/o in the synchronous aio mode to save thread diff --git a/storage/innodb_plugin/handler/ha_innodb.cc b/storage/innodb_plugin/handler/ha_innodb.cc index a9c9f379528..6f88ea3fde1 100644 --- a/storage/innodb_plugin/handler/ha_innodb.cc +++ b/storage/innodb_plugin/handler/ha_innodb.cc @@ -501,6 +501,8 @@ static SHOW_VAR innodb_status_variables[]= { (char*) &export_vars.innodb_buffer_pool_pages_misc, SHOW_LONG}, {"buffer_pool_pages_total", (char*) &export_vars.innodb_buffer_pool_pages_total, SHOW_LONG}, + {"buffer_pool_read_ahead_rnd", + (char*) &export_vars.innodb_buffer_pool_read_ahead_rnd, SHOW_LONG}, {"buffer_pool_read_ahead", (char*) &export_vars.innodb_buffer_pool_read_ahead, SHOW_LONG}, {"buffer_pool_read_ahead_evicted", @@ -11027,6 +11029,11 @@ static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug, NULL, NULL, 0, 0, 1, 0); #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ +static MYSQL_SYSVAR_BOOL(random_read_ahead, srv_random_read_ahead, + PLUGIN_VAR_NOCMDARG, + "Whether to use read ahead for random access within an extent.", + NULL, NULL, FALSE); + static MYSQL_SYSVAR_ULONG(read_ahead_threshold, srv_read_ahead_threshold, PLUGIN_VAR_RQCMDARG, "Number of pages that must be accessed sequentially for InnoDB to " @@ -11091,6 +11098,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG MYSQL_SYSVAR(change_buffering_debug), #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + MYSQL_SYSVAR(random_read_ahead), MYSQL_SYSVAR(read_ahead_threshold), MYSQL_SYSVAR(io_capacity), NULL diff --git a/storage/innodb_plugin/include/buf0buf.h b/storage/innodb_plugin/include/buf0buf.h index 86c47a6edba..9856bfce409 100644 --- a/storage/innodb_plugin/include/buf0buf.h +++ b/storage/innodb_plugin/include/buf0buf.h @@ -427,6 +427,18 @@ buf_block_get_freed_page_clock( __attribute__((pure)); /********************************************************************//** +Tells if a block is still close enough to the MRU end of the LRU list +meaning that it is not in danger of getting evicted and also implying +that it has been accessed recently. +Note that this is for heuristics only and does not reserve buffer pool +mutex. +@return TRUE if block is close to MRU end of LRU */ +UNIV_INLINE +ibool +buf_page_peek_if_young( +/*===================*/ + const buf_page_t* bpage); /*!< in: block */ +/********************************************************************//** Recommends a move of a block to the start of the LRU list if there is danger of dropping from the buffer pool. NOTE: does not reserve the buffer pool mutex. @@ -1334,6 +1346,8 @@ struct buf_pool_stat_struct{ ulint n_pages_written;/*!< number write operations */ ulint n_pages_created;/*!< number of pages created in the pool with no read */ + ulint n_ra_pages_read_rnd;/*!< number of pages read in + as part of random read ahead */ ulint n_ra_pages_read;/*!< number of pages read in as part of read ahead */ ulint n_ra_pages_evicted;/*!< number of read ahead diff --git a/storage/innodb_plugin/include/buf0buf.ic b/storage/innodb_plugin/include/buf0buf.ic index 35c63f034f5..2f3d65e80bd 100644 --- a/storage/innodb_plugin/include/buf0buf.ic +++ b/storage/innodb_plugin/include/buf0buf.ic @@ -61,6 +61,27 @@ buf_block_get_freed_page_clock( return(buf_page_get_freed_page_clock(&block->page)); } +/********************************************************************//** +Tells if a block is still close enough to the MRU end of the LRU list +meaning that it is not in danger of getting evicted and also implying +that it has been accessed recently. +Note that this is for heuristics only and does not reserve buffer pool +mutex. +@return TRUE if block is close to MRU end of LRU */ +UNIV_INLINE +ibool +buf_page_peek_if_young( +/*===================*/ + const buf_page_t* bpage) /*!< in: block */ +{ + /* FIXME: bpage->freed_page_clock is 31 bits */ + return((buf_pool->freed_page_clock & ((1UL << 31) - 1)) + < ((ulint) bpage->freed_page_clock + + (buf_pool->curr_size + * (BUF_LRU_OLD_RATIO_DIV - buf_LRU_old_ratio) + / (BUF_LRU_OLD_RATIO_DIV * 4)))); +} + /********************************************************************//** Recommends a move of a block to the start of the LRU list if there is danger of dropping from the buffer pool. NOTE: does not reserve the buffer pool @@ -89,12 +110,7 @@ buf_page_peek_if_too_old( buf_pool->stat.n_pages_not_made_young++; return(FALSE); } else { - /* FIXME: bpage->freed_page_clock is 31 bits */ - return((buf_pool->freed_page_clock & ((1UL << 31) - 1)) - > ((ulint) bpage->freed_page_clock - + (buf_pool->curr_size - * (BUF_LRU_OLD_RATIO_DIV - buf_LRU_old_ratio) - / (BUF_LRU_OLD_RATIO_DIV * 4)))); + return(!buf_page_peek_if_young(bpage)); } } diff --git a/storage/innodb_plugin/include/srv0srv.h b/storage/innodb_plugin/include/srv0srv.h index 91ae895040c..7c63a5f0d45 100644 --- a/storage/innodb_plugin/include/srv0srv.h +++ b/storage/innodb_plugin/include/srv0srv.h @@ -143,6 +143,7 @@ extern ulint srv_mem_pool_size; extern ulint srv_lock_table_size; extern ulint srv_n_file_io_threads; +extern my_bool srv_random_read_ahead; extern ulong srv_read_ahead_threshold; extern ulint srv_n_read_io_threads; extern ulint srv_n_write_io_threads; @@ -618,6 +619,7 @@ struct export_var_struct{ ulint innodb_buffer_pool_wait_free; /*!< srv_buf_pool_wait_free */ ulint innodb_buffer_pool_pages_flushed; /*!< srv_buf_pool_flushed */ ulint innodb_buffer_pool_write_requests;/*!< srv_buf_pool_write_requests */ + ulint innodb_buffer_pool_read_ahead_rnd;/*!< srv_read_ahead_rnd */ ulint innodb_buffer_pool_read_ahead; /*!< srv_read_ahead */ ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/ ulint innodb_dblwr_pages_written; /*!< srv_dblwr_pages_written */ diff --git a/storage/innodb_plugin/srv/srv0srv.c b/storage/innodb_plugin/srv/srv0srv.c index b1fc1ac67fd..8ad4c02e322 100644 --- a/storage/innodb_plugin/srv/srv0srv.c +++ b/storage/innodb_plugin/srv/srv0srv.c @@ -203,6 +203,8 @@ UNIV_INTERN ulint srv_n_file_io_threads = ULINT_MAX; UNIV_INTERN ulint srv_n_read_io_threads = ULINT_MAX; UNIV_INTERN ulint srv_n_write_io_threads = ULINT_MAX; +/* Switch to enable random read ahead. */ +UNIV_INTERN my_bool srv_random_read_ahead = FALSE; /* User settable value of the number of pages that must be present in the buffer cache and accessed sequentially for InnoDB to trigger a readahead request. */ @@ -1906,6 +1908,8 @@ srv_export_innodb_status(void) export_vars.innodb_buffer_pool_wait_free = srv_buf_pool_wait_free; export_vars.innodb_buffer_pool_pages_flushed = srv_buf_pool_flushed; export_vars.innodb_buffer_pool_reads = srv_buf_pool_reads; + export_vars.innodb_buffer_pool_read_ahead_rnd + = buf_pool->stat.n_ra_pages_read_rnd; export_vars.innodb_buffer_pool_read_ahead = buf_pool->stat.n_ra_pages_read; export_vars.innodb_buffer_pool_read_ahead_evicted -- cgit v1.2.1