diff options
author | Sergei Golubchik <sergii@pisem.net> | 2012-10-16 10:36:28 +0200 |
---|---|---|
committer | Sergei Golubchik <sergii@pisem.net> | 2012-10-16 10:36:28 +0200 |
commit | d9a8799205d160688f81362356dd2323eb8a91ea (patch) | |
tree | a25584d7adfc190bb0312b6d10cdcb62e17a20d1 /storage | |
parent | abefaab57b4b884b74ff9bd3c63f86c018d0e5de (diff) | |
parent | 96d3a797eedfe9304cc6416c7d71c7e543695870 (diff) | |
download | mariadb-git-d9a8799205d160688f81362356dd2323eb8a91ea.tar.gz |
XtraDB 1.1.8-29.0
Diffstat (limited to 'storage')
-rw-r--r-- | storage/xtradb/CMakeLists.txt | 31 | ||||
-rw-r--r-- | storage/xtradb/btr/btr0sea.c | 9 | ||||
-rw-r--r-- | storage/xtradb/buf/buf0buf.c | 13 | ||||
-rw-r--r-- | storage/xtradb/buf/buf0lru.c | 112 | ||||
-rw-r--r-- | storage/xtradb/handler/ha_innodb.cc | 36 | ||||
-rw-r--r-- | storage/xtradb/handler/i_s.cc | 295 | ||||
-rw-r--r-- | storage/xtradb/handler/i_s.h | 1 | ||||
-rw-r--r-- | storage/xtradb/include/buf0lru.h | 11 | ||||
-rw-r--r-- | storage/xtradb/include/log0log.h | 5 | ||||
-rw-r--r-- | storage/xtradb/include/log0online.h | 111 | ||||
-rw-r--r-- | storage/xtradb/include/log0recv.h | 37 | ||||
-rw-r--r-- | storage/xtradb/include/os0file.h | 9 | ||||
-rw-r--r-- | storage/xtradb/include/os0sync.h | 28 | ||||
-rw-r--r-- | storage/xtradb/include/srv0srv.h | 23 | ||||
-rw-r--r-- | storage/xtradb/include/univ.i | 2 | ||||
-rw-r--r-- | storage/xtradb/include/ut0rbt.h | 22 | ||||
-rw-r--r-- | storage/xtradb/log/log0log.c | 126 | ||||
-rw-r--r-- | storage/xtradb/log/log0online.c | 1085 | ||||
-rw-r--r-- | storage/xtradb/log/log0recv.c | 8 | ||||
-rw-r--r-- | storage/xtradb/os/os0file.c | 20 | ||||
-rw-r--r-- | storage/xtradb/srv/srv0srv.c | 52 | ||||
-rw-r--r-- | storage/xtradb/srv/srv0start.c | 19 | ||||
-rw-r--r-- | storage/xtradb/ut/ut0rbt.c | 29 |
23 files changed, 2019 insertions, 65 deletions
diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt index 1d2b0b29dea..4c098049fd4 100644 --- a/storage/xtradb/CMakeLists.txt +++ b/storage/xtradb/CMakeLists.txt @@ -91,12 +91,41 @@ IF(NOT CMAKE_CROSSCOMPILING) }" HAVE_IB_GCC_ATOMIC_BUILTINS ) + CHECK_C_SOURCE_RUNS( + " + #include <stdint.h> + int main() + { + int64_t x, y, res; + + x = 10; + y = 123; + res = __sync_bool_compare_and_swap(&x, x, y); + if (!res || x != y) { + return(1); + } + + x = 10; + y = 123; + res = __sync_add_and_fetch(&x, y); + if (res != 123 + 10 || x != 123 + 10) { + return(1); + } + + return(0); + }" + HAVE_IB_GCC_ATOMIC_BUILTINS_64 + ) ENDIF() IF(HAVE_IB_GCC_ATOMIC_BUILTINS) ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_BUILTINS=1) ENDIF() +IF(HAVE_IB_GCC_ATOMIC_BUILTINS_64) + ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_BUILTINS_64=1) +ENDIF() + # either define HAVE_IB_ATOMIC_PTHREAD_T_GCC or not IF(NOT CMAKE_CROSSCOMPILING) CHECK_C_SOURCE_RUNS( @@ -240,7 +269,7 @@ SET(INNOBASE_SOURCES btr/btr0btr.c btr/btr0cur.c btr/btr0pcur.c btr/btr0sea.c ibuf/ibuf0ibuf.c pars/lexyy.c pars/pars0grm.c pars/pars0opt.c pars/pars0pars.c pars/pars0sym.c lock/lock0lock.c lock/lock0iter.c - log/log0log.c log/log0recv.c + log/log0log.c log/log0recv.c log/log0online.c mach/mach0data.c mem/mem0mem.c mem/mem0pool.c mtr/mtr0log.c mtr/mtr0mtr.c diff --git a/storage/xtradb/btr/btr0sea.c b/storage/xtradb/btr/btr0sea.c index 855ab62c42f..7e9449a6474 100644 --- a/storage/xtradb/btr/btr0sea.c +++ b/storage/xtradb/btr/btr0sea.c @@ -183,6 +183,15 @@ btr_search_sys_create( //rw_lock_create(btr_search_latch_key, &btr_search_latch, // SYNC_SEARCH_SYS); + /* PS bug lp:1018264 - Multiple hash index partitions causes overly + large hash index: When multiple adaptive hash index partitions are + specified, _each_ partition was being created with hash_size which + should be 1/64 of the total size of all buffer pools which is + incorrect and can cause overly high memory usage. hash_size + should be representing the _total_ size of all partitions, not the + individual size of each partition. */ + hash_size /= btr_search_index_num; + btr_search_sys = mem_alloc(sizeof(btr_search_sys_t)); /* btr_search_index_num should be <= 32. (bits of trx->has_search_latch) */ diff --git a/storage/xtradb/buf/buf0buf.c b/storage/xtradb/buf/buf0buf.c index a2ff171e0c5..6ce77372dac 100644 --- a/storage/xtradb/buf/buf0buf.c +++ b/storage/xtradb/buf/buf0buf.c @@ -2838,6 +2838,7 @@ wait_until_unfixed: && ibuf_debug) { /* Try to evict the block from the buffer pool, to use the insert buffer (change buffer) as much as possible. */ + ulint page_no = buf_block_get_page_no(block); if (buf_LRU_free_block(&block->page, TRUE, FALSE)) { mutex_exit(block_mutex); @@ -2864,6 +2865,18 @@ wait_until_unfixed: "innodb_change_buffering_debug evict %u %u\n", (unsigned) space, (unsigned) offset); return(NULL); + } else if (UNIV_UNLIKELY(buf_block_get_state(block) + != BUF_BLOCK_FILE_PAGE + || (buf_block_get_page_no(block) != page_no) + || (buf_block_get_space(block) != space))) { + + /* buf_LRU_free_block temporarily releases the + block mutex, and now block points to something + else. */ + mutex_exit(block_mutex); + block = NULL; + goto loop2; + } else if (buf_flush_page_try(buf_pool, block)) { fprintf(stderr, "innodb_change_buffering_debug flush %u %u\n", diff --git a/storage/xtradb/buf/buf0lru.c b/storage/xtradb/buf/buf0lru.c index 16a91358080..e8300107f3d 100644 --- a/storage/xtradb/buf/buf0lru.c +++ b/storage/xtradb/buf/buf0lru.c @@ -2531,6 +2531,14 @@ func_exit: Dump the LRU page list to the specific file. */ #define LRU_DUMP_FILE "ib_lru_dump" #define LRU_DUMP_TEMP_FILE "ib_lru_dump.tmp" +#define LRU_OS_FILE_WRITE() \ + os_file_write(LRU_DUMP_FILE, dump_file, buffer, \ + (buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL, \ + (buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)), \ + buffer_size) +#define LRU_DUMP_PAGE_COUNT 1 /* Specifies how many dump pages + should be filled for each hold + of the LRU_list_mutex. */ UNIV_INTERN ibool @@ -2541,23 +2549,30 @@ buf_LRU_file_dump(void) ibool success; byte* buffer_base = NULL; byte* buffer = NULL; + const ulint buffer_size = LRU_DUMP_PAGE_COUNT * UNIV_PAGE_SIZE; buf_page_t* bpage; + buf_page_t* first_bpage; ulint buffers; ulint offset; - ibool ret = FALSE; + ulint pages_written; ulint i; + ulint total_pages; + + /* Sanity test to make sure page size is a multiple of + assumed dump record size */ + ut_a(UNIV_PAGE_SIZE % 8 == 0); for (i = 0; i < srv_n_data_files; i++) { if (strstr(srv_data_file_names[i], LRU_DUMP_FILE) != NULL) { fprintf(stderr, " InnoDB: The name '%s' seems to be used for" - " innodb_data_file_path. For safety, dumping of the LRU list" - " is not being done.\n", LRU_DUMP_FILE); + " innodb_data_file_path. Dumping LRU list is" + " not done for safeness.\n", LRU_DUMP_FILE); goto end; } } - buffer_base = ut_malloc(2 * UNIV_PAGE_SIZE); + buffer_base = ut_malloc(UNIV_PAGE_SIZE + buffer_size); buffer = ut_align(buffer_base, UNIV_PAGE_SIZE); if (!buffer) { fprintf(stderr, @@ -2577,18 +2592,28 @@ buf_LRU_file_dump(void) } buffers = offset = 0; - for (i = 0; i < srv_buf_pool_instances; i++) { buf_pool_t* buf_pool; buf_pool = buf_pool_from_array(i); mutex_enter(&buf_pool->LRU_list_mutex); - bpage = UT_LIST_GET_LAST(buf_pool->LRU); + bpage = first_bpage = UT_LIST_GET_FIRST(buf_pool->LRU); + total_pages = UT_LIST_GET_LEN(buf_pool->LRU); - while (bpage != NULL) { - if (offset == 0) { - memset(buffer, 0, UNIV_PAGE_SIZE); + pages_written = 0; + while (bpage != NULL && (pages_written++ < total_pages)) { + + buf_page_t* next_bpage = UT_LIST_GET_NEXT(LRU, bpage); + + if (next_bpage == first_bpage) { + /* Do not release list mutex here, it will be + released just outside this while loop */ + fprintf(stderr, + "InnoDB: detected cycle in LRU for" + " buffer pool %lu, skipping to next" + " buffer pool.\n", i); + break; } mach_write_to_4(buffer + offset * 4, bpage->space); @@ -2596,52 +2621,71 @@ buf_LRU_file_dump(void) mach_write_to_4(buffer + offset * 4, bpage->offset); offset++; - if (offset == UNIV_PAGE_SIZE/4) { + ut_a(offset <= buffer_size); + if (offset == buffer_size/4) { + mutex_t *next_block_mutex = NULL; + if (srv_shutdown_state != SRV_SHUTDOWN_NONE) { - success = 0; + mutex_exit(&buf_pool->LRU_list_mutex); + success = FALSE; fprintf(stderr, " InnoDB: stopped dumping lru" " pages because of server" " shutdown.\n"); + goto end; + } + + /* While writing file, release buffer pool + mutex but keep the next page fixed so we + don't worry about our list iterator becoming + invalid */ + if (next_bpage) { + next_block_mutex = buf_page_get_mutex( + next_bpage); + + mutex_enter(next_block_mutex); + next_bpage->buf_fix_count++; + mutex_exit(next_block_mutex); + } + mutex_exit(&buf_pool->LRU_list_mutex); + + success = LRU_OS_FILE_WRITE(); + + /* Grab this here so that next_bpage can't + be purged when we drop the fix_count */ + mutex_enter(&buf_pool->LRU_list_mutex); + + if (next_bpage) { + mutex_enter(next_block_mutex); + next_bpage->buf_fix_count--; + mutex_exit(next_block_mutex); } - success = os_file_write(LRU_DUMP_FILE, dump_file, buffer, - (buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL, - (buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)), - UNIV_PAGE_SIZE); + if (!success) { mutex_exit(&buf_pool->LRU_list_mutex); fprintf(stderr, - " InnoDB: cannot write page %lu of %s\n", + " InnoDB: cannot write page" + " %lu of %s\n", buffers, LRU_DUMP_FILE); goto end; } buffers++; offset = 0; - } - bpage = UT_LIST_GET_PREV(LRU, bpage); - } + bpage = next_bpage; + } else { + bpage = UT_LIST_GET_NEXT(LRU, bpage); + } + } /* while(bpage ...) */ mutex_exit(&buf_pool->LRU_list_mutex); - } - - if (offset == 0) { - memset(buffer, 0, UNIV_PAGE_SIZE); - } + } /* for(srv_buf_pool_instances ...) */ mach_write_to_4(buffer + offset * 4, 0xFFFFFFFFUL); offset++; mach_write_to_4(buffer + offset * 4, 0xFFFFFFFFUL); offset++; - success = os_file_write(LRU_DUMP_FILE, dump_file, buffer, - (buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL, - (buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)), - UNIV_PAGE_SIZE); - if (!success) { - goto end; - } - - ret = TRUE; + success = LRU_OS_FILE_WRITE(); end: if (dump_file != (os_file_t) -1) { if (success) { @@ -2656,7 +2700,7 @@ end: if (buffer_base) ut_free(buffer_base); - return(ret); + return(success); } typedef struct { diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index d37bccc8150..32ec2d9d858 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -358,7 +358,8 @@ static PSI_thread_info all_innodb_threads[] = { {&srv_error_monitor_thread_key, "srv_error_monitor_thread", 0}, {&srv_monitor_thread_key, "srv_monitor_thread", 0}, {&srv_master_thread_key, "srv_master_thread", 0}, - {&srv_purge_thread_key, "srv_purge_thread", 0} + {&srv_purge_thread_key, "srv_purge_thread", 0}, + {&srv_log_tracking_thread_key, "srv_redo_log_follow_thread", 0} }; # endif /* UNIV_PFS_THREAD */ @@ -368,7 +369,8 @@ performance schema instrumented if "UNIV_PFS_IO" is defined */ static PSI_file_info all_innodb_files[] = { {&innodb_file_data_key, "innodb_data_file", 0}, {&innodb_file_log_key, "innodb_log_file", 0}, - {&innodb_file_temp_key, "innodb_temp_file", 0} + {&innodb_file_temp_key, "innodb_temp_file", 0}, + {&innodb_file_bmp_key, "innodb_bmp_file", 0} }; # endif /* UNIV_PFS_IO */ #endif /* HAVE_PSI_INTERFACE */ @@ -12629,9 +12631,9 @@ static MYSQL_SYSVAR_BOOL(use_sys_stats_table, innobase_use_sys_stats_table, "So you should use ANALYZE TABLE command intentionally.", NULL, NULL, FALSE); -#ifdef UNIV_DEBUG_never -static MYSQL_SYSVAR_ULONG(sys_stats_root_page, innobase_sys_stats_root_page, - PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, +#ifdef UNIV_DEBUG +static MYSQL_SYSVAR_ULONG(persistent_stats_root_page, + innobase_sys_stats_root_page, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Override the SYS_STATS root page id, 0 = no override (for testing only)", NULL, NULL, 0, 0, ULONG_MAX, 0); #endif @@ -12834,6 +12836,18 @@ static MYSQL_SYSVAR_ENUM(stats_method, srv_innodb_stats_method, "NULLS_UNEQUAL and NULLS_IGNORED", NULL, NULL, SRV_STATS_NULLS_EQUAL, &innodb_stats_method_typelib); +static MYSQL_SYSVAR_BOOL(track_changed_pages, srv_track_changed_pages, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Track the redo log for changed pages and output a changed page bitmap", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ULONGLONG(changed_pages_limit, srv_changed_pages_limit, + PLUGIN_VAR_RQCMDARG, + "The maximum number of rows for " + "INFORMATION_SCHEMA.INNODB_CHANGED_PAGES table, " + "0 - unlimited", + NULL, NULL, 1000000, 0, ~0ULL, 0); + #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug, PLUGIN_VAR_RQCMDARG, @@ -12998,7 +13012,7 @@ static MYSQL_SYSVAR_UINT(buffer_pool_restore_at_startup, srv_auto_lru_dump, static MYSQL_SYSVAR_BOOL(blocking_buffer_pool_restore, innobase_blocking_lru_restore, - PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, "Block XtraDB startup process until buffer pool is full restored from a " "dump file (if present). Disabled by default.", NULL, NULL, FALSE); @@ -13085,8 +13099,8 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(stats_auto_update), MYSQL_SYSVAR(stats_update_need_lock), MYSQL_SYSVAR(use_sys_stats_table), -#ifdef UNIV_DEBUG_never /* disable this flag. --innodb-sys-stats becomes ambiguous */ - MYSQL_SYSVAR(sys_stats_root_page), +#ifdef UNIV_DEBUG + MYSQL_SYSVAR(persistent_stats_root_page), #endif MYSQL_SYSVAR(stats_sample_pages), MYSQL_SYSVAR(adaptive_hash_index), @@ -13118,6 +13132,8 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(use_sys_malloc), MYSQL_SYSVAR(use_native_aio), MYSQL_SYSVAR(change_buffering), + MYSQL_SYSVAR(track_changed_pages), + MYSQL_SYSVAR(changed_pages_limit), #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG MYSQL_SYSVAR(change_buffering_debug), #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ @@ -13177,10 +13193,10 @@ i_s_innodb_index_stats, i_s_innodb_buffer_pool_pages, i_s_innodb_buffer_pool_pages_index, i_s_innodb_buffer_pool_pages_blob, -i_s_innodb_admin_command +i_s_innodb_admin_command, +i_s_innodb_changed_pages maria_declare_plugin_end; - /** @brief Initialize the default value of innodb_commit_concurrency. Once InnoDB is running, the innodb_commit_concurrency must not change diff --git a/storage/xtradb/handler/i_s.cc b/storage/xtradb/handler/i_s.cc index 7af0c88c73a..57a091ea80d 100644 --- a/storage/xtradb/handler/i_s.cc +++ b/storage/xtradb/handler/i_s.cc @@ -22,6 +22,14 @@ InnoDB INFORMATION SCHEMA tables interface to MySQL. Created July 18, 2007 Vasil Dimov *******************************************************/ +#ifndef MYSQL_SERVER +#define MYSQL_SERVER /* For Item_* classes */ +#include <item.h> +/* Prevent influence of this definition to other headers */ +#undef MYSQL_SERVER +#else +#include <mysql_priv.h> +#endif //MYSQL_SERVER #include <ctype.h> /*toupper*/ #include <mysqld_error.h> @@ -45,6 +53,7 @@ extern "C" { #include "dict0mem.h" #include "dict0types.h" #include "ha_prototypes.h" /* for innobase_convert_name() */ +#include "srv0srv.h" /* for srv_track_changed_pages */ #include "srv0start.h" /* for srv_was_started */ #include "trx0i_s.h" #include "trx0trx.h" /* for TRX_QUE_STATE_STR_MAX_LEN */ @@ -54,6 +63,7 @@ extern "C" { #include "dict0dict.h" /* for dict_sys */ #include "buf0lru.h" /* for XTRA_LRU_[DUMP/RESTORE] */ #include "btr0btr.h" /* for btr_page_get_index_id */ +#include "log0online.h" } #define OK(expr) \ @@ -5266,3 +5276,288 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_undo_logs = INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE }; + +static ST_FIELD_INFO i_s_innodb_changed_pages_info[] = +{ + {STRUCT_FLD(field_name, "space_id"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "page_id"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "start_lsn"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "end_lsn"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*********************************************************************** + This function parses condition and gets upper bounds for start and end LSN's + if condition corresponds to certain pattern. + + We can't know right position to avoid scanning bitmap files from the beginning + to the lower bound. But we can stop scanning bitmap files if we reach upper bound. + + It's expected the most used queries will be like the following: + + SELECT * FROM INNODB_CHANGED_PAGES WHERE START_LSN > num1 AND start_lsn < num2; + + That's why the pattern is: + + pattern: comp | and_comp; + comp: lsn < int_num | lsn <= int_num | int_num > lsn | int_num >= lsn; + lsn: start_lsn | end_lsn; + and_comp: some_expression AND some_expression | some_expression AND and_comp; + some_expression: comp | any_other_expression; + + Suppose the condition is start_lsn < 100, this means we have to read all + blocks with start_lsn < 100. Which is equivalent to reading all the blocks + with end_lsn <= 99, or just end_lsn < 100. That's why it's enough to find + maximum lsn value, doesn't matter if this is start or end lsn and compare + it with "start_lsn" field. + + Example: + + SELECT * FROM INNODB_CHANGED_PAGES + WHERE + start_lsn > 10 AND + end_lsn <= 1111 AND + 555 > end_lsn AND + page_id = 100; + + max_lsn will be set to 555. +*/ +static +void +limit_lsn_range_from_condition( +/*===========================*/ + TABLE* table, /*!<in: table */ + COND* cond, /*!<in: condition */ + ib_uint64_t* max_lsn) /*!<in/out: maximum LSN + (must be initialized with maximum + available value) */ +{ + if (cond->type() != Item::COND_ITEM && + cond->type() != Item::FUNC_ITEM) + return; + + switch (((Item_func*) cond)->functype()) + { + case Item_func::COND_AND_FUNC: + { + List_iterator<Item> li(*((Item_cond*) cond)-> + argument_list()); + Item *item; + while ((item= li++)) + limit_lsn_range_from_condition(table, + item, + max_lsn); + break; + } + case Item_func::LT_FUNC: + case Item_func::LE_FUNC: + case Item_func::GT_FUNC: + case Item_func::GE_FUNC: + { + Item *left; + Item *right; + Item_field *item_field; + ib_uint64_t tmp_result; + + /* + a <= b equals to b >= a that's why we just exchange + "left" and "right" in the case of ">" or ">=" + function + */ + if (((Item_func*) cond)->functype() == + Item_func::LT_FUNC || + ((Item_func*) cond)->functype() == + Item_func::LE_FUNC) + { + left = ((Item_func*) cond)->arguments()[0]; + right = ((Item_func*) cond)->arguments()[1]; + } else { + left = ((Item_func*) cond)->arguments()[1]; + right = ((Item_func*) cond)->arguments()[0]; + } + + if (!left || !right) + return; + if (left->type() != Item::FIELD_ITEM) + return; + if (right->type() != Item::INT_ITEM) + return; + + item_field = (Item_field*)left; + + if (/* START_LSN */ + table->field[2] != item_field->field && + /* END_LSN */ + table->field[3] != item_field->field) + { + return; + } + + /* Check if the current field belongs to our table */ + if (table != item_field->field->table) + return; + + tmp_result = right->val_int(); + if (tmp_result < *max_lsn) + *max_lsn = tmp_result; + + break; + } + default:; + } + +} + +/*********************************************************************** +Fill the dynamic table information_schema.innodb_changed_pages. +@return 0 on success, 1 on failure */ +static +int +i_s_innodb_changed_pages_fill( +/*==========================*/ + THD* thd, /*!<in: thread */ + TABLE_LIST* tables, /*!<in/out: tables to fill */ + COND* cond) /*!<in: condition */ +{ + TABLE* table = (TABLE *) tables->table; + log_bitmap_iterator_t i; + ib_uint64_t output_rows_num = 0UL; + ib_uint64_t max_lsn = ~0ULL; + + if (!srv_track_changed_pages) + return 0; + + if (!log_online_bitmap_iterator_init(&i)) + return 1; + + if (cond) + limit_lsn_range_from_condition(table, cond, &max_lsn); + + while(log_online_bitmap_iterator_next(&i) && + (!srv_changed_pages_limit || + output_rows_num < srv_changed_pages_limit) && + /* + There is no need to compare both start LSN and end LSN fields + with maximum value. It's enough to compare only start LSN. + Example: + + max_lsn = 100 + \\\\\\\\\\\\\\\\\\\\\\\\\|\\\\\\\\ - Query 1 + I------I I-------I I-------------I I----I + ////////////////// | - Query 2 + 1 2 3 4 + + Query 1: + SELECT * FROM INNODB_CHANGED_PAGES WHERE start_lsn < 100 + will select 1,2,3 bitmaps + Query 2: + SELECT * FROM INNODB_CHANGED_PAGES WHERE end_lsn < 100 + will select 1,2 bitmaps + + The condition start_lsn <= 100 will be false after reading + 1,2,3 bitmaps which suits for both cases. + */ + LOG_BITMAP_ITERATOR_START_LSN(i) <= max_lsn) + { + if (!LOG_BITMAP_ITERATOR_PAGE_CHANGED(i)) + continue; + + /* SPACE_ID */ + table->field[0]->store( + LOG_BITMAP_ITERATOR_SPACE_ID(i)); + /* PAGE_ID */ + table->field[1]->store( + LOG_BITMAP_ITERATOR_PAGE_NUM(i)); + /* START_LSN */ + table->field[2]->store( + LOG_BITMAP_ITERATOR_START_LSN(i)); + /* END_LSN */ + table->field[3]->store( + LOG_BITMAP_ITERATOR_END_LSN(i)); + + /* + I_S tables are in-memory tables. If bitmap file is big enough + a lot of memory can be used to store the table. But the size + of used memory can be diminished if we store only data which + corresponds to some conditions (in WHERE sql clause). Here + conditions are checked for the field values stored above. + + Conditions are checked twice. The first is here (during table + generation) and the second during query execution. Maybe it + makes sense to use some flag in THD object to avoid double + checking. + */ + if (cond && !cond->val_int()) + continue; + + if (schema_table_store_record(thd, table)) + { + log_online_bitmap_iterator_release(&i); + return 1; + } + + ++output_rows_num; + } + + log_online_bitmap_iterator_release(&i); + return 0; +} + +static +int +i_s_innodb_changed_pages_init( +/*==========================*/ + void* p) +{ + DBUG_ENTER("i_s_innodb_changed_pages_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_innodb_changed_pages_info; + schema->fill_table = i_s_innodb_changed_pages_fill; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_maria_plugin i_s_innodb_changed_pages = +{ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + STRUCT_FLD(info, &i_s_info), + STRUCT_FLD(name, "INNODB_CHANGED_PAGES"), + STRUCT_FLD(author, "Percona"), + STRUCT_FLD(descr, "InnoDB CHANGED_PAGES table"), + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + STRUCT_FLD(init, i_s_innodb_changed_pages_init), + STRUCT_FLD(deinit, i_s_common_deinit), + STRUCT_FLD(version, 0x0100 /* 1.0 */), + STRUCT_FLD(status_vars, NULL), + STRUCT_FLD(system_vars, NULL), + INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE +}; diff --git a/storage/xtradb/handler/i_s.h b/storage/xtradb/handler/i_s.h index 7e9d47571cc..a8964356747 100644 --- a/storage/xtradb/handler/i_s.h +++ b/storage/xtradb/handler/i_s.h @@ -51,5 +51,6 @@ extern struct st_maria_plugin i_s_innodb_admin_command; extern struct st_maria_plugin i_s_innodb_buffer_pool_pages; extern struct st_maria_plugin i_s_innodb_buffer_pool_pages_index; extern struct st_maria_plugin i_s_innodb_buffer_pool_pages_blob; +extern struct st_maria_plugin i_s_innodb_changed_pages; #endif /* i_s_h */ diff --git a/storage/xtradb/include/buf0lru.h b/storage/xtradb/include/buf0lru.h index c3672a65ed7..efaa758f27a 100644 --- a/storage/xtradb/include/buf0lru.h +++ b/storage/xtradb/include/buf0lru.h @@ -94,13 +94,12 @@ buf_LRU_insert_zip_clean( Try to free a block. If bpage is a descriptor of a compressed-only page, the descriptor object will be freed as well. -NOTE: If this function returns TRUE, it will temporarily -release buf_pool->mutex. Furthermore, the page frame will no longer be -accessible via bpage. +NOTE: This will temporarily release buf_pool_mutex. Furthermore, the +page frame will no longer be accessible via bpage. -The caller must hold buf_pool->mutex and buf_page_get_mutex(bpage) and -release these two mutexes after the call. No other -buf_page_get_mutex() may be held when calling this function. +The caller must hold buf_page_get_mutex(bpage) and release this mutex +after the call. No other buf_page_get_mutex() may be held when +calling this function. @return TRUE if freed, FALSE otherwise. */ UNIV_INTERN ibool diff --git a/storage/xtradb/include/log0log.h b/storage/xtradb/include/log0log.h index 857ec0946c2..96c4b81695a 100644 --- a/storage/xtradb/include/log0log.h +++ b/storage/xtradb/include/log0log.h @@ -977,6 +977,11 @@ struct log_struct{ become signaled */ /* @} */ #endif /* UNIV_LOG_ARCHIVE */ + ib_uint64_t tracked_lsn; /*!< log tracking has advanced to this + lsn. Field accessed atomically where + 64-bit atomic ops are supported, + protected by the log sys mutex + otherwise. */ }; /** Test if flush order mutex is owned. */ diff --git a/storage/xtradb/include/log0online.h b/storage/xtradb/include/log0online.h new file mode 100644 index 00000000000..0e0ca169f6f --- /dev/null +++ b/storage/xtradb/include/log0online.h @@ -0,0 +1,111 @@ +/***************************************************************************** + +Copyright (c) 2011-2012, Percona Inc. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/log0online.h +Online database log parsing for changed page tracking +*******************************************************/ + +#ifndef log0online_h +#define log0online_h + +#include "univ.i" +#include "os0file.h" + +/*********************************************************************//** +Initializes the online log following subsytem. */ +UNIV_INTERN +void +log_online_read_init(); +/*===================*/ + +/*********************************************************************//** +Shuts down the online log following subsystem. */ +UNIV_INTERN +void +log_online_read_shutdown(); +/*=======================*/ + +/*********************************************************************//** +Reads and parses the redo log up to last checkpoint LSN to build the changed +page bitmap which is then written to disk. */ +UNIV_INTERN +void +log_online_follow_redo_log(); +/*=========================*/ + +/** The iterator through all bits of changed pages bitmap blocks */ +struct log_bitmap_iterator_struct +{ + char in_name[FN_REFLEN]; /*!< the file name for bitmap + input */ + os_file_t in; /*!< the bitmap input file */ + ib_uint64_t in_offset; /*!< the next write position in the + bitmap output file */ + ib_uint32_t bit_offset; /*!< bit offset inside of bitmap + block*/ + ib_uint64_t start_lsn; /*!< Start lsn of the block */ + ib_uint64_t end_lsn; /*!< End lsn of the block */ + ib_uint32_t space_id; /*!< Block space id */ + ib_uint32_t first_page_id; /*!< First block page id */ + ibool changed; /*!< true if current page was changed */ + byte* page; /*!< Bitmap block */ +}; + +typedef struct log_bitmap_iterator_struct log_bitmap_iterator_t; + +#define LOG_BITMAP_ITERATOR_START_LSN(i) \ + ((i).start_lsn) +#define LOG_BITMAP_ITERATOR_END_LSN(i) \ + ((i).end_lsn) +#define LOG_BITMAP_ITERATOR_SPACE_ID(i) \ + ((i).space_id) +#define LOG_BITMAP_ITERATOR_PAGE_NUM(i) \ + ((i).first_page_id + (i).bit_offset) +#define LOG_BITMAP_ITERATOR_PAGE_CHANGED(i) \ + ((i).changed) + +/*********************************************************************//** +Initializes log bitmap iterator. +@return TRUE if the iterator is initialized OK, FALSE otherwise. */ +UNIV_INTERN +ibool +log_online_bitmap_iterator_init( +/*============================*/ + log_bitmap_iterator_t *i); /*!<in/out: iterator */ + +/*********************************************************************//** +Releases log bitmap iterator. */ +UNIV_INTERN +void +log_online_bitmap_iterator_release( +/*===============================*/ + log_bitmap_iterator_t *i); /*!<in/out: iterator */ + +/*********************************************************************//** +Iterates through bits of saved bitmap blocks. +Sequentially reads blocks from bitmap file(s) and interates through +their bits. Ignores blocks with wrong checksum. +@return TRUE if iteration is successful, FALSE if all bits are iterated. */ +UNIV_INTERN +ibool +log_online_bitmap_iterator_next( +/*============================*/ + log_bitmap_iterator_t *i); /*!<in/out: iterator */ + +#endif diff --git a/storage/xtradb/include/log0recv.h b/storage/xtradb/include/log0recv.h index 15065267250..fdffd86e4c4 100644 --- a/storage/xtradb/include/log0recv.h +++ b/storage/xtradb/include/log0recv.h @@ -32,6 +32,28 @@ Created 9/20/1997 Heikki Tuuri #include "hash0hash.h" #include "log0log.h" +/******************************************************//** +Checks the 4-byte checksum to the trailer checksum field of a log +block. We also accept a log block in the old format before +InnoDB-3.23.52 where the checksum field contains the log block number. +@return TRUE if ok, or if the log block may be in the format of InnoDB +version predating 3.23.52 */ +UNIV_INTERN +ibool +log_block_checksum_is_ok_or_old_format( +/*===================================*/ + const byte* block); /*!< in: pointer to a log block */ + +/*******************************************************//** +Calculates the new value for lsn when more data is added to the log. */ +UNIV_INTERN +ib_uint64_t +recv_calc_lsn_on_data_add( +/*======================*/ + ib_uint64_t lsn, /*!< in: old lsn */ + ib_uint64_t len); /*!< in: this many bytes of data is + added, log block headers not included */ + #ifdef UNIV_HOTBACKUP extern ibool recv_replay_file_ops; @@ -182,6 +204,21 @@ UNIV_INTERN void recv_recovery_rollback_active(void); /*===============================*/ + +/*******************************************************************//** +Tries to parse a single log record and returns its length. +@return length of the record, or 0 if the record was not complete */ +UNIV_INTERN +ulint +recv_parse_log_rec( +/*===============*/ + byte* ptr, /*!< in: pointer to a buffer */ + byte* end_ptr,/*!< in: pointer to the buffer end */ + byte* type, /*!< out: type */ + ulint* space, /*!< out: space id */ + ulint* page_no,/*!< out: page number */ + byte** body); /*!< out: log record body start */ + /*******************************************************//** Scans log from a buffer and stores new log data to the parsing buffer. Parses and hashes the log records if new data found. Unless diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h index 5b1f9339845..4c795d93141 100644 --- a/storage/xtradb/include/os0file.h +++ b/storage/xtradb/include/os0file.h @@ -197,6 +197,7 @@ extern ulint srv_log_block_size; extern mysql_pfs_key_t innodb_file_data_key; extern mysql_pfs_key_t innodb_file_log_key; extern mysql_pfs_key_t innodb_file_temp_key; +extern mysql_pfs_key_t innodb_file_bmp_key; /* Following four macros are instumentations to register various file I/O operations with performance schema. @@ -867,6 +868,14 @@ os_file_set_eof( /*============*/ FILE* file); /*!< in: file to be truncated */ /***********************************************************************//** +Truncates a file at the specified position. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_eof_at( + os_file_t file, /*!< in: handle to a file */ + ib_uint64_t new_len);/*!< in: new file length */ +/***********************************************************************//** NOTE! Use the corresponding macro os_file_flush(), not directly this function! Flushes the write buffers of a given file to the disk. @return TRUE if success */ diff --git a/storage/xtradb/include/os0sync.h b/storage/xtradb/include/os0sync.h index 6a99c60226b..887a40c64ea 100644 --- a/storage/xtradb/include/os0sync.h +++ b/storage/xtradb/include/os0sync.h @@ -265,7 +265,11 @@ Atomic compare-and-swap and increment for InnoDB. */ #if defined(HAVE_IB_GCC_ATOMIC_BUILTINS) -#define HAVE_ATOMIC_BUILTINS +# define HAVE_ATOMIC_BUILTINS + +# ifdef HAVE_IB_GCC_ATOMIC_BUILTINS_64 +# define HAVE_ATOMIC_BUILTINS_64 +# endif /**********************************************************//** Returns true if swapped, ptr is pointer to target, old_val is value to @@ -304,6 +308,9 @@ amount of increment. */ # define os_atomic_increment_ulint(ptr, amount) \ os_atomic_increment(ptr, amount) +# define os_atomic_increment_uint64(ptr, amount) \ + os_atomic_increment(ptr, amount) + /**********************************************************//** Returns the old value of *ptr, atomically sets *ptr to new_val */ @@ -312,12 +319,13 @@ Returns the old value of *ptr, atomically sets *ptr to new_val */ #elif defined(HAVE_IB_SOLARIS_ATOMICS) -#define HAVE_ATOMIC_BUILTINS +# define HAVE_ATOMIC_BUILTINS +# define HAVE_ATOMIC_BUILTINS_64 /* If not compiling with GCC or GCC doesn't support the atomic intrinsics and running on Solaris >= 10 use Solaris atomics */ -#include <atomic.h> +# include <atomic.h> /**********************************************************//** Returns true if swapped, ptr is pointer to target, old_val is value to @@ -357,6 +365,9 @@ amount of increment. */ # define os_atomic_increment_ulint(ptr, amount) \ atomic_add_long_nv(ptr, amount) +# define os_atomic_increment_uint64(ptr, amount) \ + atomic_add_64_nv(ptr, amount) + /**********************************************************//** Returns the old value of *ptr, atomically sets *ptr to new_val */ @@ -365,7 +376,11 @@ Returns the old value of *ptr, atomically sets *ptr to new_val */ #elif defined(HAVE_WINDOWS_ATOMICS) -#define HAVE_ATOMIC_BUILTINS +# define HAVE_ATOMIC_BUILTINS + +# ifndef _WIN32 +# define HAVE_ATOMIC_BUILTINS_64 +# endif /* On Windows, use Windows atomics / interlocked */ # ifdef _WIN64 @@ -403,6 +418,11 @@ amount of increment. */ # define os_atomic_increment_ulint(ptr, amount) \ ((ulint) (win_xchg_and_add(ptr, amount) + amount)) +# define os_atomic_increment_uint64(ptr, amount) \ + ((ib_uint64_t) (InterlockedExchangeAdd64( \ + (ib_int64_t*) ptr, \ + (ib_int64_t) amount) + amount)) + /**********************************************************//** Returns the old value of *ptr, atomically sets *ptr to new_val. InterlockedExchange() operates on LONG, and the LONG will be diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h index a40683e00f1..b8820f1b7c9 100644 --- a/storage/xtradb/include/srv0srv.h +++ b/storage/xtradb/include/srv0srv.h @@ -69,6 +69,14 @@ extern os_event_t srv_error_event; /* This event is set at shutdown to wakeup threads from sleep */ extern os_event_t srv_shutdown_event; +/* This event is set on checkpoint completion to wake the redo log parser +thread */ +extern os_event_t srv_checkpoint_completed_event; + +/* This event is set on the online redo log following thread exit to signal +that the (slow) shutdown may proceed */ +extern os_event_t srv_redo_log_thread_finished_event; + /* If the last data file is auto-extended, we add this many pages to it at a time */ #define SRV_AUTO_EXTEND_INCREMENT \ @@ -136,6 +144,11 @@ extern char* srv_doublewrite_file; extern ibool srv_recovery_stats; +extern my_bool srv_track_changed_pages; + +extern +ulonglong srv_changed_pages_limit; + extern ibool srv_auto_extend_last_data_file; extern ulint srv_last_file_size_max; extern char** srv_log_group_home_dirs; @@ -402,6 +415,7 @@ extern mysql_pfs_key_t srv_error_monitor_thread_key; extern mysql_pfs_key_t srv_monitor_thread_key; extern mysql_pfs_key_t srv_master_thread_key; extern mysql_pfs_key_t srv_purge_thread_key; +extern mysql_pfs_key_t srv_log_tracking_thread_key; /* This macro register the current thread and its key with performance schema */ @@ -697,6 +711,15 @@ srv_LRU_dump_restore_thread( void* arg); /*!< in: a dummy parameter required by os_thread_create */ /******************************************************************//** +A thread which follows the redo log and outputs the changed page bitmap. +@return a dummy value */ +UNIV_INTERN +os_thread_ret_t +srv_redo_log_follow_thread( +/*=======================*/ + void* arg); /*!< in: a dummy parameter required by + os_thread_create */ +/******************************************************************//** Outputs to a file the output of the InnoDB Monitor. @return FALSE if not all information printed due to failure to obtain necessary mutex */ diff --git a/storage/xtradb/include/univ.i b/storage/xtradb/include/univ.i index ce59a3f2741..9aee2a0c7f9 100644 --- a/storage/xtradb/include/univ.i +++ b/storage/xtradb/include/univ.i @@ -54,7 +54,7 @@ Created 1/20/1994 Heikki Tuuri #define INNODB_VERSION_BUGFIX 8 #ifndef PERCONA_INNODB_VERSION -#define PERCONA_INNODB_VERSION 28.1 +#define PERCONA_INNODB_VERSION 29.0 #endif /* The following is the InnoDB version as shown in diff --git a/storage/xtradb/include/ut0rbt.h b/storage/xtradb/include/ut0rbt.h index e26b637ae13..cd9df1c1a3d 100644 --- a/storage/xtradb/include/ut0rbt.h +++ b/storage/xtradb/include/ut0rbt.h @@ -116,6 +116,10 @@ struct ib_rbt_bound_struct { /* Compare a key with the node value (t is tree, k is key, n is node)*/ #define rbt_compare(t, k, n) (t->compare(k, n->value)) +/* Node size. FIXME: name might clash, but currently it does not, so for easier + maintenance do not rename it for now. */ +#define SIZEOF_NODE(t) ((sizeof(ib_rbt_node_t) + t->sizeof_value) - 1) + /**********************************************************************//** Free an instance of a red black tree */ UNIV_INTERN @@ -187,6 +191,17 @@ rbt_add_node( ib_rbt_bound_t* parent, /*!< in: parent */ const void* value); /*!< in: this value is copied to the node */ +/****************************************************************//** +Add a new caller-provided node to tree at the specified position. +The node must have its key fields initialized correctly. +@return added node */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_add_preallocated_node( +/*======================*/ + ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: parent */ + ib_rbt_node_t* node); /*!< in: node */ /**********************************************************************//** Return the left most data node in the tree @return left most node */ @@ -273,6 +288,13 @@ void rbt_clear( /*======*/ ib_rbt_t* tree); /*!< in: rb tree */ +/****************************************************************//** +Clear the tree without deleting and freeing its nodes. */ +UNIV_INTERN +void +rbt_reset( +/*======*/ + ib_rbt_t* tree); /*!< in: rb tree */ /**********************************************************************//** Merge the node from dst into src. Return the number of nodes merged. @return no. of recs merged */ diff --git a/storage/xtradb/log/log0log.c b/storage/xtradb/log/log0log.c index dcaf951a0ed..e3e023c0c5a 100644 --- a/storage/xtradb/log/log0log.c +++ b/storage/xtradb/log/log0log.c @@ -216,6 +216,54 @@ log_buf_pool_get_oldest_modification(void) return(lsn); } +/****************************************************************//** +Safely reads the log_sys->tracked_lsn value. Uses atomic operations +if available, otherwise this field is protected with the log system +mutex. The writer counterpart function is log_set_tracked_lsn() in +log0online.c. + +@return log_sys->tracked_lsn value. */ +UNIV_INLINE +ib_uint64_t +log_get_tracked_lsn() +{ +#ifdef HAVE_ATOMIC_BUILTINS_64 + return os_atomic_increment_uint64(&log_sys->tracked_lsn, 0); +#else + ut_ad(mutex_own(&(log_sys->mutex))); + return log_sys->tracked_lsn; +#endif +} + +/****************************************************************//** +Checks if the log groups have a big enough margin of free space in +so that a new log entry can be written without overwriting log data +that is not read by the changed page bitmap thread. +@return TRUE if there is not enough free space. */ +static +ibool +log_check_tracking_margin( + ulint lsn_advance) /*!< in: an upper limit on how much log data we + plan to write. If zero, the margin will be + checked for the already-written log. */ +{ + ib_uint64_t tracked_lsn; + ulint tracked_lsn_age; + + if (!srv_track_changed_pages) { + return FALSE; + } + + ut_ad(mutex_own(&(log_sys->mutex))); + + tracked_lsn = log_get_tracked_lsn(); + tracked_lsn_age = log_sys->lsn - tracked_lsn; + + /* The overwrite would happen when log_sys->log_group_capacity is + exceeded, but we use max_checkpoint_age for an extra safety margin. */ + return tracked_lsn_age + lsn_advance > log_sys->max_checkpoint_age; +} + /************************************************************//** Opens the log for log_write_low. The log must be closed with log_close and released with log_release. @@ -232,9 +280,7 @@ log_reserve_and_open( ulint archived_lsn_age; ulint dummy; #endif /* UNIV_LOG_ARCHIVE */ -#ifdef UNIV_DEBUG ulint count = 0; -#endif /* UNIV_DEBUG */ ut_a(len < log->buf_size / 2); loop: @@ -262,6 +308,19 @@ loop: goto loop; } + if (log_check_tracking_margin(len_upper_limit) && (++count < 50)) { + + /* This log write would violate the untracked LSN free space + margin. Limit this to 50 retries as there might be situations + where we have no choice but to proceed anyway, i.e. if the log + is about to be overflown, log tracking or not. */ + mutex_exit(&(log->mutex)); + + os_thread_sleep(10000); + + goto loop; + } + #ifdef UNIV_LOG_ARCHIVE if (log->archiving_state != LOG_ARCH_OFF) { @@ -400,6 +459,8 @@ log_close(void) ulint first_rec_group; ib_uint64_t oldest_lsn; ib_uint64_t lsn; + ib_uint64_t tracked_lsn; + ulint tracked_lsn_age; log_t* log = log_sys; ib_uint64_t checkpoint_age; @@ -426,6 +487,19 @@ log_close(void) log->check_flush_or_checkpoint = TRUE; } + if (srv_track_changed_pages) { + + tracked_lsn = log_get_tracked_lsn(); + tracked_lsn_age = lsn - tracked_lsn; + + if (tracked_lsn_age >= log->log_group_capacity) { + + fprintf(stderr, " InnoDB: Error: the age of the " + "oldest untracked record exceeds the log " + "group capacity!\n"); + } + } + checkpoint_age = lsn - log->last_checkpoint_lsn; if (checkpoint_age >= log->log_group_capacity) { @@ -893,6 +967,8 @@ log_init(void) log_sys->archiving_on = os_event_create(NULL); #endif /* UNIV_LOG_ARCHIVE */ + log_sys->tracked_lsn = 0; + /*----------------------------*/ log_block_init(log_sys->buf, log_sys->lsn); @@ -1742,6 +1818,12 @@ log_io_complete_checkpoint(void) } mutex_exit(&(log_sys->mutex)); + + /* Wake the redo log watching thread to parse the log up to this + checkpoint. */ + if (srv_track_changed_pages) { + os_event_set(srv_checkpoint_completed_event); + } } /*******************************************************************//** @@ -3169,6 +3251,15 @@ loop: log_checkpoint_margin(); + mutex_enter(&(log_sys->mutex)); + if (log_check_tracking_margin(0)) { + + mutex_exit(&(log_sys->mutex)); + os_thread_sleep(10000); + goto loop; + } + mutex_exit(&(log_sys->mutex)); + #ifdef UNIV_LOG_ARCHIVE log_archive_margin(); #endif /* UNIV_LOG_ARCHIVE */ @@ -3197,6 +3288,7 @@ logs_empty_and_mark_files_at_shutdown(void) /*=======================================*/ { ib_uint64_t lsn; + ib_uint64_t tracked_lsn; ulint arch_log_no; ibool server_busy; ulint count = 0; @@ -3388,6 +3480,12 @@ loop: } srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE; + /* Wake the log tracking thread which will then immediatelly + quit because of srv_shutdown_state value */ + if (srv_track_changed_pages) { + os_event_set(srv_checkpoint_completed_event); + os_event_wait(srv_redo_log_thread_finished_event); + } fil_close_all_files(); ut_a(srv_get_active_thread_type() == ULINT_UNDEFINED); return; @@ -3397,9 +3495,12 @@ loop: mutex_enter(&log_sys->mutex); + tracked_lsn = log_get_tracked_lsn(); + lsn = log_sys->lsn; if (lsn != log_sys->last_checkpoint_lsn + || (srv_track_changed_pages && (tracked_lsn != log_sys->last_checkpoint_lsn)) #ifdef UNIV_LOG_ARCHIVE || (srv_log_archive_on && lsn != log_sys->archived_lsn + LOG_BLOCK_HDR_SIZE) @@ -3457,6 +3558,11 @@ loop: srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE; + /* Signal the log following thread to quit */ + if (srv_track_changed_pages) { + os_event_set(srv_checkpoint_completed_event); + } + /* Make some checks that the server really is quiet */ ut_a(srv_get_active_thread_type() == ULINT_UNDEFINED); @@ -3477,6 +3583,10 @@ loop: fil_flush_file_spaces(FIL_TABLESPACE); + if (srv_track_changed_pages) { + os_event_wait(srv_redo_log_thread_finished_event); + } + fil_close_all_files(); /* Make some checks that the server really is quiet */ @@ -3603,6 +3713,18 @@ log_print( ((log_sys->n_log_ios - log_sys->n_log_ios_old) / time_elapsed)); + if (srv_track_changed_pages) { + + /* The maximum tracked LSN age is equal to the maximum + checkpoint age */ + fprintf(file, + "Log tracking enabled\n" + "Log tracked up to %llu\n" + "Max tracked LSN age %lu\n", + log_get_tracked_lsn(), + log_sys->max_checkpoint_age); + } + log_sys->n_log_ios_old = log_sys->n_log_ios; log_sys->last_printout_time = current_time; diff --git a/storage/xtradb/log/log0online.c b/storage/xtradb/log/log0online.c new file mode 100644 index 00000000000..1d478c467e6 --- /dev/null +++ b/storage/xtradb/log/log0online.c @@ -0,0 +1,1085 @@ +/***************************************************************************** + +Copyright (c) 2011-2012 Percona Inc. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/**************************************************//** +@file log/log0online.c +Online database log parsing for changed page tracking + +*******************************************************/ + +#include "log0online.h" + +#include "my_dbug.h" + +#include "log0recv.h" +#include "mach0data.h" +#include "mtr0log.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "trx0sys.h" +#include "ut0rbt.h" + +enum { FOLLOW_SCAN_SIZE = 4 * (UNIV_PAGE_SIZE_MAX) }; + +/** Log parsing and bitmap output data structure */ +struct log_bitmap_struct { + byte read_buf[FOLLOW_SCAN_SIZE]; + /*!< log read buffer */ + byte parse_buf[RECV_PARSING_BUF_SIZE]; + /*!< log parse buffer */ + byte* parse_buf_end; /*!< parse buffer position where the + next read log data should be copied to. + If the previous log records were fully + parsed, it points to the start, + otherwise points immediatelly past the + end of the incomplete log record. */ + char* out_name; /*!< the file name for bitmap output */ + os_file_t out; /*!< the bitmap output file */ + ib_uint64_t out_offset; /*!< the next write position in the + bitmap output file */ + ib_uint64_t start_lsn; /*!< the LSN of the next unparsed + record and the start of the next LSN + interval to be parsed. */ + ib_uint64_t end_lsn; /*!< the end of the LSN interval to be + parsed, equal to the next checkpoint + LSN at the time of parse */ + ib_uint64_t next_parse_lsn; /*!< the LSN of the next unparsed + record in the current parse */ + ib_rbt_t* modified_pages; /*!< the current modified page set, + organized as the RB-tree with the keys + of (space, 4KB-block-start-page-id) + pairs */ + ib_rbt_node_t* page_free_list; /*!< Singly-linked list of freed nodes + of modified_pages tree for later + reuse. Nodes are linked through + ib_rbt_node_t.left as this field has + both the correct type and the tree does + not mind its overwrite during + rbt_next() tree traversal. */ +}; + +/* The log parsing and bitmap output struct instance */ +static struct log_bitmap_struct* log_bmp_sys; + +/* File name stem for modified page bitmaps */ +static const char* modified_page_stem = "ib_modified_log."; + +/* On server startup with empty database srv_start_lsn == 0, in +which case the first LSN of actual log records will be this. */ +#define MIN_TRACKED_LSN ((LOG_START_LSN) + (LOG_BLOCK_HDR_SIZE)) + +/* Tests if num bit of bitmap is set */ +#define IS_BIT_SET(bitmap, num) \ + (*((bitmap) + ((num) >> 3)) & (1UL << ((num) & 7UL))) + +/** The bitmap file block size in bytes. All writes will be multiples of this. + */ +enum { + MODIFIED_PAGE_BLOCK_SIZE = 4096 +}; + + +/** Offsets in a file bitmap block */ +enum { + MODIFIED_PAGE_IS_LAST_BLOCK = 0,/* 1 if last block in the current + write, 0 otherwise. */ + MODIFIED_PAGE_START_LSN = 4, /* The starting tracked LSN of this and + other blocks in the same write */ + MODIFIED_PAGE_END_LSN = 12, /* The ending tracked LSN of this and + other blocks in the same write */ + MODIFIED_PAGE_SPACE_ID = 20, /* The space ID of tracked pages in + this block */ + MODIFIED_PAGE_1ST_PAGE_ID = 24, /* The page ID of the first tracked + page in this block */ + MODIFIED_PAGE_BLOCK_UNUSED_1 = 28,/* Unused in order to align the start + of bitmap at 8 byte boundary */ + MODIFIED_PAGE_BLOCK_BITMAP = 32,/* Start of the bitmap itself */ + MODIFIED_PAGE_BLOCK_UNUSED_2 = MODIFIED_PAGE_BLOCK_SIZE - 8, + /* Unused in order to align the end of + bitmap at 8 byte boundary */ + MODIFIED_PAGE_BLOCK_CHECKSUM = MODIFIED_PAGE_BLOCK_SIZE - 4 + /* The checksum of the current block */ +}; + +/** Length of the bitmap data in a block in bytes */ +enum { MODIFIED_PAGE_BLOCK_BITMAP_LEN + = MODIFIED_PAGE_BLOCK_UNUSED_2 - MODIFIED_PAGE_BLOCK_BITMAP }; + +/** Length of the bitmap data in a block in page ids */ +enum { MODIFIED_PAGE_BLOCK_ID_COUNT = MODIFIED_PAGE_BLOCK_BITMAP_LEN * 8 }; + +/****************************************************************//** +Provide a comparisson function for the RB-tree tree (space, +block_start_page) pairs. Actual implementation does not matter as +long as the ordering is full. +@return -1 if p1 < p2, 0 if p1 == p2, 1 if p1 > p2 +*/ +static +int +log_online_compare_bmp_keys( +/*========================*/ + const void* p1, /*!<in: 1st key to compare */ + const void* p2) /*!<in: 2nd key to compare */ +{ + const byte *k1 = (const byte *)p1; + const byte *k2 = (const byte *)p2; + + ulint k1_space = mach_read_from_4(k1 + MODIFIED_PAGE_SPACE_ID); + ulint k2_space = mach_read_from_4(k2 + MODIFIED_PAGE_SPACE_ID); + if (k1_space == k2_space) { + ulint k1_start_page + = mach_read_from_4(k1 + MODIFIED_PAGE_1ST_PAGE_ID); + ulint k2_start_page + = mach_read_from_4(k2 + MODIFIED_PAGE_1ST_PAGE_ID); + return k1_start_page < k2_start_page + ? -1 : k1_start_page > k2_start_page ? 1 : 0; + } + return k1_space < k2_space ? -1 : 1; +} + +/****************************************************************//** +Set a bit for tracked page in the bitmap. Expand the bitmap tree as +necessary. */ +static +void +log_online_set_page_bit( +/*====================*/ + ulint space, /*!<in: log record space id */ + ulint page_no)/*!<in: log record page id */ +{ + ulint block_start_page; + ulint block_pos; + uint bit_pos; + ib_rbt_bound_t tree_search_pos; + byte search_page[MODIFIED_PAGE_BLOCK_SIZE]; + byte *page_ptr; + + ut_a(space != ULINT_UNDEFINED); + ut_a(page_no != ULINT_UNDEFINED); + + block_start_page = page_no / MODIFIED_PAGE_BLOCK_ID_COUNT + * MODIFIED_PAGE_BLOCK_ID_COUNT; + block_pos = block_start_page ? (page_no % block_start_page / 8) + : (page_no / 8); + bit_pos = page_no % 8; + + mach_write_to_4(search_page + MODIFIED_PAGE_SPACE_ID, space); + mach_write_to_4(search_page + MODIFIED_PAGE_1ST_PAGE_ID, + block_start_page); + + if (!rbt_search(log_bmp_sys->modified_pages, &tree_search_pos, + search_page)) { + page_ptr = rbt_value(byte, tree_search_pos.last); + } + else { + ib_rbt_node_t *new_node; + + if (log_bmp_sys->page_free_list) { + new_node = log_bmp_sys->page_free_list; + log_bmp_sys->page_free_list = new_node->left; + } + else { + new_node = ut_malloc(SIZEOF_NODE( + log_bmp_sys->modified_pages)); + } + memset(new_node, 0, SIZEOF_NODE(log_bmp_sys->modified_pages)); + + page_ptr = rbt_value(byte, new_node); + mach_write_to_4(page_ptr + MODIFIED_PAGE_SPACE_ID, space); + mach_write_to_4(page_ptr + MODIFIED_PAGE_1ST_PAGE_ID, + block_start_page); + + rbt_add_preallocated_node(log_bmp_sys->modified_pages, + &tree_search_pos, new_node); + } + page_ptr[MODIFIED_PAGE_BLOCK_BITMAP + block_pos] |= (1U << bit_pos); +} + +/****************************************************************//** +Calculate a bitmap block checksum. Algorithm borrowed from +log_block_calc_checksum. +@return checksum */ +UNIV_INLINE +ulint +log_online_calc_checksum( +/*=====================*/ + const byte* block) /*!<in: bitmap block */ +{ + ulint sum; + ulint sh; + ulint i; + + sum = 1; + sh = 0; + + for (i = 0; i < MODIFIED_PAGE_BLOCK_CHECKSUM; i++) { + + ulint b = block[i]; + sum &= 0x7FFFFFFFUL; + sum += b; + sum += b << sh; + sh++; + if (sh > 24) { + sh = 0; + } + } + + return sum; +} + +/****************************************************************//** +Get the last tracked fully LSN from the bitmap file by reading +backwards untile a correct end page is found. Detects incomplete +writes and corrupted data. Sets the start output position for the +written bitmap data. +@return the last fully tracked LSN */ +static +ib_uint64_t +log_online_read_last_tracked_lsn() +/*==============================*/ +{ + byte page[MODIFIED_PAGE_BLOCK_SIZE]; + ib_uint64_t read_offset = log_bmp_sys->out_offset; + /* Initialize these to nonequal values so that file size == 0 case with + zero loop repetitions is handled correctly */ + ulint checksum = 0; + ulint actual_checksum = !checksum; + ibool is_last_page = FALSE; + ib_uint64_t result; + + ut_ad(log_bmp_sys->out_offset % MODIFIED_PAGE_BLOCK_SIZE == 0); + + while (checksum != actual_checksum && read_offset > 0 && !is_last_page) + { + + ulint offset_low, offset_high; + ibool success; + + read_offset -= MODIFIED_PAGE_BLOCK_SIZE; + offset_high = (ulint)(read_offset >> 32); + offset_low = (ulint)(read_offset & 0xFFFFFFFF); + + success = os_file_read(log_bmp_sys->out, page, offset_low, + offset_high, MODIFIED_PAGE_BLOCK_SIZE); + if (!success) { + + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + /* Here and below assume that bitmap file names do not + contain apostrophes, thus no need for + ut_print_filename(). */ + fprintf(stderr, "InnoDB: Warning: failed reading " + "changed page bitmap file \'%s\'\n", + log_bmp_sys->out_name); + return MIN_TRACKED_LSN; + } + + is_last_page + = mach_read_from_4(page + MODIFIED_PAGE_IS_LAST_BLOCK); + checksum = mach_read_from_4(page + + MODIFIED_PAGE_BLOCK_CHECKSUM); + actual_checksum = log_online_calc_checksum(page); + if (checksum != actual_checksum) { + + fprintf(stderr, "InnoDB: Warning: corruption " + "detected in \'%s\' at offset %llu\n", + log_bmp_sys->out_name, read_offset); + } + + }; + + if (UNIV_LIKELY(checksum == actual_checksum && is_last_page)) { + + log_bmp_sys->out_offset = read_offset + + MODIFIED_PAGE_BLOCK_SIZE; + result = mach_read_from_8(page + MODIFIED_PAGE_END_LSN); + } + else { + log_bmp_sys->out_offset = read_offset; + result = 0; + } + + /* Truncate the output file to discard the corrupted bitmap data, if + any */ + if (!os_file_set_eof_at(log_bmp_sys->out, + log_bmp_sys->out_offset)) { + fprintf(stderr, "InnoDB: Warning: failed truncating " + "changed page bitmap file \'%s\' to %llu bytes\n", + log_bmp_sys->out_name, log_bmp_sys->out_offset); + result = 0; + } + return result; +} + +/****************************************************************//** +Safely write the log_sys->tracked_lsn value. Uses atomic operations +if available, otherwise this field is protected with the log system +mutex. The reader counterpart function is log_get_tracked_lsn() in +log0log.c. */ +UNIV_INLINE +void +log_set_tracked_lsn( +/*================*/ + ib_uint64_t tracked_lsn) /*!<in: new value */ +{ +#ifdef HAVE_ATOMIC_BUILTINS_64 + /* Single writer, no data race here */ + ib_uint64_t old_value + = os_atomic_increment_uint64(&log_sys->tracked_lsn, 0); + (void) os_atomic_increment_uint64(&log_sys->tracked_lsn, + tracked_lsn - old_value); +#else + mutex_enter(&log_sys->mutex); + log_sys->tracked_lsn = tracked_lsn; + mutex_exit(&log_sys->mutex); +#endif +} + +/****************************************************************//** +Diagnose a gap in tracked LSN range on server startup due to crash or +very fast shutdown and try to close it by tracking the data +immediatelly, if possible. */ +static +void +log_online_track_missing_on_startup( +/*================================*/ + ib_uint64_t last_tracked_lsn, /*!<in: last tracked LSN read + from the bitmap file */ + ib_uint64_t tracking_start_lsn) /*!<in: last checkpoint LSN of + the current server startup */ +{ + ut_ad(last_tracked_lsn != tracking_start_lsn); + + fprintf(stderr, "InnoDB: last tracked LSN in \'%s\' is %llu, but " + "last checkpoint LSN is %llu. This might be due to a server " + "crash or a very fast shutdown. ", log_bmp_sys->out_name, + last_tracked_lsn, tracking_start_lsn); + + /* last_tracked_lsn might be < MIN_TRACKED_LSN in the case of empty + bitmap file, handle this too. */ + last_tracked_lsn = ut_max(last_tracked_lsn, MIN_TRACKED_LSN); + + /* See if we can fully recover the missing interval */ + if (log_sys->lsn - last_tracked_lsn < log_sys->log_group_capacity) { + + fprintf(stderr, + "Reading the log to advance the last tracked LSN.\n"); + + log_bmp_sys->start_lsn = last_tracked_lsn; + log_set_tracked_lsn(log_bmp_sys->start_lsn); + log_online_follow_redo_log(); + ut_ad(log_bmp_sys->end_lsn >= tracking_start_lsn); + + fprintf(stderr, + "InnoDB: continuing tracking changed pages from LSN " + "%llu\n", log_bmp_sys->end_lsn); + } + else { + fprintf(stderr, + "The age of last tracked LSN exceeds log capacity, " + "tracking-based incremental backups will work only " + "from the higher LSN!\n"); + + log_bmp_sys->end_lsn = log_bmp_sys->start_lsn + = tracking_start_lsn; + log_set_tracked_lsn(log_bmp_sys->start_lsn); + + fprintf(stderr, + "InnoDB: starting tracking changed pages from LSN " + "%llu\n", log_bmp_sys->end_lsn); + } +} + +/*********************************************************************//** +Initialize the online log following subsytem. */ +UNIV_INTERN +void +log_online_read_init() +/*==================*/ +{ + char buf[FN_REFLEN]; + ibool success; + ib_uint64_t tracking_start_lsn + = ut_max(log_sys->last_checkpoint_lsn, MIN_TRACKED_LSN); + + /* Assert (could be compile-time assert) that bitmap data start and end + in a bitmap block is 8-byte aligned */ + ut_a(MODIFIED_PAGE_BLOCK_BITMAP % 8 == 0); + ut_a(MODIFIED_PAGE_BLOCK_BITMAP_LEN % 8 == 0); + + log_bmp_sys = ut_malloc(sizeof(*log_bmp_sys)); + + ut_snprintf(buf, FN_REFLEN, "%s%s%d", srv_data_home, + modified_page_stem, 1); + log_bmp_sys->out_name = ut_malloc(strlen(buf) + 1); + ut_strcpy(log_bmp_sys->out_name, buf); + + log_bmp_sys->modified_pages = rbt_create(MODIFIED_PAGE_BLOCK_SIZE, + log_online_compare_bmp_keys); + log_bmp_sys->page_free_list = NULL; + + log_bmp_sys->out + = os_file_create_simple_no_error_handling + (innodb_file_bmp_key, log_bmp_sys->out_name, OS_FILE_OPEN, + OS_FILE_READ_WRITE, &success); + + if (!success) { + + /* New file, tracking from scratch */ + log_bmp_sys->out + = os_file_create_simple_no_error_handling + (innodb_file_bmp_key, log_bmp_sys->out_name, + OS_FILE_CREATE, OS_FILE_READ_WRITE, &success); + if (!success) { + + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + fprintf(stderr, + "InnoDB: Error: Cannot create \'%s\'\n", + log_bmp_sys->out_name); + exit(1); + } + + log_bmp_sys->out_offset = 0; + } + else { + + /* Old file, read last tracked LSN and continue from there */ + ulint size_low; + ulint size_high; + ib_uint64_t last_tracked_lsn; + + success = os_file_get_size(log_bmp_sys->out, &size_low, + &size_high); + ut_a(success); + + log_bmp_sys->out_offset + = ((ib_uint64_t)size_high << 32) | size_low; + + if (log_bmp_sys->out_offset % MODIFIED_PAGE_BLOCK_SIZE != 0) { + + fprintf(stderr, + "InnoDB: Warning: truncated block detected " + "in \'%s\' at offset %llu\n", + log_bmp_sys->out_name, + log_bmp_sys->out_offset); + log_bmp_sys->out_offset -= + log_bmp_sys->out_offset + % MODIFIED_PAGE_BLOCK_SIZE; + } + + last_tracked_lsn = log_online_read_last_tracked_lsn(); + + if (last_tracked_lsn < tracking_start_lsn) { + + log_online_track_missing_on_startup(last_tracked_lsn, + tracking_start_lsn); + return; + } + + if (last_tracked_lsn > tracking_start_lsn) { + + fprintf(stderr, "InnoDB: last tracked LSN in \'%s\' " + "is %llu, but last checkpoint LSN is %llu. " + "The tracking-based incremental backups will " + "work only from the latter LSN!\n", + log_bmp_sys->out_name, last_tracked_lsn, + tracking_start_lsn); + } + + } + + fprintf(stderr, "InnoDB: starting tracking changed pages from " + "LSN %llu\n", tracking_start_lsn); + log_bmp_sys->start_lsn = tracking_start_lsn; + log_set_tracked_lsn(tracking_start_lsn); +} + +/*********************************************************************//** +Shut down the online log following subsystem. */ +UNIV_INTERN +void +log_online_read_shutdown() +/*======================*/ +{ + ib_rbt_node_t *free_list_node = log_bmp_sys->page_free_list; + + os_file_close(log_bmp_sys->out); + + rbt_free(log_bmp_sys->modified_pages); + + while (free_list_node) { + ib_rbt_node_t *next = free_list_node->left; + ut_free(free_list_node); + free_list_node = next; + } + + ut_free(log_bmp_sys->out_name); + ut_free(log_bmp_sys); +} + +/*********************************************************************//** +For the given minilog record type determine if the record has (space; page) +associated with it. +@return TRUE if the record has (space; page) in it */ +static +ibool +log_online_rec_has_page( +/*====================*/ + byte type) /*!<in: the minilog record type */ +{ + return type != MLOG_MULTI_REC_END && type != MLOG_DUMMY_RECORD; +} + +/*********************************************************************//** +Check if a page field for a given log record type actually contains a page +id. It does not for file operations and MLOG_LSN. +@return TRUE if page field contains actual page id, FALSE otherwise */ +static +ibool +log_online_rec_page_means_page( +/*===========================*/ + byte type) /*!<in: log record type */ +{ + return log_online_rec_has_page(type) +#ifdef UNIV_LOG_LSN_DEBUG + && type != MLOG_LSN +#endif + && type != MLOG_FILE_CREATE + && type != MLOG_FILE_RENAME + && type != MLOG_FILE_DELETE + && type != MLOG_FILE_CREATE2; +} + +/*********************************************************************//** +Parse the log data in the parse buffer for the (space, page) pairs and add +them to the modified page set as necessary. Removes the fully-parsed records +from the buffer. If an incomplete record is found, moves it to the end of the +buffer. */ +static +void +log_online_parse_redo_log() +/*=======================*/ +{ + byte *ptr = log_bmp_sys->parse_buf; + byte *end = log_bmp_sys->parse_buf_end; + + ulint len = 0; + + while (ptr != end + && log_bmp_sys->next_parse_lsn < log_bmp_sys->end_lsn) { + + byte type; + ulint space; + ulint page_no; + byte* body; + + /* recv_sys is not initialized, so on corrupt log we will + SIGSEGV. But the log of a live database should not be + corrupt. */ + len = recv_parse_log_rec(ptr, end, &type, &space, &page_no, + &body); + if (len > 0) { + + if (log_online_rec_page_means_page(type) + && (space != TRX_DOUBLEWRITE_SPACE)) { + + ut_a(len >= 3); + log_online_set_page_bit(space, page_no); + } + + ptr += len; + ut_ad(ptr <= end); + log_bmp_sys->next_parse_lsn + = recv_calc_lsn_on_data_add + (log_bmp_sys->next_parse_lsn, len); + } + else { + + /* Incomplete log record. Shift it to the + beginning of the parse buffer and leave it to be + completed on the next read. */ + ut_memmove(log_bmp_sys->parse_buf, ptr, end - ptr); + log_bmp_sys->parse_buf_end + = log_bmp_sys->parse_buf + (end - ptr); + ptr = end; + } + } + + if (len > 0) { + + log_bmp_sys->parse_buf_end = log_bmp_sys->parse_buf; + } +} + +/*********************************************************************//** +Check the log block checksum. +@return TRUE if the log block checksum is OK, FALSE otherwise. */ +static +ibool +log_online_is_valid_log_seg( +/*========================*/ + const byte* log_block) /*!< in: read log data */ +{ + ibool checksum_is_ok + = log_block_checksum_is_ok_or_old_format(log_block); + + if (!checksum_is_ok) { + + fprintf(stderr, + "InnoDB Error: log block checksum mismatch" + "expected %lu, calculated checksum %lu\n", + (ulong) log_block_get_checksum(log_block), + (ulong) log_block_calc_checksum(log_block)); + } + + return checksum_is_ok; +} + +/*********************************************************************//** +Copy new log data to the parse buffer while skipping log block header, +trailer and already parsed data. */ +static +void +log_online_add_to_parse_buf( +/*========================*/ + const byte* log_block, /*!< in: read log data */ + ulint data_len, /*!< in: length of read log data */ + ulint skip_len) /*!< in: how much of log data to + skip */ +{ + ulint start_offset = skip_len ? skip_len : LOG_BLOCK_HDR_SIZE; + ulint end_offset + = (data_len == OS_FILE_LOG_BLOCK_SIZE) + ? data_len - LOG_BLOCK_TRL_SIZE + : data_len; + ulint actual_data_len = (end_offset >= start_offset) + ? end_offset - start_offset : 0; + + ut_memcpy(log_bmp_sys->parse_buf_end, log_block + start_offset, + actual_data_len); + + log_bmp_sys->parse_buf_end += actual_data_len; + + ut_a(log_bmp_sys->parse_buf_end - log_bmp_sys->parse_buf + <= RECV_PARSING_BUF_SIZE); +} + +/*********************************************************************//** +Parse the log block: first copies the read log data to the parse buffer while +skipping log block header, trailer and already parsed data. Then it actually +parses the log to add to the modified page bitmap. */ +static +void +log_online_parse_redo_log_block( +/*============================*/ + const byte* log_block, /*!< in: read log data */ + ulint skip_already_parsed_len) /*!< in: how many bytes of + log data should be skipped as + they were parsed before */ +{ + ulint block_data_len; + + block_data_len = log_block_get_data_len(log_block); + + ut_ad(block_data_len % OS_FILE_LOG_BLOCK_SIZE == 0 + || block_data_len < OS_FILE_LOG_BLOCK_SIZE); + + log_online_add_to_parse_buf(log_block, block_data_len, + skip_already_parsed_len); + log_online_parse_redo_log(); +} + +/*********************************************************************//** +Read and parse one redo log chunk and updates the modified page bitmap. */ +static +void +log_online_follow_log_seg( +/*======================*/ + log_group_t* group, /*!< in: the log group to use */ + ib_uint64_t block_start_lsn, /*!< in: the LSN to read from */ + ib_uint64_t block_end_lsn) /*!< in: the LSN to read to */ +{ + /* Pointer to the current OS_FILE_LOG_BLOCK-sized chunk of the read log + data to parse */ + byte* log_block = log_bmp_sys->read_buf; + byte* log_block_end = log_bmp_sys->read_buf + + (block_end_lsn - block_start_lsn); + + mutex_enter(&log_sys->mutex); + log_group_read_log_seg(LOG_RECOVER, log_bmp_sys->read_buf, + group, block_start_lsn, block_end_lsn); + mutex_exit(&log_sys->mutex); + + while (log_block < log_block_end + && log_bmp_sys->next_parse_lsn < log_bmp_sys->end_lsn) { + + /* How many bytes of log data should we skip in the current log + block. Skipping is necessary because we round down the next + parse LSN thus it is possible to read the already-processed log + data many times */ + ulint skip_already_parsed_len = 0; + + if (!log_online_is_valid_log_seg(log_block)) { + break; + } + + if ((block_start_lsn <= log_bmp_sys->next_parse_lsn) + && (block_start_lsn + OS_FILE_LOG_BLOCK_SIZE + > log_bmp_sys->next_parse_lsn)) { + + /* The next parse LSN is inside the current block, skip + data preceding it. */ + skip_already_parsed_len + = log_bmp_sys->next_parse_lsn + - block_start_lsn; + } + else { + + /* If the next parse LSN is not inside the current + block, then the only option is that we have processed + ahead already. */ + ut_a(block_start_lsn > log_bmp_sys->next_parse_lsn); + } + + /* TODO: merge the copying to the parse buf code with + skip_already_len calculations */ + log_online_parse_redo_log_block(log_block, + skip_already_parsed_len); + + log_block += OS_FILE_LOG_BLOCK_SIZE; + block_start_lsn += OS_FILE_LOG_BLOCK_SIZE; + } + + return; +} + +/*********************************************************************//** +Read and parse the redo log in a given group in FOLLOW_SCAN_SIZE-sized +chunks and updates the modified page bitmap. */ +static +void +log_online_follow_log_group( +/*========================*/ + log_group_t* group, /*!< in: the log group to use */ + ib_uint64_t contiguous_lsn) /*!< in: the LSN of log block start + containing the log_parse_start_lsn */ +{ + ib_uint64_t block_start_lsn = contiguous_lsn; + ib_uint64_t block_end_lsn; + + log_bmp_sys->next_parse_lsn = log_bmp_sys->start_lsn; + log_bmp_sys->parse_buf_end = log_bmp_sys->parse_buf; + + do { + block_end_lsn = block_start_lsn + FOLLOW_SCAN_SIZE; + + log_online_follow_log_seg(group, block_start_lsn, + block_end_lsn); + + /* Next parse LSN can become higher than the last read LSN + only in the case when the read LSN falls right on the block + boundary, in which case next parse lsn is bumped to the actual + data LSN on the next (not yet read) block. This assert is + slightly conservative. */ + ut_a(log_bmp_sys->next_parse_lsn + <= block_end_lsn + LOG_BLOCK_HDR_SIZE + + LOG_BLOCK_TRL_SIZE); + + block_start_lsn = block_end_lsn; + } while (block_end_lsn < log_bmp_sys->end_lsn); + + /* Assert that the last read log record is a full one */ + ut_a(log_bmp_sys->parse_buf_end == log_bmp_sys->parse_buf); +} + +/*********************************************************************//** +Write, flush one bitmap block to disk and advance the output position if +successful. */ +static +void +log_online_write_bitmap_page( +/*=========================*/ + const byte *block) /*!< in: block to write */ +{ + ibool success; + + success = os_file_write(log_bmp_sys->out_name,log_bmp_sys->out, + block, + (ulint)(log_bmp_sys->out_offset & 0xFFFFFFFF), + (ulint)(log_bmp_sys->out_offset << 32), + MODIFIED_PAGE_BLOCK_SIZE); + if (UNIV_UNLIKELY(!success)) { + + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + fprintf(stderr, "InnoDB: Error: failed writing changed page " + "bitmap file \'%s\'\n", log_bmp_sys->out_name); + return; + } + + success = os_file_flush(log_bmp_sys->out, FALSE); + if (UNIV_UNLIKELY(!success)) { + + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + fprintf(stderr, "InnoDB: Error: failed flushing " + "changed page bitmap file \'%s\'\n", + log_bmp_sys->out_name); + return; + } + + log_bmp_sys->out_offset += MODIFIED_PAGE_BLOCK_SIZE; +} + +/*********************************************************************//** +Append the current changed page bitmap to the bitmap file. Clears the +bitmap tree and recycles its nodes to the free list. */ +static +void +log_online_write_bitmap() +/*=====================*/ +{ + ib_rbt_node_t *bmp_tree_node; + const ib_rbt_node_t *last_bmp_tree_node; + + bmp_tree_node = (ib_rbt_node_t *) + rbt_first(log_bmp_sys->modified_pages); + last_bmp_tree_node = rbt_last(log_bmp_sys->modified_pages); + + while (bmp_tree_node) { + + byte *page = rbt_value(byte, bmp_tree_node); + + if (bmp_tree_node == last_bmp_tree_node) { + mach_write_to_4(page + MODIFIED_PAGE_IS_LAST_BLOCK, 1); + } + + mach_write_to_8(page + MODIFIED_PAGE_START_LSN, + log_bmp_sys->start_lsn); + mach_write_to_8(page + MODIFIED_PAGE_END_LSN, + log_bmp_sys->end_lsn); + mach_write_to_4(page + MODIFIED_PAGE_BLOCK_CHECKSUM, + log_online_calc_checksum(page)); + + log_online_write_bitmap_page(page); + + bmp_tree_node->left = log_bmp_sys->page_free_list; + log_bmp_sys->page_free_list = bmp_tree_node; + + bmp_tree_node = (ib_rbt_node_t*) + rbt_next(log_bmp_sys->modified_pages, bmp_tree_node); + } + + rbt_reset(log_bmp_sys->modified_pages); +} + +/*********************************************************************//** +Read and parse the redo log up to last checkpoint LSN to build the changed +page bitmap which is then written to disk. */ +UNIV_INTERN +void +log_online_follow_redo_log() +/*========================*/ +{ + ib_uint64_t contiguous_start_lsn; + log_group_t* group; + + /* Grab the LSN of the last checkpoint, we will parse up to it */ + mutex_enter(&(log_sys->mutex)); + log_bmp_sys->end_lsn = log_sys->last_checkpoint_lsn; + mutex_exit(&(log_sys->mutex)); + + if (log_bmp_sys->end_lsn == log_bmp_sys->start_lsn) { + return; + } + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + ut_a(group); + + contiguous_start_lsn = ut_uint64_align_down(log_bmp_sys->start_lsn, + OS_FILE_LOG_BLOCK_SIZE); + + while (group) { + log_online_follow_log_group(group, contiguous_start_lsn); + group = UT_LIST_GET_NEXT(log_groups, group); + } + + /* A crash injection site that ensures last checkpoint LSN > last + tracked LSN, so that LSN tracking for this interval is tested. */ + DBUG_EXECUTE_IF("crash_before_bitmap_write", DBUG_SUICIDE();); + + log_online_write_bitmap(); + log_bmp_sys->start_lsn = log_bmp_sys->end_lsn; + log_set_tracked_lsn(log_bmp_sys->start_lsn); +} + +/*********************************************************************//** +Initializes log bitmap iterator. +@return TRUE if the iterator is initialized OK, FALSE otherwise. */ +UNIV_INTERN +ibool +log_online_bitmap_iterator_init( +/*============================*/ + log_bitmap_iterator_t *i) /*!<in/out: iterator */ +{ + ibool success; + + ut_a(i); + ut_snprintf(i->in_name, FN_REFLEN, "%s%s%d", srv_data_home, + modified_page_stem, 1); + i->in_offset = 0; + /* + Set up bit offset out of the reasonable limit + to intiate reading block from file in + log_online_bitmap_iterator_next() + */ + i->bit_offset = MODIFIED_PAGE_BLOCK_BITMAP_LEN; + i->in = + os_file_create_simple_no_error_handling(innodb_file_bmp_key, + i->in_name, + OS_FILE_OPEN, + OS_FILE_READ_ONLY, + &success); + + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + fprintf(stderr, + "InnoDB: Error: Cannot open \'%s\'\n", + i->in_name); + return FALSE; + } + + i->page = ut_malloc(MODIFIED_PAGE_BLOCK_SIZE); + + i->start_lsn = i->end_lsn = 0; + i->space_id = 0; + i->first_page_id = 0; + i->changed = FALSE; + + return TRUE; +} + +/*********************************************************************//** +Releases log bitmap iterator. */ +UNIV_INTERN +void +log_online_bitmap_iterator_release( +/*===============================*/ + log_bitmap_iterator_t *i) /*!<in/out: iterator */ +{ + ut_a(i); + os_file_close(i->in); + ut_free(i->page); +} + +/*********************************************************************//** +Iterates through bits of saved bitmap blocks. +Sequentially reads blocks from bitmap file(s) and interates through +their bits. Ignores blocks with wrong checksum. +@return TRUE if iteration is successful, FALSE if all bits are iterated. */ +UNIV_INTERN +ibool +log_online_bitmap_iterator_next( +/*============================*/ + log_bitmap_iterator_t *i) /*!<in/out: iterator */ +{ + ulint offset_low; + ulint offset_high; + ulint size_low; + ulint size_high; + ulint checksum = 0; + ulint actual_checksum = !checksum; + + ibool success; + + ut_a(i); + + if (i->bit_offset < MODIFIED_PAGE_BLOCK_BITMAP_LEN) + { + ++i->bit_offset; + i->changed = + IS_BIT_SET(i->page + MODIFIED_PAGE_BLOCK_BITMAP, + i->bit_offset); + return TRUE; + } + + while (checksum != actual_checksum) + { + success = os_file_get_size(i->in, + &size_low, + &size_high); + if (!success) { + os_file_get_last_error(TRUE); + fprintf(stderr, + "InnoDB: Warning: can't get size of " + "page bitmap file \'%s\'\n", + i->in_name); + return FALSE; + } + + if (i->in_offset >= + (ib_uint64_t)(size_low) + + ((ib_uint64_t)(size_high) << 32)) + return FALSE; + + offset_high = (ulint)(i->in_offset >> 32); + offset_low = (ulint)(i->in_offset & 0xFFFFFFFF); + + success = os_file_read( + i->in, + i->page, + offset_low, + offset_high, + MODIFIED_PAGE_BLOCK_SIZE); + + if (!success) { + os_file_get_last_error(TRUE); + fprintf(stderr, + "InnoDB: Warning: failed reading " + "changed page bitmap file \'%s\'\n", + i->in_name); + return FALSE; + } + + checksum = mach_read_from_4( + i->page + MODIFIED_PAGE_BLOCK_CHECKSUM); + + actual_checksum = log_online_calc_checksum(i->page); + + i->in_offset += MODIFIED_PAGE_BLOCK_SIZE; + } + + i->start_lsn = + mach_read_from_8(i->page + MODIFIED_PAGE_START_LSN); + i->end_lsn = + mach_read_from_8(i->page + MODIFIED_PAGE_END_LSN); + i->space_id = + mach_read_from_4(i->page + MODIFIED_PAGE_SPACE_ID); + i->first_page_id = + mach_read_from_4(i->page + MODIFIED_PAGE_1ST_PAGE_ID); + i->bit_offset = + 0; + i->changed = + IS_BIT_SET(i->page + MODIFIED_PAGE_BLOCK_BITMAP, + i->bit_offset); + + return TRUE; +} + diff --git a/storage/xtradb/log/log0recv.c b/storage/xtradb/log/log0recv.c index a554c576b6d..5ab8c14ae2e 100644 --- a/storage/xtradb/log/log0recv.c +++ b/storage/xtradb/log/log0recv.c @@ -857,7 +857,7 @@ block. We also accept a log block in the old format before InnoDB-3.23.52 where the checksum field contains the log block number. @return TRUE if ok, or if the log block may be in the format of InnoDB version predating 3.23.52 */ -static +UNIV_INTERN ibool log_block_checksum_is_ok_or_old_format( /*===================================*/ @@ -2102,7 +2102,7 @@ skip_this_recv_addr: /*******************************************************************//** Tries to parse a single log record and returns its length. @return length of the record, or 0 if the record was not complete */ -static +UNIV_INTERN ulint recv_parse_log_rec( /*===============*/ @@ -2173,7 +2173,7 @@ recv_parse_log_rec( /*******************************************************//** Calculates the new value for lsn when more data is added to the log. */ -static +UNIV_INTERN ib_uint64_t recv_calc_lsn_on_data_add( /*======================*/ @@ -3570,6 +3570,8 @@ recv_reset_logs( log_sys->archived_lsn = log_sys->lsn; #endif /* UNIV_LOG_ARCHIVE */ + log_sys->tracked_lsn = log_sys->lsn; + log_block_init(log_sys->buf, log_sys->lsn); log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE); diff --git a/storage/xtradb/os/os0file.c b/storage/xtradb/os/os0file.c index 061d556c6e7..8fa2cdb4a28 100644 --- a/storage/xtradb/os/os0file.c +++ b/storage/xtradb/os/os0file.c @@ -158,6 +158,7 @@ UNIV_INTERN ibool os_aio_print_debug = FALSE; UNIV_INTERN mysql_pfs_key_t innodb_file_data_key; UNIV_INTERN mysql_pfs_key_t innodb_file_log_key; UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key; +UNIV_INTERN mysql_pfs_key_t innodb_file_bmp_key; #endif /* UNIV_PFS_IO */ /** The asynchronous i/o array slot structure */ @@ -2147,6 +2148,25 @@ os_file_set_eof( #endif /* __WIN__ */ } +/***********************************************************************//** +Truncates a file at the specified position. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_eof_at( + os_file_t file, /*!< in: handle to a file */ + ib_uint64_t new_len)/*!< in: new file length */ +{ +#ifdef __WIN__ + /* TODO: untested! */ + return(!_chsize_s(file, new_len)); +#else + /* TODO: works only with -D_FILE_OFFSET_BITS=64 ? */ + return(!ftruncate(file, new_len)); +#endif +} + + #ifndef __WIN__ /***********************************************************************//** Wrapper to fsync(2) that retries the call on some errors. diff --git a/storage/xtradb/srv/srv0srv.c b/storage/xtradb/srv/srv0srv.c index 0a545a8247a..dff9fc07a7f 100644 --- a/storage/xtradb/srv/srv0srv.c +++ b/storage/xtradb/srv/srv0srv.c @@ -67,6 +67,7 @@ Created 10/8/1995 Heikki Tuuri #include "mem0pool.h" #include "sync0sync.h" #include "que0que.h" +#include "log0online.h" #include "log0recv.h" #include "pars0pars.h" #include "usr0sess.h" @@ -176,6 +177,10 @@ UNIV_INTERN char* srv_doublewrite_file = NULL; UNIV_INTERN ibool srv_recovery_stats = FALSE; +UNIV_INTERN my_bool srv_track_changed_pages = TRUE; + +UNIV_INTERN ulonglong srv_changed_pages_limit = 0; + /* if TRUE, then we auto-extend the last data file */ UNIV_INTERN ibool srv_auto_extend_last_data_file = FALSE; /* if != 0, this tells the max size auto-extending may increase the @@ -771,6 +776,10 @@ UNIV_INTERN os_event_t srv_lock_timeout_thread_event; UNIV_INTERN os_event_t srv_shutdown_event; +UNIV_INTERN os_event_t srv_checkpoint_completed_event; + +UNIV_INTERN os_event_t srv_redo_log_thread_finished_event; + UNIV_INTERN srv_sys_t* srv_sys = NULL; /* padding to prevent other memory update hotspots from residing on @@ -1110,6 +1119,9 @@ srv_init(void) srv_lock_timeout_thread_event = os_event_create(NULL); srv_shutdown_event = os_event_create(NULL); + srv_checkpoint_completed_event = os_event_create(NULL); + srv_redo_log_thread_finished_event = os_event_create(NULL); + for (i = 0; i < SRV_MASTER + 1; i++) { srv_n_threads_active[i] = 0; srv_n_threads[i] = 0; @@ -3034,6 +3046,46 @@ srv_shutdown_print_master_pending( } } +/******************************************************************//** +A thread which follows the redo log and outputs the changed page bitmap. +@return a dummy value */ +os_thread_ret_t +srv_redo_log_follow_thread( +/*=======================*/ + void* arg __attribute__((unused))) /*!< in: a dummy parameter + required by + os_thread_create */ +{ +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "Redo log follower thread starts, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif + +#ifdef UNIV_PFS_THREAD + pfs_register_thread(srv_log_tracking_thread_key); +#endif + + my_thread_init(); + + do { + os_event_wait(srv_checkpoint_completed_event); + os_event_reset(srv_checkpoint_completed_event); + + if (srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE) { + log_online_follow_redo_log(); + } + + } while (srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE); + + log_online_read_shutdown(); + os_event_set(srv_redo_log_thread_finished_event); + + my_thread_end(); + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} + /*******************************************************************//** Tells the InnoDB server that there has been activity in the database and wakes up the master thread if it is suspended (not sleeping). Used diff --git a/storage/xtradb/srv/srv0start.c b/storage/xtradb/srv/srv0start.c index d1329f445aa..7c98f74909e 100644 --- a/storage/xtradb/srv/srv0start.c +++ b/storage/xtradb/srv/srv0start.c @@ -51,6 +51,7 @@ Created 2/16/1996 Heikki Tuuri #include "rem0rec.h" #include "mtr0mtr.h" #include "log0log.h" +#include "log0online.h" #include "log0recv.h" #include "page0page.h" #include "page0cur.h" @@ -121,9 +122,9 @@ UNIV_INTERN enum srv_shutdown_state srv_shutdown_state = SRV_SHUTDOWN_NONE; static os_file_t files[1000]; /** io_handler_thread parameters for thread identification */ -static ulint n[SRV_MAX_N_IO_THREADS + 7]; +static ulint n[SRV_MAX_N_IO_THREADS + 8]; /** io_handler_thread identifiers */ -static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 7]; +static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 8]; /** We use this mutex to test the return value of pthread_mutex_trylock on successful locking. HP-UX does NOT return 0, though Linux et al do. */ @@ -145,6 +146,7 @@ UNIV_INTERN mysql_pfs_key_t srv_error_monitor_thread_key; UNIV_INTERN mysql_pfs_key_t srv_monitor_thread_key; UNIV_INTERN mysql_pfs_key_t srv_master_thread_key; UNIV_INTERN mysql_pfs_key_t srv_purge_thread_key; +UNIV_INTERN mysql_pfs_key_t srv_log_tracking_thread_key; #endif /* UNIV_PFS_THREAD */ /*********************************************************************//** @@ -2037,6 +2039,19 @@ innobase_start_or_create_for_mysql(void) if (srv_auto_lru_dump && srv_blocking_lru_restore) buf_LRU_file_restore(); + if (srv_track_changed_pages) { + + /* Initialize the log tracking subsystem here to block + server startup until it's completed due to the potential + need to re-read previous server run's log. */ + log_online_read_init(); + + /* Create the thread that follows the redo log to output the + changed page bitmap */ + os_thread_create(&srv_redo_log_follow_thread, NULL, + thread_ids + 6 + SRV_MAX_N_IO_THREADS); + } + srv_is_being_started = FALSE; err = dict_create_or_check_foreign_constraint_tables(); diff --git a/storage/xtradb/ut/ut0rbt.c b/storage/xtradb/ut/ut0rbt.c index 3d7cfa7636f..a5e9081b951 100644 --- a/storage/xtradb/ut/ut0rbt.c +++ b/storage/xtradb/ut/ut0rbt.c @@ -55,7 +55,6 @@ red-black properties: #endif #define ROOT(t) (t->root->left) -#define SIZEOF_NODE(t) ((sizeof(ib_rbt_node_t) + t->sizeof_value) - 1) /**********************************************************************//** Print out the sub-tree recursively. */ @@ -834,6 +833,21 @@ rbt_add_node( node = (ib_rbt_node_t*) ut_malloc(SIZEOF_NODE(tree)); memcpy(node->value, value, tree->sizeof_value); + return(rbt_add_preallocated_node(tree, parent, node)); +} + +/****************************************************************//** +Add a new caller-provided node to tree at the specified position. +The node must have its key fields initialized correctly. +@return added node */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_add_preallocated_node( +/*======================*/ + ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: parent */ + ib_rbt_node_t* node) /*!< in: node */ +{ node->parent = node->left = node->right = tree->nil; /* If tree is empty */ @@ -842,7 +856,7 @@ rbt_add_node( } /* Append the node, the hope here is that the caller knows - what s/he is doing. */ + what s/he is doing. */ rbt_tree_add_child(tree, parent, node); rbt_balance_tree(tree, node); @@ -854,6 +868,7 @@ rbt_add_node( return(node); } + /**********************************************************************//** Find a matching node in the rb tree. @return NULL if not found else the node where key was found */ @@ -1142,7 +1157,17 @@ rbt_clear( ib_rbt_t* tree) /*!< in: rb tree */ { rbt_free_node(ROOT(tree), tree->nil); + rbt_reset(tree); +} +/****************************************************************//** +Clear the tree without deleting and freeing its nodes. */ +UNIV_INTERN +void +rbt_reset( +/*======*/ + ib_rbt_t* tree) /*!< in: rb tree */ +{ tree->n_nodes = 0; tree->root->left = tree->root->right = tree->nil; } |