summaryrefslogtreecommitdiff
path: root/storage
diff options
context:
space:
mode:
authorSergei Golubchik <sergii@pisem.net>2012-10-16 10:36:28 +0200
committerSergei Golubchik <sergii@pisem.net>2012-10-16 10:36:28 +0200
commitd9a8799205d160688f81362356dd2323eb8a91ea (patch)
treea25584d7adfc190bb0312b6d10cdcb62e17a20d1 /storage
parentabefaab57b4b884b74ff9bd3c63f86c018d0e5de (diff)
parent96d3a797eedfe9304cc6416c7d71c7e543695870 (diff)
downloadmariadb-git-d9a8799205d160688f81362356dd2323eb8a91ea.tar.gz
XtraDB 1.1.8-29.0
Diffstat (limited to 'storage')
-rw-r--r--storage/xtradb/CMakeLists.txt31
-rw-r--r--storage/xtradb/btr/btr0sea.c9
-rw-r--r--storage/xtradb/buf/buf0buf.c13
-rw-r--r--storage/xtradb/buf/buf0lru.c112
-rw-r--r--storage/xtradb/handler/ha_innodb.cc36
-rw-r--r--storage/xtradb/handler/i_s.cc295
-rw-r--r--storage/xtradb/handler/i_s.h1
-rw-r--r--storage/xtradb/include/buf0lru.h11
-rw-r--r--storage/xtradb/include/log0log.h5
-rw-r--r--storage/xtradb/include/log0online.h111
-rw-r--r--storage/xtradb/include/log0recv.h37
-rw-r--r--storage/xtradb/include/os0file.h9
-rw-r--r--storage/xtradb/include/os0sync.h28
-rw-r--r--storage/xtradb/include/srv0srv.h23
-rw-r--r--storage/xtradb/include/univ.i2
-rw-r--r--storage/xtradb/include/ut0rbt.h22
-rw-r--r--storage/xtradb/log/log0log.c126
-rw-r--r--storage/xtradb/log/log0online.c1085
-rw-r--r--storage/xtradb/log/log0recv.c8
-rw-r--r--storage/xtradb/os/os0file.c20
-rw-r--r--storage/xtradb/srv/srv0srv.c52
-rw-r--r--storage/xtradb/srv/srv0start.c19
-rw-r--r--storage/xtradb/ut/ut0rbt.c29
23 files changed, 2019 insertions, 65 deletions
diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt
index 1d2b0b29dea..4c098049fd4 100644
--- a/storage/xtradb/CMakeLists.txt
+++ b/storage/xtradb/CMakeLists.txt
@@ -91,12 +91,41 @@ IF(NOT CMAKE_CROSSCOMPILING)
}"
HAVE_IB_GCC_ATOMIC_BUILTINS
)
+ CHECK_C_SOURCE_RUNS(
+ "
+ #include <stdint.h>
+ int main()
+ {
+ int64_t x, y, res;
+
+ x = 10;
+ y = 123;
+ res = __sync_bool_compare_and_swap(&x, x, y);
+ if (!res || x != y) {
+ return(1);
+ }
+
+ x = 10;
+ y = 123;
+ res = __sync_add_and_fetch(&x, y);
+ if (res != 123 + 10 || x != 123 + 10) {
+ return(1);
+ }
+
+ return(0);
+ }"
+ HAVE_IB_GCC_ATOMIC_BUILTINS_64
+ )
ENDIF()
IF(HAVE_IB_GCC_ATOMIC_BUILTINS)
ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_BUILTINS=1)
ENDIF()
+IF(HAVE_IB_GCC_ATOMIC_BUILTINS_64)
+ ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_BUILTINS_64=1)
+ENDIF()
+
# either define HAVE_IB_ATOMIC_PTHREAD_T_GCC or not
IF(NOT CMAKE_CROSSCOMPILING)
CHECK_C_SOURCE_RUNS(
@@ -240,7 +269,7 @@ SET(INNOBASE_SOURCES btr/btr0btr.c btr/btr0cur.c btr/btr0pcur.c btr/btr0sea.c
ibuf/ibuf0ibuf.c
pars/lexyy.c pars/pars0grm.c pars/pars0opt.c pars/pars0pars.c pars/pars0sym.c
lock/lock0lock.c lock/lock0iter.c
- log/log0log.c log/log0recv.c
+ log/log0log.c log/log0recv.c log/log0online.c
mach/mach0data.c
mem/mem0mem.c mem/mem0pool.c
mtr/mtr0log.c mtr/mtr0mtr.c
diff --git a/storage/xtradb/btr/btr0sea.c b/storage/xtradb/btr/btr0sea.c
index 855ab62c42f..7e9449a6474 100644
--- a/storage/xtradb/btr/btr0sea.c
+++ b/storage/xtradb/btr/btr0sea.c
@@ -183,6 +183,15 @@ btr_search_sys_create(
//rw_lock_create(btr_search_latch_key, &btr_search_latch,
// SYNC_SEARCH_SYS);
+ /* PS bug lp:1018264 - Multiple hash index partitions causes overly
+ large hash index: When multiple adaptive hash index partitions are
+ specified, _each_ partition was being created with hash_size which
+ should be 1/64 of the total size of all buffer pools which is
+ incorrect and can cause overly high memory usage. hash_size
+ should be representing the _total_ size of all partitions, not the
+ individual size of each partition. */
+ hash_size /= btr_search_index_num;
+
btr_search_sys = mem_alloc(sizeof(btr_search_sys_t));
/* btr_search_index_num should be <= 32. (bits of trx->has_search_latch) */
diff --git a/storage/xtradb/buf/buf0buf.c b/storage/xtradb/buf/buf0buf.c
index a2ff171e0c5..6ce77372dac 100644
--- a/storage/xtradb/buf/buf0buf.c
+++ b/storage/xtradb/buf/buf0buf.c
@@ -2838,6 +2838,7 @@ wait_until_unfixed:
&& ibuf_debug) {
/* Try to evict the block from the buffer pool, to use the
insert buffer (change buffer) as much as possible. */
+ ulint page_no = buf_block_get_page_no(block);
if (buf_LRU_free_block(&block->page, TRUE, FALSE)) {
mutex_exit(block_mutex);
@@ -2864,6 +2865,18 @@ wait_until_unfixed:
"innodb_change_buffering_debug evict %u %u\n",
(unsigned) space, (unsigned) offset);
return(NULL);
+ } else if (UNIV_UNLIKELY(buf_block_get_state(block)
+ != BUF_BLOCK_FILE_PAGE
+ || (buf_block_get_page_no(block) != page_no)
+ || (buf_block_get_space(block) != space))) {
+
+ /* buf_LRU_free_block temporarily releases the
+ block mutex, and now block points to something
+ else. */
+ mutex_exit(block_mutex);
+ block = NULL;
+ goto loop2;
+
} else if (buf_flush_page_try(buf_pool, block)) {
fprintf(stderr,
"innodb_change_buffering_debug flush %u %u\n",
diff --git a/storage/xtradb/buf/buf0lru.c b/storage/xtradb/buf/buf0lru.c
index 16a91358080..e8300107f3d 100644
--- a/storage/xtradb/buf/buf0lru.c
+++ b/storage/xtradb/buf/buf0lru.c
@@ -2531,6 +2531,14 @@ func_exit:
Dump the LRU page list to the specific file. */
#define LRU_DUMP_FILE "ib_lru_dump"
#define LRU_DUMP_TEMP_FILE "ib_lru_dump.tmp"
+#define LRU_OS_FILE_WRITE() \
+ os_file_write(LRU_DUMP_FILE, dump_file, buffer, \
+ (buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL, \
+ (buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)), \
+ buffer_size)
+#define LRU_DUMP_PAGE_COUNT 1 /* Specifies how many dump pages
+ should be filled for each hold
+ of the LRU_list_mutex. */
UNIV_INTERN
ibool
@@ -2541,23 +2549,30 @@ buf_LRU_file_dump(void)
ibool success;
byte* buffer_base = NULL;
byte* buffer = NULL;
+ const ulint buffer_size = LRU_DUMP_PAGE_COUNT * UNIV_PAGE_SIZE;
buf_page_t* bpage;
+ buf_page_t* first_bpage;
ulint buffers;
ulint offset;
- ibool ret = FALSE;
+ ulint pages_written;
ulint i;
+ ulint total_pages;
+
+ /* Sanity test to make sure page size is a multiple of
+ assumed dump record size */
+ ut_a(UNIV_PAGE_SIZE % 8 == 0);
for (i = 0; i < srv_n_data_files; i++) {
if (strstr(srv_data_file_names[i], LRU_DUMP_FILE) != NULL) {
fprintf(stderr,
" InnoDB: The name '%s' seems to be used for"
- " innodb_data_file_path. For safety, dumping of the LRU list"
- " is not being done.\n", LRU_DUMP_FILE);
+ " innodb_data_file_path. Dumping LRU list is"
+ " not done for safeness.\n", LRU_DUMP_FILE);
goto end;
}
}
- buffer_base = ut_malloc(2 * UNIV_PAGE_SIZE);
+ buffer_base = ut_malloc(UNIV_PAGE_SIZE + buffer_size);
buffer = ut_align(buffer_base, UNIV_PAGE_SIZE);
if (!buffer) {
fprintf(stderr,
@@ -2577,18 +2592,28 @@ buf_LRU_file_dump(void)
}
buffers = offset = 0;
-
for (i = 0; i < srv_buf_pool_instances; i++) {
buf_pool_t* buf_pool;
buf_pool = buf_pool_from_array(i);
mutex_enter(&buf_pool->LRU_list_mutex);
- bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+ bpage = first_bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
+ total_pages = UT_LIST_GET_LEN(buf_pool->LRU);
- while (bpage != NULL) {
- if (offset == 0) {
- memset(buffer, 0, UNIV_PAGE_SIZE);
+ pages_written = 0;
+ while (bpage != NULL && (pages_written++ < total_pages)) {
+
+ buf_page_t* next_bpage = UT_LIST_GET_NEXT(LRU, bpage);
+
+ if (next_bpage == first_bpage) {
+ /* Do not release list mutex here, it will be
+ released just outside this while loop */
+ fprintf(stderr,
+ "InnoDB: detected cycle in LRU for"
+ " buffer pool %lu, skipping to next"
+ " buffer pool.\n", i);
+ break;
}
mach_write_to_4(buffer + offset * 4, bpage->space);
@@ -2596,52 +2621,71 @@ buf_LRU_file_dump(void)
mach_write_to_4(buffer + offset * 4, bpage->offset);
offset++;
- if (offset == UNIV_PAGE_SIZE/4) {
+ ut_a(offset <= buffer_size);
+ if (offset == buffer_size/4) {
+ mutex_t *next_block_mutex = NULL;
+
if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
- success = 0;
+ mutex_exit(&buf_pool->LRU_list_mutex);
+ success = FALSE;
fprintf(stderr,
" InnoDB: stopped dumping lru"
" pages because of server"
" shutdown.\n");
+ goto end;
+ }
+
+ /* While writing file, release buffer pool
+ mutex but keep the next page fixed so we
+ don't worry about our list iterator becoming
+ invalid */
+ if (next_bpage) {
+ next_block_mutex = buf_page_get_mutex(
+ next_bpage);
+
+ mutex_enter(next_block_mutex);
+ next_bpage->buf_fix_count++;
+ mutex_exit(next_block_mutex);
+ }
+ mutex_exit(&buf_pool->LRU_list_mutex);
+
+ success = LRU_OS_FILE_WRITE();
+
+ /* Grab this here so that next_bpage can't
+ be purged when we drop the fix_count */
+ mutex_enter(&buf_pool->LRU_list_mutex);
+
+ if (next_bpage) {
+ mutex_enter(next_block_mutex);
+ next_bpage->buf_fix_count--;
+ mutex_exit(next_block_mutex);
}
- success = os_file_write(LRU_DUMP_FILE, dump_file, buffer,
- (buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL,
- (buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)),
- UNIV_PAGE_SIZE);
+
if (!success) {
mutex_exit(&buf_pool->LRU_list_mutex);
fprintf(stderr,
- " InnoDB: cannot write page %lu of %s\n",
+ " InnoDB: cannot write page"
+ " %lu of %s\n",
buffers, LRU_DUMP_FILE);
goto end;
}
buffers++;
offset = 0;
- }
- bpage = UT_LIST_GET_PREV(LRU, bpage);
- }
+ bpage = next_bpage;
+ } else {
+ bpage = UT_LIST_GET_NEXT(LRU, bpage);
+ }
+ } /* while(bpage ...) */
mutex_exit(&buf_pool->LRU_list_mutex);
- }
-
- if (offset == 0) {
- memset(buffer, 0, UNIV_PAGE_SIZE);
- }
+ } /* for(srv_buf_pool_instances ...) */
mach_write_to_4(buffer + offset * 4, 0xFFFFFFFFUL);
offset++;
mach_write_to_4(buffer + offset * 4, 0xFFFFFFFFUL);
offset++;
- success = os_file_write(LRU_DUMP_FILE, dump_file, buffer,
- (buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL,
- (buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)),
- UNIV_PAGE_SIZE);
- if (!success) {
- goto end;
- }
-
- ret = TRUE;
+ success = LRU_OS_FILE_WRITE();
end:
if (dump_file != (os_file_t) -1) {
if (success) {
@@ -2656,7 +2700,7 @@ end:
if (buffer_base)
ut_free(buffer_base);
- return(ret);
+ return(success);
}
typedef struct {
diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc
index d37bccc8150..32ec2d9d858 100644
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -358,7 +358,8 @@ static PSI_thread_info all_innodb_threads[] = {
{&srv_error_monitor_thread_key, "srv_error_monitor_thread", 0},
{&srv_monitor_thread_key, "srv_monitor_thread", 0},
{&srv_master_thread_key, "srv_master_thread", 0},
- {&srv_purge_thread_key, "srv_purge_thread", 0}
+ {&srv_purge_thread_key, "srv_purge_thread", 0},
+ {&srv_log_tracking_thread_key, "srv_redo_log_follow_thread", 0}
};
# endif /* UNIV_PFS_THREAD */
@@ -368,7 +369,8 @@ performance schema instrumented if "UNIV_PFS_IO" is defined */
static PSI_file_info all_innodb_files[] = {
{&innodb_file_data_key, "innodb_data_file", 0},
{&innodb_file_log_key, "innodb_log_file", 0},
- {&innodb_file_temp_key, "innodb_temp_file", 0}
+ {&innodb_file_temp_key, "innodb_temp_file", 0},
+ {&innodb_file_bmp_key, "innodb_bmp_file", 0}
};
# endif /* UNIV_PFS_IO */
#endif /* HAVE_PSI_INTERFACE */
@@ -12629,9 +12631,9 @@ static MYSQL_SYSVAR_BOOL(use_sys_stats_table, innobase_use_sys_stats_table,
"So you should use ANALYZE TABLE command intentionally.",
NULL, NULL, FALSE);
-#ifdef UNIV_DEBUG_never
-static MYSQL_SYSVAR_ULONG(sys_stats_root_page, innobase_sys_stats_root_page,
- PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+#ifdef UNIV_DEBUG
+static MYSQL_SYSVAR_ULONG(persistent_stats_root_page,
+ innobase_sys_stats_root_page, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Override the SYS_STATS root page id, 0 = no override (for testing only)",
NULL, NULL, 0, 0, ULONG_MAX, 0);
#endif
@@ -12834,6 +12836,18 @@ static MYSQL_SYSVAR_ENUM(stats_method, srv_innodb_stats_method,
"NULLS_UNEQUAL and NULLS_IGNORED",
NULL, NULL, SRV_STATS_NULLS_EQUAL, &innodb_stats_method_typelib);
+static MYSQL_SYSVAR_BOOL(track_changed_pages, srv_track_changed_pages,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Track the redo log for changed pages and output a changed page bitmap",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONGLONG(changed_pages_limit, srv_changed_pages_limit,
+ PLUGIN_VAR_RQCMDARG,
+ "The maximum number of rows for "
+ "INFORMATION_SCHEMA.INNODB_CHANGED_PAGES table, "
+ "0 - unlimited",
+ NULL, NULL, 1000000, 0, ~0ULL, 0);
+
#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug,
PLUGIN_VAR_RQCMDARG,
@@ -12998,7 +13012,7 @@ static MYSQL_SYSVAR_UINT(buffer_pool_restore_at_startup, srv_auto_lru_dump,
static MYSQL_SYSVAR_BOOL(blocking_buffer_pool_restore,
innobase_blocking_lru_restore,
- PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
"Block XtraDB startup process until buffer pool is full restored from a "
"dump file (if present). Disabled by default.",
NULL, NULL, FALSE);
@@ -13085,8 +13099,8 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(stats_auto_update),
MYSQL_SYSVAR(stats_update_need_lock),
MYSQL_SYSVAR(use_sys_stats_table),
-#ifdef UNIV_DEBUG_never /* disable this flag. --innodb-sys-stats becomes ambiguous */
- MYSQL_SYSVAR(sys_stats_root_page),
+#ifdef UNIV_DEBUG
+ MYSQL_SYSVAR(persistent_stats_root_page),
#endif
MYSQL_SYSVAR(stats_sample_pages),
MYSQL_SYSVAR(adaptive_hash_index),
@@ -13118,6 +13132,8 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(use_sys_malloc),
MYSQL_SYSVAR(use_native_aio),
MYSQL_SYSVAR(change_buffering),
+ MYSQL_SYSVAR(track_changed_pages),
+ MYSQL_SYSVAR(changed_pages_limit),
#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
MYSQL_SYSVAR(change_buffering_debug),
#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
@@ -13177,10 +13193,10 @@ i_s_innodb_index_stats,
i_s_innodb_buffer_pool_pages,
i_s_innodb_buffer_pool_pages_index,
i_s_innodb_buffer_pool_pages_blob,
-i_s_innodb_admin_command
+i_s_innodb_admin_command,
+i_s_innodb_changed_pages
maria_declare_plugin_end;
-
/** @brief Initialize the default value of innodb_commit_concurrency.
Once InnoDB is running, the innodb_commit_concurrency must not change
diff --git a/storage/xtradb/handler/i_s.cc b/storage/xtradb/handler/i_s.cc
index 7af0c88c73a..57a091ea80d 100644
--- a/storage/xtradb/handler/i_s.cc
+++ b/storage/xtradb/handler/i_s.cc
@@ -22,6 +22,14 @@ InnoDB INFORMATION SCHEMA tables interface to MySQL.
Created July 18, 2007 Vasil Dimov
*******************************************************/
+#ifndef MYSQL_SERVER
+#define MYSQL_SERVER /* For Item_* classes */
+#include <item.h>
+/* Prevent influence of this definition to other headers */
+#undef MYSQL_SERVER
+#else
+#include <mysql_priv.h>
+#endif //MYSQL_SERVER
#include <ctype.h> /*toupper*/
#include <mysqld_error.h>
@@ -45,6 +53,7 @@ extern "C" {
#include "dict0mem.h"
#include "dict0types.h"
#include "ha_prototypes.h" /* for innobase_convert_name() */
+#include "srv0srv.h" /* for srv_track_changed_pages */
#include "srv0start.h" /* for srv_was_started */
#include "trx0i_s.h"
#include "trx0trx.h" /* for TRX_QUE_STATE_STR_MAX_LEN */
@@ -54,6 +63,7 @@ extern "C" {
#include "dict0dict.h" /* for dict_sys */
#include "buf0lru.h" /* for XTRA_LRU_[DUMP/RESTORE] */
#include "btr0btr.h" /* for btr_page_get_index_id */
+#include "log0online.h"
}
#define OK(expr) \
@@ -5266,3 +5276,288 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_undo_logs =
INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
};
+
+static ST_FIELD_INFO i_s_innodb_changed_pages_info[] =
+{
+ {STRUCT_FLD(field_name, "space_id"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "page_id"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "start_lsn"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "end_lsn"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+/***********************************************************************
+ This function parses condition and gets upper bounds for start and end LSN's
+ if condition corresponds to certain pattern.
+
+ We can't know right position to avoid scanning bitmap files from the beginning
+ to the lower bound. But we can stop scanning bitmap files if we reach upper bound.
+
+ It's expected the most used queries will be like the following:
+
+ SELECT * FROM INNODB_CHANGED_PAGES WHERE START_LSN > num1 AND start_lsn < num2;
+
+ That's why the pattern is:
+
+ pattern: comp | and_comp;
+ comp: lsn < int_num | lsn <= int_num | int_num > lsn | int_num >= lsn;
+ lsn: start_lsn | end_lsn;
+ and_comp: some_expression AND some_expression | some_expression AND and_comp;
+ some_expression: comp | any_other_expression;
+
+ Suppose the condition is start_lsn < 100, this means we have to read all
+ blocks with start_lsn < 100. Which is equivalent to reading all the blocks
+ with end_lsn <= 99, or just end_lsn < 100. That's why it's enough to find
+ maximum lsn value, doesn't matter if this is start or end lsn and compare
+ it with "start_lsn" field.
+
+ Example:
+
+ SELECT * FROM INNODB_CHANGED_PAGES
+ WHERE
+ start_lsn > 10 AND
+ end_lsn <= 1111 AND
+ 555 > end_lsn AND
+ page_id = 100;
+
+ max_lsn will be set to 555.
+*/
+static
+void
+limit_lsn_range_from_condition(
+/*===========================*/
+ TABLE* table, /*!<in: table */
+ COND* cond, /*!<in: condition */
+ ib_uint64_t* max_lsn) /*!<in/out: maximum LSN
+ (must be initialized with maximum
+ available value) */
+{
+ if (cond->type() != Item::COND_ITEM &&
+ cond->type() != Item::FUNC_ITEM)
+ return;
+
+ switch (((Item_func*) cond)->functype())
+ {
+ case Item_func::COND_AND_FUNC:
+ {
+ List_iterator<Item> li(*((Item_cond*) cond)->
+ argument_list());
+ Item *item;
+ while ((item= li++))
+ limit_lsn_range_from_condition(table,
+ item,
+ max_lsn);
+ break;
+ }
+ case Item_func::LT_FUNC:
+ case Item_func::LE_FUNC:
+ case Item_func::GT_FUNC:
+ case Item_func::GE_FUNC:
+ {
+ Item *left;
+ Item *right;
+ Item_field *item_field;
+ ib_uint64_t tmp_result;
+
+ /*
+ a <= b equals to b >= a that's why we just exchange
+ "left" and "right" in the case of ">" or ">="
+ function
+ */
+ if (((Item_func*) cond)->functype() ==
+ Item_func::LT_FUNC ||
+ ((Item_func*) cond)->functype() ==
+ Item_func::LE_FUNC)
+ {
+ left = ((Item_func*) cond)->arguments()[0];
+ right = ((Item_func*) cond)->arguments()[1];
+ } else {
+ left = ((Item_func*) cond)->arguments()[1];
+ right = ((Item_func*) cond)->arguments()[0];
+ }
+
+ if (!left || !right)
+ return;
+ if (left->type() != Item::FIELD_ITEM)
+ return;
+ if (right->type() != Item::INT_ITEM)
+ return;
+
+ item_field = (Item_field*)left;
+
+ if (/* START_LSN */
+ table->field[2] != item_field->field &&
+ /* END_LSN */
+ table->field[3] != item_field->field)
+ {
+ return;
+ }
+
+ /* Check if the current field belongs to our table */
+ if (table != item_field->field->table)
+ return;
+
+ tmp_result = right->val_int();
+ if (tmp_result < *max_lsn)
+ *max_lsn = tmp_result;
+
+ break;
+ }
+ default:;
+ }
+
+}
+
+/***********************************************************************
+Fill the dynamic table information_schema.innodb_changed_pages.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_innodb_changed_pages_fill(
+/*==========================*/
+ THD* thd, /*!<in: thread */
+ TABLE_LIST* tables, /*!<in/out: tables to fill */
+ COND* cond) /*!<in: condition */
+{
+ TABLE* table = (TABLE *) tables->table;
+ log_bitmap_iterator_t i;
+ ib_uint64_t output_rows_num = 0UL;
+ ib_uint64_t max_lsn = ~0ULL;
+
+ if (!srv_track_changed_pages)
+ return 0;
+
+ if (!log_online_bitmap_iterator_init(&i))
+ return 1;
+
+ if (cond)
+ limit_lsn_range_from_condition(table, cond, &max_lsn);
+
+ while(log_online_bitmap_iterator_next(&i) &&
+ (!srv_changed_pages_limit ||
+ output_rows_num < srv_changed_pages_limit) &&
+ /*
+ There is no need to compare both start LSN and end LSN fields
+ with maximum value. It's enough to compare only start LSN.
+ Example:
+
+ max_lsn = 100
+ \\\\\\\\\\\\\\\\\\\\\\\\\|\\\\\\\\ - Query 1
+ I------I I-------I I-------------I I----I
+ ////////////////// | - Query 2
+ 1 2 3 4
+
+ Query 1:
+ SELECT * FROM INNODB_CHANGED_PAGES WHERE start_lsn < 100
+ will select 1,2,3 bitmaps
+ Query 2:
+ SELECT * FROM INNODB_CHANGED_PAGES WHERE end_lsn < 100
+ will select 1,2 bitmaps
+
+ The condition start_lsn <= 100 will be false after reading
+ 1,2,3 bitmaps which suits for both cases.
+ */
+ LOG_BITMAP_ITERATOR_START_LSN(i) <= max_lsn)
+ {
+ if (!LOG_BITMAP_ITERATOR_PAGE_CHANGED(i))
+ continue;
+
+ /* SPACE_ID */
+ table->field[0]->store(
+ LOG_BITMAP_ITERATOR_SPACE_ID(i));
+ /* PAGE_ID */
+ table->field[1]->store(
+ LOG_BITMAP_ITERATOR_PAGE_NUM(i));
+ /* START_LSN */
+ table->field[2]->store(
+ LOG_BITMAP_ITERATOR_START_LSN(i));
+ /* END_LSN */
+ table->field[3]->store(
+ LOG_BITMAP_ITERATOR_END_LSN(i));
+
+ /*
+ I_S tables are in-memory tables. If bitmap file is big enough
+ a lot of memory can be used to store the table. But the size
+ of used memory can be diminished if we store only data which
+ corresponds to some conditions (in WHERE sql clause). Here
+ conditions are checked for the field values stored above.
+
+ Conditions are checked twice. The first is here (during table
+ generation) and the second during query execution. Maybe it
+ makes sense to use some flag in THD object to avoid double
+ checking.
+ */
+ if (cond && !cond->val_int())
+ continue;
+
+ if (schema_table_store_record(thd, table))
+ {
+ log_online_bitmap_iterator_release(&i);
+ return 1;
+ }
+
+ ++output_rows_num;
+ }
+
+ log_online_bitmap_iterator_release(&i);
+ return 0;
+}
+
+static
+int
+i_s_innodb_changed_pages_init(
+/*==========================*/
+ void* p)
+{
+ DBUG_ENTER("i_s_innodb_changed_pages_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = i_s_innodb_changed_pages_info;
+ schema->fill_table = i_s_innodb_changed_pages_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_changed_pages =
+{
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+ STRUCT_FLD(info, &i_s_info),
+ STRUCT_FLD(name, "INNODB_CHANGED_PAGES"),
+ STRUCT_FLD(author, "Percona"),
+ STRUCT_FLD(descr, "InnoDB CHANGED_PAGES table"),
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+ STRUCT_FLD(init, i_s_innodb_changed_pages_init),
+ STRUCT_FLD(deinit, i_s_common_deinit),
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+ STRUCT_FLD(status_vars, NULL),
+ STRUCT_FLD(system_vars, NULL),
+ INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
+};
diff --git a/storage/xtradb/handler/i_s.h b/storage/xtradb/handler/i_s.h
index 7e9d47571cc..a8964356747 100644
--- a/storage/xtradb/handler/i_s.h
+++ b/storage/xtradb/handler/i_s.h
@@ -51,5 +51,6 @@ extern struct st_maria_plugin i_s_innodb_admin_command;
extern struct st_maria_plugin i_s_innodb_buffer_pool_pages;
extern struct st_maria_plugin i_s_innodb_buffer_pool_pages_index;
extern struct st_maria_plugin i_s_innodb_buffer_pool_pages_blob;
+extern struct st_maria_plugin i_s_innodb_changed_pages;
#endif /* i_s_h */
diff --git a/storage/xtradb/include/buf0lru.h b/storage/xtradb/include/buf0lru.h
index c3672a65ed7..efaa758f27a 100644
--- a/storage/xtradb/include/buf0lru.h
+++ b/storage/xtradb/include/buf0lru.h
@@ -94,13 +94,12 @@ buf_LRU_insert_zip_clean(
Try to free a block. If bpage is a descriptor of a compressed-only
page, the descriptor object will be freed as well.
-NOTE: If this function returns TRUE, it will temporarily
-release buf_pool->mutex. Furthermore, the page frame will no longer be
-accessible via bpage.
+NOTE: This will temporarily release buf_pool_mutex. Furthermore, the
+page frame will no longer be accessible via bpage.
-The caller must hold buf_pool->mutex and buf_page_get_mutex(bpage) and
-release these two mutexes after the call. No other
-buf_page_get_mutex() may be held when calling this function.
+The caller must hold buf_page_get_mutex(bpage) and release this mutex
+after the call. No other buf_page_get_mutex() may be held when
+calling this function.
@return TRUE if freed, FALSE otherwise. */
UNIV_INTERN
ibool
diff --git a/storage/xtradb/include/log0log.h b/storage/xtradb/include/log0log.h
index 857ec0946c2..96c4b81695a 100644
--- a/storage/xtradb/include/log0log.h
+++ b/storage/xtradb/include/log0log.h
@@ -977,6 +977,11 @@ struct log_struct{
become signaled */
/* @} */
#endif /* UNIV_LOG_ARCHIVE */
+ ib_uint64_t tracked_lsn; /*!< log tracking has advanced to this
+ lsn. Field accessed atomically where
+ 64-bit atomic ops are supported,
+ protected by the log sys mutex
+ otherwise. */
};
/** Test if flush order mutex is owned. */
diff --git a/storage/xtradb/include/log0online.h b/storage/xtradb/include/log0online.h
new file mode 100644
index 00000000000..0e0ca169f6f
--- /dev/null
+++ b/storage/xtradb/include/log0online.h
@@ -0,0 +1,111 @@
+/*****************************************************************************
+
+Copyright (c) 2011-2012, Percona Inc. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0online.h
+Online database log parsing for changed page tracking
+*******************************************************/
+
+#ifndef log0online_h
+#define log0online_h
+
+#include "univ.i"
+#include "os0file.h"
+
+/*********************************************************************//**
+Initializes the online log following subsytem. */
+UNIV_INTERN
+void
+log_online_read_init();
+/*===================*/
+
+/*********************************************************************//**
+Shuts down the online log following subsystem. */
+UNIV_INTERN
+void
+log_online_read_shutdown();
+/*=======================*/
+
+/*********************************************************************//**
+Reads and parses the redo log up to last checkpoint LSN to build the changed
+page bitmap which is then written to disk. */
+UNIV_INTERN
+void
+log_online_follow_redo_log();
+/*=========================*/
+
+/** The iterator through all bits of changed pages bitmap blocks */
+struct log_bitmap_iterator_struct
+{
+ char in_name[FN_REFLEN]; /*!< the file name for bitmap
+ input */
+ os_file_t in; /*!< the bitmap input file */
+ ib_uint64_t in_offset; /*!< the next write position in the
+ bitmap output file */
+ ib_uint32_t bit_offset; /*!< bit offset inside of bitmap
+ block*/
+ ib_uint64_t start_lsn; /*!< Start lsn of the block */
+ ib_uint64_t end_lsn; /*!< End lsn of the block */
+ ib_uint32_t space_id; /*!< Block space id */
+ ib_uint32_t first_page_id; /*!< First block page id */
+ ibool changed; /*!< true if current page was changed */
+ byte* page; /*!< Bitmap block */
+};
+
+typedef struct log_bitmap_iterator_struct log_bitmap_iterator_t;
+
+#define LOG_BITMAP_ITERATOR_START_LSN(i) \
+ ((i).start_lsn)
+#define LOG_BITMAP_ITERATOR_END_LSN(i) \
+ ((i).end_lsn)
+#define LOG_BITMAP_ITERATOR_SPACE_ID(i) \
+ ((i).space_id)
+#define LOG_BITMAP_ITERATOR_PAGE_NUM(i) \
+ ((i).first_page_id + (i).bit_offset)
+#define LOG_BITMAP_ITERATOR_PAGE_CHANGED(i) \
+ ((i).changed)
+
+/*********************************************************************//**
+Initializes log bitmap iterator.
+@return TRUE if the iterator is initialized OK, FALSE otherwise. */
+UNIV_INTERN
+ibool
+log_online_bitmap_iterator_init(
+/*============================*/
+ log_bitmap_iterator_t *i); /*!<in/out: iterator */
+
+/*********************************************************************//**
+Releases log bitmap iterator. */
+UNIV_INTERN
+void
+log_online_bitmap_iterator_release(
+/*===============================*/
+ log_bitmap_iterator_t *i); /*!<in/out: iterator */
+
+/*********************************************************************//**
+Iterates through bits of saved bitmap blocks.
+Sequentially reads blocks from bitmap file(s) and interates through
+their bits. Ignores blocks with wrong checksum.
+@return TRUE if iteration is successful, FALSE if all bits are iterated. */
+UNIV_INTERN
+ibool
+log_online_bitmap_iterator_next(
+/*============================*/
+ log_bitmap_iterator_t *i); /*!<in/out: iterator */
+
+#endif
diff --git a/storage/xtradb/include/log0recv.h b/storage/xtradb/include/log0recv.h
index 15065267250..fdffd86e4c4 100644
--- a/storage/xtradb/include/log0recv.h
+++ b/storage/xtradb/include/log0recv.h
@@ -32,6 +32,28 @@ Created 9/20/1997 Heikki Tuuri
#include "hash0hash.h"
#include "log0log.h"
+/******************************************************//**
+Checks the 4-byte checksum to the trailer checksum field of a log
+block. We also accept a log block in the old format before
+InnoDB-3.23.52 where the checksum field contains the log block number.
+@return TRUE if ok, or if the log block may be in the format of InnoDB
+version predating 3.23.52 */
+UNIV_INTERN
+ibool
+log_block_checksum_is_ok_or_old_format(
+/*===================================*/
+ const byte* block); /*!< in: pointer to a log block */
+
+/*******************************************************//**
+Calculates the new value for lsn when more data is added to the log. */
+UNIV_INTERN
+ib_uint64_t
+recv_calc_lsn_on_data_add(
+/*======================*/
+ ib_uint64_t lsn, /*!< in: old lsn */
+ ib_uint64_t len); /*!< in: this many bytes of data is
+ added, log block headers not included */
+
#ifdef UNIV_HOTBACKUP
extern ibool recv_replay_file_ops;
@@ -182,6 +204,21 @@ UNIV_INTERN
void
recv_recovery_rollback_active(void);
/*===============================*/
+
+/*******************************************************************//**
+Tries to parse a single log record and returns its length.
+@return length of the record, or 0 if the record was not complete */
+UNIV_INTERN
+ulint
+recv_parse_log_rec(
+/*===============*/
+ byte* ptr, /*!< in: pointer to a buffer */
+ byte* end_ptr,/*!< in: pointer to the buffer end */
+ byte* type, /*!< out: type */
+ ulint* space, /*!< out: space id */
+ ulint* page_no,/*!< out: page number */
+ byte** body); /*!< out: log record body start */
+
/*******************************************************//**
Scans log from a buffer and stores new log data to the parsing buffer.
Parses and hashes the log records if new data found. Unless
diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h
index 5b1f9339845..4c795d93141 100644
--- a/storage/xtradb/include/os0file.h
+++ b/storage/xtradb/include/os0file.h
@@ -197,6 +197,7 @@ extern ulint srv_log_block_size;
extern mysql_pfs_key_t innodb_file_data_key;
extern mysql_pfs_key_t innodb_file_log_key;
extern mysql_pfs_key_t innodb_file_temp_key;
+extern mysql_pfs_key_t innodb_file_bmp_key;
/* Following four macros are instumentations to register
various file I/O operations with performance schema.
@@ -867,6 +868,14 @@ os_file_set_eof(
/*============*/
FILE* file); /*!< in: file to be truncated */
/***********************************************************************//**
+Truncates a file at the specified position.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_eof_at(
+ os_file_t file, /*!< in: handle to a file */
+ ib_uint64_t new_len);/*!< in: new file length */
+/***********************************************************************//**
NOTE! Use the corresponding macro os_file_flush(), not directly this function!
Flushes the write buffers of a given file to the disk.
@return TRUE if success */
diff --git a/storage/xtradb/include/os0sync.h b/storage/xtradb/include/os0sync.h
index 6a99c60226b..887a40c64ea 100644
--- a/storage/xtradb/include/os0sync.h
+++ b/storage/xtradb/include/os0sync.h
@@ -265,7 +265,11 @@ Atomic compare-and-swap and increment for InnoDB. */
#if defined(HAVE_IB_GCC_ATOMIC_BUILTINS)
-#define HAVE_ATOMIC_BUILTINS
+# define HAVE_ATOMIC_BUILTINS
+
+# ifdef HAVE_IB_GCC_ATOMIC_BUILTINS_64
+# define HAVE_ATOMIC_BUILTINS_64
+# endif
/**********************************************************//**
Returns true if swapped, ptr is pointer to target, old_val is value to
@@ -304,6 +308,9 @@ amount of increment. */
# define os_atomic_increment_ulint(ptr, amount) \
os_atomic_increment(ptr, amount)
+# define os_atomic_increment_uint64(ptr, amount) \
+ os_atomic_increment(ptr, amount)
+
/**********************************************************//**
Returns the old value of *ptr, atomically sets *ptr to new_val */
@@ -312,12 +319,13 @@ Returns the old value of *ptr, atomically sets *ptr to new_val */
#elif defined(HAVE_IB_SOLARIS_ATOMICS)
-#define HAVE_ATOMIC_BUILTINS
+# define HAVE_ATOMIC_BUILTINS
+# define HAVE_ATOMIC_BUILTINS_64
/* If not compiling with GCC or GCC doesn't support the atomic
intrinsics and running on Solaris >= 10 use Solaris atomics */
-#include <atomic.h>
+# include <atomic.h>
/**********************************************************//**
Returns true if swapped, ptr is pointer to target, old_val is value to
@@ -357,6 +365,9 @@ amount of increment. */
# define os_atomic_increment_ulint(ptr, amount) \
atomic_add_long_nv(ptr, amount)
+# define os_atomic_increment_uint64(ptr, amount) \
+ atomic_add_64_nv(ptr, amount)
+
/**********************************************************//**
Returns the old value of *ptr, atomically sets *ptr to new_val */
@@ -365,7 +376,11 @@ Returns the old value of *ptr, atomically sets *ptr to new_val */
#elif defined(HAVE_WINDOWS_ATOMICS)
-#define HAVE_ATOMIC_BUILTINS
+# define HAVE_ATOMIC_BUILTINS
+
+# ifndef _WIN32
+# define HAVE_ATOMIC_BUILTINS_64
+# endif
/* On Windows, use Windows atomics / interlocked */
# ifdef _WIN64
@@ -403,6 +418,11 @@ amount of increment. */
# define os_atomic_increment_ulint(ptr, amount) \
((ulint) (win_xchg_and_add(ptr, amount) + amount))
+# define os_atomic_increment_uint64(ptr, amount) \
+ ((ib_uint64_t) (InterlockedExchangeAdd64( \
+ (ib_int64_t*) ptr, \
+ (ib_int64_t) amount) + amount))
+
/**********************************************************//**
Returns the old value of *ptr, atomically sets *ptr to new_val.
InterlockedExchange() operates on LONG, and the LONG will be
diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h
index a40683e00f1..b8820f1b7c9 100644
--- a/storage/xtradb/include/srv0srv.h
+++ b/storage/xtradb/include/srv0srv.h
@@ -69,6 +69,14 @@ extern os_event_t srv_error_event;
/* This event is set at shutdown to wakeup threads from sleep */
extern os_event_t srv_shutdown_event;
+/* This event is set on checkpoint completion to wake the redo log parser
+thread */
+extern os_event_t srv_checkpoint_completed_event;
+
+/* This event is set on the online redo log following thread exit to signal
+that the (slow) shutdown may proceed */
+extern os_event_t srv_redo_log_thread_finished_event;
+
/* If the last data file is auto-extended, we add this many pages to it
at a time */
#define SRV_AUTO_EXTEND_INCREMENT \
@@ -136,6 +144,11 @@ extern char* srv_doublewrite_file;
extern ibool srv_recovery_stats;
+extern my_bool srv_track_changed_pages;
+
+extern
+ulonglong srv_changed_pages_limit;
+
extern ibool srv_auto_extend_last_data_file;
extern ulint srv_last_file_size_max;
extern char** srv_log_group_home_dirs;
@@ -402,6 +415,7 @@ extern mysql_pfs_key_t srv_error_monitor_thread_key;
extern mysql_pfs_key_t srv_monitor_thread_key;
extern mysql_pfs_key_t srv_master_thread_key;
extern mysql_pfs_key_t srv_purge_thread_key;
+extern mysql_pfs_key_t srv_log_tracking_thread_key;
/* This macro register the current thread and its key with performance
schema */
@@ -697,6 +711,15 @@ srv_LRU_dump_restore_thread(
void* arg); /*!< in: a dummy parameter required by
os_thread_create */
/******************************************************************//**
+A thread which follows the redo log and outputs the changed page bitmap.
+@return a dummy value */
+UNIV_INTERN
+os_thread_ret_t
+srv_redo_log_follow_thread(
+/*=======================*/
+ void* arg); /*!< in: a dummy parameter required by
+ os_thread_create */
+/******************************************************************//**
Outputs to a file the output of the InnoDB Monitor.
@return FALSE if not all information printed
due to failure to obtain necessary mutex */
diff --git a/storage/xtradb/include/univ.i b/storage/xtradb/include/univ.i
index ce59a3f2741..9aee2a0c7f9 100644
--- a/storage/xtradb/include/univ.i
+++ b/storage/xtradb/include/univ.i
@@ -54,7 +54,7 @@ Created 1/20/1994 Heikki Tuuri
#define INNODB_VERSION_BUGFIX 8
#ifndef PERCONA_INNODB_VERSION
-#define PERCONA_INNODB_VERSION 28.1
+#define PERCONA_INNODB_VERSION 29.0
#endif
/* The following is the InnoDB version as shown in
diff --git a/storage/xtradb/include/ut0rbt.h b/storage/xtradb/include/ut0rbt.h
index e26b637ae13..cd9df1c1a3d 100644
--- a/storage/xtradb/include/ut0rbt.h
+++ b/storage/xtradb/include/ut0rbt.h
@@ -116,6 +116,10 @@ struct ib_rbt_bound_struct {
/* Compare a key with the node value (t is tree, k is key, n is node)*/
#define rbt_compare(t, k, n) (t->compare(k, n->value))
+/* Node size. FIXME: name might clash, but currently it does not, so for easier
+ maintenance do not rename it for now. */
+#define SIZEOF_NODE(t) ((sizeof(ib_rbt_node_t) + t->sizeof_value) - 1)
+
/**********************************************************************//**
Free an instance of a red black tree */
UNIV_INTERN
@@ -187,6 +191,17 @@ rbt_add_node(
ib_rbt_bound_t* parent, /*!< in: parent */
const void* value); /*!< in: this value is copied
to the node */
+/****************************************************************//**
+Add a new caller-provided node to tree at the specified position.
+The node must have its key fields initialized correctly.
+@return added node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_add_preallocated_node(
+/*======================*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: parent */
+ ib_rbt_node_t* node); /*!< in: node */
/**********************************************************************//**
Return the left most data node in the tree
@return left most node */
@@ -273,6 +288,13 @@ void
rbt_clear(
/*======*/
ib_rbt_t* tree); /*!< in: rb tree */
+/****************************************************************//**
+Clear the tree without deleting and freeing its nodes. */
+UNIV_INTERN
+void
+rbt_reset(
+/*======*/
+ ib_rbt_t* tree); /*!< in: rb tree */
/**********************************************************************//**
Merge the node from dst into src. Return the number of nodes merged.
@return no. of recs merged */
diff --git a/storage/xtradb/log/log0log.c b/storage/xtradb/log/log0log.c
index dcaf951a0ed..e3e023c0c5a 100644
--- a/storage/xtradb/log/log0log.c
+++ b/storage/xtradb/log/log0log.c
@@ -216,6 +216,54 @@ log_buf_pool_get_oldest_modification(void)
return(lsn);
}
+/****************************************************************//**
+Safely reads the log_sys->tracked_lsn value. Uses atomic operations
+if available, otherwise this field is protected with the log system
+mutex. The writer counterpart function is log_set_tracked_lsn() in
+log0online.c.
+
+@return log_sys->tracked_lsn value. */
+UNIV_INLINE
+ib_uint64_t
+log_get_tracked_lsn()
+{
+#ifdef HAVE_ATOMIC_BUILTINS_64
+ return os_atomic_increment_uint64(&log_sys->tracked_lsn, 0);
+#else
+ ut_ad(mutex_own(&(log_sys->mutex)));
+ return log_sys->tracked_lsn;
+#endif
+}
+
+/****************************************************************//**
+Checks if the log groups have a big enough margin of free space in
+so that a new log entry can be written without overwriting log data
+that is not read by the changed page bitmap thread.
+@return TRUE if there is not enough free space. */
+static
+ibool
+log_check_tracking_margin(
+ ulint lsn_advance) /*!< in: an upper limit on how much log data we
+ plan to write. If zero, the margin will be
+ checked for the already-written log. */
+{
+ ib_uint64_t tracked_lsn;
+ ulint tracked_lsn_age;
+
+ if (!srv_track_changed_pages) {
+ return FALSE;
+ }
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ tracked_lsn = log_get_tracked_lsn();
+ tracked_lsn_age = log_sys->lsn - tracked_lsn;
+
+ /* The overwrite would happen when log_sys->log_group_capacity is
+ exceeded, but we use max_checkpoint_age for an extra safety margin. */
+ return tracked_lsn_age + lsn_advance > log_sys->max_checkpoint_age;
+}
+
/************************************************************//**
Opens the log for log_write_low. The log must be closed with log_close and
released with log_release.
@@ -232,9 +280,7 @@ log_reserve_and_open(
ulint archived_lsn_age;
ulint dummy;
#endif /* UNIV_LOG_ARCHIVE */
-#ifdef UNIV_DEBUG
ulint count = 0;
-#endif /* UNIV_DEBUG */
ut_a(len < log->buf_size / 2);
loop:
@@ -262,6 +308,19 @@ loop:
goto loop;
}
+ if (log_check_tracking_margin(len_upper_limit) && (++count < 50)) {
+
+ /* This log write would violate the untracked LSN free space
+ margin. Limit this to 50 retries as there might be situations
+ where we have no choice but to proceed anyway, i.e. if the log
+ is about to be overflown, log tracking or not. */
+ mutex_exit(&(log->mutex));
+
+ os_thread_sleep(10000);
+
+ goto loop;
+ }
+
#ifdef UNIV_LOG_ARCHIVE
if (log->archiving_state != LOG_ARCH_OFF) {
@@ -400,6 +459,8 @@ log_close(void)
ulint first_rec_group;
ib_uint64_t oldest_lsn;
ib_uint64_t lsn;
+ ib_uint64_t tracked_lsn;
+ ulint tracked_lsn_age;
log_t* log = log_sys;
ib_uint64_t checkpoint_age;
@@ -426,6 +487,19 @@ log_close(void)
log->check_flush_or_checkpoint = TRUE;
}
+ if (srv_track_changed_pages) {
+
+ tracked_lsn = log_get_tracked_lsn();
+ tracked_lsn_age = lsn - tracked_lsn;
+
+ if (tracked_lsn_age >= log->log_group_capacity) {
+
+ fprintf(stderr, " InnoDB: Error: the age of the "
+ "oldest untracked record exceeds the log "
+ "group capacity!\n");
+ }
+ }
+
checkpoint_age = lsn - log->last_checkpoint_lsn;
if (checkpoint_age >= log->log_group_capacity) {
@@ -893,6 +967,8 @@ log_init(void)
log_sys->archiving_on = os_event_create(NULL);
#endif /* UNIV_LOG_ARCHIVE */
+ log_sys->tracked_lsn = 0;
+
/*----------------------------*/
log_block_init(log_sys->buf, log_sys->lsn);
@@ -1742,6 +1818,12 @@ log_io_complete_checkpoint(void)
}
mutex_exit(&(log_sys->mutex));
+
+ /* Wake the redo log watching thread to parse the log up to this
+ checkpoint. */
+ if (srv_track_changed_pages) {
+ os_event_set(srv_checkpoint_completed_event);
+ }
}
/*******************************************************************//**
@@ -3169,6 +3251,15 @@ loop:
log_checkpoint_margin();
+ mutex_enter(&(log_sys->mutex));
+ if (log_check_tracking_margin(0)) {
+
+ mutex_exit(&(log_sys->mutex));
+ os_thread_sleep(10000);
+ goto loop;
+ }
+ mutex_exit(&(log_sys->mutex));
+
#ifdef UNIV_LOG_ARCHIVE
log_archive_margin();
#endif /* UNIV_LOG_ARCHIVE */
@@ -3197,6 +3288,7 @@ logs_empty_and_mark_files_at_shutdown(void)
/*=======================================*/
{
ib_uint64_t lsn;
+ ib_uint64_t tracked_lsn;
ulint arch_log_no;
ibool server_busy;
ulint count = 0;
@@ -3388,6 +3480,12 @@ loop:
}
srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
+ /* Wake the log tracking thread which will then immediatelly
+ quit because of srv_shutdown_state value */
+ if (srv_track_changed_pages) {
+ os_event_set(srv_checkpoint_completed_event);
+ os_event_wait(srv_redo_log_thread_finished_event);
+ }
fil_close_all_files();
ut_a(srv_get_active_thread_type() == ULINT_UNDEFINED);
return;
@@ -3397,9 +3495,12 @@ loop:
mutex_enter(&log_sys->mutex);
+ tracked_lsn = log_get_tracked_lsn();
+
lsn = log_sys->lsn;
if (lsn != log_sys->last_checkpoint_lsn
+ || (srv_track_changed_pages && (tracked_lsn != log_sys->last_checkpoint_lsn))
#ifdef UNIV_LOG_ARCHIVE
|| (srv_log_archive_on
&& lsn != log_sys->archived_lsn + LOG_BLOCK_HDR_SIZE)
@@ -3457,6 +3558,11 @@ loop:
srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
+ /* Signal the log following thread to quit */
+ if (srv_track_changed_pages) {
+ os_event_set(srv_checkpoint_completed_event);
+ }
+
/* Make some checks that the server really is quiet */
ut_a(srv_get_active_thread_type() == ULINT_UNDEFINED);
@@ -3477,6 +3583,10 @@ loop:
fil_flush_file_spaces(FIL_TABLESPACE);
+ if (srv_track_changed_pages) {
+ os_event_wait(srv_redo_log_thread_finished_event);
+ }
+
fil_close_all_files();
/* Make some checks that the server really is quiet */
@@ -3603,6 +3713,18 @@ log_print(
((log_sys->n_log_ios - log_sys->n_log_ios_old)
/ time_elapsed));
+ if (srv_track_changed_pages) {
+
+ /* The maximum tracked LSN age is equal to the maximum
+ checkpoint age */
+ fprintf(file,
+ "Log tracking enabled\n"
+ "Log tracked up to %llu\n"
+ "Max tracked LSN age %lu\n",
+ log_get_tracked_lsn(),
+ log_sys->max_checkpoint_age);
+ }
+
log_sys->n_log_ios_old = log_sys->n_log_ios;
log_sys->last_printout_time = current_time;
diff --git a/storage/xtradb/log/log0online.c b/storage/xtradb/log/log0online.c
new file mode 100644
index 00000000000..1d478c467e6
--- /dev/null
+++ b/storage/xtradb/log/log0online.c
@@ -0,0 +1,1085 @@
+/*****************************************************************************
+
+Copyright (c) 2011-2012 Percona Inc. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file log/log0online.c
+Online database log parsing for changed page tracking
+
+*******************************************************/
+
+#include "log0online.h"
+
+#include "my_dbug.h"
+
+#include "log0recv.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0sys.h"
+#include "ut0rbt.h"
+
+enum { FOLLOW_SCAN_SIZE = 4 * (UNIV_PAGE_SIZE_MAX) };
+
+/** Log parsing and bitmap output data structure */
+struct log_bitmap_struct {
+ byte read_buf[FOLLOW_SCAN_SIZE];
+ /*!< log read buffer */
+ byte parse_buf[RECV_PARSING_BUF_SIZE];
+ /*!< log parse buffer */
+ byte* parse_buf_end; /*!< parse buffer position where the
+ next read log data should be copied to.
+ If the previous log records were fully
+ parsed, it points to the start,
+ otherwise points immediatelly past the
+ end of the incomplete log record. */
+ char* out_name; /*!< the file name for bitmap output */
+ os_file_t out; /*!< the bitmap output file */
+ ib_uint64_t out_offset; /*!< the next write position in the
+ bitmap output file */
+ ib_uint64_t start_lsn; /*!< the LSN of the next unparsed
+ record and the start of the next LSN
+ interval to be parsed. */
+ ib_uint64_t end_lsn; /*!< the end of the LSN interval to be
+ parsed, equal to the next checkpoint
+ LSN at the time of parse */
+ ib_uint64_t next_parse_lsn; /*!< the LSN of the next unparsed
+ record in the current parse */
+ ib_rbt_t* modified_pages; /*!< the current modified page set,
+ organized as the RB-tree with the keys
+ of (space, 4KB-block-start-page-id)
+ pairs */
+ ib_rbt_node_t* page_free_list; /*!< Singly-linked list of freed nodes
+ of modified_pages tree for later
+ reuse. Nodes are linked through
+ ib_rbt_node_t.left as this field has
+ both the correct type and the tree does
+ not mind its overwrite during
+ rbt_next() tree traversal. */
+};
+
+/* The log parsing and bitmap output struct instance */
+static struct log_bitmap_struct* log_bmp_sys;
+
+/* File name stem for modified page bitmaps */
+static const char* modified_page_stem = "ib_modified_log.";
+
+/* On server startup with empty database srv_start_lsn == 0, in
+which case the first LSN of actual log records will be this. */
+#define MIN_TRACKED_LSN ((LOG_START_LSN) + (LOG_BLOCK_HDR_SIZE))
+
+/* Tests if num bit of bitmap is set */
+#define IS_BIT_SET(bitmap, num) \
+ (*((bitmap) + ((num) >> 3)) & (1UL << ((num) & 7UL)))
+
+/** The bitmap file block size in bytes. All writes will be multiples of this.
+ */
+enum {
+ MODIFIED_PAGE_BLOCK_SIZE = 4096
+};
+
+
+/** Offsets in a file bitmap block */
+enum {
+ MODIFIED_PAGE_IS_LAST_BLOCK = 0,/* 1 if last block in the current
+ write, 0 otherwise. */
+ MODIFIED_PAGE_START_LSN = 4, /* The starting tracked LSN of this and
+ other blocks in the same write */
+ MODIFIED_PAGE_END_LSN = 12, /* The ending tracked LSN of this and
+ other blocks in the same write */
+ MODIFIED_PAGE_SPACE_ID = 20, /* The space ID of tracked pages in
+ this block */
+ MODIFIED_PAGE_1ST_PAGE_ID = 24, /* The page ID of the first tracked
+ page in this block */
+ MODIFIED_PAGE_BLOCK_UNUSED_1 = 28,/* Unused in order to align the start
+ of bitmap at 8 byte boundary */
+ MODIFIED_PAGE_BLOCK_BITMAP = 32,/* Start of the bitmap itself */
+ MODIFIED_PAGE_BLOCK_UNUSED_2 = MODIFIED_PAGE_BLOCK_SIZE - 8,
+ /* Unused in order to align the end of
+ bitmap at 8 byte boundary */
+ MODIFIED_PAGE_BLOCK_CHECKSUM = MODIFIED_PAGE_BLOCK_SIZE - 4
+ /* The checksum of the current block */
+};
+
+/** Length of the bitmap data in a block in bytes */
+enum { MODIFIED_PAGE_BLOCK_BITMAP_LEN
+ = MODIFIED_PAGE_BLOCK_UNUSED_2 - MODIFIED_PAGE_BLOCK_BITMAP };
+
+/** Length of the bitmap data in a block in page ids */
+enum { MODIFIED_PAGE_BLOCK_ID_COUNT = MODIFIED_PAGE_BLOCK_BITMAP_LEN * 8 };
+
+/****************************************************************//**
+Provide a comparisson function for the RB-tree tree (space,
+block_start_page) pairs. Actual implementation does not matter as
+long as the ordering is full.
+@return -1 if p1 < p2, 0 if p1 == p2, 1 if p1 > p2
+*/
+static
+int
+log_online_compare_bmp_keys(
+/*========================*/
+ const void* p1, /*!<in: 1st key to compare */
+ const void* p2) /*!<in: 2nd key to compare */
+{
+ const byte *k1 = (const byte *)p1;
+ const byte *k2 = (const byte *)p2;
+
+ ulint k1_space = mach_read_from_4(k1 + MODIFIED_PAGE_SPACE_ID);
+ ulint k2_space = mach_read_from_4(k2 + MODIFIED_PAGE_SPACE_ID);
+ if (k1_space == k2_space) {
+ ulint k1_start_page
+ = mach_read_from_4(k1 + MODIFIED_PAGE_1ST_PAGE_ID);
+ ulint k2_start_page
+ = mach_read_from_4(k2 + MODIFIED_PAGE_1ST_PAGE_ID);
+ return k1_start_page < k2_start_page
+ ? -1 : k1_start_page > k2_start_page ? 1 : 0;
+ }
+ return k1_space < k2_space ? -1 : 1;
+}
+
+/****************************************************************//**
+Set a bit for tracked page in the bitmap. Expand the bitmap tree as
+necessary. */
+static
+void
+log_online_set_page_bit(
+/*====================*/
+ ulint space, /*!<in: log record space id */
+ ulint page_no)/*!<in: log record page id */
+{
+ ulint block_start_page;
+ ulint block_pos;
+ uint bit_pos;
+ ib_rbt_bound_t tree_search_pos;
+ byte search_page[MODIFIED_PAGE_BLOCK_SIZE];
+ byte *page_ptr;
+
+ ut_a(space != ULINT_UNDEFINED);
+ ut_a(page_no != ULINT_UNDEFINED);
+
+ block_start_page = page_no / MODIFIED_PAGE_BLOCK_ID_COUNT
+ * MODIFIED_PAGE_BLOCK_ID_COUNT;
+ block_pos = block_start_page ? (page_no % block_start_page / 8)
+ : (page_no / 8);
+ bit_pos = page_no % 8;
+
+ mach_write_to_4(search_page + MODIFIED_PAGE_SPACE_ID, space);
+ mach_write_to_4(search_page + MODIFIED_PAGE_1ST_PAGE_ID,
+ block_start_page);
+
+ if (!rbt_search(log_bmp_sys->modified_pages, &tree_search_pos,
+ search_page)) {
+ page_ptr = rbt_value(byte, tree_search_pos.last);
+ }
+ else {
+ ib_rbt_node_t *new_node;
+
+ if (log_bmp_sys->page_free_list) {
+ new_node = log_bmp_sys->page_free_list;
+ log_bmp_sys->page_free_list = new_node->left;
+ }
+ else {
+ new_node = ut_malloc(SIZEOF_NODE(
+ log_bmp_sys->modified_pages));
+ }
+ memset(new_node, 0, SIZEOF_NODE(log_bmp_sys->modified_pages));
+
+ page_ptr = rbt_value(byte, new_node);
+ mach_write_to_4(page_ptr + MODIFIED_PAGE_SPACE_ID, space);
+ mach_write_to_4(page_ptr + MODIFIED_PAGE_1ST_PAGE_ID,
+ block_start_page);
+
+ rbt_add_preallocated_node(log_bmp_sys->modified_pages,
+ &tree_search_pos, new_node);
+ }
+ page_ptr[MODIFIED_PAGE_BLOCK_BITMAP + block_pos] |= (1U << bit_pos);
+}
+
+/****************************************************************//**
+Calculate a bitmap block checksum. Algorithm borrowed from
+log_block_calc_checksum.
+@return checksum */
+UNIV_INLINE
+ulint
+log_online_calc_checksum(
+/*=====================*/
+ const byte* block) /*!<in: bitmap block */
+{
+ ulint sum;
+ ulint sh;
+ ulint i;
+
+ sum = 1;
+ sh = 0;
+
+ for (i = 0; i < MODIFIED_PAGE_BLOCK_CHECKSUM; i++) {
+
+ ulint b = block[i];
+ sum &= 0x7FFFFFFFUL;
+ sum += b;
+ sum += b << sh;
+ sh++;
+ if (sh > 24) {
+ sh = 0;
+ }
+ }
+
+ return sum;
+}
+
+/****************************************************************//**
+Get the last tracked fully LSN from the bitmap file by reading
+backwards untile a correct end page is found. Detects incomplete
+writes and corrupted data. Sets the start output position for the
+written bitmap data.
+@return the last fully tracked LSN */
+static
+ib_uint64_t
+log_online_read_last_tracked_lsn()
+/*==============================*/
+{
+ byte page[MODIFIED_PAGE_BLOCK_SIZE];
+ ib_uint64_t read_offset = log_bmp_sys->out_offset;
+ /* Initialize these to nonequal values so that file size == 0 case with
+ zero loop repetitions is handled correctly */
+ ulint checksum = 0;
+ ulint actual_checksum = !checksum;
+ ibool is_last_page = FALSE;
+ ib_uint64_t result;
+
+ ut_ad(log_bmp_sys->out_offset % MODIFIED_PAGE_BLOCK_SIZE == 0);
+
+ while (checksum != actual_checksum && read_offset > 0 && !is_last_page)
+ {
+
+ ulint offset_low, offset_high;
+ ibool success;
+
+ read_offset -= MODIFIED_PAGE_BLOCK_SIZE;
+ offset_high = (ulint)(read_offset >> 32);
+ offset_low = (ulint)(read_offset & 0xFFFFFFFF);
+
+ success = os_file_read(log_bmp_sys->out, page, offset_low,
+ offset_high, MODIFIED_PAGE_BLOCK_SIZE);
+ if (!success) {
+
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+ /* Here and below assume that bitmap file names do not
+ contain apostrophes, thus no need for
+ ut_print_filename(). */
+ fprintf(stderr, "InnoDB: Warning: failed reading "
+ "changed page bitmap file \'%s\'\n",
+ log_bmp_sys->out_name);
+ return MIN_TRACKED_LSN;
+ }
+
+ is_last_page
+ = mach_read_from_4(page + MODIFIED_PAGE_IS_LAST_BLOCK);
+ checksum = mach_read_from_4(page
+ + MODIFIED_PAGE_BLOCK_CHECKSUM);
+ actual_checksum = log_online_calc_checksum(page);
+ if (checksum != actual_checksum) {
+
+ fprintf(stderr, "InnoDB: Warning: corruption "
+ "detected in \'%s\' at offset %llu\n",
+ log_bmp_sys->out_name, read_offset);
+ }
+
+ };
+
+ if (UNIV_LIKELY(checksum == actual_checksum && is_last_page)) {
+
+ log_bmp_sys->out_offset = read_offset
+ + MODIFIED_PAGE_BLOCK_SIZE;
+ result = mach_read_from_8(page + MODIFIED_PAGE_END_LSN);
+ }
+ else {
+ log_bmp_sys->out_offset = read_offset;
+ result = 0;
+ }
+
+ /* Truncate the output file to discard the corrupted bitmap data, if
+ any */
+ if (!os_file_set_eof_at(log_bmp_sys->out,
+ log_bmp_sys->out_offset)) {
+ fprintf(stderr, "InnoDB: Warning: failed truncating "
+ "changed page bitmap file \'%s\' to %llu bytes\n",
+ log_bmp_sys->out_name, log_bmp_sys->out_offset);
+ result = 0;
+ }
+ return result;
+}
+
+/****************************************************************//**
+Safely write the log_sys->tracked_lsn value. Uses atomic operations
+if available, otherwise this field is protected with the log system
+mutex. The reader counterpart function is log_get_tracked_lsn() in
+log0log.c. */
+UNIV_INLINE
+void
+log_set_tracked_lsn(
+/*================*/
+ ib_uint64_t tracked_lsn) /*!<in: new value */
+{
+#ifdef HAVE_ATOMIC_BUILTINS_64
+ /* Single writer, no data race here */
+ ib_uint64_t old_value
+ = os_atomic_increment_uint64(&log_sys->tracked_lsn, 0);
+ (void) os_atomic_increment_uint64(&log_sys->tracked_lsn,
+ tracked_lsn - old_value);
+#else
+ mutex_enter(&log_sys->mutex);
+ log_sys->tracked_lsn = tracked_lsn;
+ mutex_exit(&log_sys->mutex);
+#endif
+}
+
+/****************************************************************//**
+Diagnose a gap in tracked LSN range on server startup due to crash or
+very fast shutdown and try to close it by tracking the data
+immediatelly, if possible. */
+static
+void
+log_online_track_missing_on_startup(
+/*================================*/
+ ib_uint64_t last_tracked_lsn, /*!<in: last tracked LSN read
+ from the bitmap file */
+ ib_uint64_t tracking_start_lsn) /*!<in: last checkpoint LSN of
+ the current server startup */
+{
+ ut_ad(last_tracked_lsn != tracking_start_lsn);
+
+ fprintf(stderr, "InnoDB: last tracked LSN in \'%s\' is %llu, but "
+ "last checkpoint LSN is %llu. This might be due to a server "
+ "crash or a very fast shutdown. ", log_bmp_sys->out_name,
+ last_tracked_lsn, tracking_start_lsn);
+
+ /* last_tracked_lsn might be < MIN_TRACKED_LSN in the case of empty
+ bitmap file, handle this too. */
+ last_tracked_lsn = ut_max(last_tracked_lsn, MIN_TRACKED_LSN);
+
+ /* See if we can fully recover the missing interval */
+ if (log_sys->lsn - last_tracked_lsn < log_sys->log_group_capacity) {
+
+ fprintf(stderr,
+ "Reading the log to advance the last tracked LSN.\n");
+
+ log_bmp_sys->start_lsn = last_tracked_lsn;
+ log_set_tracked_lsn(log_bmp_sys->start_lsn);
+ log_online_follow_redo_log();
+ ut_ad(log_bmp_sys->end_lsn >= tracking_start_lsn);
+
+ fprintf(stderr,
+ "InnoDB: continuing tracking changed pages from LSN "
+ "%llu\n", log_bmp_sys->end_lsn);
+ }
+ else {
+ fprintf(stderr,
+ "The age of last tracked LSN exceeds log capacity, "
+ "tracking-based incremental backups will work only "
+ "from the higher LSN!\n");
+
+ log_bmp_sys->end_lsn = log_bmp_sys->start_lsn
+ = tracking_start_lsn;
+ log_set_tracked_lsn(log_bmp_sys->start_lsn);
+
+ fprintf(stderr,
+ "InnoDB: starting tracking changed pages from LSN "
+ "%llu\n", log_bmp_sys->end_lsn);
+ }
+}
+
+/*********************************************************************//**
+Initialize the online log following subsytem. */
+UNIV_INTERN
+void
+log_online_read_init()
+/*==================*/
+{
+ char buf[FN_REFLEN];
+ ibool success;
+ ib_uint64_t tracking_start_lsn
+ = ut_max(log_sys->last_checkpoint_lsn, MIN_TRACKED_LSN);
+
+ /* Assert (could be compile-time assert) that bitmap data start and end
+ in a bitmap block is 8-byte aligned */
+ ut_a(MODIFIED_PAGE_BLOCK_BITMAP % 8 == 0);
+ ut_a(MODIFIED_PAGE_BLOCK_BITMAP_LEN % 8 == 0);
+
+ log_bmp_sys = ut_malloc(sizeof(*log_bmp_sys));
+
+ ut_snprintf(buf, FN_REFLEN, "%s%s%d", srv_data_home,
+ modified_page_stem, 1);
+ log_bmp_sys->out_name = ut_malloc(strlen(buf) + 1);
+ ut_strcpy(log_bmp_sys->out_name, buf);
+
+ log_bmp_sys->modified_pages = rbt_create(MODIFIED_PAGE_BLOCK_SIZE,
+ log_online_compare_bmp_keys);
+ log_bmp_sys->page_free_list = NULL;
+
+ log_bmp_sys->out
+ = os_file_create_simple_no_error_handling
+ (innodb_file_bmp_key, log_bmp_sys->out_name, OS_FILE_OPEN,
+ OS_FILE_READ_WRITE, &success);
+
+ if (!success) {
+
+ /* New file, tracking from scratch */
+ log_bmp_sys->out
+ = os_file_create_simple_no_error_handling
+ (innodb_file_bmp_key, log_bmp_sys->out_name,
+ OS_FILE_CREATE, OS_FILE_READ_WRITE, &success);
+ if (!success) {
+
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+ fprintf(stderr,
+ "InnoDB: Error: Cannot create \'%s\'\n",
+ log_bmp_sys->out_name);
+ exit(1);
+ }
+
+ log_bmp_sys->out_offset = 0;
+ }
+ else {
+
+ /* Old file, read last tracked LSN and continue from there */
+ ulint size_low;
+ ulint size_high;
+ ib_uint64_t last_tracked_lsn;
+
+ success = os_file_get_size(log_bmp_sys->out, &size_low,
+ &size_high);
+ ut_a(success);
+
+ log_bmp_sys->out_offset
+ = ((ib_uint64_t)size_high << 32) | size_low;
+
+ if (log_bmp_sys->out_offset % MODIFIED_PAGE_BLOCK_SIZE != 0) {
+
+ fprintf(stderr,
+ "InnoDB: Warning: truncated block detected "
+ "in \'%s\' at offset %llu\n",
+ log_bmp_sys->out_name,
+ log_bmp_sys->out_offset);
+ log_bmp_sys->out_offset -=
+ log_bmp_sys->out_offset
+ % MODIFIED_PAGE_BLOCK_SIZE;
+ }
+
+ last_tracked_lsn = log_online_read_last_tracked_lsn();
+
+ if (last_tracked_lsn < tracking_start_lsn) {
+
+ log_online_track_missing_on_startup(last_tracked_lsn,
+ tracking_start_lsn);
+ return;
+ }
+
+ if (last_tracked_lsn > tracking_start_lsn) {
+
+ fprintf(stderr, "InnoDB: last tracked LSN in \'%s\' "
+ "is %llu, but last checkpoint LSN is %llu. "
+ "The tracking-based incremental backups will "
+ "work only from the latter LSN!\n",
+ log_bmp_sys->out_name, last_tracked_lsn,
+ tracking_start_lsn);
+ }
+
+ }
+
+ fprintf(stderr, "InnoDB: starting tracking changed pages from "
+ "LSN %llu\n", tracking_start_lsn);
+ log_bmp_sys->start_lsn = tracking_start_lsn;
+ log_set_tracked_lsn(tracking_start_lsn);
+}
+
+/*********************************************************************//**
+Shut down the online log following subsystem. */
+UNIV_INTERN
+void
+log_online_read_shutdown()
+/*======================*/
+{
+ ib_rbt_node_t *free_list_node = log_bmp_sys->page_free_list;
+
+ os_file_close(log_bmp_sys->out);
+
+ rbt_free(log_bmp_sys->modified_pages);
+
+ while (free_list_node) {
+ ib_rbt_node_t *next = free_list_node->left;
+ ut_free(free_list_node);
+ free_list_node = next;
+ }
+
+ ut_free(log_bmp_sys->out_name);
+ ut_free(log_bmp_sys);
+}
+
+/*********************************************************************//**
+For the given minilog record type determine if the record has (space; page)
+associated with it.
+@return TRUE if the record has (space; page) in it */
+static
+ibool
+log_online_rec_has_page(
+/*====================*/
+ byte type) /*!<in: the minilog record type */
+{
+ return type != MLOG_MULTI_REC_END && type != MLOG_DUMMY_RECORD;
+}
+
+/*********************************************************************//**
+Check if a page field for a given log record type actually contains a page
+id. It does not for file operations and MLOG_LSN.
+@return TRUE if page field contains actual page id, FALSE otherwise */
+static
+ibool
+log_online_rec_page_means_page(
+/*===========================*/
+ byte type) /*!<in: log record type */
+{
+ return log_online_rec_has_page(type)
+#ifdef UNIV_LOG_LSN_DEBUG
+ && type != MLOG_LSN
+#endif
+ && type != MLOG_FILE_CREATE
+ && type != MLOG_FILE_RENAME
+ && type != MLOG_FILE_DELETE
+ && type != MLOG_FILE_CREATE2;
+}
+
+/*********************************************************************//**
+Parse the log data in the parse buffer for the (space, page) pairs and add
+them to the modified page set as necessary. Removes the fully-parsed records
+from the buffer. If an incomplete record is found, moves it to the end of the
+buffer. */
+static
+void
+log_online_parse_redo_log()
+/*=======================*/
+{
+ byte *ptr = log_bmp_sys->parse_buf;
+ byte *end = log_bmp_sys->parse_buf_end;
+
+ ulint len = 0;
+
+ while (ptr != end
+ && log_bmp_sys->next_parse_lsn < log_bmp_sys->end_lsn) {
+
+ byte type;
+ ulint space;
+ ulint page_no;
+ byte* body;
+
+ /* recv_sys is not initialized, so on corrupt log we will
+ SIGSEGV. But the log of a live database should not be
+ corrupt. */
+ len = recv_parse_log_rec(ptr, end, &type, &space, &page_no,
+ &body);
+ if (len > 0) {
+
+ if (log_online_rec_page_means_page(type)
+ && (space != TRX_DOUBLEWRITE_SPACE)) {
+
+ ut_a(len >= 3);
+ log_online_set_page_bit(space, page_no);
+ }
+
+ ptr += len;
+ ut_ad(ptr <= end);
+ log_bmp_sys->next_parse_lsn
+ = recv_calc_lsn_on_data_add
+ (log_bmp_sys->next_parse_lsn, len);
+ }
+ else {
+
+ /* Incomplete log record. Shift it to the
+ beginning of the parse buffer and leave it to be
+ completed on the next read. */
+ ut_memmove(log_bmp_sys->parse_buf, ptr, end - ptr);
+ log_bmp_sys->parse_buf_end
+ = log_bmp_sys->parse_buf + (end - ptr);
+ ptr = end;
+ }
+ }
+
+ if (len > 0) {
+
+ log_bmp_sys->parse_buf_end = log_bmp_sys->parse_buf;
+ }
+}
+
+/*********************************************************************//**
+Check the log block checksum.
+@return TRUE if the log block checksum is OK, FALSE otherwise. */
+static
+ibool
+log_online_is_valid_log_seg(
+/*========================*/
+ const byte* log_block) /*!< in: read log data */
+{
+ ibool checksum_is_ok
+ = log_block_checksum_is_ok_or_old_format(log_block);
+
+ if (!checksum_is_ok) {
+
+ fprintf(stderr,
+ "InnoDB Error: log block checksum mismatch"
+ "expected %lu, calculated checksum %lu\n",
+ (ulong) log_block_get_checksum(log_block),
+ (ulong) log_block_calc_checksum(log_block));
+ }
+
+ return checksum_is_ok;
+}
+
+/*********************************************************************//**
+Copy new log data to the parse buffer while skipping log block header,
+trailer and already parsed data. */
+static
+void
+log_online_add_to_parse_buf(
+/*========================*/
+ const byte* log_block, /*!< in: read log data */
+ ulint data_len, /*!< in: length of read log data */
+ ulint skip_len) /*!< in: how much of log data to
+ skip */
+{
+ ulint start_offset = skip_len ? skip_len : LOG_BLOCK_HDR_SIZE;
+ ulint end_offset
+ = (data_len == OS_FILE_LOG_BLOCK_SIZE)
+ ? data_len - LOG_BLOCK_TRL_SIZE
+ : data_len;
+ ulint actual_data_len = (end_offset >= start_offset)
+ ? end_offset - start_offset : 0;
+
+ ut_memcpy(log_bmp_sys->parse_buf_end, log_block + start_offset,
+ actual_data_len);
+
+ log_bmp_sys->parse_buf_end += actual_data_len;
+
+ ut_a(log_bmp_sys->parse_buf_end - log_bmp_sys->parse_buf
+ <= RECV_PARSING_BUF_SIZE);
+}
+
+/*********************************************************************//**
+Parse the log block: first copies the read log data to the parse buffer while
+skipping log block header, trailer and already parsed data. Then it actually
+parses the log to add to the modified page bitmap. */
+static
+void
+log_online_parse_redo_log_block(
+/*============================*/
+ const byte* log_block, /*!< in: read log data */
+ ulint skip_already_parsed_len) /*!< in: how many bytes of
+ log data should be skipped as
+ they were parsed before */
+{
+ ulint block_data_len;
+
+ block_data_len = log_block_get_data_len(log_block);
+
+ ut_ad(block_data_len % OS_FILE_LOG_BLOCK_SIZE == 0
+ || block_data_len < OS_FILE_LOG_BLOCK_SIZE);
+
+ log_online_add_to_parse_buf(log_block, block_data_len,
+ skip_already_parsed_len);
+ log_online_parse_redo_log();
+}
+
+/*********************************************************************//**
+Read and parse one redo log chunk and updates the modified page bitmap. */
+static
+void
+log_online_follow_log_seg(
+/*======================*/
+ log_group_t* group, /*!< in: the log group to use */
+ ib_uint64_t block_start_lsn, /*!< in: the LSN to read from */
+ ib_uint64_t block_end_lsn) /*!< in: the LSN to read to */
+{
+ /* Pointer to the current OS_FILE_LOG_BLOCK-sized chunk of the read log
+ data to parse */
+ byte* log_block = log_bmp_sys->read_buf;
+ byte* log_block_end = log_bmp_sys->read_buf
+ + (block_end_lsn - block_start_lsn);
+
+ mutex_enter(&log_sys->mutex);
+ log_group_read_log_seg(LOG_RECOVER, log_bmp_sys->read_buf,
+ group, block_start_lsn, block_end_lsn);
+ mutex_exit(&log_sys->mutex);
+
+ while (log_block < log_block_end
+ && log_bmp_sys->next_parse_lsn < log_bmp_sys->end_lsn) {
+
+ /* How many bytes of log data should we skip in the current log
+ block. Skipping is necessary because we round down the next
+ parse LSN thus it is possible to read the already-processed log
+ data many times */
+ ulint skip_already_parsed_len = 0;
+
+ if (!log_online_is_valid_log_seg(log_block)) {
+ break;
+ }
+
+ if ((block_start_lsn <= log_bmp_sys->next_parse_lsn)
+ && (block_start_lsn + OS_FILE_LOG_BLOCK_SIZE
+ > log_bmp_sys->next_parse_lsn)) {
+
+ /* The next parse LSN is inside the current block, skip
+ data preceding it. */
+ skip_already_parsed_len
+ = log_bmp_sys->next_parse_lsn
+ - block_start_lsn;
+ }
+ else {
+
+ /* If the next parse LSN is not inside the current
+ block, then the only option is that we have processed
+ ahead already. */
+ ut_a(block_start_lsn > log_bmp_sys->next_parse_lsn);
+ }
+
+ /* TODO: merge the copying to the parse buf code with
+ skip_already_len calculations */
+ log_online_parse_redo_log_block(log_block,
+ skip_already_parsed_len);
+
+ log_block += OS_FILE_LOG_BLOCK_SIZE;
+ block_start_lsn += OS_FILE_LOG_BLOCK_SIZE;
+ }
+
+ return;
+}
+
+/*********************************************************************//**
+Read and parse the redo log in a given group in FOLLOW_SCAN_SIZE-sized
+chunks and updates the modified page bitmap. */
+static
+void
+log_online_follow_log_group(
+/*========================*/
+ log_group_t* group, /*!< in: the log group to use */
+ ib_uint64_t contiguous_lsn) /*!< in: the LSN of log block start
+ containing the log_parse_start_lsn */
+{
+ ib_uint64_t block_start_lsn = contiguous_lsn;
+ ib_uint64_t block_end_lsn;
+
+ log_bmp_sys->next_parse_lsn = log_bmp_sys->start_lsn;
+ log_bmp_sys->parse_buf_end = log_bmp_sys->parse_buf;
+
+ do {
+ block_end_lsn = block_start_lsn + FOLLOW_SCAN_SIZE;
+
+ log_online_follow_log_seg(group, block_start_lsn,
+ block_end_lsn);
+
+ /* Next parse LSN can become higher than the last read LSN
+ only in the case when the read LSN falls right on the block
+ boundary, in which case next parse lsn is bumped to the actual
+ data LSN on the next (not yet read) block. This assert is
+ slightly conservative. */
+ ut_a(log_bmp_sys->next_parse_lsn
+ <= block_end_lsn + LOG_BLOCK_HDR_SIZE
+ + LOG_BLOCK_TRL_SIZE);
+
+ block_start_lsn = block_end_lsn;
+ } while (block_end_lsn < log_bmp_sys->end_lsn);
+
+ /* Assert that the last read log record is a full one */
+ ut_a(log_bmp_sys->parse_buf_end == log_bmp_sys->parse_buf);
+}
+
+/*********************************************************************//**
+Write, flush one bitmap block to disk and advance the output position if
+successful. */
+static
+void
+log_online_write_bitmap_page(
+/*=========================*/
+ const byte *block) /*!< in: block to write */
+{
+ ibool success;
+
+ success = os_file_write(log_bmp_sys->out_name,log_bmp_sys->out,
+ block,
+ (ulint)(log_bmp_sys->out_offset & 0xFFFFFFFF),
+ (ulint)(log_bmp_sys->out_offset << 32),
+ MODIFIED_PAGE_BLOCK_SIZE);
+ if (UNIV_UNLIKELY(!success)) {
+
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+ fprintf(stderr, "InnoDB: Error: failed writing changed page "
+ "bitmap file \'%s\'\n", log_bmp_sys->out_name);
+ return;
+ }
+
+ success = os_file_flush(log_bmp_sys->out, FALSE);
+ if (UNIV_UNLIKELY(!success)) {
+
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+ fprintf(stderr, "InnoDB: Error: failed flushing "
+ "changed page bitmap file \'%s\'\n",
+ log_bmp_sys->out_name);
+ return;
+ }
+
+ log_bmp_sys->out_offset += MODIFIED_PAGE_BLOCK_SIZE;
+}
+
+/*********************************************************************//**
+Append the current changed page bitmap to the bitmap file. Clears the
+bitmap tree and recycles its nodes to the free list. */
+static
+void
+log_online_write_bitmap()
+/*=====================*/
+{
+ ib_rbt_node_t *bmp_tree_node;
+ const ib_rbt_node_t *last_bmp_tree_node;
+
+ bmp_tree_node = (ib_rbt_node_t *)
+ rbt_first(log_bmp_sys->modified_pages);
+ last_bmp_tree_node = rbt_last(log_bmp_sys->modified_pages);
+
+ while (bmp_tree_node) {
+
+ byte *page = rbt_value(byte, bmp_tree_node);
+
+ if (bmp_tree_node == last_bmp_tree_node) {
+ mach_write_to_4(page + MODIFIED_PAGE_IS_LAST_BLOCK, 1);
+ }
+
+ mach_write_to_8(page + MODIFIED_PAGE_START_LSN,
+ log_bmp_sys->start_lsn);
+ mach_write_to_8(page + MODIFIED_PAGE_END_LSN,
+ log_bmp_sys->end_lsn);
+ mach_write_to_4(page + MODIFIED_PAGE_BLOCK_CHECKSUM,
+ log_online_calc_checksum(page));
+
+ log_online_write_bitmap_page(page);
+
+ bmp_tree_node->left = log_bmp_sys->page_free_list;
+ log_bmp_sys->page_free_list = bmp_tree_node;
+
+ bmp_tree_node = (ib_rbt_node_t*)
+ rbt_next(log_bmp_sys->modified_pages, bmp_tree_node);
+ }
+
+ rbt_reset(log_bmp_sys->modified_pages);
+}
+
+/*********************************************************************//**
+Read and parse the redo log up to last checkpoint LSN to build the changed
+page bitmap which is then written to disk. */
+UNIV_INTERN
+void
+log_online_follow_redo_log()
+/*========================*/
+{
+ ib_uint64_t contiguous_start_lsn;
+ log_group_t* group;
+
+ /* Grab the LSN of the last checkpoint, we will parse up to it */
+ mutex_enter(&(log_sys->mutex));
+ log_bmp_sys->end_lsn = log_sys->last_checkpoint_lsn;
+ mutex_exit(&(log_sys->mutex));
+
+ if (log_bmp_sys->end_lsn == log_bmp_sys->start_lsn) {
+ return;
+ }
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+ ut_a(group);
+
+ contiguous_start_lsn = ut_uint64_align_down(log_bmp_sys->start_lsn,
+ OS_FILE_LOG_BLOCK_SIZE);
+
+ while (group) {
+ log_online_follow_log_group(group, contiguous_start_lsn);
+ group = UT_LIST_GET_NEXT(log_groups, group);
+ }
+
+ /* A crash injection site that ensures last checkpoint LSN > last
+ tracked LSN, so that LSN tracking for this interval is tested. */
+ DBUG_EXECUTE_IF("crash_before_bitmap_write", DBUG_SUICIDE(););
+
+ log_online_write_bitmap();
+ log_bmp_sys->start_lsn = log_bmp_sys->end_lsn;
+ log_set_tracked_lsn(log_bmp_sys->start_lsn);
+}
+
+/*********************************************************************//**
+Initializes log bitmap iterator.
+@return TRUE if the iterator is initialized OK, FALSE otherwise. */
+UNIV_INTERN
+ibool
+log_online_bitmap_iterator_init(
+/*============================*/
+ log_bitmap_iterator_t *i) /*!<in/out: iterator */
+{
+ ibool success;
+
+ ut_a(i);
+ ut_snprintf(i->in_name, FN_REFLEN, "%s%s%d", srv_data_home,
+ modified_page_stem, 1);
+ i->in_offset = 0;
+ /*
+ Set up bit offset out of the reasonable limit
+ to intiate reading block from file in
+ log_online_bitmap_iterator_next()
+ */
+ i->bit_offset = MODIFIED_PAGE_BLOCK_BITMAP_LEN;
+ i->in =
+ os_file_create_simple_no_error_handling(innodb_file_bmp_key,
+ i->in_name,
+ OS_FILE_OPEN,
+ OS_FILE_READ_ONLY,
+ &success);
+
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+ fprintf(stderr,
+ "InnoDB: Error: Cannot open \'%s\'\n",
+ i->in_name);
+ return FALSE;
+ }
+
+ i->page = ut_malloc(MODIFIED_PAGE_BLOCK_SIZE);
+
+ i->start_lsn = i->end_lsn = 0;
+ i->space_id = 0;
+ i->first_page_id = 0;
+ i->changed = FALSE;
+
+ return TRUE;
+}
+
+/*********************************************************************//**
+Releases log bitmap iterator. */
+UNIV_INTERN
+void
+log_online_bitmap_iterator_release(
+/*===============================*/
+ log_bitmap_iterator_t *i) /*!<in/out: iterator */
+{
+ ut_a(i);
+ os_file_close(i->in);
+ ut_free(i->page);
+}
+
+/*********************************************************************//**
+Iterates through bits of saved bitmap blocks.
+Sequentially reads blocks from bitmap file(s) and interates through
+their bits. Ignores blocks with wrong checksum.
+@return TRUE if iteration is successful, FALSE if all bits are iterated. */
+UNIV_INTERN
+ibool
+log_online_bitmap_iterator_next(
+/*============================*/
+ log_bitmap_iterator_t *i) /*!<in/out: iterator */
+{
+ ulint offset_low;
+ ulint offset_high;
+ ulint size_low;
+ ulint size_high;
+ ulint checksum = 0;
+ ulint actual_checksum = !checksum;
+
+ ibool success;
+
+ ut_a(i);
+
+ if (i->bit_offset < MODIFIED_PAGE_BLOCK_BITMAP_LEN)
+ {
+ ++i->bit_offset;
+ i->changed =
+ IS_BIT_SET(i->page + MODIFIED_PAGE_BLOCK_BITMAP,
+ i->bit_offset);
+ return TRUE;
+ }
+
+ while (checksum != actual_checksum)
+ {
+ success = os_file_get_size(i->in,
+ &size_low,
+ &size_high);
+ if (!success) {
+ os_file_get_last_error(TRUE);
+ fprintf(stderr,
+ "InnoDB: Warning: can't get size of "
+ "page bitmap file \'%s\'\n",
+ i->in_name);
+ return FALSE;
+ }
+
+ if (i->in_offset >=
+ (ib_uint64_t)(size_low) +
+ ((ib_uint64_t)(size_high) << 32))
+ return FALSE;
+
+ offset_high = (ulint)(i->in_offset >> 32);
+ offset_low = (ulint)(i->in_offset & 0xFFFFFFFF);
+
+ success = os_file_read(
+ i->in,
+ i->page,
+ offset_low,
+ offset_high,
+ MODIFIED_PAGE_BLOCK_SIZE);
+
+ if (!success) {
+ os_file_get_last_error(TRUE);
+ fprintf(stderr,
+ "InnoDB: Warning: failed reading "
+ "changed page bitmap file \'%s\'\n",
+ i->in_name);
+ return FALSE;
+ }
+
+ checksum = mach_read_from_4(
+ i->page + MODIFIED_PAGE_BLOCK_CHECKSUM);
+
+ actual_checksum = log_online_calc_checksum(i->page);
+
+ i->in_offset += MODIFIED_PAGE_BLOCK_SIZE;
+ }
+
+ i->start_lsn =
+ mach_read_from_8(i->page + MODIFIED_PAGE_START_LSN);
+ i->end_lsn =
+ mach_read_from_8(i->page + MODIFIED_PAGE_END_LSN);
+ i->space_id =
+ mach_read_from_4(i->page + MODIFIED_PAGE_SPACE_ID);
+ i->first_page_id =
+ mach_read_from_4(i->page + MODIFIED_PAGE_1ST_PAGE_ID);
+ i->bit_offset =
+ 0;
+ i->changed =
+ IS_BIT_SET(i->page + MODIFIED_PAGE_BLOCK_BITMAP,
+ i->bit_offset);
+
+ return TRUE;
+}
+
diff --git a/storage/xtradb/log/log0recv.c b/storage/xtradb/log/log0recv.c
index a554c576b6d..5ab8c14ae2e 100644
--- a/storage/xtradb/log/log0recv.c
+++ b/storage/xtradb/log/log0recv.c
@@ -857,7 +857,7 @@ block. We also accept a log block in the old format before
InnoDB-3.23.52 where the checksum field contains the log block number.
@return TRUE if ok, or if the log block may be in the format of InnoDB
version predating 3.23.52 */
-static
+UNIV_INTERN
ibool
log_block_checksum_is_ok_or_old_format(
/*===================================*/
@@ -2102,7 +2102,7 @@ skip_this_recv_addr:
/*******************************************************************//**
Tries to parse a single log record and returns its length.
@return length of the record, or 0 if the record was not complete */
-static
+UNIV_INTERN
ulint
recv_parse_log_rec(
/*===============*/
@@ -2173,7 +2173,7 @@ recv_parse_log_rec(
/*******************************************************//**
Calculates the new value for lsn when more data is added to the log. */
-static
+UNIV_INTERN
ib_uint64_t
recv_calc_lsn_on_data_add(
/*======================*/
@@ -3570,6 +3570,8 @@ recv_reset_logs(
log_sys->archived_lsn = log_sys->lsn;
#endif /* UNIV_LOG_ARCHIVE */
+ log_sys->tracked_lsn = log_sys->lsn;
+
log_block_init(log_sys->buf, log_sys->lsn);
log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE);
diff --git a/storage/xtradb/os/os0file.c b/storage/xtradb/os/os0file.c
index 061d556c6e7..8fa2cdb4a28 100644
--- a/storage/xtradb/os/os0file.c
+++ b/storage/xtradb/os/os0file.c
@@ -158,6 +158,7 @@ UNIV_INTERN ibool os_aio_print_debug = FALSE;
UNIV_INTERN mysql_pfs_key_t innodb_file_data_key;
UNIV_INTERN mysql_pfs_key_t innodb_file_log_key;
UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key;
+UNIV_INTERN mysql_pfs_key_t innodb_file_bmp_key;
#endif /* UNIV_PFS_IO */
/** The asynchronous i/o array slot structure */
@@ -2147,6 +2148,25 @@ os_file_set_eof(
#endif /* __WIN__ */
}
+/***********************************************************************//**
+Truncates a file at the specified position.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_eof_at(
+ os_file_t file, /*!< in: handle to a file */
+ ib_uint64_t new_len)/*!< in: new file length */
+{
+#ifdef __WIN__
+ /* TODO: untested! */
+ return(!_chsize_s(file, new_len));
+#else
+ /* TODO: works only with -D_FILE_OFFSET_BITS=64 ? */
+ return(!ftruncate(file, new_len));
+#endif
+}
+
+
#ifndef __WIN__
/***********************************************************************//**
Wrapper to fsync(2) that retries the call on some errors.
diff --git a/storage/xtradb/srv/srv0srv.c b/storage/xtradb/srv/srv0srv.c
index 0a545a8247a..dff9fc07a7f 100644
--- a/storage/xtradb/srv/srv0srv.c
+++ b/storage/xtradb/srv/srv0srv.c
@@ -67,6 +67,7 @@ Created 10/8/1995 Heikki Tuuri
#include "mem0pool.h"
#include "sync0sync.h"
#include "que0que.h"
+#include "log0online.h"
#include "log0recv.h"
#include "pars0pars.h"
#include "usr0sess.h"
@@ -176,6 +177,10 @@ UNIV_INTERN char* srv_doublewrite_file = NULL;
UNIV_INTERN ibool srv_recovery_stats = FALSE;
+UNIV_INTERN my_bool srv_track_changed_pages = TRUE;
+
+UNIV_INTERN ulonglong srv_changed_pages_limit = 0;
+
/* if TRUE, then we auto-extend the last data file */
UNIV_INTERN ibool srv_auto_extend_last_data_file = FALSE;
/* if != 0, this tells the max size auto-extending may increase the
@@ -771,6 +776,10 @@ UNIV_INTERN os_event_t srv_lock_timeout_thread_event;
UNIV_INTERN os_event_t srv_shutdown_event;
+UNIV_INTERN os_event_t srv_checkpoint_completed_event;
+
+UNIV_INTERN os_event_t srv_redo_log_thread_finished_event;
+
UNIV_INTERN srv_sys_t* srv_sys = NULL;
/* padding to prevent other memory update hotspots from residing on
@@ -1110,6 +1119,9 @@ srv_init(void)
srv_lock_timeout_thread_event = os_event_create(NULL);
srv_shutdown_event = os_event_create(NULL);
+ srv_checkpoint_completed_event = os_event_create(NULL);
+ srv_redo_log_thread_finished_event = os_event_create(NULL);
+
for (i = 0; i < SRV_MASTER + 1; i++) {
srv_n_threads_active[i] = 0;
srv_n_threads[i] = 0;
@@ -3034,6 +3046,46 @@ srv_shutdown_print_master_pending(
}
}
+/******************************************************************//**
+A thread which follows the redo log and outputs the changed page bitmap.
+@return a dummy value */
+os_thread_ret_t
+srv_redo_log_follow_thread(
+/*=======================*/
+ void* arg __attribute__((unused))) /*!< in: a dummy parameter
+ required by
+ os_thread_create */
+{
+#ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "Redo log follower thread starts, id %lu\n",
+ os_thread_pf(os_thread_get_curr_id()));
+#endif
+
+#ifdef UNIV_PFS_THREAD
+ pfs_register_thread(srv_log_tracking_thread_key);
+#endif
+
+ my_thread_init();
+
+ do {
+ os_event_wait(srv_checkpoint_completed_event);
+ os_event_reset(srv_checkpoint_completed_event);
+
+ if (srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE) {
+ log_online_follow_redo_log();
+ }
+
+ } while (srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE);
+
+ log_online_read_shutdown();
+ os_event_set(srv_redo_log_thread_finished_event);
+
+ my_thread_end();
+ os_thread_exit(NULL);
+
+ OS_THREAD_DUMMY_RETURN;
+}
+
/*******************************************************************//**
Tells the InnoDB server that there has been activity in the database
and wakes up the master thread if it is suspended (not sleeping). Used
diff --git a/storage/xtradb/srv/srv0start.c b/storage/xtradb/srv/srv0start.c
index d1329f445aa..7c98f74909e 100644
--- a/storage/xtradb/srv/srv0start.c
+++ b/storage/xtradb/srv/srv0start.c
@@ -51,6 +51,7 @@ Created 2/16/1996 Heikki Tuuri
#include "rem0rec.h"
#include "mtr0mtr.h"
#include "log0log.h"
+#include "log0online.h"
#include "log0recv.h"
#include "page0page.h"
#include "page0cur.h"
@@ -121,9 +122,9 @@ UNIV_INTERN enum srv_shutdown_state srv_shutdown_state = SRV_SHUTDOWN_NONE;
static os_file_t files[1000];
/** io_handler_thread parameters for thread identification */
-static ulint n[SRV_MAX_N_IO_THREADS + 7];
+static ulint n[SRV_MAX_N_IO_THREADS + 8];
/** io_handler_thread identifiers */
-static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 7];
+static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 8];
/** We use this mutex to test the return value of pthread_mutex_trylock
on successful locking. HP-UX does NOT return 0, though Linux et al do. */
@@ -145,6 +146,7 @@ UNIV_INTERN mysql_pfs_key_t srv_error_monitor_thread_key;
UNIV_INTERN mysql_pfs_key_t srv_monitor_thread_key;
UNIV_INTERN mysql_pfs_key_t srv_master_thread_key;
UNIV_INTERN mysql_pfs_key_t srv_purge_thread_key;
+UNIV_INTERN mysql_pfs_key_t srv_log_tracking_thread_key;
#endif /* UNIV_PFS_THREAD */
/*********************************************************************//**
@@ -2037,6 +2039,19 @@ innobase_start_or_create_for_mysql(void)
if (srv_auto_lru_dump && srv_blocking_lru_restore)
buf_LRU_file_restore();
+ if (srv_track_changed_pages) {
+
+ /* Initialize the log tracking subsystem here to block
+ server startup until it's completed due to the potential
+ need to re-read previous server run's log. */
+ log_online_read_init();
+
+ /* Create the thread that follows the redo log to output the
+ changed page bitmap */
+ os_thread_create(&srv_redo_log_follow_thread, NULL,
+ thread_ids + 6 + SRV_MAX_N_IO_THREADS);
+ }
+
srv_is_being_started = FALSE;
err = dict_create_or_check_foreign_constraint_tables();
diff --git a/storage/xtradb/ut/ut0rbt.c b/storage/xtradb/ut/ut0rbt.c
index 3d7cfa7636f..a5e9081b951 100644
--- a/storage/xtradb/ut/ut0rbt.c
+++ b/storage/xtradb/ut/ut0rbt.c
@@ -55,7 +55,6 @@ red-black properties:
#endif
#define ROOT(t) (t->root->left)
-#define SIZEOF_NODE(t) ((sizeof(ib_rbt_node_t) + t->sizeof_value) - 1)
/**********************************************************************//**
Print out the sub-tree recursively. */
@@ -834,6 +833,21 @@ rbt_add_node(
node = (ib_rbt_node_t*) ut_malloc(SIZEOF_NODE(tree));
memcpy(node->value, value, tree->sizeof_value);
+ return(rbt_add_preallocated_node(tree, parent, node));
+}
+
+/****************************************************************//**
+Add a new caller-provided node to tree at the specified position.
+The node must have its key fields initialized correctly.
+@return added node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_add_preallocated_node(
+/*======================*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: parent */
+ ib_rbt_node_t* node) /*!< in: node */
+{
node->parent = node->left = node->right = tree->nil;
/* If tree is empty */
@@ -842,7 +856,7 @@ rbt_add_node(
}
/* Append the node, the hope here is that the caller knows
- what s/he is doing. */
+ what s/he is doing. */
rbt_tree_add_child(tree, parent, node);
rbt_balance_tree(tree, node);
@@ -854,6 +868,7 @@ rbt_add_node(
return(node);
}
+
/**********************************************************************//**
Find a matching node in the rb tree.
@return NULL if not found else the node where key was found */
@@ -1142,7 +1157,17 @@ rbt_clear(
ib_rbt_t* tree) /*!< in: rb tree */
{
rbt_free_node(ROOT(tree), tree->nil);
+ rbt_reset(tree);
+}
+/****************************************************************//**
+Clear the tree without deleting and freeing its nodes. */
+UNIV_INTERN
+void
+rbt_reset(
+/*======*/
+ ib_rbt_t* tree) /*!< in: rb tree */
+{
tree->n_nodes = 0;
tree->root->left = tree->root->right = tree->nil;
}