diff options
Diffstat (limited to 'storage/xtradb/fil/fil0fil.c')
-rw-r--r-- | storage/xtradb/fil/fil0fil.c | 916 |
1 files changed, 707 insertions, 209 deletions
diff --git a/storage/xtradb/fil/fil0fil.c b/storage/xtradb/fil/fil0fil.c index a8520187013..e244985d096 100644 --- a/storage/xtradb/fil/fil0fil.c +++ b/storage/xtradb/fil/fil0fil.c @@ -46,6 +46,8 @@ Created 10/25/1995 Heikki Tuuri #include "row0mysql.h" #include "row0row.h" #include "que0que.h" +#include "btr0btr.h" +#include "btr0sea.h" #ifndef UNIV_HOTBACKUP # include "buf0lru.h" # include "ibuf0ibuf.h" @@ -127,6 +129,16 @@ UNIV_INTERN ulint fil_n_pending_tablespace_flushes = 0; /** The null file address */ UNIV_INTERN fil_addr_t fil_addr_null = {FIL_NULL, 0}; +#ifdef UNIV_PFS_MUTEX +/* Key to register fil_system_mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t fil_system_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +#ifdef UNIV_PFS_RWLOCK +/* Key to register file space latch with performance schema */ +UNIV_INTERN mysql_pfs_key_t fil_space_latch_key; +#endif /* UNIV_PFS_RWLOCK */ + /** File node of a tablespace or the log data space */ struct fil_node_struct { fil_space_t* space; /*!< backpointer to the space where this node @@ -242,6 +254,7 @@ the ib_logfiles form a 'space' and it is handled here */ struct fil_system_struct { #ifndef UNIV_HOTBACKUP mutex_t mutex; /*!< The mutex protecting the cache */ + mutex_t file_extend_mutex; #endif /* !UNIV_HOTBACKUP */ hash_table_t* spaces; /*!< The hash table of spaces in the system; they are hashed on the space @@ -296,6 +309,34 @@ struct fil_system_struct { initialized. */ static fil_system_t* fil_system = NULL; +#ifdef UNIV_DEBUG +/** Try fil_validate() every this many times */ +# define FIL_VALIDATE_SKIP 17 + +/******************************************************************//** +Checks the consistency of the tablespace cache some of the time. +@return TRUE if ok or the check was skipped */ +static +ibool +fil_validate_skip(void) +/*===================*/ +{ + /** The fil_validate() call skip counter. Use a signed type + because of the race condition below. */ + static int fil_validate_count = FIL_VALIDATE_SKIP; + + /* There is a race condition below, but it does not matter, + because this call is only for heuristic purposes. We want to + reduce the call frequency of the costly fil_validate() check + in debug builds. */ + if (--fil_validate_count > 0) { + return(TRUE); + } + + fil_validate_count = FIL_VALIDATE_SKIP; + return(fil_validate()); +} +#endif /* UNIV_DEBUG */ /********************************************************************//** NOTE: you must call fil_mutex_enter_and_prepare_for_io() first! @@ -336,14 +377,15 @@ fil_get_space_id_for_table( /*******************************************************************//** Frees a space object from the tablespace memory cache. Closes the files in the chain but does not delete them. There must not be any pending i/o's or -flushes on the files. */ +flushes on the files. +@return TRUE on success */ static ibool fil_space_free( /*===========*/ - /* out: TRUE if success */ - ulint id, /* in: space id */ - ibool own_mutex);/* in: TRUE if own system->mutex */ + ulint id, /* in: space id */ + ibool x_latched); /* in: TRUE if caller has space->latch + in X mode */ /********************************************************************//** Reads data from a space to a buffer. Remember that the possible incomplete blocks at the end of file are ignored: they are not taken into account when @@ -617,7 +659,7 @@ fil_node_create( UT_LIST_ADD_LAST(chain, space->chain, node); - if (id < SRV_LOG_SPACE_FIRST_ID && fil_system->max_assigned_id < id) { + if (id < SRV_EXTRA_SYS_SPACE_FIRST_ID && fil_system->max_assigned_id < id) { fil_system->max_assigned_id = id; } @@ -660,7 +702,8 @@ fil_node_open_file( async I/O! */ node->handle = os_file_create_simple_no_error_handling( - node->name, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success); + innodb_file_data_key, node->name, OS_FILE_OPEN, + OS_FILE_READ_ONLY, &success); if (!success) { /* The following call prints an error message */ os_file_get_last_error(TRUE); @@ -778,15 +821,21 @@ add_size: os_file_create() to fall back to the normal file I/O mode. */ if (space->purpose == FIL_LOG) { - node->handle = os_file_create(node->name, OS_FILE_OPEN, - OS_FILE_AIO, OS_LOG_FILE, &ret); + node->handle = os_file_create(innodb_file_log_key, + node->name, OS_FILE_OPEN, + OS_FILE_AIO, OS_LOG_FILE, + &ret); } else if (node->is_raw_disk) { - node->handle = os_file_create(node->name, + node->handle = os_file_create(innodb_file_data_key, + node->name, OS_FILE_OPEN_RAW, - OS_FILE_AIO, OS_DATA_FILE, &ret); + OS_FILE_AIO, OS_DATA_FILE, + &ret); } else { - node->handle = os_file_create(node->name, OS_FILE_OPEN, - OS_FILE_AIO, OS_DATA_FILE, &ret); + node->handle = os_file_create(innodb_file_data_key, + node->name, OS_FILE_OPEN, + OS_FILE_AIO, OS_DATA_FILE, + &ret); } ut_a(ret); @@ -815,9 +864,10 @@ fil_node_close_file( ut_ad(node && system); ut_ad(mutex_own(&(system->mutex))); ut_a(node->open); - ut_a(node->n_pending == 0); + ut_a(node->n_pending == 0 || node->space->is_being_deleted); ut_a(node->n_pending_flushes == 0); - ut_a(node->modification_counter == node->flush_counter); + ut_a(node->modification_counter == node->flush_counter + || srv_fast_shutdown == 2); ret = os_file_close(node->handle); ut_a(ret); @@ -828,7 +878,7 @@ fil_node_close_file( ut_a(system->n_open > 0); system->n_open--; - if (node->space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(node->space->id)) { + if (node->n_pending == 0 && node->space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(node->space->id)) { ut_a(UT_LIST_GET_LEN(system->LRU) > 0); /* The node is in the LRU list, remove it */ @@ -1027,7 +1077,7 @@ fil_node_free( ut_ad(node && system && space); ut_ad(mutex_own(&(system->mutex))); ut_a(node->magic_n == FIL_NODE_MAGIC_N); - ut_a(node->n_pending == 0); + ut_a(node->n_pending == 0 || space->is_being_deleted); if (node->open) { /* We fool the assertion in fil_node_close_file() to think @@ -1130,6 +1180,7 @@ try_again: space = fil_space_get_by_name(name); if (UNIV_LIKELY_NULL(space)) { + ibool success; ulint namesake_id; ut_print_timestamp(stderr); @@ -1168,9 +1219,10 @@ try_again: namesake_id = space->id; - mutex_exit(&fil_system->mutex); + success = fil_space_free(namesake_id, FALSE); + ut_a(success); - fil_space_free(namesake_id, FALSE); + mutex_exit(&fil_system->mutex); goto try_again; } @@ -1205,6 +1257,7 @@ try_again: space->mark = FALSE; if (UNIV_LIKELY(purpose == FIL_TABLESPACE && !recv_recovery_on) + && UNIV_UNLIKELY(id < SRV_EXTRA_SYS_SPACE_FIRST_ID) && UNIV_UNLIKELY(id > fil_system->max_assigned_id)) { if (!fil_system->space_id_reuse_warned) { fil_system->space_id_reuse_warned = TRUE; @@ -1235,7 +1288,7 @@ try_again: UT_LIST_INIT(space->chain); space->magic_n = FIL_SPACE_MAGIC_N; - rw_lock_create(&space->latch, SYNC_FSP); + rw_lock_create(fil_space_latch_key, &space->latch, SYNC_FSP); HASH_INSERT(fil_space_t, hash, fil_system->spaces, id, space); @@ -1290,7 +1343,7 @@ fil_assign_new_space_id( (ulong) SRV_LOG_SPACE_FIRST_ID); } - success = (id < SRV_LOG_SPACE_FIRST_ID); + success = (id < SRV_EXTRA_SYS_SPACE_FIRST_ID); if (success) { *space_id = fil_system->max_assigned_id = id; @@ -1323,15 +1376,14 @@ fil_space_free( /*===========*/ /* out: TRUE if success */ ulint id, /* in: space id */ - ibool own_mutex) /* in: TRUE if own system->mutex */ + ibool x_latched) /* in: TRUE if caller has space->latch + in X mode */ { fil_space_t* space; fil_space_t* namespace; fil_node_t* fil_node; - if (!own_mutex) { - mutex_enter(&fil_system->mutex); - } + ut_ad(mutex_own(&fil_system->mutex)); space = fil_space_get_by_id(id); @@ -1342,8 +1394,6 @@ fil_space_free( " from the cache but\n" "InnoDB: it is not there.\n", (ulong) id); - mutex_exit(&fil_system->mutex); - return(FALSE); } @@ -1378,8 +1428,8 @@ fil_space_free( ut_a(0 == UT_LIST_GET_LEN(space->chain)); - if (!own_mutex) { - mutex_exit(&fil_system->mutex); + if (x_latched) { + rw_lock_x_unlock(&space->latch); } rw_lock_free(&(space->latch)); @@ -1547,7 +1597,10 @@ fil_init( fil_system = mem_zalloc(sizeof(fil_system_t)); - mutex_create(&fil_system->mutex, SYNC_ANY_LATCH); + mutex_create(fil_system_mutex_key, + &fil_system->mutex, SYNC_ANY_LATCH); + mutex_create(fil_system_mutex_key, + &fil_system->file_extend_mutex, SYNC_OUTER_ANY_LATCH); fil_system->spaces = hash_create(hash_size); fil_system->name_hash = hash_create(hash_size); @@ -1626,25 +1679,27 @@ fil_close_all_files(void) /*=====================*/ { fil_space_t* space; - fil_node_t* node; mutex_enter(&fil_system->mutex); space = UT_LIST_GET_FIRST(fil_system->space_list); while (space != NULL) { + fil_node_t* node; fil_space_t* prev_space = space; - node = UT_LIST_GET_FIRST(space->chain); + for (node = UT_LIST_GET_FIRST(space->chain); + node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { - while (node != NULL) { if (node->open) { fil_node_close_file(node, fil_system); } - node = UT_LIST_GET_NEXT(chain, node); } + space = UT_LIST_GET_NEXT(space_list, space); - fil_space_free(prev_space->id, TRUE); + + fil_space_free(prev_space->id, FALSE); } mutex_exit(&fil_system->mutex); @@ -1666,6 +1721,10 @@ fil_set_max_space_id_if_bigger( ut_error; } + if (max_id >= SRV_EXTRA_SYS_SPACE_FIRST_ID) { + return; + } + mutex_enter(&fil_system->mutex); if (fil_system->max_assigned_id < max_id) { @@ -1684,6 +1743,7 @@ static ulint fil_write_lsn_and_arch_no_to_file( /*==============================*/ + ulint space_id, ulint sum_of_sizes, /*!< in: combined size of previous files in space, in database pages */ ib_uint64_t lsn, /*!< in: lsn to write */ @@ -1693,14 +1753,16 @@ fil_write_lsn_and_arch_no_to_file( byte* buf1; byte* buf; + ut_a(trx_sys_sys_space(space_id)); + buf1 = mem_alloc(2 * UNIV_PAGE_SIZE); buf = ut_align(buf1, UNIV_PAGE_SIZE); - fil_read(TRUE, 0, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL); + fil_read(TRUE, space_id, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL); - mach_write_ull(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn); + mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn); - fil_write(TRUE, 0, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL); + fil_write(TRUE, space_id, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL); mem_free(buf1); @@ -1736,7 +1798,7 @@ fil_write_flushed_lsn_to_data_files( always open. */ if (space->purpose == FIL_TABLESPACE - && space->id == 0) { + && trx_sys_sys_space(space->id)) { sum_of_sizes = 0; node = UT_LIST_GET_FIRST(space->chain); @@ -1744,7 +1806,7 @@ fil_write_flushed_lsn_to_data_files( mutex_exit(&fil_system->mutex); err = fil_write_lsn_and_arch_no_to_file( - sum_of_sizes, lsn, arch_log_no); + space->id, sum_of_sizes, lsn, arch_log_no); if (err != DB_SUCCESS) { return(err); @@ -1792,7 +1854,7 @@ fil_read_flushed_lsn_and_arch_log_no( os_file_read(data_file, buf, 0, 0, UNIV_PAGE_SIZE); - flushed_lsn = mach_read_ull(buf + FIL_PAGE_FILE_FLUSH_LSN); + flushed_lsn = mach_read_from_8(buf + FIL_PAGE_FILE_FLUSH_LSN); ut_free(buf2); @@ -2264,6 +2326,19 @@ try_again: path = mem_strdup(space->name); mutex_exit(&fil_system->mutex); + + /* Important: We rely on the data dictionary mutex to ensure + that a race is not possible here. It should serialize the tablespace + drop/free. We acquire an X latch only to avoid a race condition + when accessing the tablespace instance via: + + fsp_get_available_space_in_free_extents(). + + There our main motivation is to reduce the contention on the + dictionary mutex. */ + + rw_lock_x_lock(&space->latch); + #ifndef UNIV_HOTBACKUP /* Invalidate in the buffer pool all pages belonging to the tablespace. Since we have set space->is_being_deleted = TRUE, readahead @@ -2272,11 +2347,19 @@ try_again: completely and permanently. The flag is_being_deleted also prevents fil_flush() from being applied to this tablespace. */ + if (srv_lazy_drop_table) { + buf_LRU_mark_space_was_deleted(id); + } else { buf_LRU_invalidate_tablespace(id); + } #endif /* printf("Deleting tablespace %s id %lu\n", space->name, id); */ - success = fil_space_free(id, FALSE); + mutex_enter(&fil_system->mutex); + + success = fil_space_free(id, TRUE); + + mutex_exit(&fil_system->mutex); if (success) { success = os_file_delete(path); @@ -2284,6 +2367,8 @@ try_again: if (!success) { success = os_file_delete_if_exists(path); } + } else { + rw_lock_x_unlock(&space->latch); } if (success) { @@ -2311,6 +2396,31 @@ try_again: return(FALSE); } +/*******************************************************************//** +Returns TRUE if a single-table tablespace is being deleted. +@return TRUE if being deleted */ +UNIV_INTERN +ibool +fil_tablespace_is_being_deleted( +/*============================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + ibool is_being_deleted; + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + ut_a(space != NULL); + + is_being_deleted = space->is_being_deleted; + + mutex_exit(&fil_system->mutex); + + return(is_being_deleted); +} + #ifndef UNIV_HOTBACKUP /*******************************************************************//** Discards a single-table tablespace. The tablespace must be cached in the @@ -2519,7 +2629,7 @@ retry: os_thread_sleep(20000); - fil_flush(id); + fil_flush(id, TRUE); goto retry; @@ -2545,7 +2655,7 @@ retry: success = fil_rename_tablespace_in_mem(space, node, path); if (success) { - success = os_file_rename(old_path, path); + success = os_file_rename(innodb_file_data_key, old_path, path); if (!success) { /* We have to revert the changes we made @@ -2622,7 +2732,8 @@ fil_create_new_single_table_tablespace( path = fil_make_ibd_name(tablename, is_temp); - file = os_file_create(path, OS_FILE_CREATE, OS_FILE_NORMAL, + file = os_file_create(innodb_file_data_key, path, + OS_FILE_CREATE, OS_FILE_NORMAL, OS_DATA_FILE, &ret); if (ret == FALSE) { ut_print_timestamp(stderr); @@ -2732,7 +2843,7 @@ error_exit2: goto error_exit; } - ret = os_file_flush(file); + ret = os_file_flush(file, TRUE); if (!ret) { fputs("InnoDB: Error: file flush of tablespace ", stderr); @@ -2810,7 +2921,8 @@ fil_reset_too_high_lsns( filepath = fil_make_ibd_name(name, FALSE); file = os_file_create_simple_no_error_handling( - filepath, OS_FILE_OPEN, OS_FILE_READ_WRITE, &success); + innodb_file_data_key, filepath, OS_FILE_OPEN, + OS_FILE_READ_WRITE, &success); if (!success) { /* The following call prints an error message */ os_file_get_last_error(TRUE); @@ -2841,7 +2953,7 @@ fil_reset_too_high_lsns( /* We have to read the file flush lsn from the header of the file */ - flush_lsn = mach_read_ull(page + FIL_PAGE_FILE_FLUSH_LSN); + flush_lsn = mach_read_from_8(page + FIL_PAGE_FILE_FLUSH_LSN); if (current_lsn >= flush_lsn) { /* Ok */ @@ -2889,7 +3001,7 @@ fil_reset_too_high_lsns( goto func_exit; } - if (mach_read_ull(page + FIL_PAGE_LSN) > current_lsn) { + if (mach_read_from_8(page + FIL_PAGE_LSN) > current_lsn) { /* We have to reset the lsn */ if (zip_size) { @@ -2917,7 +3029,7 @@ fil_reset_too_high_lsns( } } - success = os_file_flush(file); + success = os_file_flush(file, TRUE); if (!success) { goto func_exit; @@ -2931,7 +3043,7 @@ fil_reset_too_high_lsns( goto func_exit; } - mach_write_ull(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn); + mach_write_to_8(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn); success = os_file_write(filepath, file, page, 0, 0, zip_size ? zip_size : UNIV_PAGE_SIZE); @@ -2939,7 +3051,7 @@ fil_reset_too_high_lsns( goto func_exit; } - success = os_file_flush(file); + success = os_file_flush(file, TRUE); func_exit: os_file_close(file); ut_free(buf2); @@ -2949,6 +3061,97 @@ func_exit: } /********************************************************************//** +Checks if a page is corrupt. (for offline page) +*/ +static +ibool +fil_page_buf_page_is_corrupted_offline( +/*===================================*/ + const byte* page, /*!< in: a database page */ + ulint zip_size) /*!< in: size of compressed page; + 0 for uncompressed pages */ +{ + ulint checksum_field; + ulint old_checksum_field; + + if (!zip_size + && memcmp(page + FIL_PAGE_LSN + 4, + page + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) { + return(TRUE); + } + + checksum_field = mach_read_from_4(page + + FIL_PAGE_SPACE_OR_CHKSUM); + + if (zip_size) { + return(checksum_field != BUF_NO_CHECKSUM_MAGIC + && checksum_field + != page_zip_calc_checksum(page, zip_size)); + } + + old_checksum_field = mach_read_from_4( + page + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM); + + if (old_checksum_field != mach_read_from_4(page + + FIL_PAGE_LSN) + && old_checksum_field != BUF_NO_CHECKSUM_MAGIC + && old_checksum_field + != buf_calc_page_old_checksum(page)) { + return(TRUE); + } + + if (!srv_fast_checksum + && checksum_field != 0 + && checksum_field != BUF_NO_CHECKSUM_MAGIC + && checksum_field + != buf_calc_page_new_checksum(page)) { + return(TRUE); + } + + if (srv_fast_checksum + && checksum_field != 0 + && checksum_field != BUF_NO_CHECKSUM_MAGIC + && checksum_field + != buf_calc_page_new_checksum_32(page) + && checksum_field + != buf_calc_page_new_checksum(page)) { + return(TRUE); + } + + return(FALSE); +} + +/********************************************************************//** +*/ +static +void +fil_page_buf_page_store_checksum( +/*=============================*/ + byte* page, + ulint zip_size) +{ + if (!zip_size) { + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, + srv_use_checksums + ? (!srv_fast_checksum + ? buf_calc_page_new_checksum(page) + : buf_calc_page_new_checksum_32(page)) + : BUF_NO_CHECKSUM_MAGIC); + mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, + srv_use_checksums + ? buf_calc_page_old_checksum(page) + : BUF_NO_CHECKSUM_MAGIC); + } else { + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, + srv_use_checksums + ? page_zip_calc_checksum(page, zip_size) + : BUF_NO_CHECKSUM_MAGIC); + } +} + +/********************************************************************//** Tries to open a single-table tablespace and optionally checks the space id is right in it. If does not succeed, prints an error message to the .err log. This function is used to open a tablespace when we start up mysqld, and also in @@ -2994,7 +3197,8 @@ fil_open_single_table_tablespace( ut_a(!(flags & (~0UL << DICT_TF_BITS))); file = os_file_create_simple_no_error_handling( - filepath, OS_FILE_OPEN, OS_FILE_READ_WRITE, &success); + innodb_file_data_key, filepath, OS_FILE_OPEN, + OS_FILE_READ_WRITE, &success); if (!success) { /* The following call prints an error message */ os_file_get_last_error(TRUE); @@ -3041,14 +3245,14 @@ fil_open_single_table_tablespace( space_id = fsp_header_get_space_id(page); space_flags = fsp_header_get_flags(page); - if (srv_expand_import - && (space_id != id || space_flags != (flags & ~(~0 << DICT_TF_BITS)))) { + if (srv_expand_import) { + ibool file_is_corrupt = FALSE; byte* buf3; byte* descr_page; ibool descr_is_corrupt = FALSE; - dulint old_id[31]; - dulint new_id[31]; + index_id_t old_id[31]; + index_id_t new_id[31]; ulint root_page[31]; ulint n_index; os_file_t info_file = (os_file_t) -1; @@ -3063,6 +3267,7 @@ fil_open_single_table_tablespace( fil_system_t* system; fil_node_t* node = NULL; fil_space_t* space; + ulint zip_size; buf3 = ut_malloc(2 * UNIV_PAGE_SIZE); descr_page = ut_align(buf3, UNIV_PAGE_SIZE); @@ -3080,30 +3285,26 @@ fil_open_single_table_tablespace( /* store as first descr page */ memcpy(descr_page, page, UNIV_PAGE_SIZE); + zip_size = dict_table_flags_to_zip_size(flags); + ut_a(zip_size == dict_table_flags_to_zip_size(space_flags)); + /* get free limit (page number) of the table space */ /* these should be same to the definition in fsp0fsp.c */ #define FSP_HEADER_OFFSET FIL_PAGE_DATA #define FSP_FREE_LIMIT 12 free_limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + page); - free_limit_bytes = (ib_int64_t)free_limit * (ib_int64_t)UNIV_PAGE_SIZE; + free_limit_bytes = (ib_int64_t)free_limit * (ib_int64_t)(zip_size ? zip_size : UNIV_PAGE_SIZE); /* overwrite fsp header */ fsp_header_init_fields(page, id, flags); mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id); space_id = id; space_flags = flags; - if (mach_read_ull(page + FIL_PAGE_FILE_FLUSH_LSN) > current_lsn) - mach_write_ull(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn); - mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, - srv_use_checksums - ? (!srv_fast_checksum - ? buf_calc_page_new_checksum(page) - : buf_calc_page_new_checksum_32(page)) - : BUF_NO_CHECKSUM_MAGIC); - mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, - srv_use_checksums - ? buf_calc_page_old_checksum(page) - : BUF_NO_CHECKSUM_MAGIC); + if (mach_read_from_8(page + FIL_PAGE_FILE_FLUSH_LSN) > current_lsn) + mach_write_to_8(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn); + + fil_page_buf_page_store_checksum(page, zip_size); + success = os_file_write(filepath, file, page, 0, 0, UNIV_PAGE_SIZE); /* get file size */ @@ -3113,8 +3314,10 @@ fil_open_single_table_tablespace( if (size_bytes < free_limit_bytes) { free_limit_bytes = size_bytes; - fprintf(stderr, "InnoDB: free limit of %s is larger than its real size.\n", filepath); - file_is_corrupt = TRUE; + if (size_bytes >= (lint)FSP_EXTENT_SIZE * (lint)(zip_size ? zip_size : UNIV_PAGE_SIZE)) { + fprintf(stderr, "InnoDB: free limit of %s is larger than its real size.\n", filepath); + file_is_corrupt = TRUE; + } } /* get cruster index information */ @@ -3134,30 +3337,30 @@ fil_open_single_table_tablespace( info_file_path[len - 2] = 'x'; info_file_path[len - 1] = 'p'; - info_file = os_file_create_simple_no_error_handling( + info_file = os_file_create_simple_no_error_handling(innodb_file_data_key, info_file_path, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success); if (!success) { - fprintf(stderr, "InnoDB: cannot open %s\n", info_file_path); + fprintf(stderr, "InnoDB: Cannot open the file: %s\n", info_file_path); file_is_corrupt = TRUE; goto skip_info; } success = os_file_read(info_file, page, 0, 0, UNIV_PAGE_SIZE); if (!success) { - fprintf(stderr, "InnoDB: cannot read %s\n", info_file_path); + fprintf(stderr, "InnoDB: Cannot read the file: %s\n", info_file_path); file_is_corrupt = TRUE; goto skip_info; } if (mach_read_from_4(page) != 0x78706f72UL || mach_read_from_4(page + 4) != 0x74696e66UL) { - fprintf(stderr, "InnoDB: %s seems not to be a correct .exp file\n", info_file_path); + fprintf(stderr, "InnoDB: %s seems to be an incorrect .exp file.\n", info_file_path); file_is_corrupt = TRUE; goto skip_info; } - fprintf(stderr, "InnoDB: import: extended import of %s is started.\n", name); + fprintf(stderr, "InnoDB: Import: The extended import of %s is being started.\n", name); n_index = mach_read_from_4(page + 8); - fprintf(stderr, "InnoDB: import: %lu indexes are detected.\n", (ulong)n_index); + fprintf(stderr, "InnoDB: Import: %lu indexes have been detected.\n", (ulong)n_index); for (i = 0; i < n_index; i++) { new_id[i] = dict_table_get_index_on_name(table, @@ -3175,75 +3378,41 @@ skip_info: size_bytes = ut_2pow_round(size_bytes, 1024 * 1024); } */ - if (!(flags & DICT_TF_ZSSIZE_MASK)) { + + if (zip_size) { + fprintf(stderr, "InnoDB: Warning: importing compressed table is still EXPERIMENTAL, currently.\n"); + } + + { mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets = offsets_; ib_int64_t offset; - size = (ulint) (size_bytes / UNIV_PAGE_SIZE); + size = (ulint) (size_bytes / (zip_size ? zip_size : UNIV_PAGE_SIZE)); /* over write space id of all pages */ rec_offs_init(offsets_); fprintf(stderr, "InnoDB: Progress in %%:"); - for (offset = 0; offset < free_limit_bytes; offset += UNIV_PAGE_SIZE) { - ulint checksum_field; - ulint old_checksum_field; + for (offset = 0; offset < free_limit_bytes; + offset += zip_size ? zip_size : UNIV_PAGE_SIZE) { ibool page_is_corrupt; success = os_file_read(file, page, (ulint)(offset & 0xFFFFFFFFUL), - (ulint)(offset >> 32), UNIV_PAGE_SIZE); + (ulint)(offset >> 32), + zip_size ? zip_size : UNIV_PAGE_SIZE); page_is_corrupt = FALSE; /* check consistency */ - if (memcmp(page + FIL_PAGE_LSN + 4, - page + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) { - + if (fil_page_buf_page_is_corrupted_offline(page, zip_size)) { page_is_corrupt = TRUE; } if (mach_read_from_4(page + FIL_PAGE_OFFSET) - != offset / UNIV_PAGE_SIZE) { - - page_is_corrupt = TRUE; - } - - checksum_field = mach_read_from_4(page - + FIL_PAGE_SPACE_OR_CHKSUM); - - old_checksum_field = mach_read_from_4( - page + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN_OLD_CHKSUM); - - if (old_checksum_field != mach_read_from_4(page - + FIL_PAGE_LSN) - && old_checksum_field != BUF_NO_CHECKSUM_MAGIC - && old_checksum_field - != buf_calc_page_old_checksum(page)) { - - page_is_corrupt = TRUE; - } - - if (!srv_fast_checksum - && checksum_field != 0 - && checksum_field != BUF_NO_CHECKSUM_MAGIC - && checksum_field - != buf_calc_page_new_checksum(page)) { - - page_is_corrupt = TRUE; - } - - if (srv_fast_checksum - && checksum_field != 0 - && checksum_field != BUF_NO_CHECKSUM_MAGIC - && checksum_field - != buf_calc_page_new_checksum_32(page) - && checksum_field - != buf_calc_page_new_checksum(page)) { + != offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)) { page_is_corrupt = TRUE; } @@ -3254,18 +3423,19 @@ skip_info: /* it should be overwritten already */ ut_a(!page_is_corrupt); - } else if (!((offset / UNIV_PAGE_SIZE) % UNIV_PAGE_SIZE)) { + } else if (!((offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)) + % (zip_size ? zip_size : UNIV_PAGE_SIZE))) { /* descr page (not header) */ if (page_is_corrupt) { file_is_corrupt = TRUE; descr_is_corrupt = TRUE; } else { - ut_a(fil_page_get_type(page) == FIL_PAGE_TYPE_XDES); + ut_ad(fil_page_get_type(page) == FIL_PAGE_TYPE_XDES); descr_is_corrupt = FALSE; } /* store as descr page */ - memcpy(descr_page, page, UNIV_PAGE_SIZE); + memcpy(descr_page, page, (zip_size ? zip_size : UNIV_PAGE_SIZE)); } else if (descr_is_corrupt) { /* unknown state of the page */ @@ -3293,9 +3463,12 @@ skip_info: ulint bit_index; descr = descr_page + XDES_ARR_OFFSET - + XDES_SIZE * (ut_2pow_remainder((offset / UNIV_PAGE_SIZE), UNIV_PAGE_SIZE) / FSP_EXTENT_SIZE); + + XDES_SIZE * (ut_2pow_remainder( + (offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)), + (zip_size ? zip_size : UNIV_PAGE_SIZE)) / FSP_EXTENT_SIZE); - index = XDES_FREE_BIT + XDES_BITS_PER_PAGE * ((offset / UNIV_PAGE_SIZE) % FSP_EXTENT_SIZE); + index = XDES_FREE_BIT + + XDES_BITS_PER_PAGE * ((offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)) % FSP_EXTENT_SIZE); byte_index = index / 8; bit_index = index % 8; @@ -3313,7 +3486,7 @@ skip_info: } if (page_is_corrupt) { - fprintf(stderr, " [errp:%lld]", offset / UNIV_PAGE_SIZE); + fprintf(stderr, " [errp:%lld]", offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)); /* cannot treat corrupt page */ goto skip_write; @@ -3322,8 +3495,14 @@ skip_info: if (mach_read_from_4(page + FIL_PAGE_OFFSET) || !offset) { mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id); - for (i = 0; (ulint) i < n_index; i++) { - if ((ulint) (offset / UNIV_PAGE_SIZE) == root_page[i]) { + for (i = 0; i < n_index; i++) { + if (offset / (zip_size ? zip_size : UNIV_PAGE_SIZE) == root_page[i]) { + if (fil_page_get_type(page) != FIL_PAGE_INDEX) { + file_is_corrupt = TRUE; + fprintf(stderr, " [etyp:%lld]", + offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)); + goto skip_write; + } /* this is index root page */ mach_write_to_4(page + FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + FSEG_HDR_SPACE, id); @@ -3334,10 +3513,17 @@ skip_info: } if (fil_page_get_type(page) == FIL_PAGE_INDEX) { - dulint tmp = mach_read_from_8(page + (PAGE_HEADER + PAGE_INDEX_ID)); + index_id_t tmp = mach_read_from_8(page + (PAGE_HEADER + PAGE_INDEX_ID)); + + for (i = 0; i < n_index; i++) { + if (old_id[i] == tmp) { + mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), new_id[i]); + break; + } + } - if (mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL) == 0 - && ut_dulint_cmp(old_id[0], tmp) == 0) { + if (!zip_size && mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL) == 0 + && old_id[0] == tmp) { /* leaf page of cluster index, reset trx_id of records */ rec_t* rec; rec_t* supremum; @@ -3357,7 +3543,7 @@ skip_info: if (!offset) { offset = row_get_trx_id_offset(rec, index, offsets); } - trx_write_trx_id(rec + offset, ut_dulint_create(0, 1)); + trx_write_trx_id(rec + offset, 1); for (i = 0; i < n_fields; i++) { if (rec_offs_nth_extern(offsets, i)) { @@ -3375,44 +3561,34 @@ skip_info: rec = page_rec_get_next(rec); n_recs--; } - } - - for (i = 0; i < n_index; i++) { - if (ut_dulint_cmp(old_id[i], tmp) == 0) { - mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), new_id[i]); - break; - } + } else if (mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL) == 0 + && old_id[0] != tmp) { + mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), 1); } } - if (mach_read_ull(page + FIL_PAGE_LSN) > current_lsn) { - mach_write_ull(page + FIL_PAGE_LSN, current_lsn); - mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, - current_lsn); + if (mach_read_from_8(page + FIL_PAGE_LSN) > current_lsn) { + mach_write_to_8(page + FIL_PAGE_LSN, current_lsn); + if (!zip_size) { + mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, + current_lsn); + } } - mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, - srv_use_checksums - ? (!srv_fast_checksum - ? buf_calc_page_new_checksum(page) - : buf_calc_page_new_checksum_32(page)) - : BUF_NO_CHECKSUM_MAGIC); - mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, - srv_use_checksums - ? buf_calc_page_old_checksum(page) - : BUF_NO_CHECKSUM_MAGIC); + fil_page_buf_page_store_checksum(page, zip_size); success = os_file_write(filepath, file, page, (ulint)(offset & 0xFFFFFFFFUL), - (ulint)(offset >> 32), UNIV_PAGE_SIZE); + (ulint)(offset >> 32), + zip_size ? zip_size : UNIV_PAGE_SIZE); } skip_write: if (free_limit_bytes - && ((ib_int64_t)((offset + UNIV_PAGE_SIZE) * 100) / free_limit_bytes) + && ((ib_int64_t)((offset + (zip_size ? zip_size : UNIV_PAGE_SIZE)) * 100) / free_limit_bytes) != ((offset * 100) / free_limit_bytes)) { fprintf(stderr, " %lu", - (ulong)((ib_int64_t)((offset + UNIV_PAGE_SIZE) * 100) / free_limit_bytes)); + (ulong)((ib_int64_t)((offset + (zip_size ? zip_size : UNIV_PAGE_SIZE)) * 100) / free_limit_bytes)); } } @@ -3422,7 +3598,7 @@ skip_write: index = dict_table_get_first_index(table); while (index) { for (i = 0; i < n_index; i++) { - if (ut_dulint_cmp(new_id[i], index->id) == 0) { + if (new_id[i] == index->id) { break; } } @@ -3439,7 +3615,7 @@ skip_write: info = pars_info_create(); - pars_info_add_dulint_literal(info, "indexid", new_id[i]); + pars_info_add_ull_literal(info, "indexid", new_id[i]); pars_info_add_int4_literal(info, "new_page", (lint) root_page[i]); error = que_eval_sql(info, @@ -3468,13 +3644,6 @@ skip_write: if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } - } else { - /* zip page? */ - size = (ulint) - (size_bytes - / dict_table_flags_to_zip_size(flags)); - fprintf(stderr, "InnoDB: import: table %s seems to be in newer format." - " It may not be able to treated for now.\n", name); } /* .exp file should be removed */ success = os_file_delete(info_file_path); @@ -3502,10 +3671,10 @@ skip_write: stderr); ut_print_filename(stderr, filepath); fprintf(stderr, " seems to be corrupt.\n" - "InnoDB: anyway, all not corrupt pages were tried to be converted to salvage.\n" + "InnoDB: An attempt to convert and salvage all corrupt pages was not made.\n" "InnoDB: ##### CAUTION #####\n" - "InnoDB: ## The .ibd must cause to crash InnoDB, though re-import would seem to be succeeded.\n" - "InnoDB: ## If you don't have knowledge about salvaging data from .ibd, you should not use the file.\n" + "InnoDB: ## The .ibd file may cause InnoDB to crash, even though its re-import seems to have succeeded.\n" + "InnoDB: ## If you don't know how to salvage data from a .ibd, you should not use the file.\n" "InnoDB: ###################\n"); success = FALSE; @@ -3556,6 +3725,271 @@ func_exit: os_file_close(file); mem_free(filepath); + if (srv_expand_import && dict_table_flags_to_zip_size(flags)) { + ulint page_no; + ulint zip_size; + ulint height; + ulint root_height = 0; + rec_t* node_ptr; + dict_table_t* table; + dict_index_t* index; + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + mtr_t mtr; + + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + + rec_offs_init(offsets_); + + zip_size = dict_table_flags_to_zip_size(flags); + + table = dict_table_get_low(name); + index = dict_table_get_first_index(table); + page_no = dict_index_get_page(index); + ut_a(page_no == 3); + + fprintf(stderr, "InnoDB: It is compressed .ibd file. need to convert additionaly on buffer pool.\n"); + + /* down to leaf */ + mtr_start(&mtr); + mtr_set_log_mode(&mtr, MTR_LOG_NONE); + + height = ULINT_UNDEFINED; + + for (;;) { + block = buf_page_get(space_id, zip_size, page_no, + RW_NO_LATCH, &mtr); + page = buf_block_get_frame(block); + + block->check_index_page_at_flush = TRUE; + + if (height == ULINT_UNDEFINED) { + height = btr_page_get_level(page, &mtr); + root_height = height; + } + + if (height == 0) { + break; + } + + node_ptr = page_rec_get_next(page_get_infimum_rec(page)); + + height--; + + offsets = rec_get_offsets(node_ptr, index, offsets, ULINT_UNDEFINED, &heap); + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + } + + mtr_commit(&mtr); + + fprintf(stderr, "InnoDB: pages needs split are ..."); + + /* scan reaf pages */ + while (page_no != FIL_NULL) { + rec_t* rec; + rec_t* supremum; + ulint n_recs; + + mtr_start(&mtr); + + block = buf_page_get(space_id, zip_size, page_no, + RW_X_LATCH, &mtr); + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + + if (!page_zip) { + /*something wrong*/ + fprintf(stderr, "InnoDB: Something wrong with reading page %lu.\n", page_no); +convert_err_exit: + mtr_commit(&mtr); + mutex_enter(&fil_system->mutex); + fil_space_free(space_id, FALSE); + mutex_exit(&fil_system->mutex); + success = FALSE; + goto convert_exit; + } + + supremum = page_get_supremum_rec(page); + rec = page_rec_get_next(page_get_infimum_rec(page)); + n_recs = page_get_n_recs(page); + + /* illegal operation as InnoDB online system. so not logged */ + while (rec && rec != supremum && n_recs > 0) { + ulint n_fields; + ulint i; + ulint offset = index->trx_id_offset; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + n_fields = rec_offs_n_fields(offsets); + if (!offset) { + offset = row_get_trx_id_offset(rec, index, offsets); + } + trx_write_trx_id(rec + offset, 1); + + for (i = 0; i < n_fields; i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint local_len; + byte* data; + + data = rec_get_nth_field(rec, offsets, i, &local_len); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + mach_write_to_4(data + local_len + BTR_EXTERN_SPACE_ID, id); + } + } + + rec = page_rec_get_next(rec); + n_recs--; + } + + /* dummy logged update for along with modified page path */ + if (index->id != btr_page_get_index_id(page)) { + /* this should be adjusted already */ + fprintf(stderr, "InnoDB: The page %lu seems to be converted wrong.\n", page_no); + goto convert_err_exit; + } + btr_page_set_index_id(page, page_zip, index->id, &mtr); + + /* confirm whether fits to the page size or not */ + if (!page_zip_compress(page_zip, page, index, &mtr) + && !btr_page_reorganize(block, index, &mtr)) { + buf_block_t* new_block; + page_t* new_page; + page_zip_des_t* new_page_zip; + rec_t* split_rec; + ulint n_uniq; + + /* split page is needed */ + fprintf(stderr, " %lu", page_no); + + mtr_x_lock(dict_index_get_lock(index), &mtr); + + n_uniq = dict_index_get_n_unique_in_tree(index); + + if(page_get_n_recs(page) < 2) { + /* no way to make smaller */ + fprintf(stderr, "InnoDB: The page %lu cannot be store to the page size.\n", page_no); + goto convert_err_exit; + } + + if (UNIV_UNLIKELY(page_no == dict_index_get_page(index))) { + ulint new_page_no; + dtuple_t* node_ptr; + ulint level; + rec_t* node_ptr_rec; + page_cur_t page_cursor; + + /* it is root page, need to raise before split */ + + level = btr_page_get_level(page, &mtr); + + new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, &mtr); + new_page = buf_block_get_frame(new_block); + new_page_zip = buf_block_get_page_zip(new_block); + btr_page_create(new_block, new_page_zip, index, level, &mtr); + + btr_page_set_next(new_page, new_page_zip, FIL_NULL, &mtr); + btr_page_set_prev(new_page, new_page_zip, FIL_NULL, &mtr); + + page_zip_copy_recs(new_page_zip, new_page, + page_zip, page, index, &mtr); + btr_search_move_or_delete_hash_entries(new_block, block, index); + + rec = page_rec_get_next(page_get_infimum_rec(new_page)); + new_page_no = buf_block_get_page_no(new_block); + + node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, heap, + level); + dtuple_set_info_bits(node_ptr, + dtuple_get_info_bits(node_ptr) + | REC_INFO_MIN_REC_FLAG); + btr_page_empty(block, page_zip, index, level + 1, &mtr); + + btr_page_set_next(page, page_zip, FIL_NULL, &mtr); + btr_page_set_prev(page, page_zip, FIL_NULL, &mtr); + + page_cur_set_before_first(block, &page_cursor); + + node_ptr_rec = page_cur_tuple_insert(&page_cursor, node_ptr, + index, 0, &mtr); + ut_a(node_ptr_rec); + + if (!btr_page_reorganize(block, index, &mtr)) { + fprintf(stderr, "InnoDB: failed to store the page %lu.\n", page_no); + goto convert_err_exit; + } + + /* move to the raised page */ + page_no = new_page_no; + block = new_block; + page = new_page; + page_zip = new_page_zip; + + fprintf(stderr, "(raise_to:%lu)", page_no); + } + + split_rec = page_get_middle_rec(page); + + new_block = btr_page_alloc(index, page_no + 1, FSP_UP, + btr_page_get_level(page, &mtr), &mtr); + new_page = buf_block_get_frame(new_block); + new_page_zip = buf_block_get_page_zip(new_block); + btr_page_create(new_block, new_page_zip, index, + btr_page_get_level(page, &mtr), &mtr); + + offsets = rec_get_offsets(split_rec, index, offsets, n_uniq, &heap); + + btr_attach_half_pages(index, block, + split_rec, new_block, FSP_UP, &mtr); + + page_zip_copy_recs(new_page_zip, new_page, + page_zip, page, index, &mtr); + page_delete_rec_list_start(split_rec - page + new_page, + new_block, index, &mtr); + btr_search_move_or_delete_hash_entries(new_block, block, index); + page_delete_rec_list_end(split_rec, block, index, + ULINT_UNDEFINED, ULINT_UNDEFINED, &mtr); + + fprintf(stderr, "(new:%lu)", buf_block_get_page_no(new_block)); + + /* Are they needed? */ + if (!btr_page_reorganize(block, index, &mtr)) { + fprintf(stderr, "InnoDB: failed to store the page %lu.\n", page_no); + goto convert_err_exit; + } + if (!btr_page_reorganize(new_block, index, &mtr)) { + fprintf(stderr, "InnoDB: failed to store the page %lu.\n", buf_block_get_page_no(new_block)); + goto convert_err_exit; + } + } + + page_no = btr_page_get_next(page, &mtr); + + mtr_commit(&mtr); + + if (heap) { + mem_heap_empty(heap); + } + } + + fprintf(stderr, "...done.\nInnoDB: waiting the flush batch of the additional conversion.\n"); + + /* should wait for the not-logged changes are all flushed */ + buf_flush_list(ULINT_MAX, mtr.end_lsn + 1); + buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); + + fprintf(stderr, "InnoDB: done.\n"); +convert_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + return(success); } #endif /* !UNIV_HOTBACKUP */ @@ -3624,7 +4058,8 @@ fil_load_single_table_tablespace( # endif /* !UNIV_HOTBACKUP */ #endif file = os_file_create_simple_no_error_handling( - filepath, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success); + innodb_file_data_key, filepath, OS_FILE_OPEN, + OS_FILE_READ_ONLY, &success); if (!success) { /* The following call prints an error message */ os_file_get_last_error(TRUE); @@ -3782,7 +4217,7 @@ fil_load_single_table_tablespace( os_file_close(file); new_path = fil_make_ibbackup_old_name(filepath); - ut_a(os_file_rename(filepath, new_path)); + ut_a(os_file_rename(innodb_file_data_key, filepath, new_path)); ut_free(buf2); mem_free(filepath); @@ -3820,7 +4255,7 @@ fil_load_single_table_tablespace( mutex_exit(&fil_system->mutex); - ut_a(os_file_rename(filepath, new_path)); + ut_a(os_file_rename(innodb_file_data_key, filepath, new_path)); ut_free(buf2); mem_free(filepath); @@ -4294,6 +4729,10 @@ fil_extend_space_to_desired_size( ulint page_size; ibool success = TRUE; + /* file_extend_mutex is for http://bugs.mysql.com/56433 */ + /* to protect from the other fil_extend_space_to_desired_size() */ + /* during temprary releasing &fil_system->mutex */ + mutex_enter(&fil_system->file_extend_mutex); fil_mutex_enter_and_prepare_for_io(space_id); space = fil_space_get_by_id(space_id); @@ -4305,6 +4744,7 @@ fil_extend_space_to_desired_size( *actual_size = space->size; mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system->file_extend_mutex); return(TRUE); } @@ -4337,6 +4777,8 @@ fil_extend_space_to_desired_size( offset_low = ((start_page_no - file_start_page_no) % (4096 * ((1024 * 1024) / page_size))) * page_size; + + mutex_exit(&fil_system->mutex); #ifdef UNIV_HOTBACKUP success = os_file_write(node->name, node->handle, buf, offset_low, offset_high, @@ -4346,8 +4788,10 @@ fil_extend_space_to_desired_size( node->name, node->handle, buf, offset_low, offset_high, page_size * n_pages, - NULL, NULL, NULL); + NULL, NULL, space_id, NULL); #endif + mutex_enter(&fil_system->mutex); + if (success) { node->size += n_pages; space->size += n_pages; @@ -4393,8 +4837,9 @@ fil_extend_space_to_desired_size( printf("Extended %s to %lu, actual size %lu pages\n", space->name, size_after_extend, *actual_size); */ mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system->file_extend_mutex); - fil_flush(space_id); + fil_flush(space_id, TRUE); return(success); } @@ -4725,15 +5170,13 @@ _fil_io( //#if (1 << UNIV_PAGE_SIZE_SHIFT) != UNIV_PAGE_SIZE //# error "(1 << UNIV_PAGE_SIZE_SHIFT) != UNIV_PAGE_SIZE" //#endif - ut_ad(fil_validate()); + ut_ad(fil_validate_skip()); #ifndef UNIV_HOTBACKUP # ifndef UNIV_LOG_DEBUG /* ibuf bitmap pages must be read in the sync aio mode: */ ut_ad(recv_no_ibuf_operations || (type == OS_FILE_WRITE) || !ibuf_bitmap_page(zip_size, block_offset) || sync || is_log); - ut_ad(!ibuf_inside() || is_log || (type == OS_FILE_WRITE) - || ibuf_page(space_id, zip_size, block_offset, NULL)); # endif /* UNIV_LOG_DEBUG */ if (sync) { mode = OS_AIO_SYNC; @@ -4757,6 +5200,22 @@ _fil_io( srv_data_written+= len; } + /* if the table space was already deleted, space might not exist already. */ + if (message + && space_id < SRV_LOG_SPACE_FIRST_ID + && ((buf_page_t*)message)->space_was_being_deleted) { + + if (mode == OS_AIO_NORMAL) { + buf_page_io_complete(message); + return(DB_SUCCESS); /*fake*/ + } + if (type == OS_FILE_READ) { + return(DB_TABLESPACE_DELETED); + } else { + return(DB_SUCCESS); /*fake*/ + } + } + /* Reserve the fil_system mutex and make sure that we can open at least one file while holding it, if the file is not already open */ @@ -4858,14 +5317,14 @@ _fil_io( ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0); ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0); - if (srv_pass_corrupt_table && space->is_corrupt) { + if (srv_pass_corrupt_table == 1 && space->is_corrupt) { /* should ignore i/o for the crashed space */ mutex_enter(&fil_system->mutex); fil_node_complete_io(node, fil_system, type); mutex_exit(&fil_system->mutex); if (mode == OS_AIO_NORMAL) { ut_a(space->purpose == FIL_TABLESPACE); - buf_page_io_complete(message, trx); + buf_page_io_complete(message); } if (type == OS_FILE_READ) { return(DB_TABLESPACE_DELETED); @@ -4873,7 +5332,19 @@ _fil_io( return(DB_SUCCESS); } } else { - ut_a(!space->is_corrupt); + if (srv_pass_corrupt_table > 1 && space->is_corrupt) { + /* should ignore write i/o for the crashed space */ + if (type == OS_FILE_WRITE) { + mutex_enter(&fil_system->mutex); + fil_node_complete_io(node, fil_system, type); + mutex_exit(&fil_system->mutex); + if (mode == OS_AIO_NORMAL) { + ut_a(space->purpose == FIL_TABLESPACE); + buf_page_io_complete(message); + } + return(DB_SUCCESS); + } + } #ifdef UNIV_HOTBACKUP /* In ibbackup do normal i/o, not aio */ if (type == OS_FILE_READ) { @@ -4886,10 +5357,24 @@ _fil_io( #else /* Queue the aio request */ ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, - offset_low, offset_high, len, node, message, trx); + offset_low, offset_high, len, node, message, space_id, trx); #endif } /**/ + /* if the table space was already deleted, space might not exist already. */ + if (message + && space_id < SRV_LOG_SPACE_FIRST_ID + && ((buf_page_t*)message)->space_was_being_deleted) { + + if (mode == OS_AIO_SYNC) { + if (type == OS_FILE_READ) { + return(DB_TABLESPACE_DELETED); + } else { + return(DB_SUCCESS); /*fake*/ + } + } + } + ut_a(ret); if (mode == OS_AIO_SYNC) { @@ -4902,7 +5387,7 @@ _fil_io( mutex_exit(&fil_system->mutex); - ut_ad(fil_validate()); + ut_ad(fil_validate_skip()); } return(DB_SUCCESS); @@ -4912,21 +5397,10 @@ _fil_io( Confirm whether the parameters are valid or not */ UNIV_INTERN ibool -fil_area_is_exist( +fil_is_exist( /*==============*/ ulint space_id, /*!< in: space id */ - ulint zip_size __attribute__((unused)), - /*!< in: compressed page size in bytes; - 0 for uncompressed pages */ - ulint block_offset, /*!< in: offset in number of blocks */ - ulint byte_offset __attribute__((unused)), - /*!< in: remainder of offset in bytes; in - aio this must be divisible by the OS block - size */ - ulint len __attribute__((unused))) - /*!< in: how many bytes to read or write; this - must not cross a file boundary; in aio this - must be a block size multiple */ + ulint block_offset) /*!< in: offset in number of blocks */ { fil_space_t* space; fil_node_t* node; @@ -5000,26 +5474,49 @@ fil_aio_wait( fil_node_t* fil_node; void* message; ulint type; + ulint space_id = 0; - ut_ad(fil_validate()); + ut_ad(fil_validate_skip()); - if (os_aio_use_native_aio) { + if (srv_use_native_aio) { srv_set_io_thread_op_info(segment, "native aio handle"); #ifdef WIN_ASYNC_IO ret = os_aio_windows_handle(segment, 0, &fil_node, - &message, &type); + &message, &type, &space_id); +#elif defined(LINUX_NATIVE_AIO) + ret = os_aio_linux_handle(segment, &fil_node, + &message, &type, &space_id); #else - ret = 0; /* Eliminate compiler warning */ ut_error; + ret = 0; /* Eliminate compiler warning */ #endif } else { srv_set_io_thread_op_info(segment, "simulated aio handle"); ret = os_aio_simulated_handle(segment, &fil_node, - &message, &type); + &message, &type, &space_id); + } + + /* if the table space was already deleted, fil_node might not exist already. */ + if (message + && space_id < SRV_LOG_SPACE_FIRST_ID + && ((buf_page_t*)message)->space_was_being_deleted) { + + /* intended not to be uncompress read page */ + ut_a(buf_page_get_io_fix(message) == BUF_IO_WRITE + || !buf_page_get_zip_size(message) + || buf_page_get_state(message) != BUF_BLOCK_FILE_PAGE); + + srv_set_io_thread_op_info(segment, "complete io for buf page"); + buf_page_io_complete(message); + return; } ut_a(ret); + if (UNIV_UNLIKELY(fil_node == NULL)) { + ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); + return; + } srv_set_io_thread_op_info(segment, "complete io for fil node"); @@ -5029,7 +5526,7 @@ fil_aio_wait( mutex_exit(&fil_system->mutex); - ut_ad(fil_validate()); + ut_ad(fil_validate_skip()); /* Do the i/o handling */ /* IMPORTANT: since i/o handling for reads will read also the insert @@ -5039,7 +5536,7 @@ fil_aio_wait( if (fil_node->space->purpose == FIL_TABLESPACE) { srv_set_io_thread_op_info(segment, "complete io for buf page"); - buf_page_io_complete(message, NULL); + buf_page_io_complete(message); } else { srv_set_io_thread_op_info(segment, "complete io for log"); log_io_complete(message); @@ -5054,8 +5551,9 @@ UNIV_INTERN void fil_flush( /*======*/ - ulint space_id) /*!< in: file space id (this can be a group of + ulint space_id, /*!< in: file space id (this can be a group of log files or a tablespace of the database) */ + ibool metadata) { fil_space_t* space; fil_node_t* node; @@ -5126,7 +5624,7 @@ retry: /* fprintf(stderr, "Flushing to file %s\n", node->name); */ - os_file_flush(file); + os_file_flush(file, metadata); mutex_enter(&fil_system->mutex); @@ -5209,7 +5707,7 @@ fil_flush_file_spaces( a non-existing space id. */ for (i = 0; i < n_space_ids; i++) { - fil_flush(space_ids[i]); + fil_flush(space_ids[i], TRUE); } mem_free(space_ids); @@ -5342,7 +5840,7 @@ fil_page_get_type( return(mach_read_from_2(page + FIL_PAGE_TYPE)); } -/******************************************************************** +/****************************************************************//** Initializes the tablespace memory cache. */ UNIV_INTERN void |