diff options
Diffstat (limited to 'storage/innobase')
214 files changed, 13723 insertions, 16429 deletions
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index 4513a63049d..cbd280af223 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -113,7 +113,6 @@ SET(INNOBASE_SOURCES row/row0purge.cc row/row0row.cc row/row0sel.cc - row/row0trunc.cc row/row0uins.cc row/row0umod.cc row/row0undo.cc @@ -161,6 +160,8 @@ IF(NOT TARGET innobase) RETURN() ENDIF() +ADD_DEFINITIONS(${SSL_DEFINES}) + # A GCC bug causes crash when compiling these files on ARM64 with -O1+ # Compile them with -O0 as a workaround. IF(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" @@ -175,7 +176,6 @@ IF(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" mtr/mtr0mtr.cc row/row0merge.cc row/row0mysql.cc - row/row0trunc.cc srv/srv0srv.cc COMPILE_FLAGS "-O0" ) diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index 736413d2473..a8e319dd321 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -44,6 +44,8 @@ Created 6/2/1994 Heikki Tuuri #include "dict0boot.h" #include "row0sel.h" /* row_search_max_autoinc() */ +Atomic_counter<uint32_t> btr_validate_index_running; + /**************************************************************//** Checks if the page in the cursor can be merged with given page. If necessary, re-organize the merge_page. @@ -57,15 +59,12 @@ btr_can_merge_with_page( buf_block_t** merge_block, /*!< out: the merge block */ mtr_t* mtr); /*!< in: mini-transaction */ -/**************************************************************//** -Report that an index page is corrupted. */ -void -btr_corruption_report( -/*==================*/ - const buf_block_t* block, /*!< in: corrupted block */ - const dict_index_t* index) /*!< in: index tree */ +/** Report that an index page is corrupted. +@param[in] buffer block +@param[in] index tree */ +void btr_corruption_report(const buf_block_t* block, const dict_index_t* index) { - ib::error() + ib::fatal() << "Flag mismatch in page " << block->page.id << " index " << index->name << " of table " << index->table->name; @@ -226,7 +225,7 @@ btr_root_block_get( buf_block_t* block = btr_block_get( page_id_t(index->table->space_id, index->page), - page_size_t(index->table->space->flags), mode, + index->table->space->zip_size(), mode, index, mtr); if (!block) { @@ -357,7 +356,7 @@ btr_root_adjust_on_import( page_zip_des_t* page_zip; dict_table_t* table = index->table; const page_id_t page_id(table->space_id, index->page); - const page_size_t page_size(table->space->flags); + const ulint zip_size = table->space->zip_size(); DBUG_EXECUTE_IF("ib_import_trigger_corruption_3", return(DB_CORRUPTION);); @@ -366,7 +365,7 @@ btr_root_adjust_on_import( mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); - block = btr_block_get(page_id, page_size, RW_X_LATCH, index, &mtr); + block = btr_block_get(page_id, zip_size, RW_X_LATCH, index, &mtr); page = buf_block_get_frame(block); page_zip = buf_block_get_page_zip(block); @@ -385,9 +384,21 @@ btr_root_adjust_on_import( } else { /* Check that the table flags and the tablespace flags match. */ - err = (dict_tf_to_fsp_flags(table->flags) - == table->space->flags) - ? DB_SUCCESS : DB_CORRUPTION; + ulint tf = dict_tf_to_fsp_flags(table->flags); + ulint sf = table->space->flags; + sf &= ~FSP_FLAGS_MEM_MASK; + tf &= ~FSP_FLAGS_MEM_MASK; + if (fil_space_t::is_flags_equal(tf, sf) + || fil_space_t::is_flags_equal(sf, tf)) { + mutex_enter(&fil_system.mutex); + table->space->flags = (table->space->flags + & ~FSP_FLAGS_MEM_MASK) + | (tf & FSP_FLAGS_MEM_MASK); + mutex_exit(&fil_system.mutex); + err = DB_SUCCESS; + } else { + err = DB_CORRUPTION; + } } } else { err = DB_SUCCESS; @@ -427,7 +438,7 @@ btr_page_create( ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); if (page_zip) { - page_create_zip(block, index, level, 0, NULL, mtr); + page_create_zip(block, index, level, 0, mtr); } else { page_create(block, mtr, dict_table_is_comp(index->table), dict_index_is_spatial(index)); @@ -467,7 +478,7 @@ btr_page_alloc_for_ibuf( new_block = buf_page_get( page_id_t(index->table->space_id, node_addr.page), - page_size_t(index->table->space->flags), + index->table->space->zip_size(), RW_X_LATCH, mtr); new_page = buf_block_get_frame(new_block); @@ -750,21 +761,18 @@ void btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr, ? PAGE_HEADER + PAGE_BTR_SEG_LEAF : PAGE_HEADER + PAGE_BTR_SEG_TOP]; fseg_free_page(seg_header, - index->table->space, block->page.id.page_no(), mtr); + index->table->space, block->page.id.page_no(), + !block->page.flush_observer, mtr); /* The page was marked free in the allocation bitmap, but it should remain exclusively latched until mtr_t::commit() or until it is explicitly freed from the mini-transaction. */ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); - if (srv_immediate_scrub_data_uncompressed) { - /* In MDEV-15528 this code must be removed and the - check in buf_flush_init_for_writing() re-enabled. We - should zero out the page after the redo log for this - mini-transaction has been durably written. The log - would include the 10.4 MLOG_INIT_FREE_PAGE record. */ - fsp_init_file_page(index->table->space, block, mtr); - } + /* MDEV-15528 FIXME: Zero out the page after the redo log for + this mini-transaction has been durably written. + This must be done unconditionally if + srv_immediate_scrub_data_uncompressed is set. */ } /**************************************************************//** @@ -821,7 +829,7 @@ btr_node_ptr_get_child( return btr_block_get( page_id_t(index->table->space_id, btr_node_ptr_get_child_page_no(node_ptr, offsets)), - page_size_t(index->table->space->flags), + index->table->space->zip_size(), RW_SX_LATCH, index, mtr); } @@ -1012,7 +1020,7 @@ static void btr_free_root(buf_block_t* block, mtr_t* mtr, bool invalidate) /** Prepare to free a B-tree. @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] index_id PAGE_INDEX_ID contents @param[in,out] mtr mini-transaction @return root block, to invoke btr_free_but_not_root() and btr_free_root() @@ -1021,7 +1029,7 @@ static MY_ATTRIBUTE((warn_unused_result)) buf_block_t* btr_free_root_check( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, index_id_t index_id, mtr_t* mtr) { @@ -1029,7 +1037,7 @@ btr_free_root_check( ut_ad(index_id != BTR_FREED_INDEX_ID); buf_block_t* block = buf_page_get( - page_id, page_size, RW_X_LATCH, mtr); + page_id, zip_size, RW_X_LATCH, mtr); if (block) { buf_block_dbg_add_level(block, SYNC_TREE_NODE); @@ -1050,21 +1058,18 @@ btr_free_root_check( /** Create the root node for a new index tree. @param[in] type type of the index -@param[in,out] space tablespace where created @param[in] index_id index id -@param[in] index index, or NULL when applying TRUNCATE -log record during recovery -@param[in] btr_redo_create_info used for applying TRUNCATE log -@param[in] mtr mini-transaction handle -record during recovery -@return page number of the created root, FIL_NULL if did not succeed */ +@param[in,out] space tablespace where created +@param[in] index index +@param[in,out] mtr mini-transaction +@return page number of the created root +@retval FIL_NULL if did not succeed */ ulint btr_create( ulint type, fil_space_t* space, index_id_t index_id, dict_index_t* index, - const btr_create_t* btr_redo_create_info, mtr_t* mtr) { buf_block_t* block; @@ -1079,7 +1084,7 @@ btr_create( (for an ibuf tree, not in the root, but on a separate ibuf header page) */ - if (type & DICT_IBUF) { + if (UNIV_UNLIKELY(type & DICT_IBUF)) { /* Allocate first the ibuf header page */ buf_block_t* ibuf_hdr_block = fseg_create( space, 0, @@ -1111,8 +1116,7 @@ btr_create( buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW); - flst_init(block->frame + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, - mtr); + flst_init(block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr); } else { block = fseg_create(space, 0, PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr); @@ -1141,44 +1145,11 @@ btr_create( page_zip = buf_block_get_page_zip(block); if (page_zip) { - if (index != NULL) { - page = page_create_zip(block, index, 0, 0, NULL, mtr); - } else { - /* Create a compressed index page when applying - TRUNCATE log record during recovery */ - ut_ad(btr_redo_create_info != NULL); - - redo_page_compress_t page_comp_info; - - page_comp_info.type = type; - - page_comp_info.index_id = index_id; - - page_comp_info.n_fields = - btr_redo_create_info->n_fields; - - page_comp_info.field_len = - btr_redo_create_info->field_len; - - page_comp_info.fields = btr_redo_create_info->fields; - - page_comp_info.trx_id_pos = - btr_redo_create_info->trx_id_pos; - - page = page_create_zip(block, NULL, 0, 0, - &page_comp_info, mtr); - } + page = page_create_zip(block, index, 0, 0, mtr); } else { - if (index != NULL) { - page = page_create(block, mtr, - dict_table_is_comp(index->table), - dict_index_is_spatial(index)); - } else { - ut_ad(btr_redo_create_info != NULL); - page = page_create( - block, mtr, btr_redo_create_info->format_flags, - type == DICT_SPATIAL); - } + page = page_create(block, mtr, + dict_table_is_comp(index->table), + dict_index_is_spatial(index)); /* Set the level of the new index page */ btr_page_set_level(page, NULL, 0, mtr); } @@ -1187,21 +1158,32 @@ btr_create( btr_page_set_index_id(page, page_zip, index_id, mtr); /* Set the next node and previous node fields */ - btr_page_set_next(page, page_zip, FIL_NULL, mtr); - btr_page_set_prev(page, page_zip, FIL_NULL, mtr); - - /* We reset the free bits for the page to allow creation of several - trees in the same mtr, otherwise the latch on a bitmap page would - prevent it because of the latching order. + compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4); + compile_time_assert(FIL_NULL == 0xffffffff); +#if MYSQL_VERSION_ID < 100500 + if (UNIV_LIKELY_NULL(page_zip)) { + /* Avoid tripping the ut_a() in mlog_parse_nbytes() + when crash-downgrading to an earlier MariaDB 10.4 version. */ + btr_page_set_next(page, page_zip, FIL_NULL, mtr); + btr_page_set_prev(page, page_zip, FIL_NULL, mtr); + } else { + mlog_memset(block, FIL_PAGE_PREV, 8, 0xff, mtr); + } +#else + mlog_memset(block, FIL_PAGE_PREV, 8, 0xff, mtr); + if (UNIV_LIKELY_NULL(page_zip)) { + memset(page_zip->data + FIL_PAGE_PREV, 0xff, 8); + } +#endif - index will be NULL if we are recreating the table during recovery - on behalf of TRUNCATE. + /* We reset the free bits for the page in a separate + mini-transaction to allow creation of several trees in the + same mtr, otherwise the latch on a bitmap page would prevent + it because of the latching order. Note: Insert Buffering is disabled for temporary tables given that most temporary tables are smaller in size and short-lived. */ - if (!(type & DICT_CLUSTERED) - && (index == NULL || !index->table->is_temporary())) { - + if (!(type & DICT_CLUSTERED) && !index->table->is_temporary()) { ibuf_reset_free_bits(block); } @@ -1282,18 +1264,18 @@ top_loop: /** Free a persistent index tree if it exists. @param[in] page_id root page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] index_id PAGE_INDEX_ID contents @param[in,out] mtr mini-transaction */ void btr_free_if_exists( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, index_id_t index_id, mtr_t* mtr) { buf_block_t* root = btr_free_root_check( - page_id, page_size, index_id, mtr); + page_id, zip_size, index_id, mtr); if (root == NULL) { return; @@ -1304,20 +1286,15 @@ btr_free_if_exists( btr_free_root(root, mtr, true); } -/** Free an index tree in a temporary tablespace or during TRUNCATE TABLE. -@param[in] page_id root page id -@param[in] page_size page size */ -void -btr_free( - const page_id_t page_id, - const page_size_t& page_size) +/** Free an index tree in a temporary tablespace. +@param[in] page_id root page id */ +void btr_free(const page_id_t page_id) { mtr_t mtr; mtr.start(); mtr.set_log_mode(MTR_LOG_NO_REDO); - buf_block_t* block = buf_page_get( - page_id, page_size, RW_X_LATCH, &mtr); + buf_block_t* block = buf_page_get(page_id, 0, RW_X_LATCH, &mtr); if (block) { btr_free_but_not_root(block, MTR_LOG_NO_REDO); @@ -1341,7 +1318,7 @@ btr_read_autoinc(dict_index_t* index) ib_uint64_t autoinc; if (buf_block_t* block = buf_page_get( page_id_t(index->table->space_id, index->page), - page_size_t(index->table->space->flags), + index->table->space->zip_size(), RW_S_LATCH, &mtr)) { autoinc = page_get_autoinc(block->frame); } else { @@ -1373,7 +1350,7 @@ btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no) mtr.start(); buf_block_t* block = buf_page_get( page_id_t(index->table->space_id, index->page), - page_size_t(index->table->space->flags), + index->table->space->zip_size(), RW_S_LATCH, &mtr); ib_uint64_t autoinc = block ? page_get_autoinc(block->frame) : 0; @@ -1418,7 +1395,7 @@ btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset) fil_space_t* space = index->table->space; mtr.set_named_space(space); page_set_autoinc(buf_page_get(page_id_t(space->id, index->page), - page_size_t(space->flags), + space->zip_size(), RW_SX_LATCH, &mtr), index, autoinc, &mtr, reset); mtr.commit(); @@ -1545,7 +1522,7 @@ btr_page_reorganize_low( } if (page_zip - && !page_zip_compress(page_zip, page, index, z_level, NULL, mtr)) { + && !page_zip_compress(page_zip, page, index, z_level, mtr)) { /* Restore the old page and exit. */ #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG @@ -1571,11 +1548,6 @@ btr_page_reorganize_low( goto func_exit; } - if (!recovery && !dict_table_is_locking_disabled(index->table)) { - /* Update the record lock bitmaps */ - lock_move_reorganize_page(block, temp_block); - } - data_size2 = page_get_data_size(page); max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1); @@ -1599,21 +1571,41 @@ btr_page_reorganize_low( ut_ad(cursor->rec == page_get_infimum_rec(page)); } -func_exit: #ifdef UNIV_ZIP_DEBUG ut_a(!page_zip || page_zip_validate(page_zip, page, index)); #endif /* UNIV_ZIP_DEBUG */ - if (!recovery && block->page.id.page_no() == index->page - && fil_page_get_type(temp_page) == FIL_PAGE_TYPE_INSTANT) { - /* Preserve the PAGE_INSTANT information. */ - ut_ad(!page_zip); - ut_ad(index->is_instant()); - memcpy(FIL_PAGE_TYPE + page, FIL_PAGE_TYPE + temp_page, 2); - memcpy(PAGE_HEADER + PAGE_INSTANT + page, - PAGE_HEADER + PAGE_INSTANT + temp_page, 2); + if (!recovery) { + if (block->page.id.page_no() == index->page + && fil_page_get_type(temp_page) == FIL_PAGE_TYPE_INSTANT) { + /* Preserve the PAGE_INSTANT information. */ + ut_ad(!page_zip); + ut_ad(index->is_instant()); + memcpy(FIL_PAGE_TYPE + page, + FIL_PAGE_TYPE + temp_page, 2); + memcpy(PAGE_HEADER + PAGE_INSTANT + page, + PAGE_HEADER + PAGE_INSTANT + temp_page, 2); + if (!index->table->instant) { + } else if (page_is_comp(page)) { + memcpy(PAGE_NEW_INFIMUM + page, + PAGE_NEW_INFIMUM + temp_page, 8); + memcpy(PAGE_NEW_SUPREMUM + page, + PAGE_NEW_SUPREMUM + temp_page, 8); + } else { + memcpy(PAGE_OLD_INFIMUM + page, + PAGE_OLD_INFIMUM + temp_page, 8); + memcpy(PAGE_OLD_SUPREMUM + page, + PAGE_OLD_SUPREMUM + temp_page, 8); + } + } + + if (!dict_table_is_locking_disabled(index->table)) { + /* Update the record lock bitmaps */ + lock_move_reorganize_page(block, temp_block); + } } +func_exit: buf_block_free(temp_block); /* Restore logging mode */ @@ -1659,6 +1651,14 @@ func_exit: mach_read_from_2(PAGE_HEADER + PAGE_INSTANT + page), MLOG_2BYTES, mtr); + if (!index->table->instant) { + } else if (page_is_comp(page)) { + mlog_log_string(PAGE_NEW_INFIMUM + page, 8, mtr); + mlog_log_string(PAGE_NEW_SUPREMUM + page, 8, mtr); + } else { + mlog_log_string(PAGE_OLD_INFIMUM + page, 8, mtr); + mlog_log_string(PAGE_OLD_SUPREMUM + page, 8, mtr); + } } return(success); @@ -1797,7 +1797,7 @@ btr_page_empty( : 0; if (page_zip) { - page_create_zip(block, index, level, autoinc, NULL, mtr); + page_create_zip(block, index, level, autoinc, mtr); } else { page_create(block, mtr, dict_table_is_comp(index->table), dict_index_is_spatial(index)); @@ -1809,6 +1809,65 @@ btr_page_empty( } } +/** Write instant ALTER TABLE metadata to a root page. +@param[in,out] root clustered index root page +@param[in] index clustered index with instant ALTER TABLE +@param[in,out] mtr mini-transaction */ +void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr) +{ + ut_ad(index.n_core_fields > 0); + ut_ad(index.n_core_fields < REC_MAX_N_FIELDS); + ut_ad(index.is_instant()); + ut_ad(fil_page_get_type(root->frame) == FIL_PAGE_TYPE_INSTANT + || fil_page_get_type(root->frame) == FIL_PAGE_INDEX); + ut_ad(!page_has_siblings(root->frame)); + ut_ad(root->page.id.page_no() == index.page); + + rec_t* infimum = page_get_infimum_rec(root->frame); + rec_t* supremum = page_get_supremum_rec(root->frame); + byte* page_type = root->frame + FIL_PAGE_TYPE; + uint16_t i = page_header_get_field(root->frame, PAGE_INSTANT); + + switch (mach_read_from_2(page_type)) { + case FIL_PAGE_TYPE_INSTANT: + ut_ad(page_get_instant(root->frame) == index.n_core_fields); + if (memcmp(infimum, "infimum", 8) + || memcmp(supremum, "supremum", 8)) { + ut_ad(index.table->instant); + ut_ad(!memcmp(infimum, field_ref_zero, 8)); + ut_ad(!memcmp(supremum, field_ref_zero, 7)); + /* The n_core_null_bytes only matters for + ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables. */ + ut_ad(supremum[7] == index.n_core_null_bytes + || !index.table->not_redundant()); + return; + } + break; + default: + ut_ad(!"wrong page type"); + /* fall through */ + case FIL_PAGE_INDEX: + ut_ad(!page_is_comp(root->frame) + || !page_get_instant(root->frame)); + ut_ad(!memcmp(infimum, "infimum", 8)); + ut_ad(!memcmp(supremum, "supremum", 8)); + mlog_write_ulint(page_type, FIL_PAGE_TYPE_INSTANT, + MLOG_2BYTES, mtr); + ut_ad(i <= PAGE_NO_DIRECTION); + i |= index.n_core_fields << 3; + mlog_write_ulint(PAGE_HEADER + PAGE_INSTANT + root->frame, i, + MLOG_2BYTES, mtr); + break; + } + + if (index.table->instant) { + mlog_memset(root, infimum - root->frame, 8, 0, mtr); + mlog_memset(root, supremum - root->frame, 7, 0, mtr); + mlog_write_ulint(&supremum[7], index.n_core_null_bytes, + MLOG_1BYTE, mtr); + } +} + /*************************************************************//** Makes tree one level higher by splitting the root, and inserts the tuple. It is assumed that mtr contains an x-latch on the tree. @@ -1892,8 +1951,23 @@ btr_root_raise_and_insert( btr_page_create(new_block, new_page_zip, index, level, mtr); /* Set the next node and previous node fields of new page */ - btr_page_set_next(new_page, new_page_zip, FIL_NULL, mtr); - btr_page_set_prev(new_page, new_page_zip, FIL_NULL, mtr); + compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4); + compile_time_assert(FIL_NULL == 0xffffffff); +#if MYSQL_VERSION_ID < 100500 + if (UNIV_LIKELY_NULL(new_page_zip)) { + /* Avoid tripping the ut_a() in mlog_parse_nbytes() + when crash-downgrading to an earlier MariaDB 10.4 version. */ + btr_page_set_next(new_page, new_page_zip, FIL_NULL, mtr); + btr_page_set_prev(new_page, new_page_zip, FIL_NULL, mtr); + } else { + mlog_memset(new_block, FIL_PAGE_PREV, 8, 0xff, mtr); + } +#else + mlog_memset(new_block, FIL_PAGE_PREV, 8, 0xff, mtr); + if (UNIV_LIKELY_NULL(new_page_zip)) { + memset(new_page_zip->data + FIL_PAGE_PREV, 0xff, 8); + } +#endif /* Copy the records from root to the new page one by one. */ @@ -1994,11 +2068,7 @@ btr_root_raise_and_insert( if (index->is_instant()) { ut_ad(!root_page_zip); - byte* page_type = root_block->frame + FIL_PAGE_TYPE; - ut_ad(mach_read_from_2(page_type) == FIL_PAGE_INDEX); - mlog_write_ulint(page_type, FIL_PAGE_TYPE_INSTANT, - MLOG_2BYTES, mtr); - page_set_instant(root_block->frame, index->n_core_fields, mtr); + btr_set_instant(root_block, *index, mtr); } ut_ad(!page_has_siblings(root)); @@ -2068,7 +2138,7 @@ rec_t* btr_page_get_split_rec_to_left(const btr_cur_t* cursor) || cursor->index->is_instant() || !(rec_get_info_bits(page_rec_get_next_const( page_get_infimum_rec(page)), - dict_table_is_comp(cursor->index->table)) + cursor->index->table->not_redundant()) & REC_INFO_MIN_REC_FLAG)); const rec_t* infimum = page_get_infimum_rec(page); @@ -2511,12 +2581,12 @@ btr_attach_half_pages( /* for consistency, both blocks should be locked, before change */ if (prev_page_no != FIL_NULL && direction == FSP_DOWN) { prev_block = btr_block_get( - page_id_t(space, prev_page_no), block->page.size, + page_id_t(space, prev_page_no), block->zip_size(), RW_X_LATCH, index, mtr); } if (next_page_no != FIL_NULL && direction != FSP_DOWN) { next_block = btr_block_get( - page_id_t(space, next_page_no), block->page.size, + page_id_t(space, next_page_no), block->zip_size(), RW_X_LATCH, index, mtr); } @@ -2666,7 +2736,7 @@ btr_insert_into_right_sibling( const ulint space = block->page.id.space(); next_block = btr_block_get( - page_id_t(space, next_page_no), block->page.size, + page_id_t(space, next_page_no), block->zip_size(), RW_X_LATCH, cursor->index, mtr); next_page = buf_block_get_frame(next_block); @@ -2692,7 +2762,7 @@ btr_insert_into_right_sibling( if (rec == NULL) { if (is_leaf - && next_block->page.size.is_compressed() + && next_block->page.zip.ssize && !dict_index_is_clust(cursor->index) && !cursor->index->table->is_temporary()) { /* Reset the IBUF_BITMAP_FREE bits, because @@ -2740,7 +2810,7 @@ btr_insert_into_right_sibling( /* Update the free bits of the B-tree page in the insert buffer bitmap. */ - if (next_block->page.size.is_compressed()) { + if (next_block->page.zip.ssize) { ibuf_update_free_bits_zip(next_block, mtr); } else { ibuf_update_free_bits_if_full( @@ -3153,16 +3223,16 @@ func_exit: return(rec); } -/** Removes a page from the level list of pages. +/** Remove a page from the level list of pages. @param[in] space space where removed -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in,out] page page to remove @param[in] index index tree @param[in,out] mtr mini-transaction */ void btr_level_list_remove_func( ulint space, - const page_size_t& page_size, + ulint zip_size, page_t* page, dict_index_t* index, mtr_t* mtr) @@ -3181,7 +3251,7 @@ btr_level_list_remove_func( if (prev_page_no != FIL_NULL) { buf_block_t* prev_block = btr_block_get(page_id_t(space, prev_page_no), - page_size, RW_X_LATCH, index, mtr); + zip_size, RW_X_LATCH, index, mtr); page_t* prev_page = buf_block_get_frame(prev_block); @@ -3199,7 +3269,7 @@ btr_level_list_remove_func( if (next_page_no != FIL_NULL) { buf_block_t* next_block = btr_block_get( - page_id_t(space, next_page_no), page_size, + page_id_t(space, next_page_no), zip_size, RW_X_LATCH, index, mtr); page_t* next_page @@ -3404,12 +3474,7 @@ btr_lift_page_up( if (index->is_instant() && father_block->page.id.page_no() == root_page_no) { ut_ad(!father_page_zip); - byte* page_type = father_block->frame + FIL_PAGE_TYPE; - ut_ad(mach_read_from_2(page_type) == FIL_PAGE_INDEX); - mlog_write_ulint(page_type, FIL_PAGE_TYPE_INSTANT, - MLOG_2BYTES, mtr); - page_set_instant(father_block->frame, - index->n_core_fields, mtr); + btr_set_instant(father_block, *index, mtr); } page_level++; @@ -3546,7 +3611,7 @@ btr_compress( ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); - const page_size_t page_size(index->table->space->flags); + const ulint zip_size = index->table->space->zip_size(); MONITOR_INC(MONITOR_INDEX_MERGE_ATTEMPTS); @@ -3704,7 +3769,7 @@ retry: /* Remove the page from the level list */ btr_level_list_remove(index->table->space_id, - page_size, page, index, mtr); + zip_size, page, index, mtr); if (dict_index_is_spatial(index)) { rec_t* my_rec = father_cursor.page_cur.rec; @@ -3834,7 +3899,7 @@ retry: /* Remove the page from the level list */ btr_level_list_remove(index->table->space_id, - page_size, page, index, mtr); + zip_size, page, index, mtr); ut_ad(btr_node_ptr_get_child_page_no( btr_cur_get_rec(&father_cursor), offsets) @@ -3942,7 +4007,7 @@ retry: committed mini-transaction, because in crash recovery, the free bits could momentarily be set too high. */ - if (page_size.is_compressed()) { + if (zip_size) { /* Because the free bits may be incremented and we cannot update the insert buffer bitmap in the same mini-transaction, the only safe @@ -4002,7 +4067,7 @@ func_exit: err_exit: /* We play it safe and reset the free bits. */ - if (page_size.is_compressed() + if (zip_size && merge_page && page_is_leaf(merge_page) && !dict_index_is_clust(index)) { @@ -4085,15 +4150,42 @@ btr_discard_only_page_on_level( } #endif /* UNIV_BTR_DEBUG */ + mem_heap_t* heap = NULL; + const rec_t* rec = NULL; + rec_offs* offsets = NULL; + if (index->table->instant) { + const rec_t* r = page_rec_get_next(page_get_infimum_rec( + block->frame)); + ut_ad(rec_is_metadata(r, *index) == index->is_instant()); + if (rec_is_alter_metadata(r, *index)) { + heap = mem_heap_create(srv_page_size); + offsets = rec_get_offsets(r, index, NULL, true, + ULINT_UNDEFINED, &heap); + rec = rec_copy(mem_heap_alloc(heap, + rec_offs_size(offsets)), + r, offsets); + rec_offs_make_valid(rec, index, true, offsets); + } + } + btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr); ut_ad(page_is_leaf(buf_block_get_frame(block))); /* btr_page_empty() is supposed to zero-initialize the field. */ ut_ad(!page_get_instant(block->frame)); if (index->is_primary()) { - /* Concurrent access is prevented by the root_block->lock - X-latch, so this should be safe. */ - index->remove_instant(); + if (rec) { + DBUG_ASSERT(index->table->instant); + DBUG_ASSERT(rec_is_alter_metadata(rec, *index)); + btr_set_instant(block, *index, mtr); + rec = page_cur_insert_rec_low( + page_get_infimum_rec(block->frame), + index, rec, offsets, mtr); + ut_ad(rec); + mem_heap_free(heap); + } else if (index->is_instant()) { + index->clear_instant_add(); + } } else if (!index->table->is_temporary()) { /* We play it safe and reset the free bits for the root */ ibuf_reset_free_bits(block); @@ -4149,12 +4241,12 @@ btr_discard_page( left_page_no = btr_page_get_prev(buf_block_get_frame(block)); right_page_no = btr_page_get_next(buf_block_get_frame(block)); - const page_size_t page_size(index->table->space->flags); + const ulint zip_size = index->table->space->zip_size(); ut_d(bool parent_is_different = false); if (left_page_no != FIL_NULL) { merge_block = btr_block_get( page_id_t(index->table->space_id, left_page_no), - page_size, RW_X_LATCH, index, mtr); + zip_size, RW_X_LATCH, index, mtr); merge_page = buf_block_get_frame(merge_block); #ifdef UNIV_BTR_DEBUG @@ -4170,7 +4262,7 @@ btr_discard_page( } else if (right_page_no != FIL_NULL) { merge_block = btr_block_get( page_id_t(index->table->space_id, right_page_no), - page_size, RW_X_LATCH, index, mtr); + zip_size, RW_X_LATCH, index, mtr); merge_page = buf_block_get_frame(merge_block); #ifdef UNIV_BTR_DEBUG @@ -4212,7 +4304,7 @@ btr_discard_page( } /* Remove the page from the level list */ - btr_level_list_remove(index->table->space_id, page_size, + btr_level_list_remove(index->table->space_id, zip_size, page, index, mtr); #ifdef UNIV_ZIP_DEBUG @@ -4387,7 +4479,7 @@ btr_print_index( mtr_commit(&mtr); - ut_ad(btr_validate_index(index, 0, false)); + ut_ad(btr_validate_index(index, 0)); } #endif /* UNIV_BTR_PRINT */ @@ -4512,14 +4604,32 @@ btr_index_rec_validate( return(FALSE); } + const bool is_alter_metadata = page_is_leaf(page) + && !page_has_prev(page) + && index->is_primary() && index->table->instant + && rec == page_rec_get_next_const(page_get_infimum_rec(page)); + + if (is_alter_metadata + && !rec_is_alter_metadata(rec, page_is_comp(page))) { + btr_index_rec_validate_report(page, rec, index); + + ib::error() << "First record is not ALTER TABLE metadata"; + return FALSE; + } + if (!page_is_comp(page)) { const ulint n_rec_fields = rec_get_n_fields_old(rec); if (n_rec_fields == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD && index->id == DICT_INDEXES_ID) { /* A record for older SYS_INDEXES table (missing merge_threshold column) is acceptable. */ + } else if (is_alter_metadata) { + if (n_rec_fields != ulint(index->n_fields) + 1) { + goto n_field_mismatch; + } } else if (n_rec_fields < index->n_core_fields || n_rec_fields > index->n_fields) { +n_field_mismatch: btr_index_rec_validate_report(page, rec, index); ib::error() << "Has " << rec_get_n_fields_old(rec) @@ -4538,15 +4648,28 @@ btr_index_rec_validate( offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page), ULINT_UNDEFINED, &heap); + const dict_field_t* field = index->fields; + ut_ad(rec_offs_n_fields(offsets) + == ulint(index->n_fields) + is_alter_metadata); - for (unsigned i = 0; i < index->n_fields; i++) { - dict_field_t* field = dict_index_get_nth_field(index, i); - ulint fixed_size = dict_col_get_fixed_size( - dict_field_get_col(field), - page_is_comp(page)); - + for (unsigned i = 0; i < rec_offs_n_fields(offsets); i++) { rec_get_nth_field_offs(offsets, i, &len); + ulint fixed_size; + + if (is_alter_metadata && i == index->first_user_field()) { + fixed_size = FIELD_REF_SIZE; + if (len != FIELD_REF_SIZE + || !rec_offs_nth_extern(offsets, i)) { + goto len_mismatch; + } + + continue; + } else { + fixed_size = dict_col_get_fixed_size( + field->col, page_is_comp(page)); + } + /* Note that if fixed_size != 0, it equals the length of a fixed-size column in the clustered index. We should adjust it here. @@ -4558,8 +4681,8 @@ btr_index_rec_validate( && (field->prefix_len ? len > field->prefix_len : (fixed_size && len != fixed_size))) { +len_mismatch: btr_index_rec_validate_report(page, rec, index); - ib::error error; error << "Field " << i << " len is " << len @@ -4577,6 +4700,8 @@ btr_index_rec_validate( } return(FALSE); } + + field++; } #ifdef VIRTUAL_INDEX_DEBUG @@ -4742,19 +4867,7 @@ btr_validate_level( page = buf_block_get_frame(block); fil_space_t* space = index->table->space; - const page_size_t table_page_size( - dict_table_page_size(index->table)); - const page_size_t space_page_size(space->flags); - - if (!table_page_size.equals_to(space_page_size)) { - - ib::warn() << "Flags mismatch: table=" << index->table->flags - << ", tablespace=" << space->flags; - - mtr_commit(&mtr); - - return(false); - } + const ulint zip_size = space->zip_size(); while (level != btr_page_get_level(page)) { const rec_t* node_ptr; @@ -4807,7 +4920,7 @@ btr_validate_level( block = btr_block_get( page_id_t(index->table->space_id, left_page_no), - table_page_size, + zip_size, RW_SX_LATCH, index, &mtr); page = buf_block_get_frame(block); left_page_no = btr_page_get_prev(page); @@ -4878,7 +4991,7 @@ loop: right_block = btr_block_get( page_id_t(index->table->space_id, right_page_no), - table_page_size, + zip_size, RW_SX_LATCH, index, &mtr); right_page = buf_block_get_frame(right_block); @@ -5054,13 +5167,13 @@ loop: btr_block_get( page_id_t(index->table->space_id, parent_right_page_no), - table_page_size, + zip_size, RW_SX_LATCH, index, &mtr); right_block = btr_block_get( page_id_t(index->table->space_id, right_page_no), - table_page_size, + zip_size, RW_SX_LATCH, index, &mtr); } @@ -5138,21 +5251,21 @@ node_ptr_fails: page_id_t( index->table->space_id, parent_right_page_no), - table_page_size, + zip_size, RW_SX_LATCH, index, &mtr); } } else if (parent_page_no != FIL_NULL) { btr_block_get( page_id_t(index->table->space_id, parent_page_no), - table_page_size, + zip_size, RW_SX_LATCH, index, &mtr); } } block = btr_block_get( page_id_t(index->table->space_id, right_page_no), - table_page_size, + zip_size, RW_SX_LATCH, index, &mtr); page = buf_block_get_frame(block); @@ -5166,57 +5279,16 @@ node_ptr_fails: } /**************************************************************//** -Do an index level validation of spaital index tree. -@return true if no error found */ -static -bool -btr_validate_spatial_index( -/*=======================*/ - dict_index_t* index, /*!< in: index */ - const trx_t* trx) /*!< in: transaction or NULL */ -{ - - mtr_t mtr; - bool ok = true; - - mtr.start(); - - mtr_x_lock_index(index, &mtr); - - page_t* root = btr_root_get(index, &mtr); - ulint n = btr_page_get_level(root); - -#ifdef UNIV_RTR_DEBUG - fprintf(stderr, "R-tree level is %lu\n", n); -#endif /* UNIV_RTR_DEBUG */ - - for (ulint i = 0; i <= n; ++i) { -#ifdef UNIV_RTR_DEBUG - fprintf(stderr, "Level %lu:\n", n - i); -#endif /* UNIV_RTR_DEBUG */ - - if (!btr_validate_level(index, trx, n - i, true)) { - ok = false; - break; - } - } - - mtr.commit(); - - return(ok); -} - -/**************************************************************//** Checks the consistency of an index tree. @return DB_SUCCESS if ok, error code if not */ dberr_t btr_validate_index( /*===============*/ dict_index_t* index, /*!< in: index */ - const trx_t* trx, /*!< in: transaction or NULL */ - bool lockout)/*!< in: true if X-latch index is intended */ + const trx_t* trx) /*!< in: transaction or NULL */ { dberr_t err = DB_SUCCESS; + bool lockout = dict_index_is_spatial(index); /* Full Text index are implemented by auxiliary tables, not the B-tree */ @@ -5224,13 +5296,6 @@ btr_validate_index( return(err); } - if (dict_index_is_spatial(index)) { - if(!btr_validate_spatial_index(index, trx)) { - err = DB_ERROR; - } - return(err); - } - mtr_t mtr; mtr_start(&mtr); @@ -5246,13 +5311,13 @@ btr_validate_index( page_t* root = btr_root_get(index, &mtr); if (!root) { - err = DB_CORRUPTION; mtr_commit(&mtr); - return err; + return DB_CORRUPTION; } ulint n = btr_page_get_level(root); + btr_validate_index_running++; for (ulint i = 0; i <= n; ++i) { if (!btr_validate_level(index, trx, n - i, lockout)) { @@ -5262,6 +5327,14 @@ btr_validate_index( } mtr_commit(&mtr); + /* In theory we need release barrier here, so that + btr_validate_index_running decrement is guaranteed to + happen after latches are released. + + Original code issued SEQ_CST on update and non-atomic + access on load. Which means it had broken synchronisation + as well. */ + btr_validate_index_running--; return(err); } @@ -5298,9 +5371,9 @@ btr_can_merge_with_page( page = btr_cur_get_page(cursor); const page_id_t page_id(index->table->space_id, page_no); - const page_size_t page_size(index->table->space->flags); + const ulint zip_size = index->table->space->zip_size(); - mblock = btr_block_get(page_id, page_size, RW_X_LATCH, index, mtr); + mblock = btr_block_get(page_id, zip_size, RW_X_LATCH, index, mtr); mpage = buf_block_get_frame(mblock); n_recs = page_get_n_recs(page); @@ -5316,7 +5389,7 @@ btr_can_merge_with_page( /* If compression padding tells us that merging will result in too packed up page i.e.: which is likely to cause compression failure then don't merge the pages. */ - if (page_size.is_compressed() && page_is_leaf(mpage) + if (zip_size && page_is_leaf(mpage) && (page_get_data_size(mpage) + data_size >= dict_index_zip_pad_optimal_page_size(index))) { diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc index 6039a0b4216..5a0c069d218 100644 --- a/storage/innobase/btr/btr0bulk.cc +++ b/storage/innobase/btr/btr0bulk.cc @@ -94,7 +94,7 @@ PageBulk::init() if (new_page_zip) { page_create_zip(new_block, m_index, m_level, 0, - NULL, &m_mtr); + &m_mtr); memset(FIL_PAGE_PREV + new_page, 0xff, 8); page_zip_write_header(new_page_zip, FIL_PAGE_PREV + new_page, @@ -107,12 +107,12 @@ PageBulk::init() } else { ut_ad(!dict_index_is_spatial(m_index)); page_create(new_block, &m_mtr, - dict_table_is_comp(m_index->table), + m_index->table->not_redundant(), false); - mlog_write_ulint(FIL_PAGE_PREV + new_page, FIL_NULL, - MLOG_4BYTES, &m_mtr); - mlog_write_ulint(FIL_PAGE_NEXT + new_page, FIL_NULL, - MLOG_4BYTES, &m_mtr); + compile_time_assert(FIL_PAGE_NEXT + == FIL_PAGE_PREV + 4); + compile_time_assert(FIL_NULL == 0xffffffff); + mlog_memset(new_block, FIL_PAGE_PREV, 8, 0xff, &m_mtr); mlog_write_ulint(PAGE_HEADER + PAGE_LEVEL + new_page, m_level, MLOG_2BYTES, &m_mtr); mlog_write_ull(PAGE_HEADER + PAGE_INDEX_ID + new_page, @@ -121,7 +121,7 @@ PageBulk::init() } else { new_block = btr_block_get( page_id_t(m_index->table->space_id, m_page_no), - page_size_t(m_index->table->space->flags), + m_index->table->space->zip_size(), RW_X_LATCH, m_index, &m_mtr); new_page = buf_block_get_frame(new_block); @@ -374,7 +374,7 @@ PageBulk::compress() ut_ad(m_page_zip != NULL); return(page_zip_compress(m_page_zip, m_page, m_index, - page_zip_level, NULL, &m_mtr)); + page_zip_level, &m_mtr)); } /** Get node pointer @@ -589,8 +589,9 @@ PageBulk::needExt( const dtuple_t* tuple, ulint rec_size) { - return(page_zip_rec_needs_ext(rec_size, m_is_comp, - dtuple_get_n_fields(tuple), m_block->page.size)); + return page_zip_rec_needs_ext(rec_size, m_is_comp, + dtuple_get_n_fields(tuple), + m_block->zip_size()); } /** Store external record @@ -662,7 +663,7 @@ PageBulk::latch() __FILE__, __LINE__, &m_mtr)) { m_block = buf_page_get_gen(page_id_t(m_index->table->space_id, m_page_no), - univ_page_size, RW_X_LATCH, + 0, RW_X_LATCH, m_block, BUF_GET_IF_IN_POOL, __FILE__, __LINE__, &m_mtr, &m_err); @@ -1019,7 +1020,7 @@ BtrBulk::finish(dberr_t err) ut_ad(last_page_no != FIL_NULL); last_block = btr_block_get( page_id_t(m_index->table->space_id, last_page_no), - page_size_t(m_index->table->space->flags), + m_index->table->space->zip_size(), RW_X_LATCH, m_index, &mtr); first_rec = page_rec_get_next( page_get_infimum_rec(last_block->frame)); @@ -1048,6 +1049,6 @@ BtrBulk::finish(dberr_t err) ut_ad(!sync_check_iterate(dict_sync_check())); ut_ad(err != DB_SUCCESS - || btr_validate_index(m_index, NULL, false) == DB_SUCCESS); + || btr_validate_index(m_index, NULL) == DB_SUCCESS); return(err); } diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index e7650b20507..a51315a9c65 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -98,7 +98,7 @@ throughput clearly from about 100000. */ #define BTR_CUR_FINE_HISTORY_LENGTH 100000 /** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */ -ulint btr_cur_n_non_sea; +Atomic_counter<ulint> btr_cur_n_non_sea; /** Old value of btr_cur_n_non_sea. Copied by srv_refresh_innodb_monitor_stats(). Referenced by srv_printf_innodb_monitor(). */ @@ -210,6 +210,7 @@ btr_rec_free_externally_stored_fields( /** Latches the leaf page or pages requested. @param[in] block leaf page where the search converged @param[in] page_id page id of the leaf +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] latch_mode BTR_SEARCH_LEAF, ... @param[in] cursor cursor @param[in] mtr mini-transaction @@ -218,7 +219,7 @@ btr_latch_leaves_t btr_cur_latch_leaves( buf_block_t* block, const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint latch_mode, btr_cur_t* cursor, mtr_t* mtr) @@ -249,7 +250,7 @@ btr_cur_latch_leaves( mode = latch_mode == BTR_MODIFY_LEAF ? RW_X_LATCH : RW_S_LATCH; latch_leaves.savepoints[1] = mtr_set_savepoint(mtr); - get_block = btr_block_get(page_id, page_size, mode, + get_block = btr_block_get(page_id, zip_size, mode, cursor->index, mtr); latch_leaves.blocks[1] = get_block; #ifdef UNIV_BTR_DEBUG @@ -282,7 +283,7 @@ btr_cur_latch_leaves( latch_leaves.savepoints[0] = mtr_set_savepoint(mtr); get_block = btr_block_get( page_id_t(page_id.space(), left_page_no), - page_size, RW_X_LATCH, cursor->index, mtr); + zip_size, RW_X_LATCH, cursor->index, mtr); latch_leaves.blocks[0] = get_block; if (spatial) { @@ -298,7 +299,7 @@ btr_cur_latch_leaves( latch_leaves.savepoints[1] = mtr_set_savepoint(mtr); get_block = btr_block_get( - page_id, page_size, RW_X_LATCH, cursor->index, mtr); + page_id, zip_size, RW_X_LATCH, cursor->index, mtr); latch_leaves.blocks[1] = get_block; #ifdef UNIV_BTR_DEBUG @@ -328,7 +329,7 @@ btr_cur_latch_leaves( latch_leaves.savepoints[2] = mtr_set_savepoint(mtr); get_block = btr_block_get( page_id_t(page_id.space(), right_page_no), - page_size, RW_X_LATCH, cursor->index, mtr); + zip_size, RW_X_LATCH, cursor->index, mtr); latch_leaves.blocks[2] = get_block; #ifdef UNIV_BTR_DEBUG ut_a(page_is_comp(get_block->frame) @@ -356,7 +357,7 @@ btr_cur_latch_leaves( latch_leaves.savepoints[0] = mtr_set_savepoint(mtr); get_block = btr_block_get( page_id_t(page_id.space(), left_page_no), - page_size, mode, cursor->index, mtr); + zip_size, mode, cursor->index, mtr); latch_leaves.blocks[0] = get_block; cursor->left_block = get_block; #ifdef UNIV_BTR_DEBUG @@ -368,7 +369,7 @@ btr_cur_latch_leaves( } latch_leaves.savepoints[1] = mtr_set_savepoint(mtr); - get_block = btr_block_get(page_id, page_size, mode, + get_block = btr_block_get(page_id, zip_size, mode, cursor->index, mtr); latch_leaves.blocks[1] = get_block; #ifdef UNIV_BTR_DEBUG @@ -421,8 +422,12 @@ unreadable: } btr_cur_t cur; + /* Relax the assertion in rec_init_offsets(). */ + ut_ad(!index->in_instant_init); + ut_d(index->in_instant_init = true); dberr_t err = btr_cur_open_at_index_side(true, index, BTR_SEARCH_LEAF, &cur, 0, mtr); + ut_d(index->in_instant_init = false); if (err != DB_SUCCESS) { index->table->corrupted = true; return err; @@ -456,8 +461,8 @@ unreadable: return DB_CORRUPTION; } - if (info_bits != REC_INFO_MIN_REC_FLAG - || (comp && rec_get_status(rec) != REC_STATUS_COLUMNS_ADDED)) { + if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG + || (comp && rec_get_status(rec) != REC_STATUS_INSTANT)) { incompatible: ib::error() << "Table " << index->table->name << " contains unrecognizable instant ALTER metadata"; @@ -475,6 +480,117 @@ incompatible: concurrent operations on the table, including table eviction from the cache. */ + if (info_bits & REC_INFO_DELETED_FLAG) { + /* This metadata record includes a BLOB that identifies + any dropped or reordered columns. */ + ulint trx_id_offset = index->trx_id_offset; + /* If !index->trx_id_offset, the PRIMARY KEY contains + variable-length columns. For the metadata record, + variable-length columns should be written with zero + length. However, before MDEV-21088 was fixed, for + variable-length encoded PRIMARY KEY column of type + CHAR, we wrote more than zero bytes. That is why we + must determine the actual length of each PRIMARY KEY + column. The DB_TRX_ID will start right after any + PRIMARY KEY columns. */ + ut_ad(index->n_uniq); + + /* We cannot invoke rec_get_offsets() before + index->table->deserialise_columns(). Therefore, + we must duplicate some logic here. */ + if (trx_id_offset) { + } else if (index->table->not_redundant()) { + /* The PRIMARY KEY contains variable-length columns. + For the metadata record, variable-length columns are + always written with zero length. The DB_TRX_ID will + start right after any fixed-length columns. */ + + /* OK, before MDEV-21088 was fixed, for + variable-length encoded PRIMARY KEY column of + type CHAR, we wrote more than zero bytes. In + order to allow affected tables to be accessed, + it would be nice to determine the actual + length of each PRIMARY KEY column. However, to + be able to do that, we should determine the + size of the null-bit bitmap in the metadata + record. And we cannot know that before reading + the metadata BLOB, whose starting point we are + trying to find here. (Although the PRIMARY KEY + columns cannot be NULL, we would have to know + where the lengths of variable-length PRIMARY KEY + columns start.) + + So, unfortunately we cannot help users who + were affected by MDEV-21088 on a ROW_FORMAT=COMPACT + or ROW_FORMAT=DYNAMIC table. */ + + for (uint i = index->n_uniq; i--; ) { + trx_id_offset += index->fields[i].fixed_len; + } + } else if (rec_get_1byte_offs_flag(rec)) { + trx_id_offset = rec_1_get_field_end_info( + rec, index->n_uniq - 1); + ut_ad(!(trx_id_offset & REC_1BYTE_SQL_NULL_MASK)); + trx_id_offset &= ~REC_1BYTE_SQL_NULL_MASK; + } else { + trx_id_offset = rec_2_get_field_end_info( + rec, index->n_uniq - 1); + ut_ad(!(trx_id_offset & REC_2BYTE_SQL_NULL_MASK)); + trx_id_offset &= ~REC_2BYTE_SQL_NULL_MASK; + } + + const byte* ptr = rec + trx_id_offset + + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + if (mach_read_from_4(ptr + BTR_EXTERN_LEN)) { + goto incompatible; + } + + uint len = mach_read_from_4(ptr + BTR_EXTERN_LEN + 4); + if (!len + || mach_read_from_4(ptr + BTR_EXTERN_OFFSET) + != FIL_PAGE_DATA + || mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID) + != space->id) { + goto incompatible; + } + + buf_block_t* block = buf_page_get( + page_id_t(space->id, + mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)), + 0, RW_S_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE); + if (fil_page_get_type(block->frame) != FIL_PAGE_TYPE_BLOB + || mach_read_from_4(&block->frame[FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO]) + != FIL_NULL + || mach_read_from_4(&block->frame[FIL_PAGE_DATA + + BTR_BLOB_HDR_PART_LEN]) + != len) { + goto incompatible; + } + + /* The unused part of the BLOB page should be zero-filled. */ + for (const byte* b = block->frame + + (FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE) + len, + * const end = block->frame + srv_page_size + - BTR_EXTERN_LEN; + b < end; ) { + if (*b++) { + goto incompatible; + } + } + + if (index->table->deserialise_columns( + &block->frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE], + len)) { + goto incompatible; + } + + /* Proceed to initialize the default values of + any instantly added columns. */ + } + mem_heap_t* heap = NULL; rec_offs* offsets = rec_get_offsets(rec, index, NULL, true, ULINT_UNDEFINED, &heap); @@ -488,7 +604,8 @@ inconsistent: record, it is also OK to perform READ UNCOMMITTED and then ignore any extra fields, provided that trx_sys.is_registered(DB_TRX_ID). */ - if (rec_offs_n_fields(offsets) > index->n_fields + if (rec_offs_n_fields(offsets) + > ulint(index->n_fields) + !!index->table->instant && !trx_sys.is_registered(current_trx(), row_get_rec_trx_id(rec, index, offsets))) { @@ -496,10 +613,11 @@ inconsistent: } for (unsigned i = index->n_core_fields; i < index->n_fields; i++) { - ulint len; - const byte* data = rec_get_nth_field(rec, offsets, i, &len); dict_col_t* col = index->fields[i].col; - ut_ad(!col->is_instant()); + const unsigned o = i + !!index->table->instant; + ulint len; + const byte* data = rec_get_nth_field(rec, offsets, o, &len); + ut_ad(!col->is_added()); ut_ad(!col->def_val.data); col->def_val.len = len; switch (len) { @@ -510,7 +628,7 @@ inconsistent: continue; } ut_ad(len != UNIV_SQL_DEFAULT); - if (!rec_offs_nth_extern(offsets, i)) { + if (!rec_offs_nth_extern(offsets, o)) { col->def_val.data = mem_heap_dup( index->table->heap, data, len); } else if (len < BTR_EXTERN_FIELD_REF_SIZE @@ -522,7 +640,7 @@ inconsistent: } else { col->def_val.data = btr_copy_externally_stored_field( &col->def_val.len, data, - dict_table_page_size(index->table), + cur.page_cur.block->zip_size(), len, index->table->heap); } } @@ -591,30 +709,49 @@ bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page) const uint16_t n = page_get_instant(page); - if (n < index->n_uniq + DATA_ROLL_PTR || n > index->n_fields) { + if (n < index->n_uniq + DATA_ROLL_PTR) { /* The PRIMARY KEY (or hidden DB_ROW_ID) and DB_TRX_ID,DB_ROLL_PTR columns must always be present - as 'core' fields. All fields, including those for - instantly added columns, must be present in the data - dictionary. */ + as 'core' fields. */ return true; } - if (memcmp(page_get_infimum_rec(page), "infimum", 8) - || memcmp(page_get_supremum_rec(page), "supremum", 8)) { - /* In a later format, these fields in a FIL_PAGE_TYPE_INSTANT - root page could be repurposed for something else. */ + if (n > REC_MAX_N_FIELDS) { return true; } index->n_core_fields = n; - ut_ad(!index->is_dummy); - ut_d(index->is_dummy = true); - index->n_core_null_bytes = n == index->n_fields - ? UT_BITS_IN_BYTES(unsigned(index->n_nullable)) - : UT_BITS_IN_BYTES(index->get_n_nullable(n)); - ut_d(index->is_dummy = false); - return false; + + const rec_t* infimum = page_get_infimum_rec(page); + const rec_t* supremum = page_get_supremum_rec(page); + + if (!memcmp(infimum, "infimum", 8) + && !memcmp(supremum, "supremum", 8)) { + if (n > index->n_fields) { + /* All fields, including those for instantly + added columns, must be present in the + data dictionary. */ + return true; + } + + ut_ad(!index->is_dummy); + ut_d(index->is_dummy = true); + index->n_core_null_bytes = UT_BITS_IN_BYTES( + index->get_n_nullable(n)); + ut_d(index->is_dummy = false); + return false; + } + + if (memcmp(infimum, field_ref_zero, 8) + || memcmp(supremum, field_ref_zero, 7)) { + /* The infimum and supremum records must either contain + the original strings, or they must be filled with zero + bytes, except for the bytes that we have repurposed. */ + return true; + } + + index->n_core_null_bytes = supremum[7]; + return index->n_core_null_bytes > 128; } /** Optimistically latches the leaf page or pages requested. @@ -672,8 +809,7 @@ btr_cur_optimistic_latch_leaves( cursor->left_block = btr_block_get( page_id_t(cursor->index->table->space_id, left_page_no), - page_size_t(cursor->index->table->space - ->flags), + cursor->index->table->space->zip_size(), mode, cursor->index, mtr); } else { cursor->left_block = NULL; @@ -774,7 +910,7 @@ btr_cur_latch_for_root_leaf( @param[in] lock_intention lock intention for the tree operation @param[in] rec record (current node_ptr) @param[in] rec_size size of the record or max size of node_ptr -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] mtr mtr @return true if tree modification is needed */ static @@ -785,7 +921,7 @@ btr_cur_will_modify_tree( btr_intention_t lock_intention, const rec_t* rec, ulint rec_size, - const page_size_t& page_size, + ulint zip_size, mtr_t* mtr) { ut_ad(!page_is_leaf(page)); @@ -893,9 +1029,8 @@ btr_cur_will_modify_tree( This is based on the worst case, and we could invoke page_zip_available() on the block->page.zip. */ /* needs 2 records' space also for worst compress rate. */ - if (page_size.is_compressed() - && page_zip_empty_size(index->n_fields, - page_size.physical()) + if (zip_size + && page_zip_empty_size(index->n_fields, zip_size) <= rec_size * 2 + page_get_data_size(page) + page_dir_calc_reserved_space(n_recs + 2)) { return(true); @@ -1313,7 +1448,7 @@ btr_cur_search_to_nth_level_func( } # endif /* BTR_CUR_HASH_ADAPT */ #endif /* BTR_CUR_ADAPT */ - my_atomic_addlint(&btr_cur_n_non_sea, 1); + btr_cur_n_non_sea++; /* If the hash search did not succeed, do binary search down the tree */ @@ -1336,7 +1471,7 @@ btr_cur_search_to_nth_level_func( Free blocks and read IO bandwidth should be prior for them, when the history list is glowing huge. */ if (lock_intention == BTR_INTENTION_DELETE - && trx_sys.history_size() > BTR_CUR_FINE_HISTORY_LENGTH + && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH && buf_get_n_pending_read_ios()) { x_latch_index: mtr_x_lock_index(index, mtr); @@ -1393,7 +1528,7 @@ x_latch_index: page_cursor = btr_cur_get_page_cur(cursor); - const page_size_t page_size(index->table->space->flags); + const ulint zip_size = index->table->space->zip_size(); /* Start with the root page. */ page_id_t page_id(index->table->space_id, index->page); @@ -1476,7 +1611,7 @@ search_loop: retry_page_get: ut_ad(n_blocks < BTR_MAX_LEVELS); tree_savepoints[n_blocks] = mtr_set_savepoint(mtr); - block = buf_page_get_gen(page_id, page_size, rw_latch, guess, + block = buf_page_get_gen(page_id, zip_size, rw_latch, guess, buf_mode, file, line, mtr, &err); tree_blocks[n_blocks] = block; @@ -1512,7 +1647,7 @@ retry_page_get: ut_ad(!dict_index_is_spatial(index)); if (ibuf_insert(IBUF_OP_INSERT, tuple, index, - page_id, page_size, cursor->thr)) { + page_id, zip_size, cursor->thr)) { cursor->flag = BTR_CUR_INSERT_TO_IBUF; @@ -1525,7 +1660,7 @@ retry_page_get: ut_ad(!dict_index_is_spatial(index)); if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple, - index, page_id, page_size, + index, page_id, zip_size, cursor->thr)) { cursor->flag = BTR_CUR_DEL_MARK_IBUF; @@ -1545,7 +1680,7 @@ retry_page_get: /* The record cannot be purged yet. */ cursor->flag = BTR_CUR_DELETE_REF; } else if (ibuf_insert(IBUF_OP_DELETE, tuple, - index, page_id, page_size, + index, page_id, zip_size, cursor->thr)) { /* The purge was buffered. */ @@ -1591,7 +1726,7 @@ retry_page_get: = mtr_set_savepoint(mtr); get_block = buf_page_get_gen( page_id_t(page_id.space(), left_page_no), - page_size, rw_latch, NULL, buf_mode, + zip_size, rw_latch, NULL, buf_mode, file, line, mtr, &err); prev_tree_blocks[prev_n_blocks] = get_block; prev_n_blocks++; @@ -1621,7 +1756,7 @@ retry_page_get: tree_blocks[n_blocks]); tree_savepoints[n_blocks] = mtr_set_savepoint(mtr); - block = buf_page_get_gen(page_id, page_size, rw_latch, NULL, + block = buf_page_get_gen(page_id, zip_size, rw_latch, NULL, buf_mode, file, line, mtr, &err); tree_blocks[n_blocks] = block; @@ -1688,17 +1823,11 @@ retry_page_get: if (dict_index_is_spatial(index)) { ut_ad(cursor->rtr_info); - node_seq_t seq_no = rtr_get_current_ssn_id(index); - /* If SSN in memory is not initialized, fetch it from root page */ - if (seq_no < 1) { - node_seq_t root_seq_no; - - root_seq_no = page_get_ssn_id(page); - my_atomic_store32_explicit( - &index->rtr_ssn, root_seq_no + 1, - MY_MEMORY_ORDER_RELAXED); + if (!rtr_get_current_ssn_id(index)) { + /* FIXME: do this in dict_load_table_one() */ + index->set_ssn(page_get_ssn_id(page) + 1); } /* Save the MBR */ @@ -1718,7 +1847,7 @@ retry_page_get: if (rw_latch == RW_NO_LATCH) { latch_leaves = btr_cur_latch_leaves( - block, page_id, page_size, latch_mode, + block, page_id, zip_size, latch_mode, cursor, mtr); } @@ -2082,7 +2211,7 @@ need_opposite_intention: && latch_mode == BTR_MODIFY_TREE && !btr_cur_will_modify_tree( index, page, lock_intention, node_ptr, - node_ptr_max_size, page_size, mtr) + node_ptr_max_size, zip_size, mtr) && !rtree_parent_modified) { ut_ad(upper_rw_latch == RW_X_LATCH); ut_ad(n_releases <= n_blocks); @@ -2280,12 +2409,12 @@ need_opposite_intention: if (latch_mode == BTR_CONT_MODIFY_TREE) { child_block = btr_block_get( - page_id, page_size, RW_X_LATCH, + page_id, zip_size, RW_X_LATCH, index, mtr); } else { ut_ad(latch_mode == BTR_CONT_SEARCH_TREE); child_block = btr_block_get( - page_id, page_size, RW_SX_LATCH, + page_id, zip_size, RW_SX_LATCH, index, mtr); } @@ -2340,9 +2469,10 @@ need_opposite_intention: ut_ad(index->is_instant()); /* This may be a search tuple for btr_pcur_restore_position(). */ - ut_ad(tuple->info_bits == REC_INFO_METADATA - || tuple->info_bits == REC_INFO_MIN_REC_FLAG); - } else if (rec_is_metadata(btr_cur_get_rec(cursor), index)) { + ut_ad(tuple->is_metadata() + || (tuple->is_metadata(tuple->info_bits + ^ REC_STATUS_INSTANT))); + } else if (rec_is_metadata(btr_cur_get_rec(cursor), *index)) { /* Only user records belong in the adaptive hash index. */ } else { @@ -2469,7 +2599,7 @@ btr_cur_open_at_index_side_func( Free blocks and read IO bandwidth should be prior for them, when the history list is glowing huge. */ if (lock_intention == BTR_INTENTION_DELETE - && trx_sys.history_size() > BTR_CUR_FINE_HISTORY_LENGTH + && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH && buf_get_n_pending_read_ios()) { mtr_x_lock_index(index, mtr); } else { @@ -2502,7 +2632,7 @@ btr_cur_open_at_index_side_func( cursor->index = index; page_id_t page_id(index->table->space_id, index->page); - const page_size_t page_size(index->table->space->flags); + const ulint zip_size = index->table->space->zip_size(); if (root_leaf_rw_latch == RW_X_LATCH) { node_ptr_max_size = btr_node_ptr_max_size(index); @@ -2525,7 +2655,7 @@ btr_cur_open_at_index_side_func( } tree_savepoints[n_blocks] = mtr_set_savepoint(mtr); - block = buf_page_get_gen(page_id, page_size, rw_latch, NULL, + block = buf_page_get_gen(page_id, zip_size, rw_latch, NULL, BUF_GET, file, line, mtr, &err); ut_ad((block != NULL) == (err == DB_SUCCESS)); tree_blocks[n_blocks] = block; @@ -2581,12 +2711,12 @@ btr_cur_open_at_index_side_func( if (height == level) { if (srv_read_only_mode) { btr_cur_latch_leaves( - block, page_id, page_size, + block, page_id, zip_size, latch_mode, cursor, mtr); } else if (height == 0) { if (rw_latch == RW_NO_LATCH) { btr_cur_latch_leaves( - block, page_id, page_size, + block, page_id, zip_size, latch_mode, cursor, mtr); } /* In versions <= 3.23.52 we had @@ -2717,7 +2847,7 @@ btr_cur_open_at_index_side_func( if (latch_mode == BTR_MODIFY_TREE && !btr_cur_will_modify_tree( cursor->index, page, lock_intention, node_ptr, - node_ptr_max_size, page_size, mtr)) { + node_ptr_max_size, zip_size, mtr)) { ut_ad(upper_rw_latch == RW_X_LATCH); ut_ad(n_releases <= n_blocks); @@ -2814,7 +2944,7 @@ btr_cur_open_at_rnd_pos_func( Free blocks and read IO bandwidth should be prior for them, when the history list is glowing huge. */ if (lock_intention == BTR_INTENTION_DELETE - && trx_sys.history_size() > BTR_CUR_FINE_HISTORY_LENGTH + && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH && buf_get_n_pending_read_ios()) { mtr_x_lock_index(index, mtr); } else { @@ -2859,7 +2989,7 @@ btr_cur_open_at_rnd_pos_func( cursor->index = index; page_id_t page_id(index->table->space_id, index->page); - const page_size_t page_size(index->table->space->flags); + const ulint zip_size = index->table->space->zip_size(); dberr_t err = DB_SUCCESS; if (root_leaf_rw_latch == RW_X_LATCH) { @@ -2883,7 +3013,7 @@ btr_cur_open_at_rnd_pos_func( } tree_savepoints[n_blocks] = mtr_set_savepoint(mtr); - block = buf_page_get_gen(page_id, page_size, rw_latch, NULL, + block = buf_page_get_gen(page_id, zip_size, rw_latch, NULL, BUF_GET, file, line, mtr, &err); tree_blocks[n_blocks] = block; @@ -2936,7 +3066,7 @@ btr_cur_open_at_rnd_pos_func( if (rw_latch == RW_NO_LATCH || srv_read_only_mode) { btr_cur_latch_leaves( - block, page_id, page_size, + block, page_id, zip_size, latch_mode, cursor, mtr); } @@ -3012,7 +3142,7 @@ btr_cur_open_at_rnd_pos_func( if (latch_mode == BTR_MODIFY_TREE && !btr_cur_will_modify_tree( cursor->index, page, lock_intention, node_ptr, - node_ptr_max_size, page_size, mtr)) { + node_ptr_max_size, zip_size, mtr)) { ut_ad(upper_rw_latch == RW_X_LATCH); ut_ad(n_releases <= n_blocks); @@ -3188,8 +3318,11 @@ btr_cur_ins_lock_and_undo( roll_ptr = roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS; if (!(flags & BTR_KEEP_SYS_FLAG)) { upd_sys: - row_upd_index_entry_sys_field(entry, index, - DATA_ROLL_PTR, roll_ptr); + dfield_t* r = dtuple_get_nth_field( + entry, index->db_roll_ptr()); + ut_ad(r->len == DATA_ROLL_PTR_LEN); + trx_write_roll_ptr(static_cast<byte*>(r->data), + roll_ptr); } } else { err = trx_undo_report_row_operation(thr, index, entry, @@ -3221,12 +3354,12 @@ btr_cur_prefetch_siblings( if (left_page_no != FIL_NULL) { buf_read_page_background( page_id_t(block->page.id.space(), left_page_no), - block->page.size, false); + block->zip_size(), false); } if (right_page_no != FIL_NULL) { buf_read_page_background( page_id_t(block->page.id.space(), right_page_no), - block->page.size, false); + block->zip_size(), false); } if (left_page_no != FIL_NULL || right_page_no != FIL_NULL) { @@ -3293,23 +3426,27 @@ btr_cur_optimistic_insert( || (flags & BTR_CREATE_FLAG)); ut_ad(dtuple_check_typed(entry)); - const page_size_t& page_size = block->page.size; - #ifdef HAVE_valgrind_or_MSAN - if (page_size.is_compressed()) { - MEM_CHECK_DEFINED(page, page_size.logical()); - MEM_CHECK_DEFINED(block->page.zip.data, page_size.physical()); + if (block->page.zip.data) { + MEM_CHECK_DEFINED(page, srv_page_size); + MEM_CHECK_DEFINED(block->page.zip.data, block->zip_size()); } #endif /* HAVE_valgrind_or_MSAN */ leaf = page_is_leaf(page); + if (UNIV_UNLIKELY(entry->is_alter_metadata())) { + ut_ad(leaf); + goto convert_big_rec; + } + /* Calculate the record size when entry is converted to a record */ rec_size = rec_get_converted_size(index, entry, n_ext); if (page_zip_rec_needs_ext(rec_size, page_is_comp(page), - dtuple_get_n_fields(entry), page_size)) { - + dtuple_get_n_fields(entry), + block->zip_size())) { +convert_big_rec: /* The record is so big that we have to store some fields externally on separate database pages */ big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext); @@ -3322,7 +3459,7 @@ btr_cur_optimistic_insert( rec_size = rec_get_converted_size(index, entry, n_ext); } - if (page_size.is_compressed() && page_zip_is_too_big(index, entry)) { + if (block->page.zip.data && page_zip_is_too_big(index, entry)) { if (big_rec_vec != NULL) { dtuple_convert_back_big_rec(index, entry, big_rec_vec); } @@ -3333,7 +3470,7 @@ btr_cur_optimistic_insert( LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), goto fail); - if (leaf && page_size.is_compressed() + if (block->page.zip.data && leaf && (page_get_data_size(page) + rec_size >= dict_index_zip_pad_optimal_page_size(index))) { /* If compression padding tells us that insertion will @@ -3376,7 +3513,7 @@ fail_err: we have to split the page to reserve enough free space for future updates of records. */ - if (leaf && !page_size.is_compressed() && dict_index_is_clust(index) + if (leaf && !block->page.zip.data && dict_index_is_clust(index) && page_get_n_recs(page) >= 2 && dict_index_get_space_reserve() + rec_size > max_size && (btr_page_get_split_rec_to_right(cursor, &dummy) @@ -3439,7 +3576,7 @@ fail_err: } if (*rec) { - } else if (page_size.is_compressed()) { + } else if (block->page.zip.data) { ut_ad(!index->table->is_temporary()); /* Reset the IBUF_BITMAP_FREE bits, because page_cur_tuple_insert() will have attempted page @@ -3480,7 +3617,7 @@ fail_err: } else if (index->disable_ahi) { # endif } else if (entry->info_bits & REC_INFO_MIN_REC_FLAG) { - ut_ad(entry->info_bits == REC_INFO_METADATA); + ut_ad(entry->is_metadata()); ut_ad(index->is_instant()); ut_ad(flags == BTR_NO_LOCKING_FLAG); } else { @@ -3515,7 +3652,7 @@ fail_err: committed mini-transaction, because in crash recovery, the free bits could momentarily be set too high. */ - if (page_size.is_compressed()) { + if (block->page.zip.data) { /* Update the bits in the same mini-transaction. */ ibuf_update_free_bits_zip(block, mtr); } else { @@ -3613,9 +3750,14 @@ btr_cur_pessimistic_insert( } if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext), - dict_table_is_comp(index->table), + index->table->not_redundant(), dtuple_get_n_fields(entry), - dict_table_page_size(index->table))) { + btr_cur_get_block(cursor)->zip_size()) + || UNIV_UNLIKELY(entry->is_alter_metadata() + && !dfield_is_ext( + dtuple_get_nth_field( + entry, + index->first_user_field())))) { /* The record is so big that we have to store some fields externally on separate database pages */ @@ -3685,7 +3827,7 @@ btr_cur_pessimistic_insert( if (index->disable_ahi); else # endif if (entry->info_bits & REC_INFO_MIN_REC_FLAG) { - ut_ad(entry->info_bits == REC_INFO_METADATA); + ut_ad(entry->is_metadata()); ut_ad(index->is_instant()); ut_ad(flags & BTR_NO_LOCKING_FLAG); ut_ad(!(flags & BTR_CREATE_FLAG)); @@ -3770,6 +3912,50 @@ btr_cur_upd_lock_and_undo( cmpl_info, rec, offsets, roll_ptr)); } +/** Copy DB_TRX_ID,DB_ROLL_PTR to the redo log. +@param[in] index clustered index +@param[in] trx_id_t DB_TRX_ID +@param[in] roll_ptr DB_ROLL_PTR +@param[in,out] log_ptr redo log buffer +@return current end of the redo log buffer */ +static byte* +btr_cur_log_sys( + const dict_index_t* index, + trx_id_t trx_id, + roll_ptr_t roll_ptr, + byte* log_ptr) +{ + log_ptr += mach_write_compressed(log_ptr, index->db_trx_id()); + /* Yes, we are writing DB_ROLL_PTR,DB_TRX_ID in reverse order, + after emitting the position of DB_TRX_ID in the index. + This is how row_upd_write_sys_vals_to_log() + originally worked, and it is part of the redo log format. */ + trx_write_roll_ptr(log_ptr, roll_ptr); + log_ptr += DATA_ROLL_PTR_LEN; + log_ptr += mach_u64_write_compressed(log_ptr, trx_id); + + return log_ptr; +} + +/** Write DB_TRX_ID,DB_ROLL_PTR to a clustered index entry. +@param[in,out] entry clustered index entry +@param[in] index clustered index +@param[in] trx_id DB_TRX_ID +@param[in] roll_ptr DB_ROLL_PTR */ +static void btr_cur_write_sys( + dtuple_t* entry, + const dict_index_t* index, + trx_id_t trx_id, + roll_ptr_t roll_ptr) +{ + dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id()); + ut_ad(t->len == DATA_TRX_ID_LEN); + trx_write_trx_id(static_cast<byte*>(t->data), trx_id); + dfield_t* r = dtuple_get_nth_field(entry, index->db_roll_ptr()); + ut_ad(r->len == DATA_ROLL_PTR_LEN); + trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr); +} + /***********************************************************//** Writes a redo log record of updating a record in-place. */ void @@ -3809,8 +3995,7 @@ btr_cur_update_in_place_log( log_ptr++; if (dict_index_is_clust(index)) { - log_ptr = row_upd_write_sys_vals_to_log( - index, trx_id, roll_ptr, log_ptr, mtr); + log_ptr = btr_cur_log_sys(index, trx_id, roll_ptr, log_ptr); } else { /* Dummy system fields for a secondary index */ /* TRX_ID Position */ @@ -4163,6 +4348,72 @@ func_exit: return(err); } +/** Trim a metadata record during the rollback of instant ALTER TABLE. +@param[in] entry metadata tuple +@param[in] index primary key +@param[in] update update vector for the rollback */ +ATTRIBUTE_COLD +static void btr_cur_trim_alter_metadata(dtuple_t* entry, + const dict_index_t* index, + const upd_t* update) +{ + ut_ad(index->is_instant()); + ut_ad(update->is_alter_metadata()); + ut_ad(entry->is_alter_metadata()); + + ut_ad(update->fields[0].field_no == index->first_user_field()); + ut_ad(update->fields[0].new_val.ext); + ut_ad(update->fields[0].new_val.len == FIELD_REF_SIZE); + ut_ad(entry->n_fields - 1 == index->n_fields); + + const byte* ptr = static_cast<const byte*>( + update->fields[0].new_val.data); + ut_ad(!mach_read_from_4(ptr + BTR_EXTERN_LEN)); + ut_ad(mach_read_from_4(ptr + BTR_EXTERN_LEN + 4) > 4); + ut_ad(mach_read_from_4(ptr + BTR_EXTERN_OFFSET) == FIL_PAGE_DATA); + ut_ad(mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID) + == index->table->space->id); + + ulint n_fields = update->fields[1].field_no; + ut_ad(n_fields <= index->n_fields); + if (n_fields != index->n_uniq) { + ut_ad(n_fields + >= index->n_core_fields); + entry->n_fields = n_fields; + return; + } + + /* This is based on dict_table_t::deserialise_columns() + and btr_cur_instant_init_low(). */ + mtr_t mtr; + mtr.start(); + buf_block_t* block = buf_page_get( + page_id_t(index->table->space->id, + mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)), + 0, RW_S_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE); + ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_TYPE_BLOB); + ut_ad(mach_read_from_4(&block->frame[FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO]) + == FIL_NULL); + ut_ad(mach_read_from_4(&block->frame[FIL_PAGE_DATA + + BTR_BLOB_HDR_PART_LEN]) + == mach_read_from_4(ptr + BTR_EXTERN_LEN + 4)); + n_fields = mach_read_from_4( + &block->frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE]) + + index->first_user_field(); + /* Rollback should not increase the number of fields. */ + ut_ad(n_fields <= index->n_fields); + ut_ad(n_fields + 1 <= entry->n_fields); + /* dict_index_t::clear_instant_alter() cannot be invoked while + rollback of an instant ALTER TABLE transaction is in progress + for an is_alter_metadata() record. */ + ut_ad(n_fields >= index->n_core_fields); + + mtr.commit(); + entry->n_fields = n_fields + 1; +} + /** Trim an update tuple due to instant ADD COLUMN, if needed. For normal records, the trailing instantly added fields that match the initial default values are omitted. @@ -4184,13 +4435,12 @@ btr_cur_trim( const que_thr_t* thr) { if (!index->is_instant()) { - } else if (UNIV_UNLIKELY(update->info_bits == REC_INFO_METADATA)) { + } else if (UNIV_UNLIKELY(update->is_metadata())) { /* We are either updating a metadata record - (instantly adding columns to a table where instant ADD was + (instant ALTER TABLE on a table where instant ALTER was already executed) or rolling back such an operation. */ ut_ad(!upd_get_nth_field(update, 0)->orig_len); - ut_ad(upd_get_nth_field(update, 0)->field_no - > index->n_core_fields); + ut_ad(entry->is_metadata()); if (thr->graph->trx->in_rollback) { /* This rollback can occur either as part of @@ -4207,6 +4457,13 @@ btr_cur_trim( first instantly added column logged by innobase_add_instant_try(). */ ut_ad(update->n_fields > 2); + if (update->is_alter_metadata()) { + btr_cur_trim_alter_metadata( + entry, index, update); + return; + } + ut_ad(!entry->is_alter_metadata()); + ulint n_fields = upd_get_nth_field(update, 0) ->field_no; ut_ad(n_fields + 1 >= entry->n_fields); @@ -4291,9 +4548,7 @@ btr_cur_optimistic_update( || thr_get_trx(thr) == trx_roll_crash_recv_trx); #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ - const bool is_metadata = update->info_bits == REC_INFO_METADATA; - - if (UNIV_LIKELY(!is_metadata) + if (UNIV_LIKELY(!update->is_metadata()) && !row_upd_changes_field_size_or_external(index, *offsets, update)) { @@ -4319,6 +4574,10 @@ any_extern: return(DB_OVERFLOW); } + if (rec_is_metadata(rec, *index) && index->table->instant) { + goto any_extern; + } + for (i = 0; i < upd_get_n_fields(update); i++) { if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) { @@ -4361,7 +4620,7 @@ any_extern: if (page_zip_rec_needs_ext(new_rec_size, page_is_comp(page), dict_index_get_n_fields(index), - dict_table_page_size(index->table))) { + block->zip_size())) { goto any_extern; } @@ -4375,10 +4634,10 @@ any_extern: } /* We limit max record size to 16k even for 64k page size. */ - if (new_rec_size >= COMPRESSED_REC_MAX_DATA_SIZE || - (!dict_table_is_comp(index->table) - && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) { - err = DB_OVERFLOW; + if (new_rec_size >= COMPRESSED_REC_MAX_DATA_SIZE || + (!dict_table_is_comp(index->table) + && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) { + err = DB_OVERFLOW; goto func_exit; } @@ -4451,8 +4710,8 @@ any_extern: lock_rec_store_on_page_infimum(block, rec); } - if (UNIV_UNLIKELY(is_metadata)) { - ut_ad(new_entry->info_bits == REC_INFO_METADATA); + if (UNIV_UNLIKELY(update->is_metadata())) { + ut_ad(new_entry->is_metadata()); ut_ad(index->is_instant()); /* This can be innobase_add_instant_try() performing a subsequent instant ADD COLUMN, or its rollback by @@ -4467,10 +4726,7 @@ any_extern: page_cur_move_to_prev(page_cursor); if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR, - roll_ptr); - row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID, - trx_id); + btr_cur_write_sys(new_entry, index, trx_id, roll_ptr); } /* There are no externally stored columns in new_entry */ @@ -4478,7 +4734,7 @@ any_extern: cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr); ut_a(rec); /* <- We calculated above the insert would fit */ - if (UNIV_UNLIKELY(is_metadata)) { + if (UNIV_UNLIKELY(update->is_metadata())) { /* We must empty the PAGE_FREE list, because if this was a rollback, the shortened metadata record would have too many fields, and we would be unable to @@ -4543,7 +4799,8 @@ btr_cur_pess_upd_restore_supremum( const page_id_t page_id(block->page.id.space(), prev_page_no); ut_ad(prev_page_no != FIL_NULL); - prev_block = buf_page_get_with_no_latch(page_id, block->page.size, mtr); + prev_block = buf_page_get_with_no_latch(page_id, block->zip_size(), + mtr); #ifdef UNIV_BTR_DEBUG ut_a(btr_page_get_next(prev_block->frame) == block->page.id.page_no()); #endif /* UNIV_BTR_DEBUG */ @@ -4667,8 +4924,24 @@ btr_cur_pessimistic_update( rec = btr_cur_get_rec(cursor); ut_ad(rec_offs_validate(rec, index, *offsets)); - dtuple_t* new_entry = row_rec_to_index_entry( - rec, index, *offsets, entry_heap); + dtuple_t* new_entry; + + const bool is_metadata = rec_is_metadata(rec, *index); + + if (UNIV_UNLIKELY(is_metadata)) { + ut_ad(update->is_metadata()); + ut_ad(flags & BTR_NO_LOCKING_FLAG); + ut_ad(index->is_instant()); + new_entry = row_metadata_to_tuple( + rec, index, *offsets, entry_heap, + update->info_bits, !thr_get_trx(thr)->in_rollback); + ut_ad(new_entry->n_fields + == ulint(index->n_fields) + + update->is_alter_metadata()); + } else { + new_entry = row_rec_to_index_entry(rec, index, *offsets, + entry_heap); + } /* The page containing the clustered index record corresponding to new_entry is latched in mtr. If the @@ -4680,9 +4953,6 @@ btr_cur_pessimistic_update( entry_heap); btr_cur_trim(new_entry, index, update, thr); - const bool is_metadata = new_entry->info_bits - & REC_INFO_MIN_REC_FLAG; - /* We have to set appropriate extern storage bits in the new record to be inserted: we have to remember which fields were such */ @@ -4708,14 +4978,17 @@ btr_cur_pessimistic_update( index, rec, page_zip, *offsets, update, true, mtr); } - ulint n_ext = dtuple_get_n_ext(new_entry); + ulint n_ext = index->is_primary() ? dtuple_get_n_ext(new_entry) : 0; if (page_zip_rec_needs_ext( - rec_get_converted_size(index, new_entry, n_ext), - page_is_comp(page), - dict_index_get_n_fields(index), - block->page.size)) { - + rec_get_converted_size(index, new_entry, n_ext), + page_is_comp(page), + dict_index_get_n_fields(index), + block->zip_size()) + || (UNIV_UNLIKELY(update->is_alter_metadata()) + && !dfield_is_ext(dtuple_get_nth_field( + new_entry, + index->first_user_field())))) { big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext); if (UNIV_UNLIKELY(big_rec_vec == NULL)) { @@ -4764,10 +5037,7 @@ btr_cur_pessimistic_update( } if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR, - roll_ptr); - row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID, - trx_id); + btr_cur_write_sys(new_entry, index, trx_id, roll_ptr); } if (!page_zip) { @@ -4776,10 +5046,10 @@ btr_cur_pessimistic_update( } if (UNIV_UNLIKELY(is_metadata)) { - ut_ad(new_entry->info_bits == REC_INFO_METADATA); + ut_ad(new_entry->is_metadata()); ut_ad(index->is_instant()); /* This can be innobase_add_instant_try() performing a - subsequent instant ADD COLUMN, or its rollback by + subsequent instant ALTER TABLE, or its rollback by row_undo_mod_clust_low(). */ ut_ad(flags & BTR_NO_LOCKING_FLAG); } else { @@ -4828,7 +5098,8 @@ btr_cur_pessimistic_update( btr_cur_get_block(cursor), rec, block); } - if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) { + if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets)) + || rec_is_alter_metadata(rec, *index)) { /* The new inserted record owns its possible externally stored fields */ btr_cur_unmark_extern_fields( @@ -5038,8 +5309,7 @@ btr_cur_del_mark_set_clust_rec_log( *log_ptr++ = 0; *log_ptr++ = 1; - log_ptr = row_upd_write_sys_vals_to_log( - index, trx_id, roll_ptr, log_ptr, mtr); + log_ptr = btr_cur_log_sys(index, trx_id, roll_ptr, log_ptr); mach_write_to_2(log_ptr, page_offset(rec)); log_ptr += 2; @@ -5471,42 +5741,41 @@ btr_cur_optimistic_delete_func( if (UNIV_UNLIKELY(block->page.id.page_no() == cursor->index->page && page_get_n_recs(block->frame) == 1 + (cursor->index->is_instant() - && !rec_is_metadata(rec, cursor->index)))) { + && !rec_is_metadata(rec, *cursor->index)))) { /* The whole index (and table) becomes logically empty. Empty the whole page. That is, if we are deleting the only user record, also delete the metadata record - if one exists (it exists if and only if is_instant()). + if one exists for instant ADD COLUMN (not generic ALTER TABLE). If we are deleting the metadata record and the table becomes empty, clean up the whole page. */ dict_index_t* index = cursor->index; + const rec_t* first_rec = page_rec_get_next_const( + page_get_infimum_rec(block->frame)); ut_ad(!index->is_instant() - || rec_is_metadata( - page_rec_get_next_const( - page_get_infimum_rec(block->frame)), - index)); - if (UNIV_UNLIKELY(rec_get_info_bits(rec, page_rec_is_comp(rec)) - & REC_INFO_MIN_REC_FLAG)) { - /* This should be rolling back instant ADD COLUMN. - If this is a recovered transaction, then - index->is_instant() will hold until the - insert into SYS_COLUMNS is rolled back. */ - ut_ad(index->table->supports_instant()); - ut_ad(index->is_primary()); - } else { - lock_update_delete(block, rec); - } - btr_page_empty(block, buf_block_get_page_zip(block), - index, 0, mtr); - page_cur_set_after_last(block, btr_cur_get_page_cur(cursor)); - - if (index->is_primary()) { - /* Concurrent access is prevented by - root_block->lock X-latch, so this should be - safe. */ - index->remove_instant(); + || rec_is_metadata(first_rec, *index)); + const bool is_metadata = rec_is_metadata(rec, *index); + /* We can remove the metadata when rolling back an + instant ALTER TABLE operation, or when deleting the + last user record on the page such that only metadata for + instant ADD COLUMN (not generic ALTER TABLE) remains. */ + const bool empty_table = is_metadata + || !index->is_instant() + || (first_rec != rec + && rec_is_add_metadata(first_rec, *index)); + if (UNIV_LIKELY(empty_table)) { + if (UNIV_LIKELY(!is_metadata)) { + lock_update_delete(block, rec); + } + btr_page_empty(block, buf_block_get_page_zip(block), + index, 0, mtr); + if (index->is_instant()) { + /* MDEV-17383: free metadata BLOBs! */ + index->clear_instant_alter(); + } + page_cur_set_after_last(block, + btr_cur_get_page_cur(cursor)); + return true; } - - return true; } offsets = rec_get_offsets(rec, cursor->index, offsets, true, @@ -5691,10 +5960,10 @@ btr_cur_pessimistic_delete( bool min_mark_next_rec = false; if (page_is_leaf(page)) { - const bool is_metadata = rec_get_info_bits( - rec, page_rec_is_comp(rec)) & REC_INFO_MIN_REC_FLAG; + const bool is_metadata = rec_is_metadata( + rec, page_rec_is_comp(rec)); if (UNIV_UNLIKELY(is_metadata)) { - /* This should be rolling back instant ADD COLUMN. + /* This should be rolling back instant ALTER TABLE. If this is a recovered transaction, then index->is_instant() will hold until the insert into SYS_COLUMNS is rolled back. */ @@ -5710,30 +5979,34 @@ btr_cur_pessimistic_delete( goto discard_page; } } else if (page_get_n_recs(page) == 1 - + (index->is_instant() - && !rec_is_metadata(rec, index))) { + + (index->is_instant() && !is_metadata)) { /* The whole index (and table) becomes logically empty. Empty the whole page. That is, if we are deleting the only user record, also delete the metadata record - if one exists (it exists if and only if is_instant()). - If we are deleting the metadata record and the + if one exists for instant ADD COLUMN + (not generic ALTER TABLE). + If we are deleting the metadata record + (in the rollback of instant ALTER TABLE) and the table becomes empty, clean up the whole page. */ + + const rec_t* first_rec = page_rec_get_next_const( + page_get_infimum_rec(page)); ut_ad(!index->is_instant() - || rec_is_metadata( - page_rec_get_next_const( - page_get_infimum_rec(page)), - index)); - btr_page_empty(block, page_zip, index, 0, mtr); - page_cur_set_after_last(block, - btr_cur_get_page_cur(cursor)); - if (index->is_primary()) { - /* Concurrent access is prevented by - index->lock and root_block->lock - X-latch, so this should be safe. */ - index->remove_instant(); + || rec_is_metadata(first_rec, *index)); + if (is_metadata || !index->is_instant() + || (first_rec != rec + && rec_is_add_metadata(first_rec, *index))) { + btr_page_empty(block, page_zip, index, 0, mtr); + if (index->is_instant()) { + /* MDEV-17383: free metadata BLOBs! */ + index->clear_instant_alter(); + } + page_cur_set_after_last( + block, + btr_cur_get_page_cur(cursor)); + ret = TRUE; + goto return_after_reservations; } - ret = TRUE; - goto return_after_reservations; } if (UNIV_LIKELY(!is_metadata)) { @@ -5835,7 +6108,7 @@ discard_page: || btr_cur_will_modify_tree( index, page, BTR_INTENTION_DELETE, rec, btr_node_ptr_max_size(index), - block->page.size, mtr); + block->zip_size(), mtr); page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr); @@ -6007,7 +6280,7 @@ btr_estimate_n_rows_in_range_on_level( const fil_space_t* space = index->table->space; page_id_t page_id(space->id, slot1->page_no); - const page_size_t page_size(space->flags); + const ulint zip_size = space->zip_size(); level = slot1->page_level; @@ -6024,7 +6297,7 @@ btr_estimate_n_rows_in_range_on_level( attempting to read a page that is no longer part of the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to silence a debug assertion about this. */ - block = buf_page_get_gen(page_id, page_size, RW_S_LATCH, + block = buf_page_get_gen(page_id, zip_size, RW_S_LATCH, NULL, BUF_GET_POSSIBLY_FREED, __FILE__, __LINE__, &mtr, &err); @@ -7156,7 +7429,7 @@ struct btr_blob_log_check_t { if (UNIV_UNLIKELY(m_op == BTR_STORE_INSERT_BULK)) { m_pcur->btr_cur.page_cur.block = btr_block_get( page_id_t(index->table->space_id, page_no), - page_size_t(index->table->space->flags), + index->table->space->zip_size(), RW_X_LATCH, index, m_mtr); m_pcur->btr_cur.page_cur.rec = m_pcur->btr_cur.page_cur.block->frame @@ -7247,9 +7520,6 @@ btr_store_big_rec_extern_fields( ut_ad(buf_block_get_frame(rec_block) == page_align(rec)); ut_a(dict_index_is_clust(index)); - ut_a(dict_table_page_size(index->table) - .equals_to(rec_block->page.size)); - btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block, &rec, op); page_zip = buf_block_get_page_zip(rec_block); @@ -7293,15 +7563,13 @@ btr_store_big_rec_extern_fields( } #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ - const page_size_t page_size(dict_table_page_size(index->table)); - /* Space available in compressed page to carry blob data */ - const ulint payload_size_zip = page_size.physical() + const ulint payload_size_zip = rec_block->physical_size() - FIL_PAGE_DATA; /* Space available in uncompressed page to carry blob data */ - const ulint payload_size = page_size.physical() - - FIL_PAGE_DATA - BTR_BLOB_HDR_SIZE - FIL_PAGE_DATA_END; + const ulint payload_size = payload_size_zip + - (BTR_BLOB_HDR_SIZE + FIL_PAGE_DATA_END); /* We have to create a file segment to the tablespace for each field and put the pointer to the field in rec */ @@ -7355,7 +7623,7 @@ btr_store_big_rec_extern_fields( mtr.set_flush_observer(btr_mtr->get_flush_observer()); buf_page_get(rec_block->page.id, - rec_block->page.size, RW_X_LATCH, &mtr); + rec_block->zip_size(), RW_X_LATCH, &mtr); if (prev_page_no == FIL_NULL) { hint_page_no = 1 + rec_page_no; @@ -7403,7 +7671,7 @@ btr_store_big_rec_extern_fields( prev_block = buf_page_get( page_id_t(space_id, prev_page_no), - rec_block->page.size, + rec_block->zip_size(), RW_X_LATCH, &mtr); buf_block_dbg_add_level(prev_block, @@ -7459,22 +7727,25 @@ btr_store_big_rec_extern_fields( ut_a(err == Z_STREAM_END || c_stream.avail_out == 0); - /* Write the "next BLOB page" pointer */ - mlog_write_ulint(page + FIL_PAGE_NEXT, - FIL_NULL, MLOG_4BYTES, &mtr); - /* Initialize the unused "prev page" pointer */ - mlog_write_ulint(page + FIL_PAGE_PREV, - FIL_NULL, MLOG_4BYTES, &mtr); - - /* Zero out the unused part of the page. */ - memset(page + page_zip_get_size(page_zip) - - c_stream.avail_out, - 0, c_stream.avail_out); + compile_time_assert(FIL_PAGE_NEXT + == FIL_PAGE_PREV + 4); + compile_time_assert(FIL_NULL == 0xffffffff); + mlog_memset(block, FIL_PAGE_PREV, 8, 0xff, + &mtr); mlog_log_string(page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, page_zip_get_size(page_zip) - - FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, + - FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + - c_stream.avail_out, &mtr); + /* Zero out the unused part of the page. */ + if (c_stream.avail_out) { + mlog_memset(block, + page_zip_get_size(page_zip) + - c_stream.avail_out, + c_stream.avail_out, + 0, &mtr); + } /* Copy the page to compressed storage, because it will be flushed to disk from there. */ @@ -7735,10 +8006,9 @@ btr_free_externally_stored_field( ut_ad(space_id == index->table->space->id); ut_ad(space_id == index->table->space_id); - const page_size_t ext_page_size(dict_table_page_size(index->table)); - const page_size_t& rec_page_size(rec == NULL - ? univ_page_size - : ext_page_size); + const ulint ext_zip_size = index->table->space->zip_size(); + const ulint rec_zip_size = rec ? ext_zip_size : 0; + if (rec == NULL) { /* This is a call from row_purge_upd_exist_or_extern(). */ ut_ad(!page_zip); @@ -7765,7 +8035,7 @@ btr_free_externally_stored_field( #ifdef UNIV_DEBUG rec_block = #endif /* UNIV_DEBUG */ - buf_page_get(page_id, rec_page_size, RW_X_LATCH, &mtr); + buf_page_get(page_id, rec_zip_size, RW_X_LATCH, &mtr); buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK); page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO); @@ -7791,13 +8061,13 @@ btr_free_externally_stored_field( } ext_block = buf_page_get( - page_id_t(space_id, page_no), ext_page_size, + page_id_t(space_id, page_no), ext_zip_size, RW_X_LATCH, &mtr); buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE); page = buf_block_get_frame(ext_block); - if (ext_page_size.is_compressed()) { + if (ext_zip_size) { /* Note that page_zip will be NULL in row_purge_upd_exist_or_extern(). */ switch (fil_page_get_type(page)) { @@ -7966,7 +8236,7 @@ btr_copy_blob_prefix( mtr_start(&mtr); block = buf_page_get(page_id_t(space_id, page_no), - univ_page_size, RW_S_LATCH, &mtr); + 0, RW_S_LATCH, &mtr); buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE); page = buf_block_get_frame(block); @@ -8004,7 +8274,7 @@ by a lock or a page latch. @param[out] buf the externally stored part of the field, or a prefix of it @param[in] len length of buf, in bytes -@param[in] page_size compressed BLOB page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size @param[in] space_id space id of the BLOB pages @param[in] offset offset on the first BLOB page @return number of bytes written to buf */ @@ -8013,7 +8283,7 @@ ulint btr_copy_zblob_prefix( byte* buf, ulint len, - const page_size_t& page_size, + ulint zip_size, ulint space_id, ulint page_no, ulint offset) @@ -8033,7 +8303,8 @@ btr_copy_zblob_prefix( heap = mem_heap_create(40000); page_zip_set_alloc(&d_stream, heap); - ut_ad(page_size.is_compressed()); + ut_ad(zip_size); + ut_ad(ut_is_2pow(zip_size)); ut_ad(space_id); err = inflateInit(&d_stream); @@ -8048,7 +8319,7 @@ btr_copy_zblob_prefix( is being held on the clustered index record, or, in row_merge_copy_blobs(), by an exclusive table lock. */ bpage = buf_page_get_zip(page_id_t(space_id, page_no), - page_size); + zip_size); if (UNIV_UNLIKELY(!bpage)) { ib::error() << "Cannot load compressed BLOB " @@ -8080,8 +8351,7 @@ btr_copy_zblob_prefix( } d_stream.next_in = bpage->zip.data + offset; - d_stream.avail_in = static_cast<uInt>(page_size.physical() - - offset); + d_stream.avail_in = uInt(zip_size - offset); err = inflate(&d_stream, Z_NO_FLUSH); switch (err) { @@ -8151,7 +8421,7 @@ by a lock or a page latch. @param[out] buf the externally stored part of the field, or a prefix of it @param[in] len length of buf, in bytes -@param[in] page_size BLOB page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] space_id space id of the first BLOB page @param[in] page_no page number of the first BLOB page @param[in] offset offset on the first BLOB page @@ -8161,7 +8431,7 @@ ulint btr_copy_externally_stored_field_prefix_low( byte* buf, ulint len, - const page_size_t& page_size, + ulint zip_size, ulint space_id, ulint page_no, ulint offset) @@ -8170,11 +8440,10 @@ btr_copy_externally_stored_field_prefix_low( return(0); } - if (page_size.is_compressed()) { - return(btr_copy_zblob_prefix(buf, len, page_size, + if (zip_size) { + return(btr_copy_zblob_prefix(buf, len, zip_size, space_id, page_no, offset)); } else { - ut_ad(page_size.equals_to(univ_page_size)); return(btr_copy_blob_prefix(buf, len, space_id, page_no, offset)); } @@ -8184,7 +8453,7 @@ btr_copy_externally_stored_field_prefix_low( The clustered index record must be protected by a lock or a page latch. @param[out] buf the field, or a prefix of it @param[in] len length of buf, in bytes -@param[in] page_size BLOB page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] data 'internally' stored part of the field containing also the reference to the external part; must be protected by a lock or a page latch @@ -8195,7 +8464,7 @@ ulint btr_copy_externally_stored_field_prefix( byte* buf, ulint len, - const page_size_t& page_size, + ulint zip_size, const byte* data, ulint local_len) { @@ -8234,7 +8503,7 @@ btr_copy_externally_stored_field_prefix( return(local_len + btr_copy_externally_stored_field_prefix_low(buf + local_len, len - local_len, - page_size, + zip_size, space_id, page_no, offset)); } @@ -8245,7 +8514,7 @@ The clustered index record must be protected by a lock or a page latch. @param[in] data 'internally' stored part of the field containing also the reference to the external part; must be protected by a lock or a page latch -@param[in] page_size BLOB page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] local_len length of data @param[in,out] heap mem heap @return the whole field copied to heap */ @@ -8253,7 +8522,7 @@ byte* btr_copy_externally_stored_field( ulint* len, const byte* data, - const page_size_t& page_size, + ulint zip_size, ulint local_len, mem_heap_t* heap) { @@ -8284,7 +8553,7 @@ btr_copy_externally_stored_field( *len = local_len + btr_copy_externally_stored_field_prefix_low(buf + local_len, extern_len, - page_size, + zip_size, space_id, page_no, offset); @@ -8295,7 +8564,7 @@ btr_copy_externally_stored_field( @param[in] rec record in a clustered index; must be protected by a lock or a page latch @param[in] offset array returned by rec_get_offsets() -@param[in] page_size BLOB page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] no field number @param[out] len length of the field @param[in,out] heap mem heap @@ -8304,7 +8573,7 @@ byte* btr_rec_copy_externally_stored_field( const rec_t* rec, const rec_offs* offsets, - const page_size_t& page_size, + ulint zip_size, ulint no, ulint* len, mem_heap_t* heap) @@ -8338,5 +8607,5 @@ btr_rec_copy_externally_stored_field( } return(btr_copy_externally_stored_field(len, data, - page_size, local_len, heap)); + zip_size, local_len, heap)); } diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc index 2a83e9c3e8e..a5c901b40e0 100644 --- a/storage/innobase/btr/btr0defragment.cc +++ b/storage/innobase/btr/btr0defragment.cc @@ -62,14 +62,14 @@ UNIV_INTERN mysql_pfs_key_t btr_defragment_mutex_key; /* Number of compression failures caused by defragmentation since server start. */ -ulint btr_defragment_compression_failures = 0; +Atomic_counter<ulint> btr_defragment_compression_failures; /* Number of btr_defragment_n_pages calls that altered page but didn't manage to release any page. */ -ulint btr_defragment_failures = 0; +Atomic_counter<ulint> btr_defragment_failures; /* Total number of btr_defragment_n_pages calls that altered page. The difference between btr_defragment_count and btr_defragment_failures shows the amount of effort wasted. */ -ulint btr_defragment_count = 0; +Atomic_counter<ulint> btr_defragment_count; /******************************************************************//** Constructor for btr_defragment_item_t. */ @@ -165,7 +165,7 @@ btr_defragment_add_index( // Load index rood page. buf_block_t* block = btr_block_get( page_id_t(index->table->space_id, index->page), - page_size_t(index->table->space->flags), + index->table->space->zip_size(), RW_NO_LATCH, index, &mtr); page_t* page = NULL; @@ -375,7 +375,7 @@ btr_defragment_merge_pages( dict_index_t* index, /*!< in: index tree */ buf_block_t* from_block, /*!< in: origin of merge */ buf_block_t* to_block, /*!< in: destination of merge */ - const page_size_t page_size, /*!< in: page size of the block */ + ulint zip_size, /*!< in: ROW_FORMAT=COMPRESSED size */ ulint reserved_space, /*!< in: space reserved for future insert to avoid immediate page split */ ulint* max_data_size, /*!< in/out: max data size to @@ -403,7 +403,7 @@ btr_defragment_merge_pages( // Estimate how many records can be moved from the from_page to // the to_page. - if (page_size.is_compressed()) { + if (zip_size) { ulint page_diff = srv_page_size - *max_data_size; max_ins_size_to_use = (max_ins_size_to_use > page_diff) ? max_ins_size_to_use - page_diff : 0; @@ -447,8 +447,7 @@ btr_defragment_merge_pages( // n_recs_to_move number of records to to_page. We try to reduce // the targeted data size on the to_page by // BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again. - my_atomic_addlint( - &btr_defragment_compression_failures, 1); + btr_defragment_compression_failures++; max_ins_size_to_use = move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE ? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE @@ -472,7 +471,7 @@ btr_defragment_merge_pages( // Set ibuf free bits if necessary. if (!dict_index_is_clust(index) && page_is_leaf(to_page)) { - if (page_size.is_compressed()) { + if (zip_size) { ibuf_reset_free_bits(to_block); } else { ibuf_update_free_bits_if_full( @@ -490,7 +489,7 @@ btr_defragment_merge_pages( btr_search_drop_page_hash_index(from_block); btr_level_list_remove( index->table->space_id, - page_size, from_page, index, mtr); + zip_size, from_page, index, mtr); btr_page_get_father(index, from_block, mtr, &parent); btr_cur_node_ptr_delete(&parent, mtr); /* btr_blob_dbg_remove(from_page, index, @@ -577,7 +576,7 @@ btr_defragment_n_pages( } first_page = buf_block_get_frame(block); - const page_size_t page_size(index->table->space->flags); + const ulint zip_size = index->table->space->zip_size(); /* 1. Load the pages and calculate the total data size. */ blocks[0] = block; @@ -593,7 +592,7 @@ btr_defragment_n_pages( } blocks[i] = btr_block_get(page_id_t(index->table->space_id, - page_no), page_size, + page_no), zip_size, RW_X_LATCH, index, mtr); } @@ -619,7 +618,7 @@ btr_defragment_n_pages( optimal_page_size = page_get_free_space_of_empty( page_is_comp(first_page)); // For compressed pages, we take compression failures into account. - if (page_size.is_compressed()) { + if (zip_size) { ulint size = 0; uint i = 0; // We estimate the optimal data size of the index use samples of @@ -662,7 +661,7 @@ btr_defragment_n_pages( // Start from the second page. for (uint i = 1; i < n_pages; i ++) { buf_block_t* new_block = btr_defragment_merge_pages( - index, blocks[i], current_block, page_size, + index, blocks[i], current_block, zip_size, reserved_space, &max_data_size, heap, mtr); if (new_block != current_block) { n_defragmented ++; @@ -671,11 +670,9 @@ btr_defragment_n_pages( } mem_heap_free(heap); n_defragmented ++; - my_atomic_addlint( - &btr_defragment_count, 1); + btr_defragment_count++; if (n_pages == n_defragmented) { - my_atomic_addlint( - &btr_defragment_failures, 1); + btr_defragment_failures++; } else { index->stat_defrag_n_pages_freed += (n_pages - n_defragmented); } diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc index 4f06251d0bf..37444ee974d 100644 --- a/storage/innobase/btr/btr0pcur.cc +++ b/storage/innobase/btr/btr0pcur.cc @@ -150,17 +150,26 @@ before_first: rec = page_rec_get_prev(rec); ut_ad(!page_rec_is_infimum(rec)); - ut_ad(!rec_is_metadata(rec, index)); + if (UNIV_UNLIKELY(rec_is_metadata(rec, *index))) { + ut_ad(index->table->instant); + ut_ad(page_get_n_recs(block->frame) == 1); + ut_ad(page_is_leaf(block->frame)); + ut_ad(!page_has_prev(block->frame)); + cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE; + return; + } cursor->rel_pos = BTR_PCUR_AFTER; } else if (page_rec_is_infimum_low(offs)) { rec = page_rec_get_next(rec); - if (rec_is_metadata(rec, index)) { + if (rec_is_metadata(rec, *index)) { ut_ad(!page_has_prev(block->frame)); + ut_d(const rec_t* p = rec); rec = page_rec_get_next(rec); if (page_rec_is_supremum(rec)) { - ut_ad(page_has_next(block->frame)); + ut_ad(page_has_next(block->frame) + || rec_is_alter_metadata(p, *index)); goto before_first; } } @@ -170,10 +179,25 @@ before_first: cursor->rel_pos = BTR_PCUR_ON; } - cursor->old_rec = dict_index_copy_rec_order_prefix( - index, rec, &cursor->old_n_fields, - &cursor->old_rec_buf, &cursor->buf_size); + if (index->is_ibuf()) { + ut_ad(!index->table->not_redundant()); + cursor->old_n_fields = rec_get_n_fields_old(rec); + } else if (page_rec_is_leaf(rec)) { + cursor->old_n_fields = dict_index_get_n_unique_in_tree(index); + } else if (index->is_spatial()) { + ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index) + == DICT_INDEX_SPATIAL_NODEPTR_SIZE); + /* For R-tree, we have to compare + the child page numbers as well. */ + cursor->old_n_fields = DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1; + } else { + cursor->old_n_fields = dict_index_get_n_unique_in_tree(index); + } + cursor->old_rec = rec_copy_prefix_to_buf(rec, index, + cursor->old_n_fields, + &cursor->old_rec_buf, + &cursor->buf_size); cursor->block_when_stored = block; /* Function try to check if block is S/X latch. */ @@ -457,7 +481,7 @@ btr_pcur_move_to_next_page( next_block = btr_block_get( page_id_t(block->page.id.space(), next_page_no), - block->page.size, mode, + block->zip_size(), mode, btr_pcur_get_btr_cur(cursor)->index, mtr); if (UNIV_UNLIKELY(!next_block)) { diff --git a/storage/innobase/btr/btr0scrub.cc b/storage/innobase/btr/btr0scrub.cc index 6a550739121..5f89391d280 100644 --- a/storage/innobase/btr/btr0scrub.cc +++ b/storage/innobase/btr/btr0scrub.cc @@ -1,5 +1,5 @@ // Copyright (c) 2014, Google Inc. -// Copyright (c) 2017, MariaDB Corporation. +// Copyright (c) 2017, 2019, MariaDB Corporation. /**************************************************//** @file btr/btr0scrub.cc @@ -119,13 +119,13 @@ btr_scrub_lock_dict_func(ulint space_id, bool lock_to_close_table, time_t last = start; /* FIXME: this is not the proper way of doing things. The - dict_sys->mutex should not be held by any thread for longer + dict_sys.mutex should not be held by any thread for longer than a few microseconds. It must not be held during I/O, for example. So, what is the purpose for this busy-waiting? This function should be rewritten as part of MDEV-8139: Fix scrubbing tests. */ - while (mutex_enter_nowait(&(dict_sys->mutex))) { + while (mutex_enter_nowait(&dict_sys.mutex)) { /* if we lock to close a table, we wait forever * if we don't lock to close a table, we check if space * is closing, and then instead give up @@ -157,7 +157,7 @@ btr_scrub_lock_dict_func(ulint space_id, bool lock_to_close_table, } } - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); return true; } @@ -204,10 +204,10 @@ btr_scrub_table_close_for_thread( /* If tablespace is not marked as stopping perform the actual close. */ if (!space->is_stopping()) { - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); /* perform the actual closing */ btr_scrub_table_close(scrub_data->current_table); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } space->release(); } @@ -431,10 +431,10 @@ btr_pessimistic_scrub( } /* read block variables */ - const ulint page_no = mach_read_from_4(page + FIL_PAGE_OFFSET); + const uint32_t page_no = mach_read_from_4(page + FIL_PAGE_OFFSET); const uint32_t left_page_no = btr_page_get_prev(page); const uint32_t right_page_no = btr_page_get_next(page); - const page_size_t page_size(index->table->space->flags); + const ulint zip_size = index->table->space->zip_size(); /** * When splitting page, we need X-latches on left/right brothers @@ -449,16 +449,16 @@ btr_pessimistic_scrub( */ mtr->release_block_at_savepoint(scrub_data->savepoint, block); - buf_block_t* get_block __attribute__((unused)) = btr_block_get( + btr_block_get( page_id_t(index->table->space_id, left_page_no), - page_size, RW_X_LATCH, index, mtr); + zip_size, RW_X_LATCH, index, mtr); /** * Refetch block and re-initialize page */ block = btr_block_get( page_id_t(index->table->space_id, page_no), - page_size, RW_X_LATCH, index, mtr); + zip_size, RW_X_LATCH, index, mtr); page = buf_block_get_frame(block); @@ -470,9 +470,9 @@ btr_pessimistic_scrub( } if (right_page_no != FIL_NULL) { - buf_block_t* get_block __attribute__((unused))= btr_block_get( + btr_block_get( page_id_t(index->table->space_id, right_page_no), - page_size, RW_X_LATCH, index, mtr); + zip_size, RW_X_LATCH, index, mtr); } /* arguments to btr_page_split_and_insert */ @@ -837,13 +837,15 @@ btr_scrub_start_space( ulint space, /*!< in: space */ btr_scrub_t* scrub_data) /*!< in/out: scrub data */ { - bool found; scrub_data->space = space; scrub_data->current_table = NULL; scrub_data->current_index = NULL; - const page_size_t page_size = fil_space_get_page_size(space, &found); - - scrub_data->compressed = page_size.is_compressed(); + if (fil_space_t* s = fil_space_acquire_silent(space)) { + scrub_data->compressed = s->zip_size(); + s->release(); + } else { + scrub_data->compressed = 0; + } scrub_data->scrubbing = check_scrub_setting(scrub_data); return scrub_data->scrubbing; } diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc index 32f5ae672e8..c92659e9d71 100644 --- a/storage/innobase/btr/btr0sea.cc +++ b/storage/innobase/btr/btr0sea.cc @@ -289,18 +289,11 @@ void btr_search_sys_free() /** Set index->ref_count = 0 on all indexes of a table. @param[in,out] table table handler */ -static -void -btr_search_disable_ref_count( - dict_table_t* table) +static void btr_search_disable_ref_count(dict_table_t *table) { - dict_index_t* index; - - for (index = dict_table_get_first_index(table); - index != NULL; - index = dict_table_get_next_index(index)) { - index->search_info->ref_count = 0; - } + for (dict_index_t *index= dict_table_get_first_index(table); index; + index= dict_table_get_next_index(index)) + index->search_info->ref_count= 0; } /** Lazily free detached metadata when removing the last reference. */ @@ -383,12 +376,12 @@ void btr_search_disable() { dict_table_t* table; - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); btr_search_x_lock_all(); if (!btr_search_enabled) { - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); btr_search_x_unlock_all(); return; } @@ -397,19 +390,19 @@ void btr_search_disable() /* Clear the index->search_info->ref_count of every index in the data dictionary cache. */ - for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU); table; + for (table = UT_LIST_GET_FIRST(dict_sys.table_LRU); table; table = UT_LIST_GET_NEXT(table_LRU, table)) { btr_search_disable_ref_count(table); } - for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU); table; + for (table = UT_LIST_GET_FIRST(dict_sys.table_non_LRU); table; table = UT_LIST_GET_NEXT(table_LRU, table)) { btr_search_disable_ref_count(table); } - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); /* Set all block->index = NULL. */ buf_pool_clear_hash_index(); @@ -1121,7 +1114,7 @@ retry: ut_ad(page_is_leaf(block->frame)); /* We must not dereference block->index here, because it could be freed - if (index->table->n_ref_count == 0 && !mutex_own(&dict_sys->mutex)). + if (index->table->n_ref_count == 0 && !mutex_own(&dict_sys.mutex)). Determine the ahi_slot based on the block contents. */ const index_id_t index_id @@ -1192,7 +1185,7 @@ retry: rec = page_get_infimum_rec(page); rec = page_rec_get_next_low(rec, page_is_comp(page)); - if (rec_is_metadata(rec, index)) { + if (rec_is_metadata(rec, *index)) { rec = page_rec_get_next_low(rec, page_is_comp(page)); } @@ -1294,7 +1287,7 @@ void btr_search_drop_page_hash_when_freed(const page_id_t page_id) are possibly holding, we cannot s-latch the page, but must (recursively) x-latch it, even though we are only reading. */ - block = buf_page_get_gen(page_id, univ_page_size, RW_X_LATCH, NULL, + block = buf_page_get_gen(page_id, 0, RW_X_LATCH, NULL, BUF_PEEK_IF_IN_POOL, __FILE__, __LINE__, &mtr, &err); @@ -1312,7 +1305,7 @@ void btr_search_drop_page_hash_when_freed(const page_id_t page_id) be open, or we should be in the process of dropping the table (preventing eviction). */ ut_ad(index->table->get_ref_count() > 0 - || mutex_own(&dict_sys->mutex)); + || mutex_own(&dict_sys.mutex)); btr_search_drop_page_hash_index(block); } } @@ -1410,7 +1403,7 @@ btr_search_build_page_hash_index( rec = page_rec_get_next_const(page_get_infimum_rec(page)); - if (rec_is_metadata(rec, index)) { + if (rec_is_metadata(rec, *index)) { rec = page_rec_get_next_const(rec); if (!--n_recs) return; } @@ -1866,7 +1859,7 @@ btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch) bool locked = false; - if (!page_rec_is_infimum(rec) && !rec_is_metadata(rec, index)) { + if (!page_rec_is_infimum(rec) && !rec_is_metadata(rec, *index)) { offsets = rec_get_offsets( rec, index, offsets, true, btr_search_get_n_fields(n_fields, n_bytes), &heap); diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index e2425ccb6cc..4aeb4bbd193 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -33,14 +33,11 @@ Created 11/5/1995 Heikki Tuuri #include "mtr0types.h" #include "mach0data.h" -#include "page0size.h" #include "buf0buf.h" +#include "buf0checksum.h" +#include "ut0crc32.h" #include <string.h> -#ifdef UNIV_NONINL -#include "buf0buf.ic" -#endif - #ifndef UNIV_INNOCHECKSUM #include "mem0mem.h" #include "btr0btr.h" @@ -61,19 +58,14 @@ Created 11/5/1995 Heikki Tuuri #include "log0recv.h" #include "srv0mon.h" #include "log0crypt.h" +#include "fil0pagecompress.h" +#include "fsp0pagecompress.h" #endif /* !UNIV_INNOCHECKSUM */ #include "page0zip.h" #include "sync0sync.h" #include "buf0dump.h" -#include <new> #include <map> #include <sstream> -#ifndef UNIV_INNOCHECKSUM -#include "fil0pagecompress.h" -#include "fsp0pagecompress.h" -#endif -#include "ut0byte.h" -#include <new> #ifdef UNIV_LINUX #include <stdlib.h> @@ -133,6 +125,17 @@ struct set_numa_interleave_t #include "snappy-c.h" #endif +#ifndef UNIV_INNOCHECKSUM +buf_pool_t::io_buf_t::~io_buf_t() +{ + for (buf_tmp_buffer_t* s = slots, *e = slots + n_slots; s != e; s++) { + aligned_free(s->crypt_buf); + aligned_free(s->comp_buf); + } + ut_free(slots); +} +#endif /* !UNIV_INNOCHECKSUM */ + /* IMPLEMENTATION OF THE BUFFER POOL ================================= @@ -405,16 +408,9 @@ on the io_type */ @return reserved buffer slot */ static buf_tmp_buffer_t* buf_pool_reserve_tmp_slot(buf_pool_t* buf_pool) { - for (ulint i = 0; i < buf_pool->tmp_arr->n_slots; i++) { - buf_tmp_buffer_t* slot = &buf_pool->tmp_arr->slots[i]; - if (slot->acquire()) { - return slot; - } - } - - /* We assume that free slot is found */ - ut_error; - return NULL; + buf_tmp_buffer_t* slot = buf_pool->io_buf.reserve(); + ut_a(slot); + return slot; } /** Reserve a buffer for encryption, decryption or decompression. @@ -468,7 +464,7 @@ static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame) } /* read space & lsn */ - uint header_len = FIL_PAGE_DATA; + uint header_len = FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; /* Copy FIL page header, it is not encrypted */ memcpy(tmp_frame, src_frame, header_len); @@ -477,7 +473,7 @@ static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame) const byte* src = src_frame + header_len; byte* dst = tmp_frame + header_len; uint srclen = uint(srv_page_size) - - header_len - FIL_PAGE_DATA_END; + - (header_len + FIL_PAGE_FCRC32_CHECKSUM); ulint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET); if (!log_tmp_block_decrypt(src, srclen, dst, @@ -485,9 +481,9 @@ static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame) return false; } - memcpy(tmp_frame + srv_page_size - FIL_PAGE_DATA_END, - src_frame + srv_page_size - FIL_PAGE_DATA_END, - FIL_PAGE_DATA_END); + memcpy(tmp_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM, + src_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM, + FIL_PAGE_FCRC32_CHECKSUM); memcpy(src_frame, tmp_frame, srv_page_size); srv_stats.pages_decrypted.inc(); @@ -507,7 +503,8 @@ static bool buf_page_decrypt_after_read(buf_page_t* bpage, fil_space_t* space) byte* dst_frame = bpage->zip.data ? bpage->zip.data : ((buf_block_t*) bpage)->frame; - bool page_compressed = fil_page_is_compressed(dst_frame); + bool page_compressed = space->is_compressed() + && buf_page_is_compressed(dst_frame, space->flags); buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); if (bpage->id.page_no() == 0) { @@ -536,39 +533,42 @@ static bool buf_page_decrypt_after_read(buf_page_t* bpage, fil_space_t* space) also for pages first compressed and then encrypted. */ buf_tmp_buffer_t* slot; + uint key_version = buf_page_get_key_version(dst_frame, space->flags); - if (page_compressed) { + if (page_compressed && !key_version) { /* the page we read is unencrypted */ /* Find free slot from temporary memory array */ decompress: + if (space->full_crc32() + && buf_page_is_corrupted(true, dst_frame, space->flags)) { + return false; + } + slot = buf_pool_reserve_tmp_slot(buf_pool); /* For decompression, use crypt_buf. */ buf_tmp_reserve_crypt_buf(slot); + decompress_with_slot: - ut_d(fil_page_type_validate(dst_frame)); + ut_d(fil_page_type_validate(space, dst_frame)); - ulint write_size = fil_page_decompress(slot->crypt_buf, - dst_frame); + ulint write_size = fil_page_decompress( + slot->crypt_buf, dst_frame, space->flags); slot->release(); - ut_ad(!write_size || fil_page_type_validate(dst_frame)); + ut_ad(!write_size || fil_page_type_validate(space, dst_frame)); ut_ad(space->pending_io()); return write_size != 0; } - if (space->crypt_data - && mach_read_from_4(FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION - + dst_frame)) { + if (key_version && space->crypt_data) { /* Verify encryption checksum before we even try to decrypt. */ - if (!fil_space_verify_crypt_checksum(dst_frame, bpage->size)) { + if (!buf_page_verify_crypt_checksum(dst_frame, space->flags)) { decrypt_failed: ib::error() << "Encrypted page " << bpage->id << " in file " << space->chain.start->name << " looks corrupted; key_version=" - << mach_read_from_4( - FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION - + dst_frame); + << key_version; return false; } @@ -576,7 +576,7 @@ decrypt_failed: slot = buf_pool_reserve_tmp_slot(buf_pool); buf_tmp_reserve_crypt_buf(slot); - ut_d(fil_page_type_validate(dst_frame)); + ut_d(fil_page_type_validate(space, dst_frame)); /* decrypt using crypt_buf to dst_frame */ if (!fil_space_decrypt(space, slot->crypt_buf, dst_frame)) { @@ -584,9 +584,10 @@ decrypt_failed: goto decrypt_failed; } - ut_d(fil_page_type_validate(dst_frame)); + ut_d(fil_page_type_validate(space, dst_frame)); - if (fil_page_is_compressed_encrypted(dst_frame)) { + if ((space->full_crc32() && page_compressed) + || fil_page_is_compressed_encrypted(dst_frame)) { goto decompress_with_slot; } @@ -794,11 +795,7 @@ buf_page_is_checksum_valid_crc32( return false; } - return checksum_field1 == crc32 -#ifdef INNODB_BUG_ENDIAN_CRC32 - || checksum_field1 == buf_calc_page_crc32(read_buf, true) -#endif - ; + return checksum_field1 == crc32; } /** Checks if the page is in innodb checksum format. @@ -927,28 +924,44 @@ buf_page_is_checksum_valid_none( && checksum_field1 == BUF_NO_CHECKSUM_MAGIC); } -#ifdef INNODB_BUG_ENDIAN_CRC32 -/** Validate the CRC-32C checksum of a page. -@param[in] page buffer page (srv_page_size bytes) -@param[in] checksum CRC-32C checksum stored on page -@return computed checksum */ -static uint32_t buf_page_check_crc32(const byte* page, uint32_t checksum) +/** Checks whether the lsn present in the page is lesser than the +peek current lsn. +@param[in] check_lsn lsn to check +@param[in] read_buf page. */ +static void buf_page_check_lsn(bool check_lsn, const byte* read_buf) { - uint32_t crc32 = buf_calc_page_crc32(page); +#ifndef UNIV_INNOCHECKSUM + if (check_lsn && recv_lsn_checks_on) { + lsn_t current_lsn; + const lsn_t page_lsn + = mach_read_from_8(read_buf + FIL_PAGE_LSN); - if (checksum != crc32) { - crc32 = buf_calc_page_crc32(page, true); - } + /* Since we are going to reset the page LSN during the import + phase it makes no sense to spam the log with error messages. */ + + if (log_peek_lsn(¤t_lsn) && current_lsn < page_lsn) { - return crc32; + const ulint space_id = mach_read_from_4( + read_buf + FIL_PAGE_SPACE_ID); + const ulint page_no = mach_read_from_4( + read_buf + FIL_PAGE_OFFSET); + + ib::error() << "Page " << page_id_t(space_id, page_no) + << " log sequence number " << page_lsn + << " is in the future! Current system" + << " log sequence number " + << current_lsn << "."; + + ib::error() << "Your database may be corrupt or" + " you may have copied the InnoDB" + " tablespace but not the InnoDB" + " log files. " + << FORCE_RECOVERY_MSG; + + } + } +#endif /* !UNIV_INNOCHECKSUM */ } -#else /* INNODB_BUG_ENDIAN_CRC32 */ -/** Validate the CRC-32C checksum of a page. -@param[in] page buffer page (srv_page_size bytes) -@param[in] checksum CRC-32C checksum stored on page -@return computed checksum */ -# define buf_page_check_crc32(page, checksum) buf_calc_page_crc32(page) -#endif /* INNODB_BUG_ENDIAN_CRC32 */ /** Check if a buffer is all zeroes. @@ -963,29 +976,64 @@ bool buf_is_zeroes(span<const byte> buf) /** Check if a page is corrupt. @param[in] check_lsn whether the LSN should be checked @param[in] read_buf database page -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] space tablespace @return whether the page is corrupted */ bool buf_page_is_corrupted( bool check_lsn, const byte* read_buf, - const page_size_t& page_size, -#ifndef UNIV_INNOCHECKSUM - const fil_space_t* space) -#else - const void* space) -#endif + ulint fsp_flags) { - ut_ad(page_size.logical() == srv_page_size); #ifndef UNIV_INNOCHECKSUM DBUG_EXECUTE_IF("buf_page_import_corrupt_failure", return(true); ); #endif + if (fil_space_t::full_crc32(fsp_flags)) { + bool compressed = false, corrupted = false; + const uint size = buf_page_full_crc32_size( + read_buf, &compressed, &corrupted); + if (corrupted) { + return true; + } + const byte* end = read_buf + (size - FIL_PAGE_FCRC32_CHECKSUM); + uint crc32 = mach_read_from_4(end); + + if (!crc32 && size == srv_page_size + && buf_is_zeroes(span<const byte>(read_buf, size))) { + return false; + } + + DBUG_EXECUTE_IF( + "page_intermittent_checksum_mismatch", { + static int page_counter; + if (page_counter++ == 2) { + crc32++; + } + }); + + if (crc32 != ut_crc32(read_buf, + size - FIL_PAGE_FCRC32_CHECKSUM)) { + return true; + } + if (!compressed + && !mach_read_from_4(FIL_PAGE_FCRC32_KEY_VERSION + + read_buf) + && memcmp(read_buf + (FIL_PAGE_LSN + 4), + end - (FIL_PAGE_FCRC32_END_LSN + - FIL_PAGE_FCRC32_CHECKSUM), 4)) { + return true; + } + + buf_page_check_lsn(check_lsn, read_buf); + return false; + } + size_t checksum_field1 = 0; size_t checksum_field2 = 0; uint32_t crc32 = 0; bool crc32_inited = false; - + bool crc32_chksum = false; + const ulint zip_size = fil_space_t::zip_size(fsp_flags); ulint page_type = mach_read_from_2(read_buf + FIL_PAGE_TYPE); /* We can trust page type if page compression is set on tablespace @@ -1000,16 +1048,15 @@ buf_page_is_corrupted( if ((page_type == FIL_PAGE_PAGE_COMPRESSED || page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) #ifndef UNIV_INNOCHECKSUM - && space && FSP_FLAGS_HAS_PAGE_COMPRESSION(space->flags) + && FSP_FLAGS_HAS_PAGE_COMPRESSION(fsp_flags) #endif ) { return(false); } - if (!page_size.is_compressed() - && memcmp(read_buf + FIL_PAGE_LSN + 4, - read_buf + page_size.logical() - - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) { + if (!zip_size && memcmp(read_buf + FIL_PAGE_LSN + 4, + read_buf + srv_page_size + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) { /* Stored log sequence numbers at the start and the end of page do not match */ @@ -1017,37 +1064,7 @@ buf_page_is_corrupted( return(true); } -#ifndef UNIV_INNOCHECKSUM - if (check_lsn && recv_lsn_checks_on) { - lsn_t current_lsn; - const lsn_t page_lsn - = mach_read_from_8(read_buf + FIL_PAGE_LSN); - - /* Since we are going to reset the page LSN during the import - phase it makes no sense to spam the log with error messages. */ - - if (log_peek_lsn(¤t_lsn) && current_lsn < page_lsn) { - - const ulint space_id = mach_read_from_4( - read_buf + FIL_PAGE_SPACE_ID); - const ulint page_no = mach_read_from_4( - read_buf + FIL_PAGE_OFFSET); - - ib::error() << "Page " << page_id_t(space_id, page_no) - << " log sequence number " << page_lsn - << " is in the future! Current system" - << " log sequence number " - << current_lsn << "."; - - ib::error() << "Your database may be corrupt or" - " you may have copied the InnoDB" - " tablespace but not the InnoDB" - " log files. " - << FORCE_RECOVERY_MSG; - - } - } -#endif /* !UNIV_INNOCHECKSUM */ + buf_page_check_lsn(check_lsn, read_buf); /* Check whether the checksum fields have correct values */ @@ -1058,23 +1075,26 @@ buf_page_is_corrupted( return(false); } - if (page_size.is_compressed()) { - return(!page_zip_verify_checksum(read_buf, - page_size.physical())); + if (zip_size) { + return !page_zip_verify_checksum(read_buf, zip_size); } checksum_field1 = mach_read_from_4( read_buf + FIL_PAGE_SPACE_OR_CHKSUM); checksum_field2 = mach_read_from_4( - read_buf + page_size.logical() - FIL_PAGE_END_LSN_OLD_CHKSUM); + read_buf + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM); compile_time_assert(!(FIL_PAGE_LSN % 8)); /* A page filled with NUL bytes is considered not corrupted. - The FIL_PAGE_FILE_FLUSH_LSN field may be written nonzero for - the first page of the system tablespace. - Ignore it for the system tablespace. */ + Before MariaDB Server 10.1.25 (MDEV-12113) or 10.2.2 (or MySQL 5.7), + the FIL_PAGE_FILE_FLUSH_LSN field may have been written nonzero + for the first page of each file of the system tablespace. + We want to ignore it for the system tablespace, but because + we do not know the expected tablespace here, we ignore the + field for all data files, except for + innodb_checksum_algorithm=full_crc32 which we handled above. */ if (!checksum_field1 && !checksum_field2) { /* Checksum fields can have valid value as zero. If the page is not empty then do the checksum @@ -1082,8 +1102,7 @@ buf_page_is_corrupted( bool all_zeroes = true; for (size_t i = 0; i < srv_page_size; i++) { #ifndef UNIV_INNOCHECKSUM - if (i == FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION - && (!space || !space->id)) { + if (i == FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) { i += 8; } #endif @@ -1099,6 +1118,7 @@ buf_page_is_corrupted( } switch (curr_algo) { + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: return !buf_page_is_checksum_valid_crc32( read_buf, checksum_field1, checksum_field2); @@ -1108,6 +1128,7 @@ buf_page_is_corrupted( case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: return !buf_page_is_checksum_valid_none( read_buf, checksum_field1, checksum_field2); + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: case SRV_CHECKSUM_ALGORITHM_CRC32: case SRV_CHECKSUM_ALGORITHM_INNODB: if (buf_page_is_checksum_valid_none(read_buf, @@ -1132,6 +1153,9 @@ buf_page_is_corrupted( return false; } + crc32_chksum = curr_algo == SRV_CHECKSUM_ALGORITHM_CRC32 + || curr_algo == SRV_CHECKSUM_ALGORITHM_FULL_CRC32; + /* Very old versions of InnoDB only stored 8 byte lsn to the start and the end of the page. */ @@ -1142,19 +1166,18 @@ buf_page_is_corrupted( != mach_read_from_4(read_buf + FIL_PAGE_LSN) && checksum_field2 != BUF_NO_CHECKSUM_MAGIC) { - if (curr_algo == SRV_CHECKSUM_ALGORITHM_CRC32) { + if (crc32_chksum) { + crc32 = buf_calc_page_crc32(read_buf); + crc32_inited = true; + DBUG_EXECUTE_IF( "page_intermittent_checksum_mismatch", { static int page_counter; if (page_counter++ == 2) { - checksum_field2++; + crc32++; } }); - crc32 = buf_page_check_crc32(read_buf, - checksum_field2); - crc32_inited = true; - if (checksum_field2 != crc32 && checksum_field2 != buf_calc_page_old_checksum(read_buf)) { @@ -1166,8 +1189,7 @@ buf_page_is_corrupted( if (checksum_field2 != buf_calc_page_old_checksum(read_buf)) { - crc32 = buf_page_check_crc32( - read_buf, checksum_field2); + crc32 = buf_calc_page_crc32(read_buf); crc32_inited = true; if (checksum_field2 != crc32) { @@ -1179,10 +1201,10 @@ buf_page_is_corrupted( if (checksum_field1 == 0 || checksum_field1 == BUF_NO_CHECKSUM_MAGIC) { - } else if (curr_algo == SRV_CHECKSUM_ALGORITHM_CRC32) { + } else if (crc32_chksum) { + if (!crc32_inited) { - crc32 = buf_page_check_crc32( - read_buf, checksum_field2); + crc32 = buf_calc_page_crc32(read_buf); crc32_inited = true; } @@ -1198,8 +1220,7 @@ buf_page_is_corrupted( != buf_calc_page_new_checksum(read_buf)) { if (!crc32_inited) { - crc32 = buf_page_check_crc32( - read_buf, checksum_field2); + crc32 = buf_calc_page_crc32(read_buf); crc32_inited = true; } @@ -1253,10 +1274,10 @@ buf_madvise_do_dump() srv_log_buffer_size, MADV_DODUMP); } - /* mirrors recv_sys_init() */ - if (recv_sys->buf) + /* mirrors recv_sys_t::create() */ + if (recv_sys.buf) { - ret+= madvise(recv_sys->buf, recv_sys->len, MADV_DODUMP); + ret+= madvise(recv_sys.buf, recv_sys.len, MADV_DODUMP); } buf_pool_mutex_enter_all(); @@ -1280,22 +1301,21 @@ buf_madvise_do_dump() /** Dump a page to stderr. @param[in] read_buf database page -@param[in] page_size page size */ -UNIV_INTERN -void -buf_page_print(const byte* read_buf, const page_size_t& page_size) +@param[in] zip_size compressed page size, or 0 */ +void buf_page_print(const byte* read_buf, ulint zip_size) { dict_index_t* index; #ifndef UNIV_DEBUG + const ulint size = zip_size ? zip_size : srv_page_size; ib::info() << "Page dump in ascii and hex (" - << page_size.physical() << " bytes):"; + << size << " bytes):"; - ut_print_buf(stderr, read_buf, page_size.physical()); + ut_print_buf(stderr, read_buf, size); fputs("\nInnoDB: End of page dump\n", stderr); #endif - if (page_size.is_compressed()) { + if (zip_size) { /* Print compressed page. */ ib::info() << "Compressed page type (" << fil_page_get_type(read_buf) @@ -1307,27 +1327,21 @@ buf_page_print(const byte* read_buf, const page_size_t& page_size) SRV_CHECKSUM_ALGORITHM_CRC32) << " " << page_zip_calc_checksum( - read_buf, page_size.physical(), + read_buf, zip_size, SRV_CHECKSUM_ALGORITHM_CRC32) -#ifdef INNODB_BUG_ENDIAN_CRC32 - << "/" - << page_zip_calc_checksum( - read_buf, page_size.physical(), - SRV_CHECKSUM_ALGORITHM_CRC32, true) -#endif << ", " << buf_checksum_algorithm_name( SRV_CHECKSUM_ALGORITHM_INNODB) << " " << page_zip_calc_checksum( - read_buf, page_size.physical(), + read_buf, zip_size, SRV_CHECKSUM_ALGORITHM_INNODB) << ", " << buf_checksum_algorithm_name( SRV_CHECKSUM_ALGORITHM_NONE) << " " << page_zip_calc_checksum( - read_buf, page_size.physical(), + read_buf, zip_size, SRV_CHECKSUM_ALGORITHM_NONE) << "; page LSN " << mach_read_from_8(read_buf + FIL_PAGE_LSN) @@ -1340,10 +1354,6 @@ buf_page_print(const byte* read_buf, const page_size_t& page_size) } else { const uint32_t crc32 = buf_calc_page_crc32(read_buf); -#ifdef INNODB_BUG_ENDIAN_CRC32 - const uint32_t crc32_legacy = buf_calc_page_crc32(read_buf, - true); -#endif /* INNODB_BUG_ENDIAN_CRC32 */ ulint page_type = fil_page_get_type(read_buf); ib::info() << "Uncompressed page, stored checksum in field1 " @@ -1353,9 +1363,6 @@ buf_page_print(const byte* read_buf, const page_size_t& page_size) << buf_checksum_algorithm_name( SRV_CHECKSUM_ALGORITHM_CRC32) << " " << crc32 -#ifdef INNODB_BUG_ENDIAN_CRC32 - << "/" << crc32_legacy -#endif << ", " << buf_checksum_algorithm_name( SRV_CHECKSUM_ALGORITHM_INNODB) << " " @@ -1367,15 +1374,12 @@ buf_page_print(const byte* read_buf, const page_size_t& page_size) SRV_CHECKSUM_ALGORITHM_NONE) << " " << BUF_NO_CHECKSUM_MAGIC << ", stored checksum in field2 " - << mach_read_from_4(read_buf + page_size.logical() + << mach_read_from_4(read_buf + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM) << ", calculated checksums for field2: " << buf_checksum_algorithm_name( SRV_CHECKSUM_ALGORITHM_CRC32) << " " << crc32 -#ifdef INNODB_BUG_ENDIAN_CRC32 - << "/" << crc32_legacy -#endif << ", " << buf_checksum_algorithm_name( SRV_CHECKSUM_ALGORITHM_INNODB) << " " @@ -1389,7 +1393,7 @@ buf_page_print(const byte* read_buf, const page_size_t& page_size) << " " << mach_read_from_4(read_buf + FIL_PAGE_LSN + 4) << ", low 4 bytes of LSN at page end " - << mach_read_from_4(read_buf + page_size.logical() + << mach_read_from_4(read_buf + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM + 4) << ", page number (if stored to page already) " << mach_read_from_4(read_buf + FIL_PAGE_OFFSET) @@ -1503,7 +1507,7 @@ pfs_register_buffer_block( : NULL; # ifdef UNIV_DEBUG - rwlock = &block->debug_latch; + rwlock = block->debug_latch; ut_a(!rwlock->pfs_psi); rwlock->pfs_psi = (PSI_server) ? PSI_server->init_rwlock(buf_block_debug_latch_key, @@ -1561,6 +1565,7 @@ buf_block_init( page_zip_des_init(&block->page.zip); mutex_create(LATCH_ID_BUF_BLOCK_MUTEX, &block->mutex); + ut_d(block->debug_latch = (rw_lock_t *) ut_malloc_nokey(sizeof(rw_lock_t))); #if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC /* If PFS_SKIP_BUFFER_MUTEX_RWLOCK is defined, skip registration @@ -1572,7 +1577,7 @@ buf_block_init( rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING); - ut_d(rw_lock_create(PFS_NOT_INSTRUMENTED, &block->debug_latch, + ut_d(rw_lock_create(PFS_NOT_INSTRUMENTED, block->debug_latch, SYNC_LEVEL_VARYING)); #else /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */ @@ -1580,7 +1585,7 @@ buf_block_init( rw_lock_create(buf_block_lock_key, &block->lock, SYNC_LEVEL_VARYING); ut_d(rw_lock_create(buf_block_debug_latch_key, - &block->debug_latch, SYNC_LEVEL_VARYING)); + block->debug_latch, SYNC_LEVEL_VARYING)); #endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */ @@ -1607,11 +1612,6 @@ buf_chunk_init( /* Round down to a multiple of page size, although it already should be. */ mem_size = ut_2pow_round<ulint>(mem_size, srv_page_size); - /* Reserve space for the block descriptors. */ - mem_size += ut_2pow_round<ulint>((mem_size >> srv_page_size_shift) - * (sizeof *block) - + (srv_page_size - 1), - srv_page_size); DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return(NULL);); @@ -1790,7 +1790,7 @@ buf_chunk_not_freed( == block->page.newest_modification); ut_ad(block->page.oldest_modification == 0 || block->page.oldest_modification - == recv_sys->recovered_lsn + == recv_sys.recovered_lsn || srv_force_recovery == SRV_FORCE_NO_LOG_REDO); ut_ad(block->page.buf_fix_count == 0); @@ -1839,6 +1839,16 @@ buf_pool_set_sizes(void) buf_pool_mutex_exit_all(); } +/** Free the synchronization objects of a buffer pool block descriptor +@param[in,out] block buffer pool block descriptor */ +static void buf_block_free_mutexes(buf_block_t* block) +{ + mutex_free(&block->mutex); + rw_lock_free(&block->lock); + ut_d(rw_lock_free(block->debug_latch)); + ut_d(ut_free(block->debug_latch)); +} + /********************************************************************//** Initialize a buffer pool instance. @return DB_SUCCESS if all goes well. */ @@ -1902,11 +1912,7 @@ buf_pool_init_instance( buf_block_t* block = chunk->blocks; for (i = chunk->size; i--; block++) { - mutex_free(&block->mutex); - rw_lock_free(&block->lock); - - ut_d(rw_lock_free( - &block->debug_latch)); + buf_block_free_mutexes(block); } buf_pool->allocator.deallocate_large_dodump( @@ -1926,8 +1932,7 @@ buf_pool_init_instance( ut_min(BUF_READ_AHEAD_PAGES, ut_2_power_up(buf_pool->curr_size / BUF_READ_AHEAD_PORTION)); - buf_pool->curr_pool_size = buf_pool->curr_size - << srv_page_size_shift; + buf_pool->curr_pool_size = buf_pool_size; buf_pool->old_size = buf_pool->curr_size; buf_pool->n_chunks_new = buf_pool->n_chunks; @@ -1984,12 +1989,9 @@ buf_pool_init_instance( new(&buf_pool->single_scan_itr) LRUItr(buf_pool, &buf_pool->mutex); /* Initialize the temporal memory array and slots */ - buf_pool->tmp_arr = (buf_tmp_array_t *)ut_malloc_nokey(sizeof(buf_tmp_array_t)); - memset(buf_pool->tmp_arr, 0, sizeof(buf_tmp_array_t)); - ulint n_slots = (srv_n_read_io_threads + srv_n_write_io_threads) * (8 * OS_AIO_N_PENDING_IOS_PER_THREAD); - buf_pool->tmp_arr->n_slots = n_slots; - buf_pool->tmp_arr->slots = (buf_tmp_buffer_t*)ut_malloc_nokey(sizeof(buf_tmp_buffer_t) * n_slots); - memset(buf_pool->tmp_arr->slots, 0, (sizeof(buf_tmp_buffer_t) * n_slots)); + new(&buf_pool->io_buf) buf_pool_t::io_buf_t( + (srv_n_read_io_threads + srv_n_write_io_threads) + * (8 * OS_AIO_N_PENDING_IOS_PER_THREAD)); buf_pool_mutex_exit(buf_pool); @@ -2051,10 +2053,7 @@ buf_pool_free_instance( buf_block_t* block = chunk->blocks; for (ulint i = chunk->size; i--; block++) { - mutex_free(&block->mutex); - rw_lock_free(&block->lock); - - ut_d(rw_lock_free(&block->debug_latch)); + buf_block_free_mutexes(block); } buf_pool->allocator.deallocate_large_dodump( @@ -2070,26 +2069,7 @@ buf_pool_free_instance( hash_table_free(buf_pool->page_hash); hash_table_free(buf_pool->zip_hash); - /* Free all used temporary slots */ - if (buf_pool->tmp_arr) { - for(ulint i = 0; i < buf_pool->tmp_arr->n_slots; i++) { - buf_tmp_buffer_t* slot = &(buf_pool->tmp_arr->slots[i]); - if (slot && slot->crypt_buf) { - aligned_free(slot->crypt_buf); - slot->crypt_buf = NULL; - } - - if (slot && slot->comp_buf) { - aligned_free(slot->comp_buf); - slot->comp_buf = NULL; - } - } - - ut_free(buf_pool->tmp_arr->slots); - ut_free(buf_pool->tmp_arr); - buf_pool->tmp_arr = NULL; - } - + buf_pool->io_buf.~io_buf_t(); buf_pool->allocator.~ut_allocator(); } @@ -2715,12 +2695,12 @@ buf_pool_resize() ut_ad(UT_LIST_GET_LEN(buf_pool->withdraw) == 0); ut_ad(buf_pool->flush_rbt == NULL); - buf_pool->curr_size = new_instance_size; - buf_pool->n_chunks_new = (new_instance_size << srv_page_size_shift) / srv_buf_pool_chunk_unit; + buf_pool->curr_size = buf_pool->n_chunks_new * buf_pool->chunks->size; + buf_pool_mutex_exit(buf_pool); } #ifdef BTR_CUR_HASH_ADAPT @@ -2910,11 +2890,7 @@ withdraw_retry: for (ulint j = chunk->size; j--; block++) { - mutex_free(&block->mutex); - rw_lock_free(&block->lock); - - ut_d(rw_lock_free( - &block->debug_latch)); + buf_block_free_mutexes(block); } buf_pool->allocator.deallocate_large_dodump( @@ -3056,7 +3032,7 @@ calc_buf_pool_size: ut_2_power_up(buf_pool->curr_size / BUF_READ_AHEAD_PORTION)); buf_pool->curr_pool_size - = buf_pool->curr_size << srv_page_size_shift; + = buf_pool->n_chunks * srv_buf_pool_chunk_unit; curr_size += buf_pool->curr_pool_size; buf_pool->old_size = buf_pool->curr_size; } @@ -3107,13 +3083,10 @@ calc_buf_pool_size: buf_resize_status("Resizing also other hash tables."); - /* normalize lock_sys */ srv_lock_table_size = 5 * (srv_buf_pool_size >> srv_page_size_shift); lock_sys.resize(srv_lock_table_size); - - /* normalize dict_sys */ - dict_resize(); + dict_sys.resize(); ib::info() << "Resized hash tables at lock_sys," #ifdef BTR_CUR_HASH_ADAPT @@ -3425,7 +3398,7 @@ page_found: } /* Add to an existing watch. */ - buf_block_fix(bpage); + bpage->fix(); return(NULL); } @@ -3565,7 +3538,7 @@ void buf_pool_watch_unset(const page_id_t page_id) increments buf_fix_count. */ bpage = buf_page_hash_get_low(buf_pool, page_id); - if (buf_block_unfix(bpage) == 0 + if (bpage->unfix() == 0 && buf_pool_watch_is_sentinel(buf_pool, bpage)) { buf_pool_watch_remove(buf_pool, bpage); } @@ -3732,12 +3705,9 @@ be implemented at a higher level. In other words, all possible accesses to a given page through this function must be protected by the same set of mutexes or latches. @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size @return pointer to the block */ -buf_page_t* -buf_page_get_zip( - const page_id_t page_id, - const page_size_t& page_size) +buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size) { buf_page_t* bpage; BPageMutex* block_mutex; @@ -3746,6 +3716,8 @@ buf_page_get_zip( ibool must_read; buf_pool_t* buf_pool = buf_pool_get(page_id); + ut_ad(zip_size); + ut_ad(ut_is_2pow(zip_size)); buf_pool->stat.n_page_gets++; for (;;) { @@ -3763,7 +3735,7 @@ lookup: /* Page not in buf_pool: needs to be read from file */ ut_ad(!hash_lock); - dberr_t err = buf_read_page(page_id, page_size); + dberr_t err = buf_read_page(page_id, zip_size); if (UNIV_UNLIKELY(err != DB_SUCCESS)) { ib::error() << "Reading compressed page " << page_id @@ -3791,7 +3763,7 @@ err_exit: switch (buf_page_get_state(bpage)) { case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_DIRTY: - buf_block_fix(bpage); + bpage->fix(); block_mutex = &buf_pool->zip_mutex; goto got_block; case BUF_BLOCK_FILE_PAGE: @@ -3902,7 +3874,7 @@ buf_zip_decompress( && (!crypt_data->is_default_encryption() || srv_encrypt_tables); - ut_ad(block->page.size.is_compressed()); + ut_ad(block->zip_size()); ut_a(block->page.id.space() != 0); if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) { @@ -3914,12 +3886,6 @@ buf_zip_decompress( << ", crc32: " << page_zip_calc_checksum( frame, size, SRV_CHECKSUM_ALGORITHM_CRC32) -#ifdef INNODB_BUG_ENDIAN_CRC32 - << "/" - << page_zip_calc_checksum( - frame, size, SRV_CHECKSUM_ALGORITHM_CRC32, - true) -#endif << " innodb: " << page_zip_calc_checksum( frame, size, SRV_CHECKSUM_ALGORITHM_INNODB) @@ -3927,7 +3893,6 @@ buf_zip_decompress( << page_zip_calc_checksum( frame, size, SRV_CHECKSUM_ALGORITHM_NONE) << " (algorithm: " << srv_checksum_algorithm << ")"; - goto err_exit; } @@ -3954,7 +3919,7 @@ buf_zip_decompress( case FIL_PAGE_TYPE_ZBLOB: case FIL_PAGE_TYPE_ZBLOB2: /* Copy to uncompressed storage. */ - memcpy(block->frame, frame, block->page.size.physical()); + memcpy(block->frame, frame, block->zip_size()); if (space) { space->release_for_io(); } @@ -4257,6 +4222,7 @@ done: /** This is the low level function used to get access to a database page. @param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH @param[in] guess guessed block or NULL @param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, @@ -4264,11 +4230,12 @@ BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH @param[in] file file name @param[in] line line where called @param[in] mtr mini-transaction +@param[out] err DB_SUCCESS or error code @return pointer to the block or NULL */ buf_block_t* buf_page_get_low( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint rw_latch, buf_block_t* guess, ulint mode, @@ -4316,16 +4283,15 @@ buf_page_get_low( case BUF_GET: case BUF_GET_IF_IN_POOL_OR_WATCH: case BUF_GET_POSSIBLY_FREED: - bool found; - const page_size_t& space_page_size - = fil_space_get_page_size(page_id.space(), &found); - ut_ad(found); - ut_ad(page_size.equals_to(space_page_size)); + fil_space_t* s = fil_space_acquire_for_io(page_id.space()); + ut_ad(s); + ut_ad(s->zip_size() == zip_size); + s->release_for_io(); } #endif /* UNIV_DEBUG */ ut_ad(!mtr || !ibuf_inside(mtr) - || ibuf_page_low(page_id, page_size, FALSE, file, line, NULL)); + || ibuf_page_low(page_id, zip_size, FALSE, file, line, NULL)); buf_pool->stat.n_page_gets++; hash_lock = buf_page_hash_lock_get(buf_pool, page_id); @@ -4398,10 +4364,10 @@ loop: = buf_page_get_mutex( &fix_block->page); mutex_enter(fix_mutex); - buf_block_fix(fix_block); + fix_block->fix(); mutex_exit(fix_mutex); } else { - buf_block_fix(fix_block); + fix_block->fix(); } /* Now safe to release page_hash mutex */ @@ -4434,10 +4400,10 @@ loop: corrupted, or if an encrypted page with a valid checksum cannot be decypted. */ - dberr_t local_err = buf_read_page(page_id, page_size); + dberr_t local_err = buf_read_page(page_id, zip_size); if (local_err == DB_SUCCESS) { - buf_read_ahead_random(page_id, page_size, + buf_read_ahead_random(page_id, zip_size, ibuf_inside(mtr)); retries = 0; @@ -4513,18 +4479,20 @@ loop: BPageMutex* fix_mutex = buf_page_get_mutex( &fix_block->page); mutex_enter(fix_mutex); - buf_block_fix(fix_block); + fix_block->fix(); mutex_exit(fix_mutex); } else { - buf_block_fix(fix_block); + fix_block->fix(); } /* Now safe to release page_hash mutex */ rw_lock_s_unlock(hash_lock); got_block: - switch (mode) { + default: + ut_ad(block->zip_size() == zip_size); + break; case BUF_GET_IF_IN_POOL: case BUF_PEEK_IF_IN_POOL: case BUF_EVICT_IF_IN_POOL: @@ -4539,7 +4507,7 @@ got_block: /* The page is being read to buffer pool, but we cannot wait around for the read to complete. */ - buf_block_unfix(fix_block); + fix_block->unfix(); return(NULL); } @@ -4555,7 +4523,7 @@ got_block: /* This suggests that the page is being flushed. Avoid returning reference to this page. Instead wait for the flush action to complete. */ - buf_block_unfix(fix_block); + fix_block->unfix(); os_thread_sleep(WAIT_FOR_WRITE); goto loop; } @@ -4564,7 +4532,7 @@ got_block: evict_from_pool: ut_ad(!fix_block->page.oldest_modification); buf_pool_mutex_enter(buf_pool); - buf_block_unfix(fix_block); + fix_block->unfix(); if (!buf_LRU_free_page(&fix_block->page, true)) { ut_ad(0); @@ -4583,7 +4551,7 @@ evict_from_pool: adaptive hash index. There cannot be an adaptive hash index for a compressed-only page, so do not bother decompressing the page. */ - buf_block_unfix(fix_block); + fix_block->unfix(); return(NULL); } @@ -4597,7 +4565,7 @@ evict_from_pool: /* This condition often occurs when the buffer is not buffer-fixed, but I/O-fixed by buf_page_init_for_read(). */ - buf_block_unfix(fix_block); + fix_block->unfix(); /* The block is buffer-fixed or I/O-fixed. Try again later. */ @@ -4626,7 +4594,7 @@ evict_from_pool: /* Buffer-fixing prevents the page_hash from changing. */ ut_ad(bpage == buf_page_hash_get_low(buf_pool, page_id)); - buf_block_unfix(fix_block); + fix_block->unfix(); buf_page_mutex_enter(block); mutex_enter(&buf_pool->zip_mutex); @@ -4715,7 +4683,7 @@ evict_from_pool: buf_page_mutex_exit(fix_block); --buf_pool->n_pend_unzip; - buf_block_unfix(fix_block); + fix_block->unfix(); buf_pool_mutex_exit(buf_pool); rw_lock_x_unlock(&fix_block->lock); @@ -4728,7 +4696,7 @@ evict_from_pool: if (!access_time && !recv_no_ibuf_operations) { ibuf_merge_or_delete_for_page( - block, page_id, &page_size, TRUE); + block, block->page.id, zip_size, true); } buf_pool_mutex_enter(buf_pool); @@ -4774,7 +4742,7 @@ evict_from_pool: buf_pool_mutex_enter(buf_pool); - buf_block_unfix(fix_block); + fix_block->unfix(); /* Now we are only holding the buf_pool->mutex, not block->mutex or hash_lock. Blocks cannot be @@ -4833,7 +4801,7 @@ evict_from_pool: buf_page_mutex_exit(fix_block); - buf_block_fix(fix_block); + fix_block->fix(); /* Failed to evict the page; change it directly */ @@ -4851,7 +4819,7 @@ evict_from_pool: if (!fsp_is_system_temporary(page_id.space())) { ibool ret; ret = rw_lock_s_lock_nowait( - &fix_block->debug_latch, file, line); + fix_block->debug_latch, file, line); ut_a(ret); } #endif /* UNIV_DEBUG */ @@ -4894,12 +4862,11 @@ evict_from_pool: buf_wait_for_read(fix_block); if (fix_block->page.id != page_id) { - - buf_block_unfix(fix_block); + fix_block->unfix(); #ifdef UNIV_DEBUG if (!fsp_is_system_temporary(page_id.space())) { - rw_lock_s_unlock(&fix_block->debug_latch); + rw_lock_s_unlock(fix_block->debug_latch); } #endif /* UNIV_DEBUG */ @@ -4916,7 +4883,7 @@ evict_from_pool: /* In the case of a first access, try to apply linear read-ahead */ - buf_read_ahead_linear(page_id, page_size, ibuf_inside(mtr)); + buf_read_ahead_linear(page_id, zip_size, ibuf_inside(mtr)); } ut_ad(!rw_lock_own_flagged(hash_lock, @@ -4928,6 +4895,7 @@ evict_from_pool: /** This is the general function used to get access to a database page. It does page initialization and applies the buffered redo logs. @param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH @param[in] guess guessed block or NULL @param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, @@ -4940,7 +4908,7 @@ BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH buf_block_t* buf_page_get_gen( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint rw_latch, buf_block_t* guess, ulint mode, @@ -4949,15 +4917,15 @@ buf_page_get_gen( mtr_t* mtr, dberr_t* err) { - if (buf_block_t *block = recv_recovery_create_page(page_id)) + if (buf_block_t *block= recv_recovery_create_page(page_id)) { - buf_block_fix(block); - ut_ad(rw_lock_s_lock_nowait(&block->debug_latch, file, line)); + block->fix(); + ut_ad(rw_lock_s_lock_nowait(block->debug_latch, file, line)); block= buf_page_mtr_lock(block, rw_latch, mtr, file, line); return block; } - return buf_page_get_low(page_id, page_size, rw_latch, + return buf_page_get_low(page_id, zip_size, rw_latch, guess, mode, file, line, mtr, err); } @@ -5004,7 +4972,7 @@ buf_page_optimistic_get( buf_page_make_young_if_needed(&block->page); ut_ad(!ibuf_inside(mtr) - || ibuf_page(block->page.id, block->page.size, NULL)); + || ibuf_page(block->page.id, block->zip_size(), NULL)); mtr_memo_type_t fix_type; @@ -5058,7 +5026,7 @@ buf_page_optimistic_get( if (!access_time) { /* In the case of a first access, try to apply linear read-ahead */ - buf_read_ahead_linear(block->page.id, block->page.size, + buf_read_ahead_linear(block->page.id, block->zip_size(), ibuf_inside(mtr)); } @@ -5284,13 +5252,14 @@ buf_page_init_low( /** Inits a page to the buffer buf_pool. @param[in,out] buf_pool buffer pool @param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in,out] block block to init */ static void buf_page_init( buf_pool_t* buf_pool, const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, buf_block_t* block) { buf_page_t* hash_page; @@ -5327,7 +5296,7 @@ buf_page_init( ut_a(buf_fix_count > 0); - my_atomic_add32((int32*) &block->page.buf_fix_count, buf_fix_count); + block->page.buf_fix_count += buf_fix_count; buf_pool_watch_remove(buf_pool, hash_page); } else { @@ -5340,14 +5309,11 @@ buf_page_init( ut_d(block->page.in_page_hash = TRUE); block->page.id = page_id; - block->page.size.copy_from(page_size); HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, page_id.fold(), &block->page); - if (page_size.is_compressed()) { - page_zip_set_size(&block->page.zip, page_size.physical()); - } + page_zip_set_size(&block->page.zip, zip_size); } /** Initialize a page for read to the buffer buf_pool. If the page is @@ -5361,6 +5327,7 @@ and the lock released later. @param[out] err DB_SUCCESS or DB_TABLESPACE_DELETED @param[in] mode BUF_READ_IBUF_PAGES_ONLY, ... @param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] unzip whether the uncompressed page is requested (for ROW_FORMAT=COMPRESSED) @return pointer to the block @@ -5370,7 +5337,7 @@ buf_page_init_for_read( dberr_t* err, ulint mode, const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, bool unzip) { buf_block_t* block; @@ -5389,12 +5356,12 @@ buf_page_init_for_read( if (mode == BUF_READ_IBUF_PAGES_ONLY) { /* It is a read-ahead within an ibuf routine */ - ut_ad(!ibuf_bitmap_page(page_id, page_size)); + ut_ad(!ibuf_bitmap_page(page_id, zip_size)); ibuf_mtr_start(&mtr); - if (!recv_no_ibuf_operations && - !ibuf_page(page_id, page_size, &mtr)) { + if (!recv_no_ibuf_operations + && !ibuf_page(page_id, zip_size, &mtr)) { ibuf_mtr_commit(&mtr); @@ -5404,7 +5371,7 @@ buf_page_init_for_read( ut_ad(mode == BUF_READ_ANY_PAGE); } - if (page_size.is_compressed() && !unzip && !recv_recovery_is_on()) { + if (zip_size && !unzip && !recv_recovery_is_on()) { block = NULL; } else { block = buf_LRU_get_free_block(buf_pool); @@ -5439,7 +5406,7 @@ buf_page_init_for_read( ut_ad(buf_pool_from_bpage(bpage) == buf_pool); - buf_page_init(buf_pool, page_id, page_size, block); + buf_page_init(buf_pool, page_id, zip_size, block); /* Note: We are using the hash_lock for protection. This is safe because no other thread can lookup the block from the @@ -5463,7 +5430,7 @@ buf_page_init_for_read( rw_lock_x_lock_gen(&block->lock, BUF_IO_READ); - if (page_size.is_compressed()) { + if (zip_size) { /* buf_pool->mutex may be released and reacquired by buf_buddy_alloc(). Thus, we must release block->mutex in order not to @@ -5473,8 +5440,7 @@ buf_page_init_for_read( been added to buf_pool->LRU and buf_pool->page_hash. */ buf_page_mutex_exit(block); - data = buf_buddy_alloc(buf_pool, page_size.physical(), - &lru); + data = buf_buddy_alloc(buf_pool, zip_size, &lru); buf_page_mutex_enter(block); block->page.zip.data = (page_zip_t*) data; @@ -5495,7 +5461,7 @@ buf_page_init_for_read( control block (bpage), in order to avoid the invocation of buf_buddy_relocate_block() on uninitialized data. */ - data = buf_buddy_alloc(buf_pool, page_size.physical(), &lru); + data = buf_buddy_alloc(buf_pool, zip_size, &lru); rw_lock_x_lock(hash_lock); @@ -5513,8 +5479,7 @@ buf_page_init_for_read( /* The block was added by some other thread. */ rw_lock_x_unlock(hash_lock); watch_page = NULL; - buf_buddy_free(buf_pool, data, - page_size.physical()); + buf_buddy_free(buf_pool, data, zip_size); bpage = NULL; goto func_exit; @@ -5527,11 +5492,9 @@ buf_page_init_for_read( bpage->buf_pool_index = buf_pool_index(buf_pool); page_zip_des_init(&bpage->zip); - page_zip_set_size(&bpage->zip, page_size.physical()); + page_zip_set_size(&bpage->zip, zip_size); bpage->zip.data = (page_zip_t*) data; - bpage->size.copy_from(page_size); - mutex_enter(&buf_pool->zip_mutex); buf_page_init_low(bpage); @@ -5557,7 +5520,7 @@ buf_page_init_for_read( ut_a(buf_fix_count > 0); - my_atomic_add32((int32*) &bpage->buf_fix_count, buf_fix_count); + bpage->buf_fix_count += buf_fix_count; ut_ad(buf_pool_watch_is_sentinel(buf_pool, watch_page)); buf_pool_watch_remove(buf_pool, watch_page); @@ -5596,18 +5559,18 @@ func_exit: return(bpage); } -/** Initializes a page to the buffer buf_pool. The page is usually not read +/** Initialize a page in the buffer pool. The page is usually not read from a file even if it cannot be found in the buffer buf_pool. This is one of the functions which perform to a block a state transition NOT_USED => FILE_PAGE (the other is buf_page_get_gen). @param[in] page_id page id -@param[in] page_size page size -@param[in] mtr mini-transaction +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction @return pointer to the block, page bufferfixed */ buf_block_t* buf_page_create( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, mtr_t* mtr) { buf_frame_t* frame; @@ -5617,7 +5580,7 @@ buf_page_create( rw_lock_t* hash_lock; ut_ad(mtr->is_active()); - ut_ad(page_id.space() != 0 || !page_size.is_compressed()); + ut_ad(page_id.space() != 0 || !zip_size); free_block = buf_LRU_get_free_block(buf_pool); @@ -5646,13 +5609,13 @@ buf_page_create( #endif /* BTR_CUR_HASH_ADAPT */ if (!recv_recovery_is_on()) { - return buf_page_get_with_no_latch(page_id, page_size, + return buf_page_get_with_no_latch(page_id, zip_size, mtr); } - mutex_exit(&recv_sys->mutex); - block = buf_page_get_with_no_latch(page_id, page_size, mtr); - mutex_enter(&recv_sys->mutex); + mutex_exit(&recv_sys.mutex); + block = buf_page_get_with_no_latch(page_id, zip_size, mtr); + mutex_enter(&recv_sys.mutex); return block; } @@ -5665,7 +5628,7 @@ buf_page_create( buf_page_mutex_enter(block); - buf_page_init(buf_pool, page_id, page_size, block); + buf_page_init(buf_pool, page_id, zip_size, block); rw_lock_x_unlock(hash_lock); @@ -5675,7 +5638,7 @@ buf_page_create( buf_block_buf_fix_inc(block, __FILE__, __LINE__); buf_pool->stat.n_pages_created++; - if (page_size.is_compressed()) { + if (zip_size) { void* data; bool lru; @@ -5693,7 +5656,7 @@ buf_page_create( the reacquisition of buf_pool->mutex. We also must defer this operation until after the block descriptor has been added to buf_pool->LRU and buf_pool->page_hash. */ - data = buf_buddy_alloc(buf_pool, page_size.physical(), &lru); + data = buf_buddy_alloc(buf_pool, zip_size, &lru); buf_page_mutex_enter(block); block->page.zip.data = (page_zip_t*) data; @@ -5720,7 +5683,7 @@ buf_page_create( /* Delete possible entries for the page from the insert buffer: such can exist if the page belonged to an index which was dropped */ if (!recv_recovery_is_on()) { - ibuf_merge_or_delete_for_page(NULL, page_id, &page_size, TRUE); + ibuf_merge_or_delete_for_page(NULL, page_id, zip_size, true); } frame = block->frame; @@ -5916,6 +5879,29 @@ buf_corrupt_page_release(buf_page_t* bpage, const fil_space_t* space) buf_pool_mutex_exit(buf_pool); } +/** Check if the encrypted page is corrupted for the full crc32 format. +@param[in] space_id page belongs to space id +@param[in] dst_frame page +@param[in] is_compressed compressed page +@return true if page is corrupted or false if it isn't */ +static bool buf_page_full_crc32_is_corrupted( + ulint space_id, + const byte* dst_frame, + bool is_compressed) +{ + if (!is_compressed + && memcmp(dst_frame + FIL_PAGE_LSN + 4, + dst_frame + srv_page_size - FIL_PAGE_FCRC32_END_LSN, 4)) { + return true; + } + + if (space_id != mach_read_from_4(dst_frame + FIL_PAGE_SPACE_ID)) { + return true; + } + + return false; +} + /** Check if page is maybe compressed, encrypted or both when we encounter corrupted page. Note that we can't be 100% sure if page is corrupted or decrypt/decompress just failed. @@ -5934,6 +5920,7 @@ static dberr_t buf_page_check_corrupt(buf_page_t* bpage, fil_space_t* space) byte* dst_frame = (bpage->zip.data) ? bpage->zip.data : ((buf_block_t*) bpage)->frame; dberr_t err = DB_SUCCESS; + uint key_version = buf_page_get_key_version(dst_frame, space->flags); /* In buf_decrypt_after_read we have either decrypted the page if page post encryption checksum matches and used key_id is found @@ -5941,15 +5928,23 @@ static dberr_t buf_page_check_corrupt(buf_page_t* bpage, fil_space_t* space) not decrypted and it could be either encrypted and corrupted or corrupted or good page. If we decrypted, there page could still be corrupted if used key does not match. */ - const bool seems_encrypted = mach_read_from_4( - dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) + const bool seems_encrypted = !space->full_crc32() && key_version && space->crypt_data && space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED; + ut_ad(space->purpose != FIL_TYPE_TEMPORARY || space->full_crc32()); /* If traditional checksums match, we assume that page is not anymore encrypted. */ - if (buf_page_is_corrupted( - true, dst_frame, bpage->size, space)) { + if (space->full_crc32() + && !buf_is_zeroes(span<const byte>(dst_frame, + space->physical_size())) + && (key_version || space->is_compressed() + || space->purpose == FIL_TYPE_TEMPORARY)) { + if (buf_page_full_crc32_is_corrupted( + space->id, dst_frame, space->is_compressed())) { + err = DB_PAGE_CORRUPTED; + } + } else if (buf_page_is_corrupted(true, dst_frame, space->flags)) { err = DB_PAGE_CORRUPTED; } @@ -5964,8 +5959,7 @@ static dberr_t buf_page_check_corrupt(buf_page_t* bpage, fil_space_t* space) ib::info() << "However key management plugin or used key_version " - << mach_read_from_4(dst_frame - + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) + << key_version << " is not found or" " used encryption algorithm or method does not match."; @@ -6010,7 +6004,7 @@ buf_page_io_complete(buf_page_t* bpage, bool dblwr, bool evict) io_type = buf_page_get_io_fix(bpage); ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE); - ut_ad(bpage->size.is_compressed() == (bpage->zip.data != NULL)); + ut_ad(!!bpage->zip.ssize == (bpage->zip.data != NULL)); ut_ad(uncompressed || bpage->zip.data); if (io_type == BUF_IO_READ) { @@ -6034,10 +6028,10 @@ buf_page_io_complete(buf_page_t* bpage, bool dblwr, bool evict) } if (bpage->zip.data && uncompressed) { - my_atomic_addlint(&buf_pool->n_pend_unzip, 1); + buf_pool->n_pend_unzip++; ibool ok = buf_zip_decompress((buf_block_t*) bpage, FALSE); - my_atomic_addlint(&buf_pool->n_pend_unzip, ulint(-1)); + buf_pool->n_pend_unzip--; if (!ok) { ib::info() << "Page " @@ -6064,13 +6058,26 @@ buf_page_io_complete(buf_page_t* bpage, bool dblwr, bool evict) } else if (read_space_id == 0 && read_page_no == 0) { /* This is likely an uninitialized page. */ - } else if ((bpage->id.space() != TRX_SYS_SPACE + } else if (((!space->full_crc32() + || bpage->id.space() != TRX_SYS_SPACE) && bpage->id.space() != read_space_id) || bpage->id.page_no() != read_page_no) { - /* We did not compare space_id to read_space_id - in the system tablespace, because the field - was written as garbage before MySQL 4.1.1, - which did not support innodb_file_per_table. */ + /* We do not compare space_id to read_space_id + in the system tablespace unless space->full_crc32(), + because the field was written as garbage before + MySQL 4.1.1, which introduced support for + innodb_file_per_table. */ + + if (space->full_crc32() + && *reinterpret_cast<uint32_t*> + (&frame[FIL_PAGE_FCRC32_KEY_VERSION]) + && space->crypt_data + && space->crypt_data->type + != CRYPT_SCHEME_UNENCRYPTED) { + ib::error() << "Cannot decrypt " << bpage->id; + err = DB_DECRYPTION_FAILED; + goto release_page; + } ib::error() << "Space id and page no stored in " "the page, read in are " @@ -6111,7 +6118,7 @@ database_corrupted: << ". You may have to recover from " << "a backup."; - buf_page_print(frame, bpage->size); + buf_page_print(frame, bpage->zip_size()); ib::info() << "It is also possible that your" @@ -6148,6 +6155,7 @@ database_corrupted: if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED) { +release_page: const page_id_t corrupt_page_id = bpage->id; buf_corrupt_page_release(bpage, space); @@ -6164,19 +6172,15 @@ database_corrupted: recv_recover_page(bpage); } - /* If space is being truncated then avoid ibuf operation. - During re-init we have already freed ibuf entries. */ if (uncompressed && !recv_no_ibuf_operations && (bpage->id.space() == 0 || !is_predefined_tablespace(bpage->id.space())) - && !srv_is_tablespace_truncated(bpage->id.space()) && fil_page_get_type(frame) == FIL_PAGE_INDEX && page_is_leaf(frame)) { - ibuf_merge_or_delete_for_page( - (buf_block_t*) bpage, bpage->id, - &bpage->size, TRUE); + reinterpret_cast<buf_block_t*>(bpage), + bpage->id, bpage->zip_size(), true); } space->release_for_io(); @@ -7270,6 +7274,21 @@ buf_all_freed(void) return(TRUE); } +/** Verify that post encryption checksum match with the calculated checksum. +This function should be called only if tablespace contains crypt data metadata. +@param[in] page page frame +@param[in] fsp_flags tablespace flags +@return true if true if page is encrypted and OK, false otherwise */ +bool buf_page_verify_crypt_checksum(const byte* page, ulint fsp_flags) +{ + if (!fil_space_t::full_crc32(fsp_flags)) { + return fil_space_verify_crypt_checksum( + page, fil_space_t::zip_size(fsp_flags)); + } + + return !buf_page_is_corrupted(true, page, fsp_flags); +} + /*********************************************************************//** Checks that there currently are no pending i/o-operations for the buffer pool. @@ -7354,28 +7373,21 @@ static byte* buf_tmp_page_encrypt( byte* src_frame, byte* dst_frame) { - uint header_len = FIL_PAGE_DATA; - /* FIL page header is not encrypted */ - memcpy(dst_frame, src_frame, header_len); - /* Calculate the start offset in a page */ - uint unencrypted_bytes = header_len + FIL_PAGE_DATA_END; - uint srclen = srv_page_size - unencrypted_bytes; - const byte* src = src_frame + header_len; - byte* dst = dst_frame + header_len; + uint srclen = srv_page_size - (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + + FIL_PAGE_FCRC32_CHECKSUM); + const byte* src = src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + byte* dst = dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + + memcpy(dst_frame, src_frame, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); if (!log_tmp_block_encrypt(src, srclen, dst, (offset * srv_page_size), true)) { return NULL; } - memcpy(dst_frame + srv_page_size - FIL_PAGE_DATA_END, - src_frame + srv_page_size - FIL_PAGE_DATA_END, - FIL_PAGE_DATA_END); - - /* Handle post encryption checksum */ - mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4, - buf_calc_page_crc32(dst_frame)); + const ulint payload = srv_page_size - FIL_PAGE_FCRC32_CHECKSUM; + mach_write_to_4(dst_frame + payload, ut_crc32(dst_frame, payload)); srv_stats.pages_encrypted.inc(); srv_stats.n_temp_blocks_encrypted.inc(); @@ -7391,7 +7403,7 @@ a page is written to disk. (may be src_frame or an encrypted/compressed copy of it) */ UNIV_INTERN byte* -buf_page_encrypt_before_write( +buf_page_encrypt( fil_space_t* space, buf_page_t* bpage, byte* src_frame) @@ -7399,7 +7411,7 @@ buf_page_encrypt_before_write( ut_ad(space->id == bpage->id.space()); bpage->real_size = srv_page_size; - fil_page_type_validate(src_frame); + ut_d(fil_page_type_validate(space, src_frame)); switch (bpage->id.page_no()) { case 0: @@ -7427,18 +7439,23 @@ buf_page_encrypt_before_write( && crypt_data->type != CRYPT_SCHEME_UNENCRYPTED && (!crypt_data->is_default_encryption() || srv_encrypt_tables); - - page_compressed = FSP_FLAGS_HAS_PAGE_COMPRESSION(space->flags); + page_compressed = space->is_compressed(); } if (!encrypted && !page_compressed) { /* No need to encrypt or page compress the page. Clear key-version & crypt-checksum. */ - memset(src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8); + if (space->full_crc32()) { + memset(src_frame + FIL_PAGE_FCRC32_KEY_VERSION, 0, 4); + } else { + memset(src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, + 0, 8); + } + return src_frame; } - ut_ad(!bpage->size.is_compressed() || !page_compressed); + ut_ad(!bpage->zip_size() || !page_compressed); buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); /* Find free slot from temporary memory array */ buf_tmp_buffer_t* slot = buf_pool_reserve_tmp_slot(buf_pool); @@ -7447,6 +7464,19 @@ buf_page_encrypt_before_write( buf_tmp_reserve_crypt_buf(slot); byte *dst_frame = slot->crypt_buf; + const bool full_crc32 = space->full_crc32(); + + if (full_crc32) { + /* Write LSN for the full crc32 checksum before + encryption. Because lsn is one of the input for encryption. */ + mach_write_to_8(src_frame + FIL_PAGE_LSN, + bpage->newest_modification); + if (!page_compressed) { + mach_write_to_4( + src_frame + srv_page_size - FIL_PAGE_FCRC32_END_LSN, + (ulint) bpage->newest_modification); + } + } if (!page_compressed) { not_compressed: @@ -7466,26 +7496,38 @@ not_compressed: bpage->real_size = srv_page_size; slot->out_buf = dst_frame = tmp; - ut_d(fil_page_type_validate(tmp)); + ut_d(fil_page_type_validate(space, tmp)); } else { ut_ad(space->purpose != FIL_TYPE_TEMPORARY); /* First we compress the page content */ buf_tmp_reserve_compression_buf(slot); byte* tmp = slot->comp_buf; ulint out_len = fil_page_compress( - src_frame, tmp, - fsp_flags_get_page_compression_level(space->flags), + src_frame, tmp, space->flags, fil_space_get_block_size(space, bpage->id.page_no()), encrypted); + if (!out_len) { goto not_compressed; } bpage->real_size = out_len; + if (full_crc32) { + ut_d(bool compressed = false); + out_len = buf_page_full_crc32_size(tmp, +#ifdef UNIV_DEBUG + &compressed, +#else + NULL, +#endif + NULL); + ut_ad(compressed); + } + /* Workaround for MDEV-15527. */ memset(tmp + out_len, 0 , srv_page_size - out_len); - ut_d(fil_page_type_validate(tmp)); + ut_d(fil_page_type_validate(space, tmp)); if (encrypted) { /* And then we encrypt the page content */ @@ -7496,10 +7538,17 @@ not_compressed: dst_frame); } + if (full_crc32) { + compile_time_assert(FIL_PAGE_FCRC32_CHECKSUM == 4); + mach_write_to_4(tmp + out_len - 4, + ut_crc32(tmp, out_len - 4)); + ut_ad(!buf_page_is_corrupted(true, tmp, space->flags)); + } + slot->out_buf = dst_frame = tmp; } - ut_d(fil_page_type_validate(dst_frame)); + ut_d(fil_page_type_validate(space, dst_frame)); // return dst_frame which will be written return dst_frame; @@ -7513,7 +7562,7 @@ bool buf_page_should_punch_hole( const buf_page_t* bpage) { - return (bpage->real_size != bpage->size.physical()); + return bpage->real_size != bpage->physical_size(); } /** @@ -7526,6 +7575,6 @@ buf_page_get_trim_length( const buf_page_t* bpage, ulint write_length) { - return (bpage->size.physical() - write_length); + return bpage->physical_size() - write_length; } #endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/buf/buf0checksum.cc b/storage/innobase/buf/buf0checksum.cc index 70ad5ed600b..2b2a74dd736 100644 --- a/storage/innobase/buf/buf0checksum.cc +++ b/storage/innobase/buf/buf0checksum.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -39,35 +39,6 @@ ha_innodb.cc:12251: error: cannot convert 'srv_checksum_algorithm_t*' to 'long unsigned int*' in initialization */ ulong srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_INNODB; -#ifdef INNODB_BUG_ENDIAN_CRC32 -/** Calculate the CRC32 checksum of a page. The value is stored to the page -when it is written to a file and also checked for a match when reading from -the file. Note that we must be careful to calculate the same value on all -architectures. -@param[in] page buffer page (srv_page_size bytes) -@param[in] bug_endian whether to use big endian byteorder -when converting byte strings to integers, for bug-compatibility with -big-endian architecture running MySQL 5.6, MariaDB 10.0 or MariaDB 10.1 -@return CRC-32C */ -uint32_t buf_calc_page_crc32(const byte* page, bool bug_endian) -{ - return bug_endian - ? ut_crc32_legacy_big_endian( - page + FIL_PAGE_OFFSET, - FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION - - FIL_PAGE_OFFSET) - ^ ut_crc32_legacy_big_endian(page + FIL_PAGE_DATA, - srv_page_size - - (FIL_PAGE_DATA - + FIL_PAGE_END_LSN_OLD_CHKSUM)) - : ut_crc32(page + FIL_PAGE_OFFSET, - FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION - - FIL_PAGE_OFFSET) - ^ ut_crc32(page + FIL_PAGE_DATA, - srv_page_size - - (FIL_PAGE_DATA + FIL_PAGE_END_LSN_OLD_CHKSUM)); -} -#else /** Calculate the CRC32 checksum of a page. The value is stored to the page when it is written to a file and also checked for a match when reading from the file. Note that we must be careful to calculate the same value on all @@ -88,7 +59,6 @@ uint32_t buf_calc_page_crc32(const byte* page) srv_page_size - (FIL_PAGE_DATA + FIL_PAGE_END_LSN_OLD_CHKSUM)); } -#endif /** Calculate a checksum which is stored to the page when it is written to a file. Note that we must be careful to calculate the same value on @@ -151,6 +121,10 @@ buf_checksum_algorithm_name(srv_checksum_algorithm_t algo) return("none"); case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: return("strict_none"); + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: + return("full_crc32"); + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + return("strict_full_crc32"); } ut_error; diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index 50994b90de9..9c1b305fb6a 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -87,7 +87,7 @@ buf_dblwr_get( buf_block_t* block; block = buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), - univ_page_size, RW_X_LATCH, mtr); + 0, RW_X_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); @@ -368,7 +368,7 @@ buf_dblwr_init_or_load_pages( byte* doublewrite; byte* unaligned_read_buf; ibool reset_space_ids = FALSE; - recv_dblwr_t& recv_dblwr = recv_sys->dblwr; + recv_dblwr_t& recv_dblwr = recv_sys.dblwr; /* We do the file i/o past the buffer pool */ @@ -473,6 +473,7 @@ buf_dblwr_init_or_load_pages( page = buf; for (ulint i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) { + if (reset_space_ids) { ulint source_page_no; @@ -532,7 +533,7 @@ buf_dblwr_process() ulint page_no_dblwr = 0; byte* read_buf; byte* unaligned_read_buf; - recv_dblwr_t& recv_dblwr = recv_sys->dblwr; + recv_dblwr_t& recv_dblwr = recv_sys.dblwr; if (!buf_dblwr) { return; @@ -565,12 +566,9 @@ buf_dblwr_process() if (page_no >= space->size) { - /* Do not report the warning if the tablespace - is scheduled for truncation or was truncated - and we have parsed an MLOG_TRUNCATE record. */ - if (!srv_is_tablespace_truncated(space_id) - && !srv_was_tablespace_truncated(space) - && !srv_is_undo_tablespace(space_id)) { + /* Do not report the warning for undo + tablespaces, because they can be truncated in place. */ + if (!srv_is_undo_tablespace(space_id)) { ib::warn() << "A copy of page " << page_id << " in the doublewrite buffer slot " << page_no_dblwr @@ -579,13 +577,13 @@ buf_dblwr_process() continue; } - const page_size_t page_size(space->flags); - ut_ad(!buf_is_zeroes(span<const byte>(page, - page_size.physical()))); + const ulint physical_size = space->physical_size(); + const ulint zip_size = space->zip_size(); + ut_ad(!buf_is_zeroes(span<const byte>(page, physical_size))); /* We want to ensure that for partial reads the unread portion of the page is NUL. */ - memset(read_buf, 0x0, page_size.physical()); + memset(read_buf, 0x0, physical_size); IORequest request; @@ -594,8 +592,8 @@ buf_dblwr_process() /* Read in the actual page from the file */ dberr_t err = fil_io( request, true, - page_id, page_size, - 0, page_size.physical(), read_buf, NULL); + page_id, zip_size, + 0, physical_size, read_buf, NULL); if (UNIV_UNLIKELY(err != DB_SUCCESS)) { ib::warn() @@ -605,9 +603,10 @@ buf_dblwr_process() } const bool is_all_zero = buf_is_zeroes( - span<const byte>(read_buf, page_size.physical())); + span<const byte>(read_buf, physical_size)); const bool expect_encrypted = space->crypt_data && space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED; + bool is_corrupted = false; if (is_all_zero) { /* We will check if the copy in the @@ -617,19 +616,22 @@ buf_dblwr_process() } else { /* Decompress the page before validating the checksum. */ - ulint decomp = fil_page_decompress(buf, read_buf); - if (!decomp || (decomp != srv_page_size - && page_size.is_compressed())) { + ulint decomp = fil_page_decompress(buf, read_buf, + space->flags); + if (!decomp || (zip_size && decomp != srv_page_size)) { goto bad; } - if (expect_encrypted && mach_read_from_4( - read_buf - + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) - ? fil_space_verify_crypt_checksum(read_buf, - page_size) - : !buf_page_is_corrupted(true, read_buf, - page_size, space)) { + if (expect_encrypted + && buf_page_get_key_version(read_buf, space->flags)) { + is_corrupted = !buf_page_verify_crypt_checksum( + read_buf, space->flags); + } else { + is_corrupted = buf_page_is_corrupted( + true, read_buf, space->flags); + } + + if (!is_corrupted) { /* The page is good; there is no need to consult the doublewrite buffer. */ continue; @@ -643,16 +645,21 @@ bad: << " from the doublewrite buffer."; } - ulint decomp = fil_page_decompress(buf, page); - if (!decomp || (decomp != srv_page_size - && page_size.is_compressed())) { + ulint decomp = fil_page_decompress(buf, page, space->flags); + if (!decomp || (zip_size && decomp != srv_page_size)) { continue; } - if (expect_encrypted && mach_read_from_4( - page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) - ? !fil_space_verify_crypt_checksum(page, page_size) - : buf_page_is_corrupted(true, page, page_size, space)) { + if (expect_encrypted + && buf_page_get_key_version(read_buf, space->flags)) { + is_corrupted = !buf_page_verify_crypt_checksum( + page, space->flags); + } else { + is_corrupted = buf_page_is_corrupted( + true, page, space->flags); + } + + if (is_corrupted) { /* Theoretically we could have another good copy for this page in the doublewrite buffer. If not, we will report a fatal error @@ -664,7 +671,7 @@ bad: if (page_no == 0) { /* Check the FSP_SPACE_FLAGS. */ ulint flags = fsp_header_get_flags(page); - if (!fsp_flags_is_valid(flags, space_id) + if (!fil_space_t::is_valid_flags(flags, space_id) && fsp_flags_convert_from_101(flags) == ULINT_UNDEFINED) { ib::warn() << "Ignoring a doublewrite copy" @@ -681,8 +688,8 @@ bad: IORequest write_request(IORequest::WRITE); - fil_io(write_request, true, page_id, page_size, - 0, page_size.physical(), + fil_io(write_request, true, page_id, zip_size, + 0, physical_size, const_cast<byte*>(page), NULL); ib::info() << "Recovered page " << page_id @@ -786,40 +793,42 @@ buf_dblwr_update( } } -/********************************************************************//** -Check the LSN values on the page. */ -static -void -buf_dblwr_check_page_lsn( -/*=====================*/ - const page_t* page) /*!< in: page to check */ +#ifdef UNIV_DEBUG +/** Check the LSN values on the page. +@param[in] page page to check +@param[in] s tablespace */ +static void buf_dblwr_check_page_lsn(const page_t* page, const fil_space_t& s) { - ibool page_compressed = (mach_read_from_2(page+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED); - uint key_version = mach_read_from_4(page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); - /* Ignore page compressed or encrypted pages */ - if (page_compressed || key_version) { + if (s.is_compressed() + || buf_page_get_key_version(page, s.flags)) { return; } - if (memcmp(page + (FIL_PAGE_LSN + 4), - page + (srv_page_size - - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), - 4)) { - - const ulint lsn1 = mach_read_from_4( - page + FIL_PAGE_LSN + 4); - const ulint lsn2 = mach_read_from_4( - page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM - + 4); - - ib::error() << "The page to be written seems corrupt!" + const unsigned lsn1 = mach_read_from_4(page + FIL_PAGE_LSN + 4), + lsn2 = mach_read_from_4(page + srv_page_size + - (s.full_crc32() + ? FIL_PAGE_FCRC32_END_LSN + : FIL_PAGE_END_LSN_OLD_CHKSUM - 4)); + if (UNIV_UNLIKELY(lsn1 != lsn2)) { + ib::error() << "The page to be written to " + << s.chain.start->name << + " seems corrupt!" " The low 4 bytes of LSN fields do not match" " (" << lsn1 << " != " << lsn2 << ")!" " Noticed in the buffer pool."; } } +static void buf_dblwr_check_page_lsn(const buf_page_t& b, const byte* page) +{ + if (fil_space_t* space = fil_space_acquire_for_io(b.id.space())) { + buf_dblwr_check_page_lsn(page, *space); + space->release_for_io(); + } +} +#endif /* UNIV_DEBUG */ + /********************************************************************//** Asserts when a corrupt block is find during writing out data to the disk. */ @@ -829,7 +838,7 @@ buf_dblwr_assert_on_corrupt_block( /*==============================*/ const buf_block_t* block) /*!< in: block to check */ { - buf_page_print(block->frame, univ_page_size); + buf_page_print(block->frame); ib::fatal() << "Apparent corruption of an index page " << block->page.id @@ -919,14 +928,14 @@ buf_dblwr_write_block_to_datafile( void * frame = buf_page_get_frame(bpage); if (bpage->zip.data != NULL) { - ut_ad(bpage->size.is_compressed()); + ut_ad(bpage->zip_size()); - fil_io(request, sync, bpage->id, bpage->size, 0, - bpage->size.physical(), + fil_io(request, sync, bpage->id, bpage->zip_size(), 0, + bpage->zip_size(), (void*) frame, (void*) bpage); } else { - ut_ad(!bpage->size.is_compressed()); + ut_ad(!bpage->zip_size()); /* Our IO API is common for both reads and writes and is therefore geared towards a non-const parameter. */ @@ -935,11 +944,10 @@ buf_dblwr_write_block_to_datafile( const_cast<buf_page_t*>(bpage)); ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); - buf_dblwr_check_page_lsn(block->frame); - + ut_d(buf_dblwr_check_page_lsn(block->page, block->frame)); fil_io(request, - sync, bpage->id, bpage->size, 0, bpage->real_size, - frame, block); + sync, bpage->id, bpage->zip_size(), 0, bpage->real_size, + frame, block); } } @@ -1030,10 +1038,7 @@ try_again: /* Check that the actual page in the buffer pool is not corrupt and the LSN values are sane. */ buf_dblwr_check_block(block); - - /* Check that the page as written to the doublewrite - buffer has sane LSN values. */ - buf_dblwr_check_page_lsn(write_buf + len2); + ut_d(buf_dblwr_check_page_lsn(block->page, write_buf + len2)); } /* Write out the first block of the doublewrite buffer */ @@ -1041,7 +1046,7 @@ try_again: buf_dblwr->first_free) << srv_page_size_shift; fil_io(IORequestWrite, true, - page_id_t(TRX_SYS_SPACE, buf_dblwr->block1), univ_page_size, + page_id_t(TRX_SYS_SPACE, buf_dblwr->block1), 0, 0, len, (void*) write_buf, NULL); if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { @@ -1057,7 +1062,7 @@ try_again: + (TRX_SYS_DOUBLEWRITE_BLOCK_SIZE << srv_page_size_shift); fil_io(IORequestWrite, true, - page_id_t(TRX_SYS_SPACE, buf_dblwr->block2), univ_page_size, + page_id_t(TRX_SYS_SPACE, buf_dblwr->block2), 0, 0, len, (void*) write_buf, NULL); flush: @@ -1143,18 +1148,15 @@ try_again: encryption and/or page compression */ void * frame = buf_page_get_frame(bpage); - if (bpage->size.is_compressed()) { - MEM_CHECK_DEFINED(bpage->zip.data, bpage->size.physical()); + if (auto zip_size = bpage->zip_size()) { + MEM_CHECK_DEFINED(bpage->zip.data, zip_size); /* Copy the compressed page and clear the rest. */ - - memcpy(p, frame, bpage->size.physical()); - - memset(p + bpage->size.physical(), 0x0, - srv_page_size - bpage->size.physical()); + memcpy(p, frame, zip_size); + memset(p + zip_size, 0x0, srv_page_size - zip_size); } else { ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); - MEM_CHECK_DEFINED(frame, bpage->size.logical()); - memcpy(p, frame, bpage->size.logical()); + MEM_CHECK_DEFINED(frame, srv_page_size); + memcpy(p, frame, srv_page_size); } buf_dblwr->buf_block_arr[buf_dblwr->first_free] = bpage; @@ -1216,8 +1218,8 @@ buf_dblwr_write_single_page( /* Check that the page as written to the doublewrite buffer has sane LSN values. */ if (!bpage->zip.data) { - buf_dblwr_check_page_lsn( - ((buf_block_t*) bpage)->frame); + ut_d(buf_dblwr_check_page_lsn( + *bpage, ((buf_block_t*) bpage)->frame)); } } @@ -1276,18 +1278,18 @@ retry: encryption and/or page compression */ void * frame = buf_page_get_frame(bpage); - if (bpage->size.is_compressed()) { + if (auto zip_size = bpage->zip_size()) { memcpy(buf_dblwr->write_buf + srv_page_size * i, - frame, bpage->size.physical()); + frame, zip_size); memset(buf_dblwr->write_buf + srv_page_size * i - + bpage->size.physical(), 0x0, - srv_page_size - bpage->size.physical()); + + zip_size, 0x0, + srv_page_size - zip_size); fil_io(IORequestWrite, true, page_id_t(TRX_SYS_SPACE, offset), - univ_page_size, + 0, 0, srv_page_size, (void *)(buf_dblwr->write_buf + srv_page_size * i), @@ -1298,7 +1300,7 @@ retry: fil_io(IORequestWrite, true, page_id_t(TRX_SYS_SPACE, offset), - univ_page_size, + 0, 0, srv_page_size, (void*) frame, diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc index eb7d085ba57..234fb8ef5f3 100644 --- a/storage/innobase/buf/buf0dump.cc +++ b/storage/innobase/buf/buf0dump.cc @@ -671,7 +671,7 @@ buf_load() so all pages from a given tablespace are consecutive. */ ulint cur_space_id = BUF_DUMP_SPACE(dump[0]); fil_space_t* space = fil_space_acquire_silent(cur_space_id); - page_size_t page_size(space ? space->flags : 0); + ulint zip_size = space ? space->zip_size() : 0; /* JAN: TODO: MySQL 5.7 PSI #ifdef HAVE_PSI_STAGE_INTERFACE @@ -702,9 +702,7 @@ buf_load() space = fil_space_acquire_silent(cur_space_id); if (space != NULL) { - const page_size_t cur_page_size( - space->flags); - page_size.copy_from(cur_page_size); + zip_size = space->zip_size(); } } @@ -719,7 +717,7 @@ buf_load() buf_read_page_background( page_id_t(this_space_id, BUF_DUMP_PAGE(dump[i])), - page_size, true); + zip_size, true); if (i % 64 == 63) { os_aio_simulated_wake_handler_threads(); @@ -821,7 +819,7 @@ DECLARE_THREAD(buf_dump_thread)(void*) if (srv_buffer_pool_load_at_startup) { #ifdef WITH_WSREP - if (!wsrep_recovery) { + if (!get_wsrep_recovery()) { #endif /* WITH_WSREP */ buf_load(); #ifdef WITH_WSREP @@ -855,7 +853,7 @@ DECLARE_THREAD(buf_dump_thread)(void*) "Dumping of buffer pool not started" " as load was incomplete"); #ifdef WITH_WSREP - } else if (wsrep_recovery) { + } else if (get_wsrep_recovery()) { #endif /* WITH_WSREP */ } else { buf_dump(FALSE/* do complete dump at shutdown */); diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index f948ecb483e..616885ed863 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -211,7 +211,7 @@ incr_flush_list_size_in_bytes( { ut_ad(buf_flush_list_mutex_own(buf_pool)); - buf_pool->stat.flush_list_bytes += block->page.size.physical(); + buf_pool->stat.flush_list_bytes += block->physical_size(); ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size); } @@ -427,120 +427,44 @@ buf_flush_insert_into_flush_list( ut_ad(buf_page_mutex_own(block)); buf_flush_list_mutex_enter(buf_pool); - - ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL) - || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification - <= lsn)); - - /* If we are in the recovery then we need to update the flush - red-black tree as well. */ - if (buf_pool->flush_rbt != NULL) { - buf_flush_list_mutex_exit(buf_pool); - buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn); - return; - } - - ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); - ut_ad(!block->page.in_flush_list); - - ut_d(block->page.in_flush_list = TRUE); - block->page.oldest_modification = lsn; - - UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page); - - incr_flush_list_size_in_bytes(block, buf_pool); - - MEM_CHECK_DEFINED(block->page.size.is_compressed() - ? block->page.zip.data : block->frame, - block->page.size.physical()); -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - ut_a(buf_flush_validate_skip(buf_pool)); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - - buf_flush_list_mutex_exit(buf_pool); -} - -/********************************************************************//** -Inserts a modified block into the flush list in the right sorted position. -This function is used by recovery, because there the modifications do not -necessarily come in the order of lsn's. */ -void -buf_flush_insert_sorted_into_flush_list( -/*====================================*/ - buf_pool_t* buf_pool, /*!< in: buffer pool instance */ - buf_block_t* block, /*!< in/out: block which is modified */ - lsn_t lsn) /*!< in: oldest modification */ -{ - buf_page_t* prev_b; - buf_page_t* b; - - ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE); - ut_ad(!buf_pool_mutex_own(buf_pool)); - ut_ad(log_flush_order_mutex_own()); - ut_ad(buf_page_mutex_own(block)); - ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); - - buf_flush_list_mutex_enter(buf_pool); - - /* The field in_LRU_list is protected by buf_pool->mutex, which - we are not holding. However, while a block is in the flush - list, it is dirty and cannot be discarded, not from the - page_hash or from the LRU list. At most, the uncompressed - page frame of a compressed block may be discarded or created - (copying the block->page to or from a buf_page_t that is - dynamically allocated from buf_buddy_alloc()). Because those - transitions hold block->mutex and the flush list mutex (via - buf_flush_relocate_on_flush_list()), there is no possibility - of a race condition in the assertions below. */ - ut_ad(block->page.in_LRU_list); - ut_ad(block->page.in_page_hash); - /* buf_buddy_block_register() will take a block in the - BUF_BLOCK_MEMORY state, not a file page. */ - ut_ad(!block->page.in_zip_hash); - ut_ad(!block->page.in_flush_list); ut_d(block->page.in_flush_list = TRUE); + ut_ad(!block->page.oldest_modification); block->page.oldest_modification = lsn; - - MEM_CHECK_DEFINED(block->page.size.is_compressed() + MEM_CHECK_DEFINED(block->page.zip.data ? block->page.zip.data : block->frame, - block->page.size.physical()); - - prev_b = NULL; - - /* For the most part when this function is called the flush_rbt - should not be NULL. In a very rare boundary case it is possible - that the flush_rbt has already been freed by the recovery thread - before the last page was hooked up in the flush_list by the - io-handler thread. In that case we'll just do a simple - linear search in the else block. */ - if (buf_pool->flush_rbt != NULL) { - - prev_b = buf_flush_insert_in_flush_rbt(&block->page); - - } else { - - b = UT_LIST_GET_FIRST(buf_pool->flush_list); - - while (b != NULL && b->oldest_modification - > block->page.oldest_modification) { + block->physical_size()); + incr_flush_list_size_in_bytes(block, buf_pool); - ut_ad(b->in_flush_list); - prev_b = b; - b = UT_LIST_GET_NEXT(list, b); + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE); + /* The field in_LRU_list is protected by buf_pool->mutex, which + we are not holding. However, while a block is in the flush + list, it is dirty and cannot be discarded, not from the + page_hash or from the LRU list. At most, the uncompressed + page frame of a compressed block may be discarded or created + (copying the block->page to or from a buf_page_t that is + dynamically allocated from buf_buddy_alloc()). Because those + transitions hold block->mutex and the flush list mutex (via + buf_flush_relocate_on_flush_list()), there is no possibility + of a race condition in the assertions below. */ + ut_ad(block->page.in_LRU_list); + ut_ad(block->page.in_page_hash); + /* buf_buddy_block_register() will take a block in the + BUF_BLOCK_MEMORY state, not a file page. */ + ut_ad(!block->page.in_zip_hash); + + if (buf_page_t* prev_b = + buf_flush_insert_in_flush_rbt(&block->page)) { + UT_LIST_INSERT_AFTER(buf_pool->flush_list, prev_b, &block->page); + goto func_exit; } } - if (prev_b == NULL) { - UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page); - } else { - UT_LIST_INSERT_AFTER(buf_pool->flush_list, prev_b, &block->page); - } - - incr_flush_list_size_in_bytes(block, buf_pool); - + UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page); +func_exit: #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - ut_a(buf_flush_validate_low(buf_pool)); + ut_a(buf_flush_validate_skip(buf_pool)); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ buf_flush_list_mutex_exit(buf_pool); @@ -662,7 +586,7 @@ buf_flush_remove( } /* If the flush_rbt is active then delete from there as well. */ - if (buf_pool->flush_rbt != NULL) { + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { buf_flush_delete_from_flush_rbt(bpage); } @@ -670,7 +594,7 @@ buf_flush_remove( because we assert on in_flush_list in comparison function. */ ut_d(bpage->in_flush_list = FALSE); - buf_pool->stat.flush_list_bytes -= bpage->size.physical(); + buf_pool->stat.flush_list_bytes -= bpage->physical_size(); bpage->oldest_modification = 0; @@ -730,7 +654,7 @@ buf_flush_relocate_on_flush_list( /* If recovery is active we must swap the control blocks in the flush_rbt as well. */ - if (buf_pool->flush_rbt != NULL) { + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { buf_flush_delete_from_flush_rbt(bpage); prev_b = buf_flush_insert_in_flush_rbt(dpage); } @@ -797,9 +721,9 @@ void buf_flush_write_complete(buf_page_t* bpage, bool dblwr) /** Calculate the checksum of a page from compressed table and update the page. -@param[in,out] page page to update -@param[in] size compressed page size -@param[in] lsn LSN to stamp on the page */ +@param[in,out] page page to update +@param[in] size compressed page size +@param[in] lsn LSN to stamp on the page */ void buf_flush_update_zip_checksum( buf_frame_t* page, @@ -816,31 +740,50 @@ buf_flush_update_zip_checksum( mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum); } +/** Assign the full crc32 checksum for non-compressed page. +@param[in,out] page page to be updated */ +void buf_flush_assign_full_crc32_checksum(byte* page) +{ + ut_d(bool compressed = false); + ut_d(bool corrupted = false); + ut_d(const uint size = buf_page_full_crc32_size(page, &compressed, + &corrupted)); + ut_ad(!compressed); + ut_ad(!corrupted); + ut_ad(size == uint(srv_page_size)); + const ulint payload = srv_page_size - FIL_PAGE_FCRC32_CHECKSUM; + mach_write_to_4(page + payload, ut_crc32(page, payload)); +} + /** Initialize a page for writing to the tablespace. -@param[in] block buffer block; NULL if bypassing the buffer pool -@param[in,out] page page frame -@param[in,out] page_zip_ compressed page, or NULL if uncompressed -@param[in] newest_lsn newest modification LSN to the page */ +@param[in] block buffer block; NULL if bypassing + the buffer pool +@param[in,out] page page frame +@param[in,out] page_zip_ compressed page, or NULL if + uncompressed +@param[in] newest_lsn newest modification LSN to the page +@param[in] use_full_checksum whether tablespace uses full checksum */ void buf_flush_init_for_writing( const buf_block_t* block, byte* page, void* page_zip_, - lsn_t newest_lsn) + lsn_t newest_lsn, + bool use_full_checksum) { + if (block != NULL && block->frame != page) { + /* If page is encrypted in full crc32 format then + checksum stored already as a part of fil_encrypt_buf() */ + ut_ad(use_full_checksum); + return; + } + ut_ad(block == NULL || block->frame == page); ut_ad(block == NULL || page_zip_ == NULL || &block->page.zip == page_zip_); ut_ad(!block || newest_lsn); ut_ad(page); -#if 0 /* MDEV-15528 TODO: reinstate this check */ - /* innodb_immediate_scrub_data_uncompressed=ON would cause - fsp_init_file_page() to be called on freed pages, and thus - cause them to be written as almost-all-zeroed. - In MDEV-15528 we should change that implement an option to - make freed pages appear all-zero, bypassing this code. */ ut_ad(!newest_lsn || fil_page_get_type(page)); -#endif if (page_zip_) { page_zip_des_t* page_zip; @@ -885,8 +828,14 @@ buf_flush_init_for_writing( /* Write the newest modification lsn to the page header and trailer */ mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn); - mach_write_to_8(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM, - newest_lsn); + if (use_full_checksum) { + mach_write_to_4(page + srv_page_size - FIL_PAGE_FCRC32_END_LSN, + static_cast<uint32_t>(newest_lsn)); + return buf_flush_assign_full_crc32_checksum(page); + } else { + mach_write_to_8(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM, + newest_lsn); + } if (block && srv_page_size == 16384) { /* The page type could be garbage in old files @@ -964,6 +913,8 @@ buf_flush_init_for_writing( be calculated after storing the new formula checksum. */ checksum = buf_calc_page_old_checksum(page); break; + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: case SRV_CHECKSUM_ALGORITHM_CRC32: case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: /* In other cases we write the same checksum to both fields. */ @@ -1006,7 +957,10 @@ buf_flush_write_block_low( || space->purpose == FIL_TYPE_TABLESPACE); ut_ad((space->purpose == FIL_TYPE_TEMPORARY) == (space == fil_system.temp_space)); + page_t* frame = NULL; + const bool full_crc32 = space->full_crc32(); + #ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); ut_ad(!buf_pool_mutex_own(buf_pool)); @@ -1046,8 +1000,7 @@ buf_flush_write_block_low( break; case BUF_BLOCK_ZIP_DIRTY: frame = bpage->zip.data; - - buf_flush_update_zip_checksum(frame, bpage->size.physical(), + buf_flush_update_zip_checksum(frame, bpage->zip_size(), bpage->newest_modification); break; case BUF_BLOCK_FILE_PAGE: @@ -1056,15 +1009,23 @@ buf_flush_write_block_low( frame = ((buf_block_t*) bpage)->frame; } + byte* page = reinterpret_cast<const buf_block_t*>(bpage)->frame; + + if (full_crc32) { + page = buf_page_encrypt(space, bpage, page); + frame = page; + } + buf_flush_init_for_writing( - reinterpret_cast<const buf_block_t*>(bpage), - reinterpret_cast<const buf_block_t*>(bpage)->frame, + reinterpret_cast<const buf_block_t*>(bpage), page, bpage->zip.data ? &bpage->zip : NULL, - bpage->newest_modification); + bpage->newest_modification, full_crc32); break; } - frame = buf_page_encrypt_before_write(space, bpage, frame); + if (!full_crc32) { + frame = buf_page_encrypt(space, bpage, frame); + } ut_ad(space->purpose == FIL_TYPE_TABLESPACE || space->atomic_write_supported); @@ -1075,7 +1036,8 @@ buf_flush_write_block_low( /* TODO: pass the tablespace to fil_io() */ fil_io(request, - sync, bpage->id, bpage->size, 0, bpage->size.physical(), + sync, bpage->id, bpage->zip_size(), 0, + bpage->physical_size(), frame, bpage); } else { ut_ad(!srv_read_only_mode); @@ -1336,9 +1298,13 @@ buf_flush_try_neighbors( buf_pool_t* buf_pool = buf_pool_get(page_id); ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); + fil_space_t* space = fil_space_acquire_for_io(page_id.space()); + if (!space) { + return 0; + } if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN - || srv_flush_neighbors == 0) { + || !srv_flush_neighbors || !space->is_rotational()) { /* If there is little space or neighbor flushing is not enabled then just flush the victim. */ low = page_id.page_no(); @@ -1393,9 +1359,8 @@ buf_flush_try_neighbors( } } - const ulint space_size = fil_space_get_size(page_id.space()); - if (high > space_size) { - high = space_size; + if (high > space->size) { + high = space->size; } DBUG_PRINT("ib_buf", ("flush %u:%u..%u", @@ -1472,6 +1437,8 @@ buf_flush_try_neighbors( buf_pool_mutex_exit(buf_pool); } + space->release_for_io(); + if (count > 1) { MONITOR_INC_VALUE_CUMULATIVE( MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE, @@ -3056,7 +3023,7 @@ DECLARE_THREAD(buf_flush_page_cleaner_coordinator)(void*) " See the man page of setpriority()."; } /* Signal that setpriority() has been attempted. */ - os_event_set(recv_sys->flush_end); + os_event_set(recv_sys.flush_end); #endif /* UNIV_LINUX */ do { @@ -3064,13 +3031,13 @@ DECLARE_THREAD(buf_flush_page_cleaner_coordinator)(void*) ulint n_flushed_lru = 0; ulint n_flushed_list = 0; - os_event_wait(recv_sys->flush_start); + os_event_wait(recv_sys.flush_start); if (!recv_writer_thread_active) { break; } - switch (recv_sys->flush_type) { + switch (recv_sys.flush_type) { case BUF_FLUSH_LRU: /* Flush pages from end of LRU if required */ pc_request(0, LSN_MAX); @@ -3091,8 +3058,8 @@ DECLARE_THREAD(buf_flush_page_cleaner_coordinator)(void*) ut_ad(0); } - os_event_reset(recv_sys->flush_start); - os_event_set(recv_sys->flush_end); + os_event_reset(recv_sys.flush_start); + os_event_set(recv_sys.flush_end); } while (recv_writer_thread_active); os_event_wait(buf_flush_event); @@ -3577,7 +3544,7 @@ buf_flush_validate_low( /* If we are in recovery mode i.e.: flush_rbt != NULL then each block in the flush_list must also be present in the flush_rbt. */ - if (buf_pool->flush_rbt != NULL) { + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { rnode = rbt_first(buf_pool->flush_rbt); } @@ -3598,7 +3565,7 @@ buf_flush_validate_low( || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH); ut_a(om > 0); - if (buf_pool->flush_rbt != NULL) { + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { buf_page_t** prpage; ut_a(rnode != NULL); diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index c81975a1f2c..6f33a0399f9 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -168,7 +168,7 @@ incr_LRU_size_in_bytes( { ut_ad(buf_pool_mutex_own(buf_pool)); - buf_pool->stat.LRU_bytes += bpage->size.physical(); + buf_pool->stat.LRU_bytes += bpage->physical_size(); ut_ad(buf_pool->stat.LRU_bytes <= buf_pool->curr_pool_size); } @@ -1233,7 +1233,7 @@ buf_LRU_remove_block( UT_LIST_REMOVE(buf_pool->LRU, bpage); ut_d(bpage->in_LRU_list = FALSE); - buf_pool->stat.LRU_bytes -= bpage->size.physical(); + buf_pool->stat.LRU_bytes -= bpage->physical_size(); buf_unzip_LRU_remove_block_if_needed(bpage); @@ -1501,7 +1501,7 @@ func_exit: ? BUF_BLOCK_ZIP_DIRTY : BUF_BLOCK_ZIP_PAGE; - ut_ad(b->size.is_compressed()); + ut_ad(b->zip_size()); /* The fields in_page_hash and in_LRU_list of the to-be-freed block descriptor should have @@ -1580,10 +1580,6 @@ func_exit: page_zip_set_size(&bpage->zip, 0); - bpage->size.copy_from(page_size_t(bpage->size.logical(), - bpage->size.logical(), - false)); - mutex_exit(block_mutex); /* Prevent buf_page_get_gen() from @@ -1673,19 +1669,14 @@ buf_LRU_block_free_non_file_page( buf_page_mutex_exit(block); buf_pool_mutex_exit_forbid(buf_pool); - ut_ad(block->page.size.is_compressed()); + ut_ad(block->zip_size()); - buf_buddy_free(buf_pool, data, block->page.size.physical()); + buf_buddy_free(buf_pool, data, block->zip_size()); buf_pool_mutex_exit_allow(buf_pool); buf_page_mutex_enter(block); page_zip_set_size(&block->page.zip, 0); - - block->page.size.copy_from( - page_size_t(block->page.size.logical(), - block->page.size.logical(), - false)); } if (buf_pool->curr_size < buf_pool->old_size @@ -1756,7 +1747,7 @@ buf_LRU_block_remove_hashed( const page_t* page = ((buf_block_t*) bpage)->frame; ut_a(!zip || bpage->oldest_modification == 0); - ut_ad(bpage->size.is_compressed()); + ut_ad(bpage->zip_size()); switch (fil_page_get_type(page)) { case FIL_PAGE_TYPE_ALLOCATED: @@ -1771,7 +1762,7 @@ buf_LRU_block_remove_hashed( to the compressed page, which will be preserved. */ memcpy(bpage->zip.data, page, - bpage->size.physical()); + bpage->zip_size()); } break; case FIL_PAGE_TYPE_ZBLOB: @@ -1788,14 +1779,13 @@ buf_LRU_block_remove_hashed( default: ib::error() << "The compressed page to be" " evicted seems corrupt:"; - ut_print_buf(stderr, page, - bpage->size.logical()); + ut_print_buf(stderr, page, srv_page_size); ib::error() << "Possibly older version of" " the page:"; ut_print_buf(stderr, bpage->zip.data, - bpage->size.physical()); + bpage->zip_size()); putc('\n', stderr); ut_error; } @@ -1805,10 +1795,7 @@ buf_LRU_block_remove_hashed( /* fall through */ case BUF_BLOCK_ZIP_PAGE: ut_a(bpage->oldest_modification == 0); - if (bpage->size.is_compressed()) { - MEM_CHECK_ADDRESSABLE(bpage->zip.data, - bpage->size.physical()); - } + MEM_CHECK_ADDRESSABLE(bpage->zip.data, bpage->zip_size()); break; case BUF_BLOCK_POOL_WATCH: case BUF_BLOCK_ZIP_DIRTY: @@ -1839,7 +1826,7 @@ buf_LRU_block_remove_hashed( ut_ad(!bpage->in_flush_list); ut_ad(!bpage->in_LRU_list); ut_a(bpage->zip.data); - ut_a(bpage->size.is_compressed()); + ut_a(bpage->zip.ssize); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG UT_LIST_REMOVE(buf_pool->zip_clean, bpage); @@ -1849,8 +1836,7 @@ buf_LRU_block_remove_hashed( rw_lock_x_unlock(hash_lock); buf_pool_mutex_exit_forbid(buf_pool); - buf_buddy_free(buf_pool, bpage->zip.data, - bpage->size.physical()); + buf_buddy_free(buf_pool, bpage->zip.data, bpage->zip_size()); buf_pool_mutex_exit_allow(buf_pool); buf_page_free_descriptor(bpage); @@ -1896,16 +1882,11 @@ buf_LRU_block_remove_hashed( ut_ad(!bpage->in_LRU_list); buf_pool_mutex_exit_forbid(buf_pool); - buf_buddy_free(buf_pool, data, bpage->size.physical()); + buf_buddy_free(buf_pool, data, bpage->zip_size()); buf_pool_mutex_exit_allow(buf_pool); page_zip_set_size(&bpage->zip, 0); - - bpage->size.copy_from( - page_size_t(bpage->size.logical(), - bpage->size.logical(), - false)); } return(true); @@ -1964,7 +1945,7 @@ void buf_LRU_free_one_page(buf_page_t* bpage, page_id_t old_page_id) rw_lock_x_lock(hash_lock); - while (buf_block_get_fix(bpage) > 0) { + while (bpage->buf_fix_count > 0) { /* Wait for other threads to release the fix count before releasing the bpage from LRU list. */ } @@ -2244,7 +2225,7 @@ buf_LRU_print_instance( if (bpage->buf_fix_count) { fprintf(stderr, "buffix count %u ", - bpage->buf_fix_count); + uint32_t(bpage->buf_fix_count)); } if (buf_page_get_io_fix(bpage)) { @@ -2269,7 +2250,7 @@ buf_LRU_print_instance( fprintf(stderr, "\ntype %u size " ULINTPF " index id " IB_ID_FMT "\n", fil_page_get_type(frame), - bpage->size.physical(), + bpage->zip_size(), btr_page_get_index_id(frame)); break; diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index 7877087dd4a..33ccd6688ae 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -96,15 +96,14 @@ buffer buf_pool if it is not already there, in which case does nothing. Sets the io_fix flag and sets an exclusive lock on the buffer frame. The flag is cleared and the x-lock released by an i/o-handler thread. -@param[out] err DB_SUCCESS, DB_TABLESPACE_DELETED or - DB_TABLESPACE_TRUNCATED if we are trying - to read from a non-existent tablespace, a - tablespace which is just now being dropped, - or a tablespace which is truncated +@param[out] err DB_SUCCESS or DB_TABLESPACE_DELETED + if we are trying + to read from a non-existent tablespace @param[in] sync true if synchronous aio is desired @param[in] type IO type, SIMULATED, IGNORE_MISSING @param[in] mode BUF_READ_IBUF_PAGES_ONLY, ..., @param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] unzip true=request uncompressed page @param[in] ignore_missing_space true=ignore missing space when reading @return 1 if a read request was queued, 0 if the page already resided @@ -119,7 +118,7 @@ buf_read_page_low( ulint type, ulint mode, const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, bool unzip, bool ignore_missing_space = false) { @@ -135,7 +134,7 @@ buf_read_page_low( return(0); } - if (ibuf_bitmap_page(page_id, page_size) || trx_sys_hdr_page(page_id)) { + if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) { /* Trx sys header is so low in the latching order that we play safe and do not leave the i/o-completion to an asynchronous @@ -150,7 +149,7 @@ buf_read_page_low( or is being dropped; if we succeed in initing the page in the buffer pool for read, then DISCARD cannot proceed until the read has completed */ - bpage = buf_page_init_for_read(err, mode, page_id, page_size, unzip); + bpage = buf_page_init_for_read(err, mode, page_id, zip_size, unzip); if (bpage == NULL) { @@ -158,7 +157,7 @@ buf_read_page_low( } DBUG_LOG("ib_buf", - "read page " << page_id << " size=" << page_size.physical() + "read page " << page_id << " zip_size=" << zip_size << " unzip=" << unzip << ',' << (sync ? "sync" : "async")); ut_ad(buf_page_in_file(bpage)); @@ -169,7 +168,7 @@ buf_read_page_low( void* dst; - if (page_size.is_compressed()) { + if (zip_size) { dst = bpage->zip.data; } else { ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); @@ -180,7 +179,8 @@ buf_read_page_low( IORequest request(type | IORequest::READ); *err = fil_io( - request, sync, page_id, page_size, 0, page_size.physical(), + request, sync, page_id, zip_size, 0, + zip_size ? zip_size : srv_page_size, dst, bpage, ignore_missing_space); if (sync) { @@ -188,20 +188,8 @@ buf_read_page_low( } if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { - if (*err == DB_TABLESPACE_TRUNCATED) { - /* Remove the page which is outside the - truncated tablespace bounds when recovering - from a crash happened during a truncation */ - buf_read_page_handle_error(bpage); - if (recv_recovery_is_on()) { - mutex_enter(&recv_sys->mutex); - ut_ad(recv_sys->n_addrs > 0); - recv_sys->n_addrs--; - mutex_exit(&recv_sys->mutex); - } - return(0); - } else if (IORequest::ignore_missing(type) - || *err == DB_TABLESPACE_DELETED) { + if (IORequest::ignore_missing(type) + || *err == DB_TABLESPACE_DELETED) { buf_read_page_handle_error(bpage); return(0); } @@ -233,16 +221,13 @@ performed by ibuf routines, a situation which could result in a deadlock if the OS does not support asynchronous i/o. @param[in] page_id page id of a page which the current thread wants to access -@param[in] page_size page size -@param[in] inside_ibuf TRUE if we are inside ibuf routine +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] ibuf whether we are inside ibuf routine @return number of page read requests issued; NOTE that if we read ibuf pages, it may happen that the page at the given page number does not get read even if we return a positive value! */ ulint -buf_read_ahead_random( - const page_id_t page_id, - const page_size_t& page_size, - ibool inside_ibuf) +buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf) { buf_pool_t* buf_pool = buf_pool_get(page_id); ulint recent_blocks = 0; @@ -264,7 +249,7 @@ buf_read_ahead_random( return(0); } - if (ibuf_bitmap_page(page_id, page_size) || trx_sys_hdr_page(page_id)) { + if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) { /* If it is an ibuf bitmap page or trx sys hdr, we do no read-ahead, as that could break the ibuf page access @@ -279,14 +264,14 @@ buf_read_ahead_random( high = (page_id.page_no() / buf_read_ahead_random_area + 1) * buf_read_ahead_random_area; - /* Remember the tablespace version before we ask the tablespace size - below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we + /* If DISCARD + IMPORT changes the actual .ibd file meanwhile, we do not try to read outside the bounds of the tablespace! */ if (fil_space_t* space = fil_space_acquire(page_id.space())) { #ifdef UNIV_DEBUG if (srv_file_per_table) { ulint size = 0; + const ulint physical_size = space->physical_size(); for (const fil_node_t* node = UT_LIST_GET_FIRST(space->chain); @@ -294,7 +279,7 @@ buf_read_ahead_random( node = UT_LIST_GET_NEXT(chain, node)) { size += ulint(os_file_get_size(node->handle) - / page_size.physical()); + / physical_size); } ut_ad(size == space->size); @@ -347,12 +332,7 @@ buf_read_ahead_random( read_ahead: /* Read all the suitable blocks within the area */ - if (inside_ibuf) { - ibuf_mode = BUF_READ_IBUF_PAGES_ONLY; - } else { - ibuf_mode = BUF_READ_ANY_PAGE; - } - + ibuf_mode = ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE; count = 0; for (i = low; i < high; i++) { @@ -361,16 +341,15 @@ read_ahead: const page_id_t cur_page_id(page_id.space(), i); - if (!ibuf_bitmap_page(cur_page_id, page_size)) { + if (!ibuf_bitmap_page(cur_page_id, zip_size)) { count += buf_read_page_low( &err, false, IORequest::DO_NOT_WAKE, ibuf_mode, - cur_page_id, page_size, false); + cur_page_id, zip_size, false); switch (err) { case DB_SUCCESS: - case DB_TABLESPACE_TRUNCATED: case DB_ERROR: break; case DB_TABLESPACE_DELETED: @@ -412,16 +391,13 @@ buffer buf_pool if it is not already there. Sets the io_fix flag and sets an exclusive lock on the buffer frame. The flag is cleared and the x-lock released by the i/o-handler thread. @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @retval DB_SUCCESS if the page was read and is not corrupted, @retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted, @retval DB_DECRYPTION_FAILED if page post encryption checksum matches but after decryption normal page checksum does not match. @retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */ -dberr_t -buf_read_page( - const page_id_t page_id, - const page_size_t& page_size) +dberr_t buf_read_page(const page_id_t page_id, ulint zip_size) { ulint count; dberr_t err = DB_SUCCESS; @@ -434,7 +410,7 @@ buf_read_page( count = buf_read_page_low( &err, true, - 0, BUF_READ_ANY_PAGE, page_id, page_size, false); + 0, BUF_READ_ANY_PAGE, page_id, zip_size, false); srv_stats.buf_pool_reads.add(count); @@ -454,13 +430,10 @@ buffer buf_pool if it is not already there. Sets the io_fix flag and sets an exclusive lock on the buffer frame. The flag is cleared and the x-lock released by the i/o-handler thread. @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] sync true if synchronous aio is desired */ void -buf_read_page_background( - const page_id_t page_id, - const page_size_t& page_size, - bool sync) +buf_read_page_background(const page_id_t page_id, ulint zip_size, bool sync) { ulint count; dberr_t err; @@ -469,11 +442,10 @@ buf_read_page_background( &err, sync, IORequest::DO_NOT_WAKE | IORequest::IGNORE_MISSING, BUF_READ_ANY_PAGE, - page_id, page_size, false); + page_id, zip_size, false); switch (err) { case DB_SUCCESS: - case DB_TABLESPACE_TRUNCATED: case DB_ERROR: break; case DB_TABLESPACE_DELETED: @@ -525,14 +497,11 @@ NOTE 3: the calling thread must want access to the page given: this rule is set to prevent unintended read-aheads performed by ibuf routines, a situation which could result in a deadlock if the OS does not support asynchronous io. @param[in] page_id page id; see NOTE 3 above -@param[in] page_size page size -@param[in] inside_ibuf TRUE if we are inside ibuf routine +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] ibuf whether if we are inside ibuf routine @return number of page read requests issued */ ulint -buf_read_ahead_linear( - const page_id_t page_id, - const page_size_t& page_size, - ibool inside_ibuf) +buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf) { buf_pool_t* buf_pool = buf_pool_get(page_id); buf_page_t* bpage; @@ -571,7 +540,7 @@ buf_read_ahead_linear( return(0); } - if (ibuf_bitmap_page(page_id, page_size) || trx_sys_hdr_page(page_id)) { + if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) { /* If it is an ibuf bitmap page or trx sys hdr, we do no read-ahead, as that could break the ibuf page access @@ -732,9 +701,7 @@ buf_read_ahead_linear( /* If we got this far, read-ahead can be sensible: do it */ - ulint ibuf_mode; - - ibuf_mode = inside_ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE; + ulint ibuf_mode = ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE; /* Since Windows XP seems to schedule the i/o handler thread very eagerly, and consequently it does not wait for the @@ -748,15 +715,14 @@ buf_read_ahead_linear( const page_id_t cur_page_id(page_id.space(), i); - if (!ibuf_bitmap_page(cur_page_id, page_size)) { + if (!ibuf_bitmap_page(cur_page_id, zip_size)) { count += buf_read_page_low( &err, false, IORequest::DO_NOT_WAKE, - ibuf_mode, cur_page_id, page_size, false); + ibuf_mode, cur_page_id, zip_size, false); switch (err) { case DB_SUCCESS: - case DB_TABLESPACE_TRUNCATED: case DB_TABLESPACE_DELETED: case DB_ERROR: break; @@ -860,11 +826,11 @@ next: sync && (i + 1 == n_stored), 0, BUF_READ_ANY_PAGE, page_id, - page_size_t(space->flags), true); + space->zip_size(), + true, true /* ignore_missing_space */); switch(err) { case DB_SUCCESS: - case DB_TABLESPACE_TRUNCATED: case DB_ERROR: break; case DB_TABLESPACE_DELETED: @@ -916,7 +882,7 @@ buf_read_recv_pages( fil_space_open_if_needed(space); - const page_size_t page_size(space->flags); + const ulint zip_size = space->zip_size(); for (ulint i = 0; i < n_stored; i++) { buf_pool_t* buf_pool; @@ -953,13 +919,13 @@ buf_read_recv_pages( &err, true, 0, BUF_READ_ANY_PAGE, - cur_page_id, page_size, true); + cur_page_id, zip_size, true); } else { buf_read_page_low( &err, false, IORequest::DO_NOT_WAKE, BUF_READ_ANY_PAGE, - cur_page_id, page_size, true); + cur_page_id, zip_size, true); } if (err == DB_DECRYPTION_FAILED || err == DB_PAGE_CORRUPTED) { diff --git a/storage/innobase/data/data0data.cc b/storage/innobase/data/data0data.cc index 49bb8715a51..2d100519752 100644 --- a/storage/innobase/data/data0data.cc +++ b/storage/innobase/data/data0data.cc @@ -58,7 +58,12 @@ void dtuple_t::trim(const dict_index_t& index) for (; i > index.n_core_fields; i--) { const dfield_t* dfield = dtuple_get_nth_field(this, i - 1); const dict_col_t* col = dict_index_get_nth_col(&index, i - 1); - ut_ad(col->is_instant()); + + if (col->is_dropped()) { + continue; + } + + ut_ad(col->is_added()); ulint len = dfield_get_len(dfield); if (len != col->def_val.len) { break; @@ -573,7 +578,6 @@ dtuple_convert_big_rec( mem_heap_t* heap; big_rec_t* vector; dfield_t* dfield; - dict_field_t* ifield; ulint size; ulint n_fields; ulint local_prefix_len; @@ -582,7 +586,14 @@ dtuple_convert_big_rec( return(NULL); } - const ulint local_len = index->table->get_overflow_field_local_len(); + if (!index->table->space) { + return NULL; + } + + ulint local_len = index->table->get_overflow_field_local_len(); + const auto zip_size = index->table->space->zip_size(); + + ut_ad(index->n_uniq > 0); ut_a(dtuple_check_typed_no_assert(entry)); @@ -605,24 +616,42 @@ dtuple_convert_big_rec( stored externally */ n_fields = 0; + ulint longest_i; + + const bool mblob = entry->is_alter_metadata(); + ut_ad(entry->n_fields >= index->first_user_field() + mblob); + ut_ad(entry->n_fields - mblob <= index->n_fields); + + if (mblob) { + longest_i = index->first_user_field(); + dfield = dtuple_get_nth_field(entry, longest_i); + local_len = BTR_EXTERN_FIELD_REF_SIZE; + ut_ad(!dfield_is_ext(dfield)); + goto ext_write; + } + + if (!dict_table_has_atomic_blobs(index->table)) { + /* up to MySQL 5.1: store a 768-byte prefix locally */ + local_len = BTR_EXTERN_FIELD_REF_SIZE + + DICT_ANTELOPE_MAX_INDEX_COL_LEN; + } else { + /* new-format table: do not store any BLOB prefix locally */ + local_len = BTR_EXTERN_FIELD_REF_SIZE; + } while (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, *n_ext), - dict_table_is_comp(index->table), + index->table->not_redundant(), dict_index_get_n_fields(index), - dict_table_page_size(index->table))) { - - ulint i; - ulint longest = 0; - ulint longest_i = ULINT_MAX; - byte* data; - - for (i = dict_index_get_n_unique_in_tree(index); - i < dtuple_get_n_fields(entry); i++) { + zip_size)) { + longest_i = 0; + for (ulint i = index->first_user_field(), longest = 0; + i + mblob < entry->n_fields; i++) { ulint savings; + dfield = dtuple_get_nth_field(entry, i + mblob); - dfield = dtuple_get_nth_field(entry, i); - ifield = dict_index_get_nth_field(index, i); + const dict_field_t* ifield = dict_index_get_nth_field( + index, i); /* Skip fixed-length, NULL, externally stored, or short columns */ @@ -664,7 +693,7 @@ skip_field: continue; } - if (!longest) { + if (!longest_i) { /* Cannot shorten more */ mem_heap_free(heap); @@ -677,9 +706,8 @@ skip_field: We store the first bytes locally to the record. Then we can calculate all ordering fields in all indexes from locally stored data. */ - dfield = dtuple_get_nth_field(entry, longest_i); - ifield = dict_index_get_nth_field(index, longest_i); +ext_write: local_prefix_len = local_len - BTR_EXTERN_FIELD_REF_SIZE; vector->append( @@ -690,7 +718,8 @@ skip_field: + local_prefix_len)); /* Allocate the locally stored part of the column. */ - data = static_cast<byte*>(mem_heap_alloc(heap, local_len)); + byte* data = static_cast<byte*>( + mem_heap_alloc(heap, local_len)); /* Copy the local prefix. */ memcpy(data, dfield_get_data(dfield), local_prefix_len); diff --git a/storage/innobase/data/data0type.cc b/storage/innobase/data/data0type.cc index 896d1240340..7de4cc026d1 100644 --- a/storage/innobase/data/data0type.cc +++ b/storage/innobase/data/data0type.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2018, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -24,7 +24,8 @@ Data types Created 1/16/1996 Heikki Tuuri *******************************************************/ -#include "data0type.h" +#include "dict0mem.h" +#include "my_sys.h" /** The DB_TRX_ID,DB_ROLL_PTR values for "no history is available" */ const byte reset_trx_id[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN] = { @@ -79,67 +80,6 @@ dtype_get_at_most_n_mbchars( } /*********************************************************************//** -Checks if a data main type is a string type. Also a BLOB is considered a -string type. -@return TRUE if string type */ -ibool -dtype_is_string_type( -/*=================*/ - ulint mtype) /*!< in: InnoDB main data type code: DATA_CHAR, ... */ -{ - if (mtype <= DATA_BLOB - || mtype == DATA_MYSQL - || mtype == DATA_VARMYSQL) { - - return(TRUE); - } - - return(FALSE); -} - -/*********************************************************************//** -Checks if a type is a binary string type. Note that for tables created with -< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For -those DATA_BLOB columns this function currently returns FALSE. -@return TRUE if binary string type */ -ibool -dtype_is_binary_string_type( -/*========================*/ - ulint mtype, /*!< in: main data type */ - ulint prtype) /*!< in: precise type */ -{ - if ((mtype == DATA_FIXBINARY) - || (mtype == DATA_BINARY) - || (mtype == DATA_BLOB && (prtype & DATA_BINARY_TYPE))) { - - return(TRUE); - } - - return(FALSE); -} - -/*********************************************************************//** -Checks if a type is a non-binary string type. That is, dtype_is_string_type is -TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created -with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. -For those DATA_BLOB columns this function currently returns TRUE. -@return TRUE if non-binary string type */ -ibool -dtype_is_non_binary_string_type( -/*============================*/ - ulint mtype, /*!< in: main data type */ - ulint prtype) /*!< in: precise type */ -{ - if (dtype_is_string_type(mtype) == TRUE - && dtype_is_binary_string_type(mtype, prtype) == FALSE) { - - return(TRUE); - } - - return(FALSE); -} - -/*********************************************************************//** Validates a data type structure. @return TRUE if ok */ ibool diff --git a/storage/innobase/dict/dict0boot.cc b/storage/innobase/dict/dict0boot.cc index 87a2fe3f03f..0055ca6ef17 100644 --- a/storage/innobase/dict/dict0boot.cc +++ b/storage/innobase/dict/dict0boot.cc @@ -47,7 +47,7 @@ dict_hdr_get( dict_hdr_t* header; block = buf_page_get(page_id_t(DICT_HDR_SPACE, DICT_HDR_PAGE_NO), - univ_page_size, RW_X_LATCH, mtr); + 0, RW_X_LATCH, mtr); header = DICT_HDR + buf_block_get_frame(block); buf_block_dbg_add_level(block, SYNC_DICT_HEADER); @@ -64,52 +64,14 @@ dict_hdr_get_new_id( (not assigned if NULL) */ index_id_t* index_id, /*!< out: index id (not assigned if NULL) */ - ulint* space_id, /*!< out: space id + ulint* space_id) /*!< out: space id (not assigned if NULL) */ - const dict_table_t* table, /*!< in: table */ - bool disable_redo) /*!< in: if true and table - object is NULL - then disable-redo */ { dict_hdr_t* dict_hdr; ib_id_t id; mtr_t mtr; mtr_start(&mtr); - if (table) { - if (table->is_temporary()) { - mtr.set_log_mode(MTR_LOG_NO_REDO); - } - } else if (disable_redo) { - /* In non-read-only mode we need to ensure that space-id header - page is written to disk else if page is removed from buffer - cache and re-loaded it would assign temporary tablespace id - to another tablespace. - This is not a case with read-only mode as there is no new object - that is created except temporary tablespace. */ - mtr.set_log_mode(srv_read_only_mode - ? MTR_LOG_NONE : MTR_LOG_NO_REDO); - } - - /* Server started and let's say space-id = x - - table created with file-per-table - - space-id = x + 1 - - crash - Case 1: If it was redo logged then we know that it will be - restored to x + 1 - Case 2: if not redo-logged - Header will have the old space-id = x - This is OK because on restart there is no object with - space id = x + 1 - Case 3: - space-id = x (on start) - space-id = x+1 (temp-table allocation) - no redo logging - space-id = x+2 (non-temp-table allocation), this get's - redo logged. - If there is a crash there will be only 2 entries - x (original) and x+2 (new) and disk hdr will be updated - to reflect x + 2 entry. - We cannot allocate the same space id to different objects. */ dict_hdr = dict_hdr_get(&mtr); if (table_id) { @@ -148,9 +110,9 @@ dict_hdr_flush_row_id(void) row_id_t id; mtr_t mtr; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); - id = dict_sys->row_id; + id = dict_sys.row_id; mtr_start(&mtr); @@ -210,7 +172,7 @@ dict_hdr_create( /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, fil_system.sys_space, DICT_TABLES_ID, - dict_ind_redundant, NULL, mtr); + dict_ind_redundant, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -221,7 +183,7 @@ dict_hdr_create( /*--------------------------*/ root_page_no = btr_create(DICT_UNIQUE, fil_system.sys_space, DICT_TABLE_IDS_ID, - dict_ind_redundant, NULL, mtr); + dict_ind_redundant, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -232,7 +194,7 @@ dict_hdr_create( /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, fil_system.sys_space, DICT_COLUMNS_ID, - dict_ind_redundant, NULL, mtr); + dict_ind_redundant, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -243,7 +205,7 @@ dict_hdr_create( /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, fil_system.sys_space, DICT_INDEXES_ID, - dict_ind_redundant, NULL, mtr); + dict_ind_redundant, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -254,7 +216,7 @@ dict_hdr_create( /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, fil_system.sys_space, DICT_FIELDS_ID, - dict_ind_redundant, NULL, mtr); + dict_ind_redundant, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -302,11 +264,11 @@ dict_boot(void) mtr_start(&mtr); /* Create the hash tables etc. */ - dict_init(); + dict_sys.create(); heap = mem_heap_create(450); - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); /* Get the dictionary header */ dict_hdr = dict_hdr_get(&mtr); @@ -321,7 +283,7 @@ dict_boot(void) ..._MARGIN, it will immediately be updated to the disk-based header. */ - dict_sys->row_id = DICT_HDR_ROW_ID_WRITE_MARGIN + dict_sys.row_id = DICT_HDR_ROW_ID_WRITE_MARGIN + ut_uint64_align_up(mach_read_from_8(dict_hdr + DICT_HDR_ROW_ID), DICT_HDR_ROW_ID_WRITE_MARGIN); @@ -350,7 +312,7 @@ dict_boot(void) dict_table_add_system_columns(table, heap); table->add_to_cache(); - dict_sys->sys_tables = table; + dict_sys.sys_tables = table; mem_heap_empty(heap); index = dict_mem_index_create(table, "CLUST_IND", @@ -391,7 +353,7 @@ dict_boot(void) dict_table_add_system_columns(table, heap); table->add_to_cache(); - dict_sys->sys_columns = table; + dict_sys.sys_columns = table; mem_heap_empty(heap); index = dict_mem_index_create(table, "CLUST_IND", @@ -434,7 +396,7 @@ dict_boot(void) dict_table_get_nth_col(table, DICT_COL__SYS_INDEXES__MERGE_THRESHOLD) ->def_val.len = UNIV_SQL_NULL; table->add_to_cache(); - dict_sys->sys_indexes = table; + dict_sys.sys_indexes = table; mem_heap_empty(heap); index = dict_mem_index_create(table, "CLUST_IND", @@ -463,7 +425,7 @@ dict_boot(void) dict_table_add_system_columns(table, heap); table->add_to_cache(); - dict_sys->sys_fields = table; + dict_sys.sys_fields = table; mem_heap_free(heap); index = dict_mem_index_create(table, "CLUST_IND", @@ -511,14 +473,14 @@ dict_boot(void) if (err == DB_SUCCESS) { /* Load definitions of other indexes on system tables */ - dict_load_sys_table(dict_sys->sys_tables); - dict_load_sys_table(dict_sys->sys_columns); - dict_load_sys_table(dict_sys->sys_indexes); - dict_load_sys_table(dict_sys->sys_fields); + dict_load_sys_table(dict_sys.sys_tables); + dict_load_sys_table(dict_sys.sys_columns); + dict_load_sys_table(dict_sys.sys_indexes); + dict_load_sys_table(dict_sys.sys_fields); } } - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); return(err); } diff --git a/storage/innobase/dict/dict0crea.cc b/storage/innobase/dict/dict0crea.cc index fd45aee407d..28974971786 100644 --- a/storage/innobase/dict/dict0crea.cc +++ b/storage/innobase/dict/dict0crea.cc @@ -58,7 +58,6 @@ dict_create_sys_tables_tuple( which the memory for the built tuple is allocated */ { - dict_table_t* sys_tables; dtuple_t* entry; dfield_t* dfield; byte* ptr; @@ -69,11 +68,9 @@ dict_create_sys_tables_tuple( ut_ad(heap); ut_ad(table->n_cols >= DATA_N_SYS_COLS); - sys_tables = dict_sys->sys_tables; - entry = dtuple_create(heap, 8 + DATA_N_SYS_COLS); - dict_table_copy_types(entry, sys_tables); + dict_table_copy_types(entry, dict_sys.sys_tables); /* 0: NAME -----------------------------*/ dfield = dtuple_get_nth_field( @@ -171,7 +168,6 @@ dict_create_sys_columns_tuple( which the memory for the built tuple is allocated */ { - dict_table_t* sys_columns; dtuple_t* entry; const dict_col_t* column; dfield_t* dfield; @@ -195,11 +191,9 @@ dict_create_sys_columns_tuple( ut_ad(!column->is_virtual()); } - sys_columns = dict_sys->sys_columns; - entry = dtuple_create(heap, 7 + DATA_N_SYS_COLS); - dict_table_copy_types(entry, sys_columns); + dict_table_copy_types(entry, dict_sys.sys_columns); /* 0: TABLE_ID -----------------------*/ dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__TABLE_ID); @@ -290,7 +284,6 @@ dict_create_sys_virtual_tuple( ulint b_col_n, mem_heap_t* heap) { - dict_table_t* sys_virtual; dtuple_t* entry; const dict_col_t* base_column; dfield_t* dfield; @@ -303,12 +296,10 @@ dict_create_sys_virtual_tuple( dict_v_col_t* v_col = dict_table_get_nth_v_col(table, v_col_n); base_column = v_col->base_col[b_col_n]; - sys_virtual = dict_sys->sys_virtual; - entry = dtuple_create(heap, DICT_NUM_COLS__SYS_VIRTUAL + DATA_N_SYS_COLS); - dict_table_copy_types(entry, sys_virtual); + dict_table_copy_types(entry, dict_sys.sys_virtual); /* 0: TABLE_ID -----------------------*/ dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_VIRTUAL__TABLE_ID); @@ -352,12 +343,14 @@ dict_build_table_def_step( que_thr_t* thr, /*!< in: query thread */ tab_node_t* node) /*!< in: table create node */ { - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); dict_table_t* table = node->table; + trx_t* trx = thr_get_trx(thr); ut_ad(!table->is_temporary()); ut_ad(!table->space); ut_ad(table->space_id == ULINT_UNDEFINED); - dict_table_assign_new_id(table, thr_get_trx(thr)); + dict_hdr_get_new_id(&table->id, NULL, NULL); + trx->table_id = table->id; /* Always set this bit for all new created tables */ DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_AUX_HEX_NAME); @@ -370,8 +363,6 @@ dict_build_table_def_step( ut_ad(DICT_TF_GET_ZIP_SSIZE(table->flags) == 0 || dict_table_has_atomic_blobs(table)); - trx_t* trx = thr_get_trx(thr); - ut_ad(trx->table_id); mtr_t mtr; trx_undo_t* undo = trx->rsegs.m_redo.undo; if (undo && !undo->table_id @@ -399,7 +390,7 @@ dict_build_table_def_step( } /* Get a new tablespace ID */ ulint space_id; - dict_hdr_get_new_id(NULL, NULL, &space_id, table, false); + dict_hdr_get_new_id(NULL, NULL, &space_id); DBUG_EXECUTE_IF( "ib_create_table_fail_out_of_space_ids", @@ -486,24 +477,21 @@ dict_create_sys_indexes_tuple( which the memory for the built tuple is allocated */ { - dict_table_t* sys_indexes; dtuple_t* entry; dfield_t* dfield; byte* ptr; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); ut_ad(index); ut_ad(index->table->space || index->table->file_unreadable); ut_ad(!index->table->space || index->table->space->id == index->table->space_id); ut_ad(heap); - sys_indexes = dict_sys->sys_indexes; - entry = dtuple_create( heap, DICT_NUM_COLS__SYS_INDEXES + DATA_N_SYS_COLS); - dict_table_copy_types(entry, sys_indexes); + dict_table_copy_types(entry, dict_sys.sys_indexes); /* 0: TABLE_ID -----------------------*/ dfield = dtuple_get_nth_field( @@ -607,7 +595,6 @@ dict_create_sys_fields_tuple( which the memory for the built tuple is allocated */ { - dict_table_t* sys_fields; dtuple_t* entry; dict_field_t* field; dfield_t* dfield; @@ -627,11 +614,9 @@ dict_create_sys_fields_tuple( field = dict_index_get_nth_field(index, fld_no); - sys_fields = dict_sys->sys_fields; - entry = dtuple_create(heap, 3 + DATA_N_SYS_COLS); - dict_table_copy_types(entry, sys_fields); + dict_table_copy_types(entry, dict_sys.sys_fields); /* 0: INDEX_ID -----------------------*/ dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__INDEX_ID); @@ -726,7 +711,7 @@ dict_build_index_def_step( dtuple_t* row; trx_t* trx; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); trx = thr_get_trx(thr); @@ -747,7 +732,7 @@ dict_build_index_def_step( ut_ad((UT_LIST_GET_LEN(table->indexes) > 0) || dict_index_is_clust(index)); - dict_hdr_get_new_id(NULL, &index->id, NULL, table, false); + dict_hdr_get_new_id(NULL, &index->id, NULL); /* Inherit the space id from the table; we store all indexes of a table in the same tablespace */ @@ -777,7 +762,7 @@ dict_build_index_def( dict_index_t* index, /*!< in/out: index */ trx_t* trx) /*!< in/out: InnoDB transaction handle */ { - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); if (trx->table_id == 0) { /* Record only the first table id. */ @@ -787,7 +772,7 @@ dict_build_index_def( ut_ad((UT_LIST_GET_LEN(table->indexes) > 0) || dict_index_is_clust(index)); - dict_hdr_get_new_id(NULL, &index->id, NULL, table, false); + dict_hdr_get_new_id(NULL, &index->id, NULL); /* Note that the index was created by this transaction. */ index->trx_id = trx->id; @@ -823,15 +808,12 @@ dict_create_index_tree_step( mtr_t mtr; btr_pcur_t pcur; dict_index_t* index; - dict_table_t* sys_indexes; dtuple_t* search_tuple; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); index = node->index; - sys_indexes = dict_sys->sys_indexes; - if (index->type == DICT_FTS) { /* FTS index does not need an index tree */ return(DB_SUCCESS); @@ -845,7 +827,7 @@ dict_create_index_tree_step( search_tuple = dict_create_search_tuple(node->ind_row, node->heap); - btr_pcur_open(UT_LIST_GET_FIRST(sys_indexes->indexes), + btr_pcur_open(UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes), search_tuple, PAGE_CUR_L, BTR_MODIFY_LEAF, &pcur, &mtr); @@ -861,7 +843,7 @@ dict_create_index_tree_step( node->page_no = btr_create( index->type, index->table->space, - index->id, index, NULL, &mtr); + index->id, index, &mtr); if (node->page_no == FIL_NULL) { err = DB_OUT_OF_FILE_SPACE; @@ -898,7 +880,7 @@ dict_create_index_tree_in_mem( { mtr_t mtr; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); ut_ad(!(index->type & DICT_FTS)); mtr_start(&mtr); @@ -910,7 +892,7 @@ dict_create_index_tree_in_mem( ut_ad(!(index->table->flags2 & DICT_TF2_DISCARDED)); index->page = btr_create(index->type, index->table->space, - index->id, index, NULL, &mtr); + index->id, index, &mtr); mtr_commit(&mtr); index->trx_id = trx->id; @@ -922,16 +904,14 @@ dict_create_index_tree_in_mem( @param[in,out] rec SYS_INDEXES record @param[in,out] pcur persistent cursor on rec @param[in,out] trx dictionary transaction -@param[in,out] mtr mini-transaction -@return whether freeing the B-tree was attempted */ -bool dict_drop_index_tree(rec_t* rec, btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr) +@param[in,out] mtr mini-transaction */ +void dict_drop_index_tree(rec_t* rec, btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr) { - const byte* ptr; - ulint len; - ulint root_page_no; + byte* ptr; + ulint len; - ut_ad(mutex_own(&dict_sys->mutex)); - ut_a(!dict_table_is_comp(dict_sys->sys_indexes)); + ut_ad(mutex_own(&dict_sys.mutex)); + ut_a(!dict_table_is_comp(dict_sys.sys_indexes)); ptr = rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len); @@ -939,15 +919,15 @@ bool dict_drop_index_tree(rec_t* rec, btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr) btr_pcur_store_position(pcur, mtr); - root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); + const uint32_t root_page_no = mach_read_from_4(ptr); if (root_page_no == FIL_NULL) { /* The tree has already been freed */ - - return(false); + return; } - mlog_write_ulint(const_cast<byte*>(ptr), FIL_NULL, MLOG_4BYTES, mtr); + compile_time_assert(FIL_NULL == 0xffffffff); + mlog_memset(ptr, 4, 0xff, mtr); ptr = rec_get_nth_field_old( rec, DICT_FLD__SYS_INDEXES__SPACE, &len); @@ -960,7 +940,7 @@ bool dict_drop_index_tree(rec_t* rec, btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr) && trx_get_dict_operation(trx) == TRX_DICT_OP_TABLE) { /* We are about to delete the entire .ibd file; do not bother to free pages inside it. */ - return false; + return; } ptr = rec_get_nth_field_old( @@ -968,109 +948,16 @@ bool dict_drop_index_tree(rec_t* rec, btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr) ut_ad(len == 8); - bool found; - const page_size_t page_size(fil_space_get_page_size(space_id, - &found)); - - if (!found) { - /* It is a single table tablespace and the .ibd file is - missing: do nothing */ - - return(false); - } - - /* If tablespace is scheduled for truncate, do not try to drop - the indexes in that tablespace. There is a truncate fixup action - which will take care of it. */ - if (srv_is_tablespace_truncated(space_id)) { - return(false); - } - - btr_free_if_exists(page_id_t(space_id, root_page_no), page_size, - mach_read_from_8(ptr), mtr); - - return(true); -} - -/*******************************************************************//** -Recreate the index tree associated with a row in SYS_INDEXES table. -@return new root page number, or FIL_NULL on failure */ -ulint -dict_recreate_index_tree( -/*=====================*/ - const dict_table_t* - table, /*!< in/out: the table the index belongs to */ - btr_pcur_t* pcur, /*!< in/out: persistent cursor pointing to - record in the clustered index of - SYS_INDEXES table. The cursor may be - repositioned in this call. */ - mtr_t* mtr) /*!< in/out: mtr having the latch - on the record page. */ -{ - ut_ad(mutex_own(&dict_sys->mutex)); - ut_a(!dict_table_is_comp(dict_sys->sys_indexes)); - ut_ad(!table->space || table->space->id == table->space_id); - - ulint len; - const rec_t* rec = btr_pcur_get_rec(pcur); - - const byte* ptr = rec_get_nth_field_old( - rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len); - - ut_ad(len == 4); - - ut_ad(table->space_id == mach_read_from_4( - rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__SPACE, - &len))); - ut_ad(len == 4); - - if (!table->space) { - /* It is a single table tablespae and the .ibd file is - missing: do nothing. */ - - ib::warn() - << "Trying to TRUNCATE a missing .ibd file of table " - << table->name << "!"; - - return(FIL_NULL); - } - - ptr = rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__TYPE, &len); - ut_ad(len == 4); - ulint type = mach_read_from_4(ptr); - - ptr = rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__ID, &len); - ut_ad(len == 8); - index_id_t index_id = mach_read_from_8(ptr); - - /* We will need to commit the mini-transaction in order to avoid - deadlocks in the btr_create() call, because otherwise we would - be freeing and allocating pages in the same mini-transaction. */ - btr_pcur_store_position(pcur, mtr); - mtr_commit(mtr); - - mtr_start(mtr); - mtr->set_named_space(table->space); - btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); - - /* Find the index corresponding to this SYS_INDEXES record. */ - for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); - index != NULL; - index = UT_LIST_GET_NEXT(indexes, index)) { - if (index->id == index_id) { - ulint root_page_no = (index->type & DICT_FTS) - ? FIL_NULL - : btr_create(type, table->space, - index_id, index, NULL, mtr); - index->page = unsigned(root_page_no); - return root_page_no; + if (fil_space_t* s = fil_space_acquire_silent(space_id)) { + /* Ensure that the tablespace file exists + in order to avoid a crash in buf_page_get_gen(). */ + if (s->size || fil_space_get_size(space_id)) { + btr_free_if_exists(page_id_t(space_id, root_page_no), + s->zip_size(), + mach_read_from_8(ptr), mtr); } + s->release(); } - - ib::error() << "Failed to create index with index id " << index_id - << " of table " << table->name; - - return(FIL_NULL); } /*********************************************************************//** @@ -1099,15 +986,15 @@ tab_create_graph_create( node->mode = mode; node->key_id = key_id; - node->tab_def = ins_node_create(INS_DIRECT, dict_sys->sys_tables, + node->tab_def = ins_node_create(INS_DIRECT, dict_sys.sys_tables, heap); node->tab_def->common.parent = node; - node->col_def = ins_node_create(INS_DIRECT, dict_sys->sys_columns, + node->col_def = ins_node_create(INS_DIRECT, dict_sys.sys_columns, heap); node->col_def->common.parent = node; - node->v_col_def = ins_node_create(INS_DIRECT, dict_sys->sys_virtual, + node->v_col_def = ins_node_create(INS_DIRECT, dict_sys.sys_virtual, heap); node->v_col_def->common.parent = node; @@ -1146,11 +1033,11 @@ ind_create_graph_create( node->heap = mem_heap_create(256); node->ind_def = ins_node_create(INS_DIRECT, - dict_sys->sys_indexes, heap); + dict_sys.sys_indexes, heap); node->ind_def->common.parent = node; node->field_def = ins_node_create(INS_DIRECT, - dict_sys->sys_fields, heap); + dict_sys.sys_fields, heap); node->field_def->common.parent = node; return(node); @@ -1169,7 +1056,7 @@ dict_create_table_step( trx_t* trx; ut_ad(thr); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); trx = thr_get_trx(thr); @@ -1250,7 +1137,8 @@ dict_create_table_step( ut_ad(node->col_no == v_col->v_pos); dict_build_v_col_def_step(node); - if (node->base_col_no < v_col->num_base - 1) { + if (node->base_col_no + < unsigned{v_col->num_base} - 1) { /* move on to next base column */ node->base_col_no++; } else { @@ -1311,7 +1199,7 @@ dict_create_index_step( trx_t* trx; ut_ad(thr); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); trx = thr_get_trx(thr); @@ -1472,7 +1360,7 @@ dict_check_if_system_table_exists( ut_a(srv_get_active_thread_type() == SRV_NONE); - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); sys_table = dict_table_get_low(tablename); @@ -1490,7 +1378,7 @@ dict_check_if_system_table_exists( dict_table_prevent_eviction(sys_table); } - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); return(error); } @@ -1658,9 +1546,9 @@ dict_create_or_check_sys_virtual() "SYS_VIRTUAL", DICT_NUM_FIELDS__SYS_VIRTUAL + 1, 1); if (err == DB_SUCCESS) { - mutex_enter(&dict_sys->mutex); - dict_sys->sys_virtual = dict_table_get_low("SYS_VIRTUAL"); - mutex_exit(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); + dict_sys.sys_virtual = dict_table_get_low("SYS_VIRTUAL"); + mutex_exit(&dict_sys.mutex); return(DB_SUCCESS); } @@ -1733,9 +1621,9 @@ dict_create_or_check_sys_virtual() dberr_t sys_virtual_err = dict_check_if_system_table_exists( "SYS_VIRTUAL", DICT_NUM_FIELDS__SYS_VIRTUAL + 1, 1); ut_a(sys_virtual_err == DB_SUCCESS); - mutex_enter(&dict_sys->mutex); - dict_sys->sys_virtual = dict_table_get_low("SYS_VIRTUAL"); - mutex_exit(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); + dict_sys.sys_virtual = dict_table_get_low("SYS_VIRTUAL"); + mutex_exit(&dict_sys.mutex); return(err); } @@ -2132,7 +2020,7 @@ dict_create_add_foreigns_to_dictionary( dict_foreign_t* foreign; dberr_t error; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); if (NULL == dict_table_get_low("SYS_FOREIGN")) { @@ -2142,6 +2030,8 @@ dict_create_add_foreigns_to_dictionary( return(DB_ERROR); } + error = DB_SUCCESS; + for (dict_foreign_set::const_iterator it = local_fk_set.begin(); it != local_fk_set.end(); ++it) { @@ -2153,12 +2043,11 @@ dict_create_add_foreigns_to_dictionary( table->name.m_name, foreign, trx); if (error != DB_SUCCESS) { - - return(error); + break; } } - return(DB_SUCCESS); + return error; } /****************************************************************//** @@ -2346,15 +2235,3 @@ dict_replace_tablespace_in_dictionary( return(error); } - -/** Assign a new table ID and put it into the table cache and the transaction. -@param[in,out] table Table that needs an ID -@param[in,out] trx Transaction */ -void -dict_table_assign_new_id( - dict_table_t* table, - trx_t* trx) -{ - dict_hdr_get_new_id(&table->id, NULL, NULL, table, false); - trx->table_id = table->id; -} diff --git a/storage/innobase/dict/dict0defrag_bg.cc b/storage/innobase/dict/dict0defrag_bg.cc index 7c6f5d75b5d..7e61e298ac6 100644 --- a/storage/innobase/dict/dict0defrag_bg.cc +++ b/storage/innobase/dict/dict0defrag_bg.cc @@ -151,7 +151,7 @@ dict_stats_defrag_pool_del( { ut_a((table && !index) || (!table && index)); ut_ad(!srv_read_only_mode); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); mutex_enter(&defrag_pool_mutex); @@ -193,7 +193,7 @@ dict_stats_process_entry_from_defrag_pool() dict_table_t* table; - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); /* If the table is no longer cached, we've already lost the in memory stats so there's nothing really to write to disk. */ @@ -208,11 +208,11 @@ dict_stats_process_entry_from_defrag_pool() if (table) { dict_table_close(table, TRUE, FALSE); } - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); return; } - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); dict_stats_save_defrag_stats(index); dict_table_close(table, FALSE, FALSE); } @@ -243,8 +243,7 @@ dict_stats_save_defrag_summary( return DB_SUCCESS; } - rw_lock_x_lock(&dict_operation_lock); - mutex_enter(&dict_sys->mutex); + dict_sys_lock(); ret = dict_stats_save_index_stat(index, time(NULL), "n_pages_freed", index->stat_defrag_n_pages_freed, @@ -253,8 +252,7 @@ dict_stats_save_defrag_summary( " last defragmentation run.", NULL); - mutex_exit(&dict_sys->mutex); - rw_lock_x_unlock(&dict_operation_lock); + dict_sys_unlock(); return (ret); } @@ -294,9 +292,7 @@ dict_stats_save_defrag_stats( return DB_SUCCESS; } - rw_lock_x_lock(&dict_operation_lock); - - mutex_enter(&dict_sys->mutex); + dict_sys_lock(); ret = dict_stats_save_index_stat(index, now, "n_page_split", index->stat_defrag_n_page_split, NULL, @@ -326,8 +322,6 @@ dict_stats_save_defrag_stats( NULL); end: - mutex_exit(&dict_sys->mutex); - rw_lock_x_unlock(&dict_operation_lock); - - return (ret); + dict_sys_unlock(); + return ret; } diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc index 93a86b29af0..07fcd4f57c1 100644 --- a/storage/innobase/dict/dict0dict.cc +++ b/storage/innobase/dict/dict0dict.cc @@ -60,7 +60,6 @@ extern uint ibuf_debug; #include "lock0lock.h" #include "mach0data.h" #include "mem0mem.h" -#include "os0once.h" #include "page0page.h" #include "page0zip.h" #include "pars0pars.h" @@ -80,17 +79,7 @@ extern uint ibuf_debug; #include <algorithm> /** the dictionary system */ -dict_sys_t* dict_sys = NULL; - -/** @brief the data dictionary rw-latch protecting dict_sys - -table create, drop, etc. reserve this in X-mode; implicit or -backround operations purge, rollback, foreign key checks reserve this -in S-mode; we cannot trust that MySQL protects implicit or background -operations a table drop since MySQL does not know of them; therefore -we need this; NOTE: a transaction which reserves this must keep book -on the mode in trx_t::dict_operation_lock_mode */ -rw_lock_t dict_operation_lock; +dict_sys_t dict_sys; /** Percentage of compression failures that are allowed in a single round */ @@ -173,22 +162,6 @@ static ibool dict_lru_validate(void); /*===================*/ -/**********************************************************************//** -Check if table is in the dictionary table LRU list. -@return TRUE if table found */ -static -ibool -dict_lru_find_table( -/*================*/ - const dict_table_t* find_table); /*!< in: table to find */ -/**********************************************************************//** -Check if a table exists in the dict table non-LRU list. -@return TRUE if table found */ -static -ibool -dict_non_lru_find_table( -/*====================*/ - const dict_table_t* find_table); /*!< in: table to find */ #endif /* UNIV_DEBUG */ /* Stream for storing detailed information about the latest foreign key @@ -247,158 +220,24 @@ dict_get_db_name_len( return ulint(s - name); } -/** Reserve the dictionary system mutex. */ -void -dict_mutex_enter_for_mysql_func(const char *file, unsigned line) -{ - mutex_enter_loc(&dict_sys->mutex, file, line); -} - -/********************************************************************//** -Releases the dictionary system mutex for MySQL. */ -void -dict_mutex_exit_for_mysql(void) -/*===========================*/ -{ - mutex_exit(&dict_sys->mutex); -} - -/** Allocate and init a dict_table_t's stats latch. -This function must not be called concurrently on the same table object. -@param[in,out] table_void table whose stats latch to create */ -static -void -dict_table_stats_latch_alloc( - void* table_void) -{ - dict_table_t* table = static_cast<dict_table_t*>(table_void); - - /* Note: rw_lock_create() will call the constructor */ - - table->stats_latch = static_cast<rw_lock_t*>( - ut_malloc_nokey(sizeof(rw_lock_t))); - - ut_a(table->stats_latch != NULL); - - rw_lock_create(dict_table_stats_key, table->stats_latch, - SYNC_INDEX_TREE); -} - -/** Deinit and free a dict_table_t's stats latch. -This function must not be called concurrently on the same table object. -@param[in,out] table table whose stats latch to free */ -static -void -dict_table_stats_latch_free( - dict_table_t* table) -{ - rw_lock_free(table->stats_latch); - ut_free(table->stats_latch); -} - -/** Create a dict_table_t's stats latch or delay for lazy creation. -This function is only called from either single threaded environment -or from a thread that has not shared the table object with other threads. -@param[in,out] table table whose stats latch to create -@param[in] enabled if false then the latch is disabled -and dict_table_stats_lock()/unlock() become noop on this table. */ -void -dict_table_stats_latch_create( - dict_table_t* table, - bool enabled) -{ - if (!enabled) { - table->stats_latch = NULL; - table->stats_latch_created = os_once::DONE; - return; - } - - /* We create this lazily the first time it is used. */ - table->stats_latch = NULL; - table->stats_latch_created = os_once::NEVER_DONE; -} - -/** Destroy a dict_table_t's stats latch. -This function is only called from either single threaded environment -or from a thread that has not shared the table object with other threads. -@param[in,out] table table whose stats latch to destroy */ -void -dict_table_stats_latch_destroy( - dict_table_t* table) -{ - if (table->stats_latch_created == os_once::DONE - && table->stats_latch != NULL) { - - dict_table_stats_latch_free(table); - } -} - -/** Lock the appropriate latch to protect a given table's statistics. -@param[in] table table whose stats to lock -@param[in] latch_mode RW_S_LATCH or RW_X_LATCH */ -void -dict_table_stats_lock( - dict_table_t* table, - ulint latch_mode) -{ - ut_ad(table != NULL); - ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); - - os_once::do_or_wait_for_done( - &table->stats_latch_created, - dict_table_stats_latch_alloc, table); - - if (table->stats_latch == NULL) { - /* This is a dummy table object that is private in the current - thread and is not shared between multiple threads, thus we - skip any locking. */ - return; - } - - switch (latch_mode) { - case RW_S_LATCH: - rw_lock_s_lock(table->stats_latch); - break; - case RW_X_LATCH: - rw_lock_x_lock(table->stats_latch); - break; - case RW_NO_LATCH: - /* fall through */ - default: - ut_error; - } -} - -/** Unlock the latch that has been locked by dict_table_stats_lock(). -@param[in] table table whose stats to unlock -@param[in] latch_mode RW_S_LATCH or RW_X_LATCH */ -void -dict_table_stats_unlock( - dict_table_t* table, - ulint latch_mode) +/** Open a persistent table. +@param[in] table_id persistent table identifier +@param[in] ignore_err errors to ignore +@param[in] cached_only whether to skip loading +@return persistent table +@retval NULL if not found */ +static dict_table_t* dict_table_open_on_id_low( + table_id_t table_id, + dict_err_ignore_t ignore_err, + bool cached_only) { - ut_ad(table != NULL); - ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + dict_table_t* table = dict_sys.get_table(table_id); - if (table->stats_latch == NULL) { - /* This is a dummy table object that is private in the current - thread and is not shared between multiple threads, thus we - skip any locking. */ - return; + if (!table && !cached_only) { + table = dict_load_table_on_id(table_id, ignore_err); } - switch (latch_mode) { - case RW_S_LATCH: - rw_lock_s_unlock(table->stats_latch); - break; - case RW_X_LATCH: - rw_lock_x_unlock(table->stats_latch); - break; - case RW_NO_LATCH: - /* fall through */ - default: - ut_error; - } + return table; } /**********************************************************************//** @@ -411,7 +250,7 @@ dict_table_try_drop_aborted( dict_table_t* table, /*!< in: table, or NULL if it needs to be looked up again */ table_id_t table_id, /*!< in: table identifier */ - int32 ref_count) /*!< in: expected table->n_ref_count */ + uint32_t ref_count) /*!< in: expected table->n_ref_count */ { trx_t* trx; @@ -444,7 +283,7 @@ dict_table_try_drop_aborted( /**********************************************************************//** When opening a table, try to drop any indexes after an aborted index creation. -Release the dict_sys->mutex. */ +Release the dict_sys.mutex. */ static void dict_table_try_drop_aborted_and_mutex_exit( @@ -464,11 +303,11 @@ dict_table_try_drop_aborted_and_mutex_exit( was aborted. */ table_id_t table_id = table->id; - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); dict_table_try_drop_aborted(table, table_id, 1); } else { - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } } @@ -484,10 +323,10 @@ dict_table_close( index creation */ { if (!dict_locked) { - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); } - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); ut_a(table->get_ref_count() > 0); const bool last_handle = table->release(); @@ -506,14 +345,7 @@ dict_table_close( MONITOR_DEC(MONITOR_TABLE_REFERENCE); ut_ad(dict_lru_validate()); - -#ifdef UNIV_DEBUG - if (table->can_be_evicted) { - ut_ad(dict_lru_find_table(table)); - } else { - ut_ad(dict_non_lru_find_table(table)); - } -#endif /* UNIV_DEBUG */ + ut_ad(dict_sys.find(table)); if (!dict_locked) { table_id_t table_id = table->id; @@ -521,7 +353,7 @@ dict_table_close( && table->drop_aborted && dict_table_get_first_index(table); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); /* dict_table_try_drop_aborted() can generate undo logs. So it should be avoided after shutdown of background @@ -534,9 +366,9 @@ dict_table_close( /********************************************************************//** Closes the only open handle to a table and drops a table while assuring -that dict_sys->mutex is held the whole time. This assures that the table +that dict_sys.mutex is held the whole time. This assures that the table is not evicted after the close when the count of open handles goes to zero. -Because dict_sys->mutex is held, we do not need to call +Because dict_sys.mutex is held, we do not need to call dict_table_prevent_eviction(). */ void dict_table_close_and_drop( @@ -546,8 +378,7 @@ dict_table_close_and_drop( { dberr_t err = DB_SUCCESS; - ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); + ut_d(dict_sys.assert_locked()); ut_ad(trx->dict_operation != TRX_DICT_OP_NONE); ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); @@ -734,61 +565,6 @@ dict_table_get_nth_v_col_mysql( return(dict_table_get_nth_v_col(table, i)); } -/** Allocate and init the autoinc latch of a given table. -This function must not be called concurrently on the same table object. -@param[in,out] table_void table whose autoinc latch to create */ -static -void -dict_table_autoinc_alloc( - void* table_void) -{ - dict_table_t* table = static_cast<dict_table_t*>(table_void); - table->autoinc_mutex = UT_NEW_NOKEY(ib_mutex_t()); - ut_a(table->autoinc_mutex != NULL); - mutex_create(LATCH_ID_AUTOINC, table->autoinc_mutex); -} - -/** Allocate and init the zip_pad_mutex of a given index. -This function must not be called concurrently on the same index object. -@param[in,out] index_void index whose zip_pad_mutex to create */ -static -void -dict_index_zip_pad_alloc( - void* index_void) -{ - dict_index_t* index = static_cast<dict_index_t*>(index_void); - index->zip_pad.mutex = UT_NEW_NOKEY(SysMutex()); - ut_a(index->zip_pad.mutex != NULL); - mutex_create(LATCH_ID_ZIP_PAD_MUTEX, index->zip_pad.mutex); -} - -/********************************************************************//** -Acquire the autoinc lock. */ -void -dict_table_autoinc_lock( -/*====================*/ - dict_table_t* table) /*!< in/out: table */ -{ - os_once::do_or_wait_for_done( - &table->autoinc_mutex_created, - dict_table_autoinc_alloc, table); - - mutex_enter(table->autoinc_mutex); -} - -/** Acquire the zip_pad_mutex latch. -@param[in,out] index the index whose zip_pad_mutex to acquire.*/ -static -void -dict_index_zip_pad_lock( - dict_index_t* index) -{ - os_once::do_or_wait_for_done( - &index->zip_pad.mutex_created, - dict_index_zip_pad_alloc, index); - - mutex_enter(index->zip_pad.mutex); -} /** Get all the FTS indexes on a table. @param[in] table table @@ -815,16 +591,6 @@ dict_table_get_all_fts_indexes( return(ib_vector_size(indexes)); } -/********************************************************************//** -Release the autoinc lock. */ -void -dict_table_autoinc_unlock( -/*======================*/ - dict_table_t* table) /*!< in/out: table */ -{ - mutex_exit(table->autoinc_mutex); -} - /** Looks for column n in an index. @param[in] index index @param[in] n column number @@ -882,47 +648,29 @@ dict_index_get_nth_col_or_prefix_pos( return(ULINT_UNDEFINED); } -/** Returns TRUE if the index contains a column or a prefix of that column. -@param[in] index index +/** Check if the index contains a column or a prefix of that column. @param[in] n column number @param[in] is_virtual whether it is a virtual col -@return TRUE if contains the column or its prefix */ -bool -dict_index_contains_col_or_prefix( - const dict_index_t* index, - ulint n, - bool is_virtual) +@return whether the index contains the column or its prefix */ +bool dict_index_t::contains_col_or_prefix(ulint n, bool is_virtual) const { - const dict_field_t* field; - const dict_col_t* col; - ulint pos; - ulint n_fields; + ut_ad(magic_n == DICT_INDEX_MAGIC_N); - ut_ad(index); - ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - - if (dict_index_is_clust(index)) { + if (is_primary()) { return(!is_virtual); } - if (is_virtual) { - col = &dict_table_get_nth_v_col(index->table, n)->m_col; - } else { - col = dict_table_get_nth_col(index->table, n); - } - - n_fields = dict_index_get_n_fields(index); + const dict_col_t* col = is_virtual + ? &dict_table_get_nth_v_col(table, n)->m_col + : dict_table_get_nth_col(table, n); - for (pos = 0; pos < n_fields; pos++) { - field = dict_index_get_nth_field(index, pos); - - if (col == field->col) { - - return(true); + for (ulint pos = 0; pos < n_fields; pos++) { + if (col == fields[pos].col) { + return true; } } - return(false); + return false; } /********************************************************************//** @@ -991,10 +739,10 @@ dict_table_open_on_id( dict_table_t* table; if (!dict_locked) { - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); } - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); table = dict_table_open_on_id_low( table_id, @@ -1004,13 +752,7 @@ dict_table_open_on_id( table_op == DICT_TABLE_OP_OPEN_ONLY_IF_CACHED); if (table != NULL) { - - if (table->can_be_evicted) { - dict_move_to_mru(table); - } - - table->acquire(); - + dict_sys.acquire(table); MONITOR_INC(MONITOR_TABLE_REFERENCE); } @@ -1070,56 +812,46 @@ dict_table_col_in_clustered_key( return(FALSE); } -/**********************************************************************//** -Inits the data dictionary module. */ -void -dict_init(void) -/*===========*/ +/** Initialise the data dictionary cache. */ +void dict_sys_t::create() { - dict_sys = static_cast<dict_sys_t*>(ut_zalloc_nokey(sizeof(*dict_sys))); - - UT_LIST_INIT(dict_sys->table_LRU, &dict_table_t::table_LRU); - UT_LIST_INIT(dict_sys->table_non_LRU, &dict_table_t::table_LRU); + ut_ad(this == &dict_sys); + ut_ad(!is_initialised()); + m_initialised= true; + UT_LIST_INIT(table_LRU, &dict_table_t::table_LRU); + UT_LIST_INIT(table_non_LRU, &dict_table_t::table_LRU); - mutex_create(LATCH_ID_DICT_SYS, &dict_sys->mutex); + mutex_create(LATCH_ID_DICT_SYS, &mutex); - dict_sys->table_hash = hash_create( - buf_pool_get_curr_size() - / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE)); + const ulint hash_size = buf_pool_get_curr_size() + / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE); - dict_sys->table_id_hash = hash_create( - buf_pool_get_curr_size() - / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE)); + table_hash= hash_create(hash_size); + table_id_hash= hash_create(hash_size); + temp_id_hash= hash_create(hash_size); - rw_lock_create(dict_operation_lock_key, - &dict_operation_lock, SYNC_DICT_OPERATION); + rw_lock_create(dict_operation_lock_key, &latch, SYNC_DICT_OPERATION); - if (!srv_read_only_mode) { - dict_foreign_err_file = os_file_create_tmpfile(); - ut_a(dict_foreign_err_file); - } + if (!srv_read_only_mode) + { + dict_foreign_err_file= os_file_create_tmpfile(); + ut_a(dict_foreign_err_file); + } - mutex_create(LATCH_ID_DICT_FOREIGN_ERR, &dict_foreign_err_mutex); + mutex_create(LATCH_ID_DICT_FOREIGN_ERR, &dict_foreign_err_mutex); } -/**********************************************************************//** -Move to the most recently used segment of the LRU list. */ -void -dict_move_to_mru( -/*=============*/ - dict_table_t* table) /*!< in: table to move to MRU */ +/** Acquire a reference to a cached table. */ +inline void dict_sys_t::acquire(dict_table_t* table) { - ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(dict_lru_validate()); - ut_ad(dict_lru_find_table(table)); - - ut_a(table->can_be_evicted); - - UT_LIST_REMOVE(dict_sys->table_LRU, table); - - UT_LIST_ADD_FIRST(dict_sys->table_LRU, table); - - ut_ad(dict_lru_validate()); + ut_ad(dict_sys.find(table)); + if (table->can_be_evicted) + { + UT_LIST_REMOVE(dict_sys.table_LRU, table); + UT_LIST_ADD_FIRST(dict_sys.table_LRU, table); + } + + table->acquire(); } /**********************************************************************//** @@ -1145,11 +877,11 @@ dict_table_open_on_name( DBUG_PRINT("dict_table_open_on_name", ("table: '%s'", table_name)); if (!dict_locked) { - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); } ut_ad(table_name); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); table = dict_table_check_if_in_cache_low(table_name); @@ -1165,7 +897,7 @@ dict_table_open_on_name( if (!(ignore_err & ~DICT_ERR_IGNORE_FK_NOKEY) && !table->is_readable()) { /* Make life easy for drop table. */ - dict_table_prevent_eviction(table); + dict_sys.prevent_eviction(table); if (table->corrupted) { @@ -1173,31 +905,22 @@ dict_table_open_on_name( << " is corrupted. Please " "drop the table and recreate."; if (!dict_locked) { - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } DBUG_RETURN(NULL); } - if (table->can_be_evicted) { - dict_move_to_mru(table); - } - - table->acquire(); + dict_sys.acquire(table); if (!dict_locked) { - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } DBUG_RETURN(table); } - if (table->can_be_evicted) { - dict_move_to_mru(table); - } - - table->acquire(); - + dict_sys.acquire(table); MONITOR_INC(MONITOR_TABLE_REFERENCE); } @@ -1248,65 +971,64 @@ dict_table_add_system_columns( } /** Add the table definition to the data dictionary cache */ -void -dict_table_t::add_to_cache() +void dict_table_t::add_to_cache() { - ut_ad(dict_lru_validate()); - ut_ad(mutex_own(&dict_sys->mutex)); - cached = TRUE; - ulint fold = ut_fold_string(name.m_name); - ulint id_fold = ut_fold_ull(id); + dict_sys.add(this); +} + +/** Add a table definition to the data dictionary cache */ +inline void dict_sys_t::add(dict_table_t* table) +{ + ut_ad(!find(table)); + + ulint fold = ut_fold_string(table->name.m_name); + + mutex_create(LATCH_ID_AUTOINC, &table->autoinc_mutex); /* Look for a table with the same name: error if such exists */ { dict_table_t* table2; - HASH_SEARCH(name_hash, dict_sys->table_hash, fold, + HASH_SEARCH(name_hash, table_hash, fold, dict_table_t*, table2, ut_ad(table2->cached), - !strcmp(table2->name.m_name, name.m_name)); + !strcmp(table2->name.m_name, table->name.m_name)); ut_a(table2 == NULL); #ifdef UNIV_DEBUG /* Look for the same table pointer with a different name */ - HASH_SEARCH_ALL(name_hash, dict_sys->table_hash, + HASH_SEARCH_ALL(name_hash, table_hash, dict_table_t*, table2, ut_ad(table2->cached), - table2 == this); + table2 == table); ut_ad(table2 == NULL); #endif /* UNIV_DEBUG */ } + HASH_INSERT(dict_table_t, name_hash, table_hash, fold, table); /* Look for a table with the same id: error if such exists */ + hash_table_t* id_hash = table->is_temporary() + ? temp_id_hash : table_id_hash; + const ulint id_fold = ut_fold_ull(table->id); { dict_table_t* table2; - HASH_SEARCH(id_hash, dict_sys->table_id_hash, id_fold, + HASH_SEARCH(id_hash, id_hash, id_fold, dict_table_t*, table2, ut_ad(table2->cached), - table2->id == id); + table2->id == table->id); ut_a(table2 == NULL); #ifdef UNIV_DEBUG /* Look for the same table pointer with a different id */ - HASH_SEARCH_ALL(id_hash, dict_sys->table_id_hash, + HASH_SEARCH_ALL(id_hash, id_hash, dict_table_t*, table2, ut_ad(table2->cached), - table2 == this); + table2 == table); ut_ad(table2 == NULL); #endif /* UNIV_DEBUG */ - } - /* Add table to hash table of tables */ - HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold, - this); - - /* Add table to hash table of tables based on table id */ - HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash, id_fold, - this); - - if (can_be_evicted) { - UT_LIST_ADD_FIRST(dict_sys->table_LRU, this); - } else { - UT_LIST_ADD_FIRST(dict_sys->table_non_LRU, this); + HASH_INSERT(dict_table_t, id_hash, id_hash, id_fold, table); } + UT_LIST_ADD_FIRST(table->can_be_evicted ? table_LRU : table_non_LRU, + table); ut_ad(dict_lru_validate()); } @@ -1319,9 +1041,7 @@ dict_table_can_be_evicted( /*======================*/ dict_table_t* table) /*!< in: table to test */ { - ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); - + ut_d(dict_sys.assert_locked()); ut_a(table->can_be_evicted); ut_a(table->foreign_set.empty()); ut_a(table->referenced_set.empty()); @@ -1395,7 +1115,7 @@ dict_index_t *dict_index_t::clone() const (mem_heap_zalloc(heap, n_uniq * sizeof *stat_n_sample_sizes)); index->stat_n_non_null_key_vals= static_cast<ib_uint64_t*> (mem_heap_zalloc(heap, n_uniq * sizeof *stat_n_non_null_key_vals)); - memset(&index->zip_pad, 0, sizeof index->zip_pad); + mutex_create(LATCH_ID_ZIP_PAD_MUTEX, &index->zip_pad.mutex); return index; } @@ -1439,11 +1159,10 @@ dict_make_room_in_cache( ut_a(pct_check > 0); ut_a(pct_check <= 100); - ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); + ut_d(dict_sys.assert_locked()); ut_ad(dict_lru_validate()); - i = len = UT_LIST_GET_LEN(dict_sys->table_LRU); + i = len = UT_LIST_GET_LEN(dict_sys.table_LRU); if (len < max_tables) { return(0); @@ -1457,7 +1176,7 @@ dict_make_room_in_cache( /* Find a suitable candidate to evict from the cache. Don't scan the entire LRU list. Only scan pct_check list entries. */ - for (table = UT_LIST_GET_LAST(dict_sys->table_LRU); + for (table = UT_LIST_GET_LAST(dict_sys.table_LRU); table != NULL && i > check_up_to && (len - n_evicted) > max_tables; @@ -1469,7 +1188,7 @@ dict_make_room_in_cache( if (dict_table_can_be_evicted(table)) { ut_ad(!table->fts); - dict_table_remove_from_cache_low(table, TRUE); + dict_sys.remove(table, true); ++n_evicted; } @@ -1480,25 +1199,6 @@ dict_make_room_in_cache( return(n_evicted); } -/**********************************************************************//** -Move a table to the non-LRU list from the LRU list. */ -void -dict_table_move_from_lru_to_non_lru( -/*================================*/ - dict_table_t* table) /*!< in: table to move from LRU to non-LRU */ -{ - ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(dict_lru_find_table(table)); - - ut_a(table->can_be_evicted); - - UT_LIST_REMOVE(dict_sys->table_LRU, table); - - UT_LIST_ADD_LAST(dict_sys->table_non_LRU, table); - - table->can_be_evicted = FALSE; -} - /** Looks for an index with the given id given a table instance. @param[in] table table instance @param[in] id index id @@ -1534,14 +1234,11 @@ dict_index_find_on_id_low( /*======================*/ index_id_t id) /*!< in: index id */ { - dict_table_t* table; + if (!dict_sys.is_initialised()) return NULL; - /* This can happen if the system tablespace is the wrong page size */ - if (dict_sys == NULL) { - return(NULL); - } + dict_table_t* table; - for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU); + for (table = UT_LIST_GET_FIRST(dict_sys.table_LRU); table != NULL; table = UT_LIST_GET_NEXT(table_LRU, table)) { @@ -1552,7 +1249,7 @@ dict_index_find_on_id_low( } } - for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU); + for (table = UT_LIST_GET_FIRST(dict_sys.table_non_LRU); table != NULL; table = UT_LIST_GET_NEXT(table_LRU, table)) { @@ -1605,7 +1302,7 @@ dict_table_rename_in_cache( char old_name[MAX_FULL_NAME_LEN + 1]; os_file_type_t ftype; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); /* store the old/current name to an automatic variable */ ut_a(strlen(table->name.m_name) < sizeof old_name); @@ -1615,7 +1312,7 @@ dict_table_rename_in_cache( /* Look for a table with the same name: error if such exists */ dict_table_t* table2; - HASH_SEARCH(name_hash, dict_sys->table_hash, fold, + HASH_SEARCH(name_hash, dict_sys.table_hash, fold, dict_table_t*, table2, ut_ad(table2->cached), (ut_strcmp(table2->name.m_name, new_name) == 0)); DBUG_EXECUTE_IF("dict_table_rename_in_cache_failure", @@ -1709,7 +1406,7 @@ dict_table_rename_in_cache( } /* Remove table from the hash tables of tables */ - HASH_DELETE(dict_table_t, name_hash, dict_sys->table_hash, + HASH_DELETE(dict_table_t, name_hash, dict_sys.table_hash, ut_fold_string(old_name), table); if (strlen(new_name) > strlen(table->name.m_name)) { @@ -1724,7 +1421,7 @@ dict_table_rename_in_cache( strcpy(table->name.m_name, new_name); /* Add table to hash table of tables */ - HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold, + HASH_INSERT(dict_table_t, name_hash, dict_sys.table_hash, fold, table); if (!rename_also_foreigns) { @@ -1986,28 +1683,26 @@ dict_table_change_id_in_cache( dict_table_t* table, /*!< in/out: table object already in cache */ table_id_t new_id) /*!< in: new id to set */ { - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(!table->is_temporary()); /* Remove the table from the hash table of id's */ - HASH_DELETE(dict_table_t, id_hash, dict_sys->table_id_hash, + HASH_DELETE(dict_table_t, id_hash, dict_sys.table_id_hash, ut_fold_ull(table->id), table); table->id = new_id; /* Add the table back to the hash table */ - HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash, + HASH_INSERT(dict_table_t, id_hash, dict_sys.table_id_hash, ut_fold_ull(table->id), table); } -/**********************************************************************//** -Removes a table object from the dictionary cache. */ -void -dict_table_remove_from_cache_low( -/*=============================*/ - dict_table_t* table, /*!< in, own: table */ - ibool lru_evict) /*!< in: TRUE if table being evicted - to make room in the table LRU list */ +/** Evict a table definition from the InnoDB data dictionary cache. +@param[in,out] table cached table definition to be evicted +@param[in] lru whether this is part of least-recently-used eviction +@param[in] keep whether to keep (not free) the object */ +void dict_sys_t::remove(dict_table_t* table, bool lru, bool keep) { dict_foreign_t* foreign; dict_index_t* index; @@ -2015,7 +1710,7 @@ dict_table_remove_from_cache_low( ut_ad(dict_lru_validate()); ut_a(table->get_ref_count() == 0); ut_a(table->n_rec_locks == 0); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(find(table)); ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); /* Remove the foreign constraints from the cache */ @@ -2039,37 +1734,33 @@ dict_table_remove_from_cache_low( index != NULL; index = UT_LIST_GET_LAST(table->indexes)) { - dict_index_remove_from_cache_low(table, index, lru_evict); + dict_index_remove_from_cache_low(table, index, lru); } /* Remove table from the hash tables of tables */ - HASH_DELETE(dict_table_t, name_hash, dict_sys->table_hash, + HASH_DELETE(dict_table_t, name_hash, table_hash, ut_fold_string(table->name.m_name), table); - HASH_DELETE(dict_table_t, id_hash, dict_sys->table_id_hash, - ut_fold_ull(table->id), table); + hash_table_t* id_hash = table->is_temporary() + ? temp_id_hash : table_id_hash; + const ulint id_fold = ut_fold_ull(table->id); + HASH_DELETE(dict_table_t, id_hash, id_hash, id_fold, table); /* Remove table from LRU or non-LRU list. */ if (table->can_be_evicted) { - ut_ad(dict_lru_find_table(table)); - UT_LIST_REMOVE(dict_sys->table_LRU, table); + UT_LIST_REMOVE(table_LRU, table); } else { - ut_ad(dict_non_lru_find_table(table)); - UT_LIST_REMOVE(dict_sys->table_non_LRU, table); + UT_LIST_REMOVE(table_non_LRU, table); } - ut_ad(dict_lru_validate()); - - if (lru_evict && table->drop_aborted) { + if (lru && table->drop_aborted) { /* When evicting the table definition, drop the orphan indexes from the data dictionary and free the index pages. */ trx_t* trx = trx_create(); - ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); - + ut_d(dict_sys.assert_locked()); /* Mimic row_mysql_lock_data_dictionary(). */ trx->dict_operation_lock_mode = RW_X_LATCH; @@ -2086,6 +1777,12 @@ dict_table_remove_from_cache_low( UT_DELETE(table->vc_templ); } + mutex_free(&table->autoinc_mutex); + + if (keep) { + return; + } + #ifdef BTR_CUR_HASH_ADAPT if (UNIV_UNLIKELY(UT_LIST_GET_LEN(table->freed_indexes) != 0)) { table->vc_templ = NULL; @@ -2097,16 +1794,6 @@ dict_table_remove_from_cache_low( dict_mem_table_free(table); } -/**********************************************************************//** -Removes a table object from the dictionary cache. */ -void -dict_table_remove_from_cache( -/*=========================*/ - dict_table_t* table) /*!< in, own: table */ -{ - dict_table_remove_from_cache_low(table, FALSE); -} - /****************************************************************//** If the given column name is reserved for InnoDB system columns, return TRUE. @@ -2132,44 +1819,6 @@ dict_col_name_is_reserved( return(FALSE); } -/** Clears the virtual column's index list before index is -being freed. -@param[in] index Index being freed */ -void dict_index_remove_from_v_col_list(dict_index_t* index) -{ - /* Index is not completely formed */ - if (!index->cached) { - return; - } - if (dict_index_has_virtual(index)) { - const dict_col_t* col; - const dict_v_col_t* vcol; - - for (ulint i = 0; i < dict_index_get_n_fields(index); i++) { - col = dict_index_get_nth_col(index, i); - if (col && col->is_virtual()) { - vcol = reinterpret_cast<const dict_v_col_t*>( - col); - /* This could be NULL, when we do add - virtual column, add index together. We do not - need to track this virtual column's index */ - if (vcol->v_indexes == NULL) { - continue; - } - dict_v_idx_list::iterator it; - for (it = vcol->v_indexes->begin(); - it != vcol->v_indexes->end(); ++it) { - dict_v_idx_t v_index = *it; - if (v_index.index == index) { - vcol->v_indexes->erase(it); - break; - } - } - } - } - } -} - /** Adds an index to the dictionary cache, with possible indexing newly added column. @param[in,out] index index; NOTE! The index memory @@ -2187,7 +1836,7 @@ dict_index_add_to_cache( ulint n_ord; ulint i; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); ut_ad(index->n_def == index->n_fields); ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); ut_ad(!dict_index_is_online_ddl(index)); @@ -2252,6 +1901,8 @@ dict_index_add_to_cache( > field->col->max_prefix) { /* Set the max_prefix value based on the prefix_len. */ + ut_ad(field->col->is_binary() + || field->prefix_len % field->col->mbmaxlen == 0); field->col->max_prefix = field->prefix_len; } ut_ad(field->col->ord_part == 1); @@ -2317,7 +1968,7 @@ dict_index_remove_from_cache_low( ut_ad(table && index); ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); ut_ad(table->id); #ifdef BTR_CUR_HASH_ADAPT ut_ad(!index->freed()); @@ -2396,7 +2047,7 @@ dict_index_find_cols( const dict_table_t* table = index->table; ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); for (ulint i = 0; i < index->n_fields; i++) { ulint j; @@ -2494,18 +2145,9 @@ dict_index_add_col( if (col->is_virtual()) { dict_v_col_t* v_col = reinterpret_cast<dict_v_col_t*>(col); - - /* When v_col->v_indexes==NULL, - ha_innobase::commit_inplace_alter_table(commit=true) - will evict and reload the table definition, and - v_col->v_indexes will not be NULL for the new table. */ - if (v_col->v_indexes != NULL) { - /* Register the index with the virtual column index - list */ - v_col->v_indexes->push_back( - dict_v_idx_t(index, index->n_def)); - } - + /* Register the index with the virtual column index list */ + v_col->n_v_indexes++; + v_col->v_indexes.push_front(dict_v_idx_t(index, index->n_def)); col_name = dict_table_get_v_col_name_mysql( table, dict_col_get_no(col)); } else { @@ -2673,7 +2315,7 @@ dict_index_build_internal_clust( ut_ad(dict_index_is_clust(index)); ut_ad(!dict_index_is_ibuf(index)); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); /* Create a new index object with certainly enough fields */ new_index = dict_mem_index_create(index->table, index->name, @@ -2825,7 +2467,7 @@ dict_index_build_internal_non_clust( ut_ad(table && index); ut_ad(!dict_index_is_clust(index)); ut_ad(!dict_index_is_ibuf(index)); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); /* The clustered index should be the first in the list of indexes */ clust_index = UT_LIST_GET_FIRST(table->indexes); @@ -2919,7 +2561,7 @@ dict_index_build_internal_fts( dict_index_t* new_index; ut_ad(index->type == DICT_FTS); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); /* Create a new index */ new_index = dict_mem_index_create(index->table, index->name, @@ -2988,7 +2630,7 @@ dict_foreign_remove_from_cache( /*===========================*/ dict_foreign_t* foreign) /*!< in, own: foreign constraint */ { - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); ut_a(foreign); if (foreign->referenced_table != NULL) { @@ -3013,7 +2655,7 @@ dict_foreign_find( dict_table_t* table, /*!< in: table object */ dict_foreign_t* foreign) /*!< in: foreign constraint */ { - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); ut_ad(dict_foreign_set_validate(table->foreign_set)); ut_ad(dict_foreign_set_validate(table->referenced_set)); @@ -3067,7 +2709,7 @@ dict_foreign_find_index( /*!< out: index where error happened */ { - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); if (error) { *error = FK_INDEX_NOT_FOUND; @@ -3165,7 +2807,7 @@ dict_foreign_add_to_cache( DBUG_ENTER("dict_foreign_add_to_cache"); DBUG_PRINT("dict_foreign_add_to_cache", ("id: %s", foreign->id)); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); for_table = dict_table_check_if_in_cache_low( foreign->foreign_table_name_lookup); @@ -3279,11 +2921,11 @@ dict_foreign_add_to_cache( list. Otherwise it will be evicted from the cache. */ if (ref_table != NULL) { - dict_table_prevent_eviction(ref_table); + dict_sys.prevent_eviction(ref_table); } if (for_table != NULL) { - dict_table_prevent_eviction(for_table); + dict_sys.prevent_eviction(for_table); } ut_ad(dict_lru_validate()); @@ -4073,7 +3715,7 @@ dict_create_foreign_constraints_low( char create_name[MAX_TABLE_NAME_LEN + 1]; ut_ad(!srv_read_only_mode); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); table = dict_table_get_low(name); /* First check if we are actually doing an ALTER TABLE, and in that @@ -4986,7 +4628,7 @@ dict_foreign_parse_drop_constraints( ptr = str; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); loop: ptr = dict_scan_to(ptr, "DROP"); @@ -5088,14 +4730,14 @@ syntax_error: /**********************************************************************//** Returns an index object if it is found in the dictionary cache. -Assumes that dict_sys->mutex is already being held. +Assumes that dict_sys.mutex is already being held. @return index, NULL if not found */ dict_index_t* dict_index_get_if_in_cache_low( /*===========================*/ index_id_t index_id) /*!< in: index id */ { - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); return(dict_index_find_on_id_low(index_id)); } @@ -5111,15 +4753,15 @@ dict_index_get_if_in_cache( { dict_index_t* index; - if (dict_sys == NULL) { + if (!dict_sys.is_initialised()) { return(NULL); } - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); index = dict_index_get_if_in_cache_low(index_id); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); return(index); } @@ -5210,46 +4852,6 @@ dict_index_build_node_ptr( return(tuple); } -/**********************************************************************//** -Copies an initial segment of a physical record, long enough to specify an -index entry uniquely. -@return pointer to the prefix record */ -rec_t* -dict_index_copy_rec_order_prefix( -/*=============================*/ - const dict_index_t* index, /*!< in: index */ - const rec_t* rec, /*!< in: record for which to - copy prefix */ - ulint* n_fields,/*!< out: number of fields copied */ - byte** buf, /*!< in/out: memory buffer for the - copied prefix, or NULL */ - ulint* buf_size)/*!< in/out: buffer size */ -{ - ulint n; - - UNIV_PREFETCH_R(rec); - - if (dict_index_is_ibuf(index)) { - ut_ad(!dict_table_is_comp(index->table)); - n = rec_get_n_fields_old(rec); - } else { - if (page_rec_is_leaf(rec)) { - n = dict_index_get_n_unique_in_tree(index); - } else if (dict_index_is_spatial(index)) { - ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index) - == DICT_INDEX_SPATIAL_NODEPTR_SIZE); - /* For R-tree, we have to compare - the child page numbers as well. */ - n = DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1; - } else { - n = dict_index_get_n_unique_in_tree(index); - } - } - - *n_fields = n; - return(rec_copy_prefix_to_buf(rec, index, n, buf, buf_size)); -} - /** Convert a physical record into a search tuple. @param[in] rec index record (not necessarily in an index page) @param[in] index index @@ -5442,7 +5044,7 @@ dict_print_info_on_foreign_keys( dict_foreign_t* foreign; std::string str; - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); for (dict_foreign_set::iterator it = table->foreign_set.begin(); it != table->foreign_set.end(); @@ -5509,12 +5111,12 @@ dict_print_info_on_foreign_keys( } } - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); return str; } /** Given a space_id of a file-per-table tablespace, search the -dict_sys->table_LRU list and return the dict_table_t* pointer for it. +dict_sys.table_LRU list and return the dict_table_t* pointer for it. @param space tablespace @return table if found, NULL if not */ static @@ -5527,13 +5129,13 @@ dict_find_single_table_by_space(const fil_space_t* space) ut_ad(space->id > 0); - if (dict_sys == NULL) { + if (!dict_sys.is_initialised()) { /* This could happen when it's in redo processing. */ return(NULL); } - table = UT_LIST_GET_FIRST(dict_sys->table_LRU); - num_item = UT_LIST_GET_LEN(dict_sys->table_LRU); + table = UT_LIST_GET_FIRST(dict_sys.table_LRU); + num_item = UT_LIST_GET_LEN(dict_sys.table_LRU); /* This function intentionally does not acquire mutex as it is used by error handling code in deep call stack as last means to avoid @@ -5607,9 +5209,9 @@ dict_set_corrupted( row_mysql_lock_data_dictionary(trx); } - ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(!dict_table_is_comp(dict_sys->sys_tables)); - ut_ad(!dict_table_is_comp(dict_sys->sys_indexes)); + ut_ad(mutex_own(&dict_sys.mutex)); + ut_ad(!dict_table_is_comp(dict_sys.sys_tables)); + ut_ad(!dict_table_is_comp(dict_sys.sys_indexes)); ut_ad(!sync_check_iterate(dict_sync_check())); /* Mark the table as corrupted only if the clustered index @@ -5637,7 +5239,7 @@ dict_set_corrupted( mtr_start(&mtr); index->type |= DICT_CORRUPT; - sys_index = UT_LIST_GET_FIRST(dict_sys->sys_indexes->indexes); + sys_index = UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes); /* Find the index row in SYS_INDEXES */ tuple = dtuple_create(heap, 2); @@ -5697,9 +5299,9 @@ dict_set_corrupted_index_cache_only( { ut_ad(index != NULL); ut_ad(index->table != NULL); - ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(!dict_table_is_comp(dict_sys->sys_tables)); - ut_ad(!dict_table_is_comp(dict_sys->sys_indexes)); + ut_ad(mutex_own(&dict_sys.mutex)); + ut_ad(!dict_table_is_comp(dict_sys.sys_tables)); + ut_ad(!dict_table_is_comp(dict_sys.sys_indexes)); /* Mark the table as corrupted only if the clustered index is corrupted */ @@ -5727,11 +5329,10 @@ dict_index_set_merge_threshold( btr_cur_t cursor; ut_ad(index != NULL); - ut_ad(!dict_table_is_comp(dict_sys->sys_tables)); - ut_ad(!dict_table_is_comp(dict_sys->sys_indexes)); + ut_ad(!dict_table_is_comp(dict_sys.sys_tables)); + ut_ad(!dict_table_is_comp(dict_sys.sys_indexes)); - rw_lock_x_lock(&dict_operation_lock); - mutex_enter(&(dict_sys->mutex)); + dict_sys_lock(); heap = mem_heap_create(sizeof(dtuple_t) + 2 * (sizeof(dfield_t) + sizeof(que_fork_t) + sizeof(upd_node_t) @@ -5739,7 +5340,7 @@ dict_index_set_merge_threshold( mtr_start(&mtr); - sys_index = UT_LIST_GET_FIRST(dict_sys->sys_indexes->indexes); + sys_index = UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes); /* Find the index row in SYS_INDEXES */ tuple = dtuple_create(heap, 2); @@ -5779,8 +5380,7 @@ dict_index_set_merge_threshold( mtr_commit(&mtr); mem_heap_free(heap); - mutex_exit(&(dict_sys->mutex)); - rw_lock_x_unlock(&dict_operation_lock); + dict_sys_unlock(); } #ifdef UNIV_DEBUG @@ -5811,14 +5411,14 @@ void dict_set_merge_threshold_all_debug( uint merge_threshold_all) { - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); dict_set_merge_threshold_list_debug( - &dict_sys->table_LRU, merge_threshold_all); + &dict_sys.table_LRU, merge_threshold_all); dict_set_merge_threshold_list_debug( - &dict_sys->table_non_LRU, merge_threshold_all); + &dict_sys.table_non_LRU, merge_threshold_all); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } #endif /* UNIV_DEBUG */ @@ -5964,7 +5564,7 @@ dict_table_check_for_dup_indexes( const dict_index_t* index1; const dict_index_t* index2; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); /* The primary index _must_ exist */ ut_a(UT_LIST_GET_LEN(table->indexes) > 0); @@ -6041,7 +5641,7 @@ dict_table_schema_check( dict_table_t* table; ulint i; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); table = dict_table_get_low(req_schema->table_name); @@ -6285,107 +5885,96 @@ dict_fs2utf8( } } -/** Resize the hash tables besed on the current buffer pool size. */ -void -dict_resize() +/** Resize the hash tables based on the current buffer pool size. */ +void dict_sys_t::resize() { - dict_table_t* table; - - mutex_enter(&dict_sys->mutex); - - /* all table entries are in table_LRU and table_non_LRU lists */ - hash_table_free(dict_sys->table_hash); - hash_table_free(dict_sys->table_id_hash); - - dict_sys->table_hash = hash_create( - buf_pool_get_curr_size() - / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE)); - - dict_sys->table_id_hash = hash_create( - buf_pool_get_curr_size() - / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE)); - - for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU); table; - table = UT_LIST_GET_NEXT(table_LRU, table)) { - ulint fold = ut_fold_string(table->name.m_name); - ulint id_fold = ut_fold_ull(table->id); - - HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, - fold, table); - - HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash, - id_fold, table); - } - - for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU); table; - table = UT_LIST_GET_NEXT(table_LRU, table)) { - ulint fold = ut_fold_string(table->name.m_name); - ulint id_fold = ut_fold_ull(table->id); - - HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, - fold, table); - - HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash, - id_fold, table); - } - - mutex_exit(&dict_sys->mutex); + ut_ad(this == &dict_sys); + ut_ad(is_initialised()); + mutex_enter(&mutex); + + /* all table entries are in table_LRU and table_non_LRU lists */ + hash_table_free(table_hash); + hash_table_free(table_id_hash); + hash_table_free(temp_id_hash); + + const ulint hash_size = buf_pool_get_curr_size() + / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE); + table_hash = hash_create(hash_size); + table_id_hash = hash_create(hash_size); + temp_id_hash = hash_create(hash_size); + + for (dict_table_t* table= UT_LIST_GET_FIRST(table_LRU); table; + table= UT_LIST_GET_NEXT(table_LRU, table)) + { + ut_ad(!table->is_temporary()); + ulint fold= ut_fold_string(table->name.m_name); + ulint id_fold= ut_fold_ull(table->id); + + HASH_INSERT(dict_table_t, name_hash, table_hash, fold, table); + HASH_INSERT(dict_table_t, id_hash, table_id_hash, id_fold, table); + } + + for (dict_table_t* table = UT_LIST_GET_FIRST(table_non_LRU); table; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + ulint fold = ut_fold_string(table->name.m_name); + ulint id_fold = ut_fold_ull(table->id); + + HASH_INSERT(dict_table_t, name_hash, table_hash, fold, table); + + hash_table_t* id_hash = table->is_temporary() + ? temp_id_hash : table_id_hash; + + HASH_INSERT(dict_table_t, id_hash, id_hash, id_fold, table); + } + + mutex_exit(&mutex); } -/**********************************************************************//** -Closes the data dictionary module. */ -void -dict_close(void) -/*============*/ +/** Close the data dictionary cache on shutdown. */ +void dict_sys_t::close() { - if (dict_sys == NULL) { - /* This should only happen if a failure occurred - during redo log processing. */ - return; - } - - /* Acquire only because it's a pre-condition. */ - mutex_enter(&dict_sys->mutex); - - /* Free the hash elements. We don't remove them from the table - because we are going to destroy the table anyway. */ - for (ulint i = 0; i < hash_get_n_cells(dict_sys->table_id_hash); i++) { - dict_table_t* table; + ut_ad(this == &dict_sys); + if (!is_initialised()) return; - table = static_cast<dict_table_t*>( - HASH_GET_FIRST(dict_sys->table_hash, i)); + mutex_enter(&mutex); - while (table) { - dict_table_t* prev_table = table; + /* Free the hash elements. We don't remove them from the table + because we are going to destroy the table anyway. */ + for (ulint i = 0; i < hash_get_n_cells(table_hash); i++) + { + dict_table_t* table = static_cast<dict_table_t*>(HASH_GET_FIRST(table_hash, + i)); - table = static_cast<dict_table_t*>( - HASH_GET_NEXT(name_hash, prev_table)); - ut_ad(prev_table->magic_n == DICT_TABLE_MAGIC_N); - dict_table_remove_from_cache(prev_table); - } - } + while (table) + { + dict_table_t* prev_table = table; + table = static_cast<dict_table_t*>(HASH_GET_NEXT(name_hash, prev_table)); + dict_sys.remove(prev_table); + } + } - hash_table_free(dict_sys->table_hash); + hash_table_free(table_hash); - /* The elements are the same instance as in dict_sys->table_hash, - therefore we don't delete the individual elements. */ - hash_table_free(dict_sys->table_id_hash); + /* table_id_hash contains the same elements as in table_hash, + therefore we don't delete the individual elements. */ + hash_table_free(table_id_hash); - mutex_exit(&dict_sys->mutex); - mutex_free(&dict_sys->mutex); + /* No temporary tables should exist at this point. */ + hash_table_free(temp_id_hash); - rw_lock_free(&dict_operation_lock); + mutex_exit(&mutex); + mutex_free(&mutex); + rw_lock_free(&latch); - mutex_free(&dict_foreign_err_mutex); + mutex_free(&dict_foreign_err_mutex); - if (dict_foreign_err_file) { - fclose(dict_foreign_err_file); - dict_foreign_err_file = NULL; - } + if (dict_foreign_err_file) + { + fclose(dict_foreign_err_file); + dict_foreign_err_file = NULL; + } - ut_free(dict_sys); - - dict_sys = NULL; + m_initialised= false; } #ifdef UNIV_DEBUG @@ -6399,16 +5988,16 @@ dict_lru_validate(void) { dict_table_t* table; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); - for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU); + for (table = UT_LIST_GET_FIRST(dict_sys.table_LRU); table != NULL; table = UT_LIST_GET_NEXT(table_LRU, table)) { ut_a(table->can_be_evicted); } - for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU); + for (table = UT_LIST_GET_FIRST(dict_sys.table_non_LRU); table != NULL; table = UT_LIST_GET_NEXT(table_LRU, table)) { @@ -6417,62 +6006,6 @@ dict_lru_validate(void) return(TRUE); } - -/**********************************************************************//** -Check if a table exists in the dict table LRU list. -@return TRUE if table found in LRU list */ -static -ibool -dict_lru_find_table( -/*================*/ - const dict_table_t* find_table) /*!< in: table to find */ -{ - dict_table_t* table; - - ut_ad(find_table != NULL); - ut_ad(mutex_own(&dict_sys->mutex)); - - for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU); - table != NULL; - table = UT_LIST_GET_NEXT(table_LRU, table)) { - - ut_a(table->can_be_evicted); - - if (table == find_table) { - return(TRUE); - } - } - - return(FALSE); -} - -/**********************************************************************//** -Check if a table exists in the dict table non-LRU list. -@return TRUE if table found in non-LRU list */ -static -ibool -dict_non_lru_find_table( -/*====================*/ - const dict_table_t* find_table) /*!< in: table to find */ -{ - dict_table_t* table; - - ut_ad(find_table != NULL); - ut_ad(mutex_own(&dict_sys->mutex)); - - for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU); - table != NULL; - table = UT_LIST_GET_NEXT(table_LRU, table)) { - - ut_a(!table->can_be_evicted); - - if (table == find_table) { - return(TRUE); - } - } - - return(FALSE); -} #endif /* UNIV_DEBUG */ /*********************************************************************//** Check an index to see whether its first fields are the columns in the array, @@ -6596,6 +6129,7 @@ dict_index_zip_pad_update( ulint fail_pct; ut_ad(info); + ut_ad(info->pad % ZIP_PAD_INCR == 0); total = info->success + info->failure; @@ -6620,17 +6154,13 @@ dict_index_zip_pad_update( if (fail_pct > zip_threshold) { /* Compression failures are more then user defined threshold. Increase the pad size to reduce chances of - compression failures. */ - ut_ad(info->pad % ZIP_PAD_INCR == 0); + compression failures. - /* Only do increment if it won't increase padding + Only do increment if it won't increase padding beyond max pad size. */ if (info->pad + ZIP_PAD_INCR < (srv_page_size * zip_pad_max) / 100) { - /* Use atomics even though we have the mutex. - This is to ensure that we are able to read - info->pad atomically. */ - my_atomic_addlint(&info->pad, ZIP_PAD_INCR); + info->pad.fetch_add(ZIP_PAD_INCR); MONITOR_INC(MONITOR_PAD_INCREMENTS); } @@ -6647,12 +6177,7 @@ dict_index_zip_pad_update( padding. */ if (info->n_rounds >= ZIP_PAD_SUCCESSFUL_ROUND_LIMIT && info->pad > 0) { - - ut_ad(info->pad % ZIP_PAD_INCR == 0); - /* Use atomics even though we have the mutex. - This is to ensure that we are able to read - info->pad atomically. */ - my_atomic_addlint(&info->pad, ulint(-ZIP_PAD_INCR)); + info->pad.fetch_sub(ZIP_PAD_INCR); info->n_rounds = 0; @@ -6675,10 +6200,10 @@ dict_index_zip_success( return; } - dict_index_zip_pad_lock(index); + mutex_enter(&index->zip_pad.mutex); ++index->zip_pad.success; dict_index_zip_pad_update(&index->zip_pad, zip_threshold); - dict_index_zip_pad_unlock(index); + mutex_exit(&index->zip_pad.mutex); } /*********************************************************************//** @@ -6695,10 +6220,10 @@ dict_index_zip_failure( return; } - dict_index_zip_pad_lock(index); + mutex_enter(&index->zip_pad.mutex); ++index->zip_pad.failure; dict_index_zip_pad_update(&index->zip_pad, zip_threshold); - dict_index_zip_pad_unlock(index); + mutex_exit(&index->zip_pad.mutex); } /*********************************************************************//** @@ -6719,7 +6244,7 @@ dict_index_zip_pad_optimal_page_size( return(srv_page_size); } - pad = my_atomic_loadlint(&index->zip_pad.pad); + pad = index->zip_pad.pad; ut_ad(pad < srv_page_size); sz = srv_page_size - pad; @@ -6762,8 +6287,8 @@ ulint dict_sys_get_size() { /* No mutex; this is a very crude approximation anyway */ - ulint size = UT_LIST_GET_LEN(dict_sys->table_LRU) - + UT_LIST_GET_LEN(dict_sys->table_non_LRU); + ulint size = UT_LIST_GET_LEN(dict_sys.table_LRU) + + UT_LIST_GET_LEN(dict_sys.table_non_LRU); size *= sizeof(dict_table_t) + sizeof(dict_index_t) * 2 + (sizeof(dict_col_t) + sizeof(dict_field_t)) * 10 @@ -6785,8 +6310,7 @@ dict_space_is_empty( mtr_t mtr; bool found = false; - rw_lock_x_lock(&dict_operation_lock); - mutex_enter(&dict_sys->mutex); + dict_sys_lock(); mtr_start(&mtr); for (rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES); @@ -6807,8 +6331,7 @@ dict_space_is_empty( } mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); - rw_lock_x_unlock(&dict_operation_lock); + dict_sys_unlock(); return(!found); } @@ -6826,8 +6349,7 @@ dict_space_get_id( ulint name_len = strlen(name); ulint id = ULINT_UNDEFINED; - rw_lock_x_lock(&dict_operation_lock); - mutex_enter(&dict_sys->mutex); + dict_sys_lock(); mtr_start(&mtr); for (rec = dict_startscan_system(&pcur, &mtr, SYS_TABLESPACES); @@ -6856,68 +6378,7 @@ dict_space_get_id( } mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); - rw_lock_x_unlock(&dict_operation_lock); + dict_sys_unlock(); return(id); } - -/** Determine the extent size (in pages) for the given table -@param[in] table the table whose extent size is being - calculated. -@return extent size in pages (256, 128 or 64) */ -ulint -dict_table_extent_size( - const dict_table_t* table) -{ - const ulint mb_1 = 1024 * 1024; - const ulint mb_2 = 2 * mb_1; - const ulint mb_4 = 4 * mb_1; - - page_size_t page_size = dict_table_page_size(table); - ulint pages_in_extent = FSP_EXTENT_SIZE; - - if (page_size.is_compressed()) { - - ulint disk_page_size = page_size.physical(); - - switch (disk_page_size) { - case 1024: - pages_in_extent = mb_1/1024; - break; - case 2048: - pages_in_extent = mb_1/2048; - break; - case 4096: - pages_in_extent = mb_1/4096; - break; - case 8192: - pages_in_extent = mb_1/8192; - break; - case 16384: - pages_in_extent = mb_1/16384; - break; - case 32768: - pages_in_extent = mb_2/32768; - break; - case 65536: - pages_in_extent = mb_4/65536; - break; - default: - ut_ad(0); - } - } - - return(pages_in_extent); -} - -size_t -dict_table_t::get_overflow_field_local_len() const -{ - if (dict_table_has_atomic_blobs(this)) { - /* new-format table: do not store any BLOB prefix locally */ - return BTR_EXTERN_FIELD_REF_SIZE; - } - /* up to MySQL 5.1: store a 768-byte prefix locally */ - return BTR_EXTERN_FIELD_REF_SIZE + DICT_ANTELOPE_MAX_INDEX_COL_LEN; -} diff --git a/storage/innobase/dict/dict0load.cc b/storage/innobase/dict/dict0load.cc index 2214e398c5f..18d67ac2f0b 100644 --- a/storage/innobase/dict/dict0load.cc +++ b/storage/innobase/dict/dict0load.cc @@ -220,7 +220,7 @@ dict_get_first_table_name_in_db( ulint len; mtr_t mtr; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); heap = mem_heap_create(1000); @@ -421,6 +421,8 @@ dict_process_sys_indexes_rec( const char* err_msg; byte* buf; + ut_d(index->is_dummy = true); + ut_d(index->in_instant_init = false); buf = static_cast<byte*>(mem_heap_alloc(heap, 8)); /* Parse the record, and get "dict_index_t" struct filled */ @@ -801,7 +803,7 @@ dict_get_first_path( char* filepath = NULL; mem_heap_t* heap = mem_heap_create(1024); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); mtr_start(&mtr); @@ -879,8 +881,7 @@ dict_update_filepath( dberr_t err = DB_SUCCESS; trx_t* trx; - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_d(dict_sys.assert_locked()); trx = trx_create(); trx->op_info = "update filepath"; @@ -947,8 +948,7 @@ dict_replace_tablespace_and_filepath( DBUG_EXECUTE_IF("innodb_fail_to_update_tablespace_dict", return(DB_INTERRUPTED);); - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_d(dict_sys.assert_locked()); ut_ad(filepath); trx = trx_create(); @@ -982,7 +982,7 @@ dict_sys_tables_rec_check( const byte* field; ulint len; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); if (rec_get_deleted_flag(rec, 0)) { return("delete-marked record in SYS_TABLES"); @@ -1345,8 +1345,7 @@ static ulint dict_check_sys_tables() DBUG_ENTER("dict_check_sys_tables"); - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_d(dict_sys.assert_locked()); mtr_start(&mtr); @@ -1483,15 +1482,13 @@ void dict_check_tablespaces_and_store_max_id() DBUG_ENTER("dict_check_tablespaces_and_store_max_id"); - rw_lock_x_lock(&dict_operation_lock); - mutex_enter(&dict_sys->mutex); + dict_sys_lock(); /* Initialize the max space_id from sys header */ - mtr_start(&mtr); - ulint max_space_id = mtr_read_ulint( - dict_hdr_get(&mtr) + DICT_HDR_MAX_SPACE_ID, - MLOG_4BYTES, &mtr); - mtr_commit(&mtr); + mtr.start(); + ulint max_space_id = mach_read_from_4(DICT_HDR_MAX_SPACE_ID + + dict_hdr_get(&mtr)); + mtr.commit(); fil_set_max_space_id_if_bigger(max_space_id); @@ -1501,8 +1498,7 @@ void dict_check_tablespaces_and_store_max_id() max_space_id = dict_check_sys_tables(); fil_set_max_space_id_if_bigger(max_space_id); - mutex_exit(&dict_sys->mutex); - rw_lock_x_unlock(&dict_operation_lock); + dict_sys_unlock(); DBUG_VOID_RETURN; } @@ -1785,7 +1781,7 @@ dict_load_columns( mtr_t mtr; ulint n_skipped = 0; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); mtr_start(&mtr); @@ -1902,7 +1898,7 @@ dict_load_virtual_one_col( mtr_t mtr; ulint skipped = 0; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); if (v_col->num_base == 0) { return; @@ -1941,7 +1937,7 @@ dict_load_virtual_one_col( btr_pcur_open_on_user_rec(sys_virtual_index, tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, &mtr); - for (i = 0; i < v_col->num_base + skipped; i++) { + for (i = 0; i < unsigned{v_col->num_base} + skipped; i++) { const char* err_msg; ulint pos; @@ -2135,7 +2131,7 @@ dict_load_fields( mtr_t mtr; dberr_t error; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); mtr_start(&mtr); @@ -2366,7 +2362,7 @@ dict_load_indexes( mtr_t mtr; dberr_t error = DB_SUCCESS; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); mtr_start(&mtr); @@ -2560,7 +2556,7 @@ corrupted: goto corrupted; } else if (dict_is_sys_table(table->id) && (dict_index_is_clust(index) - || ((table == dict_sys->sys_tables) + || ((table == dict_sys.sys_tables) && !strcmp("ID_IND", index->name)))) { /* The index was created in memory already at booting @@ -2660,7 +2656,7 @@ dict_save_data_dir_path( dict_table_t* table, /*!< in/out: table */ const char* filepath) /*!< in: filepath of tablespace */ { - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); ut_a(DICT_TF_HAS_DATA_DIR(table->flags)); ut_a(!table->data_dir_path); @@ -2687,7 +2683,7 @@ dict_save_data_dir_path( /** Make sure the data_dir_path is saved in dict_table_t if DATA DIRECTORY was used. Try to read it from the fil_system first, then from SYS_DATAFILES. @param[in] table Table object -@param[in] dict_mutex_own true if dict_sys->mutex is owned already */ +@param[in] dict_mutex_own true if dict_sys.mutex is owned already */ void dict_get_and_save_data_dir_path( dict_table_t* table, @@ -2744,7 +2740,7 @@ dict_table_t* dict_load_table(const char* name, dict_err_ignore_t ignore_err) DBUG_ENTER("dict_load_table"); DBUG_PRINT("dict_load_table", ("loading table: '%s'", name)); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); result = dict_table_check_if_in_cache_low(name); @@ -2825,7 +2821,7 @@ dict_load_tablespace( } /* Try to open the tablespace. We set the 2nd param (fix_dict) to - false because we do not have an x-lock on dict_operation_lock */ + false because we do not have an x-lock on dict_sys.latch */ table->space = fil_ibd_open( true, false, FIL_TYPE_TABLESPACE, table->space_id, dict_tf_to_fsp_flags(table->flags), @@ -2878,7 +2874,7 @@ dict_load_table_one( DBUG_ENTER("dict_load_table_one"); DBUG_PRINT("dict_load_table_one", ("table: %s", name.m_name)); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); heap = mem_heap_create(32000); @@ -2977,7 +2973,7 @@ err_exit: << " failed, the table has" " corrupted clustered indexes. Turn on" " 'innodb_force_load_corrupted' to drop it"; - dict_table_remove_from_cache(table); + dict_sys.remove(table); table = NULL; goto func_exit; } else { @@ -2999,8 +2995,7 @@ corrupted: dict_table_get_first_index(table)->page); mtr.start(); buf_block_t* block = buf_page_get( - page_id, - dict_table_page_size(table), + page_id, table->space->zip_size(), RW_S_LATCH, &mtr); const bool corrupted = !block || page_get_space_id(block->frame) @@ -3045,7 +3040,7 @@ corrupted: " foreign key indexes. Turn off" " 'foreign_key_checks' and try again."; - dict_table_remove_from_cache(table); + dict_sys.remove(table); table = NULL; } else { dict_mem_table_fill_foreign_vcol_set(table); @@ -3061,7 +3056,7 @@ corrupted: if (!srv_force_recovery || !index || !index->is_primary()) { - dict_table_remove_from_cache(table); + dict_sys.remove(table); table = NULL; } else if (index->is_corrupted() && table->is_readable()) { @@ -3094,7 +3089,7 @@ func_exit: } else if (table->can_be_evicted) { /* fts_optimize_thread is not started yet. So make the table as non-evictable from cache. */ - dict_table_move_from_lru_to_non_lru(table); + dict_sys.prevent_eviction(table); } } @@ -3126,7 +3121,7 @@ dict_load_table_on_id( dict_table_t* table; mtr_t mtr; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); table = NULL; @@ -3137,7 +3132,7 @@ dict_load_table_on_id( mtr_start(&mtr); /*---------------------------------------------------*/ /* Get the secondary index based on ID for table SYS_TABLES */ - sys_tables = dict_sys->sys_tables; + sys_tables = dict_sys.sys_tables; sys_table_ids = dict_table_get_next_index( dict_table_get_first_index(sys_tables)); ut_ad(!dict_table_is_comp(sys_tables)); @@ -3211,7 +3206,7 @@ dict_load_sys_table( { mem_heap_t* heap; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); heap = mem_heap_create(1000); @@ -3248,7 +3243,7 @@ dict_load_foreign_cols( mtr_t mtr; size_t id_len; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); id_len = strlen(foreign->id); @@ -3395,7 +3390,7 @@ dict_load_foreign( DBUG_PRINT("dict_load_foreign", ("id: '%s', check_recursive: %d", id, check_recursive)); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); id_len = strlen(id); @@ -3571,7 +3566,7 @@ dict_load_foreigns( DBUG_ENTER("dict_load_foreigns"); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); sys_foreign = dict_table_get_low("SYS_FOREIGN"); @@ -3730,7 +3725,7 @@ dict_load_table_id_on_index_id( bool found = false; mtr_t mtr; - ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(mutex_own(&dict_sys.mutex)); /* NOTE that the operation of this function is protected by the dictionary mutex, and therefore no deadlocks can occur diff --git a/storage/innobase/dict/dict0mem.cc b/storage/innobase/dict/dict0mem.cc index f380feff7ef..bbc5535668c 100644 --- a/storage/innobase/dict/dict0mem.cc +++ b/storage/innobase/dict/dict0mem.cc @@ -37,6 +37,7 @@ Created 1/8/1996 Heikki Tuuri #include "lock0lock.h" #include "sync0sync.h" #include "row0row.h" +#include "sql_string.h" #include <iostream> #define DICT_HEAP_SIZE 100 /*!< initial memory heap size when @@ -115,6 +116,14 @@ operator<<( return(s << ut_get_name(NULL, table_name.m_name)); } +bool dict_col_t::same_encoding(uint16_t a, uint16_t b) +{ + if (const CHARSET_INFO *acs= get_charset(a, MYF(MY_WME))) + if (const CHARSET_INFO *bcs= get_charset(b, MYF(MY_WME))) + return Charset(acs).same_encoding(bcs); + return false; +} + /**********************************************************************//** Creates a table memory object. @return own: table object */ @@ -170,17 +179,13 @@ dict_mem_table_create( mem_heap_alloc(heap, table->n_cols * sizeof(dict_col_t))); table->v_cols = static_cast<dict_v_col_t*>( mem_heap_alloc(heap, n_v_cols * sizeof(*table->v_cols))); - - /* true means that the stats latch will be enabled - - dict_table_stats_lock() will not be noop. */ - dict_table_stats_latch_create(table, true); + for (ulint i = n_v_cols; i--; ) { + new (&table->v_cols[i]) dict_v_col_t(); + } table->autoinc_lock = static_cast<ib_lock_t*>( mem_heap_alloc(heap, lock_get_size())); - /* lazy creation of table autoinc latch */ - dict_table_autoinc_create_lazy(table); - /* If the table has an FTS index or we are in the process of building one, create the table->fts */ if (dict_table_has_fts_index(table) @@ -188,13 +193,14 @@ dict_mem_table_create( || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) { table->fts = fts_create(table); table->fts->cache = fts_cache_create(table); - } else { - table->fts = NULL; } new(&table->foreign_set) dict_foreign_set(); new(&table->referenced_set) dict_foreign_set(); + rw_lock_create(dict_table_stats_key, &table->stats_latch, + SYNC_INDEX_TREE); + return(table); } @@ -223,9 +229,7 @@ dict_mem_table_free( } } - dict_table_autoinc_destroy(table); dict_mem_table_free_foreign_vcol_set(table); - dict_table_stats_latch_destroy(table); table->foreign_set.~dict_foreign_set(); table->referenced_set.~dict_foreign_set(); @@ -236,15 +240,12 @@ dict_mem_table_free( /* Clean up virtual index info structures that are registered with virtual columns */ for (ulint i = 0; i < table->n_v_def; i++) { - dict_v_col_t* vcol - = dict_table_get_nth_v_col(table, i); - - UT_DELETE(vcol->v_indexes); + dict_table_get_nth_v_col(table, i)->~dict_v_col_t(); } - if (table->s_cols != NULL) { - UT_DELETE(table->s_cols); - } + UT_DELETE(table->s_cols); + + rw_lock_free(&table->stats_latch); mem_heap_free(table->heap); } @@ -423,7 +424,8 @@ dict_mem_table_add_v_col( v_col->num_base = num_base; /* Initialize the index list for virtual columns */ - v_col->v_indexes = UT_NEW_NOKEY(dict_v_idx_list()); + ut_ad(v_col->v_indexes.empty()); + v_col->n_v_indexes = 0; return(v_col); } @@ -457,7 +459,7 @@ dict_mem_table_add_s_col( } s_col.num_base = num_base; - table->s_cols->push_back(s_col); + table->s_cols->push_front(s_col); } /**********************************************************************//** @@ -533,6 +535,14 @@ dict_mem_table_col_rename_low( = dict_index_get_nth_field( index, i); + ut_ad(!field->name + == field->col->is_dropped()); + if (!field->name) { + /* dropped columns lack a name */ + ut_ad(index->is_instant()); + continue; + } + /* if is_virtual and that in field->col does not match, continue */ if ((!is_virtual) != @@ -750,6 +760,7 @@ dict_mem_fill_column_struct( column->mbmaxlen = mbmaxlen; column->def_val.data = NULL; column->def_val.len = UNIV_SQL_DEFAULT; + ut_ad(!column->is_dropped()); } /**********************************************************************//** @@ -778,16 +789,14 @@ dict_mem_index_create( dict_mem_fill_index_struct(index, heap, index_name, type, n_fields); - dict_index_zip_pad_mutex_create_lazy(index); + mutex_create(LATCH_ID_ZIP_PAD_MUTEX, &index->zip_pad.mutex); if (type & DICT_SPATIAL) { - index->rtr_track = static_cast<rtr_info_track_t*>( - mem_heap_alloc( - heap, - sizeof(*index->rtr_track))); + index->rtr_track = new + (mem_heap_alloc(heap, sizeof *index->rtr_track)) + rtr_info_track_t(); mutex_create(LATCH_ID_RTR_ACTIVE_MUTEX, &index->rtr_track->rtr_active_mutex); - index->rtr_track->rtr_active = UT_NEW_NOKEY(rtr_info_active()); } return(index); @@ -895,11 +904,7 @@ dict_mem_fill_vcol_has_index( continue; } - dict_v_idx_list::iterator it; - for (it = v_col->v_indexes->begin(); - it != v_col->v_indexes->end(); ++it) { - dict_v_idx_t v_idx = *it; - + for (const auto& v_idx : v_col->v_indexes) { if (v_idx.index != index) { continue; } @@ -972,7 +977,7 @@ dict_mem_fill_vcol_set_for_base_col( continue; } - for (ulint j = 0; j < v_col->num_base; j++) { + for (ulint j = 0; j < unsigned{v_col->num_base}; j++) { if (strcmp(col_name, dict_table_get_col_name( table, v_col->base_col[j]->ind)) == 0) { @@ -1093,24 +1098,18 @@ dict_mem_index_free( ut_ad(index); ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - dict_index_zip_pad_mutex_destroy(index); + mutex_free(&index->zip_pad.mutex); if (dict_index_is_spatial(index)) { - rtr_info_active::iterator it; - rtr_info_t* rtr_info; - - for (it = index->rtr_track->rtr_active->begin(); - it != index->rtr_track->rtr_active->end(); ++it) { - rtr_info = *it; - + for (auto& rtr_info : index->rtr_track->rtr_active) { rtr_info->index = NULL; } mutex_destroy(&index->rtr_track->rtr_active_mutex); - UT_DELETE(index->rtr_track->rtr_active); + index->rtr_track->~rtr_info_track_t(); } - dict_index_remove_from_v_col_list(index); + index->detach_columns(); mem_heap_free(index->heap); } @@ -1197,293 +1196,122 @@ operator<< (std::ostream& out, const dict_foreign_set& fk_set) return(out); } -/** Adjust clustered index metadata for instant ADD COLUMN. -@param[in] clustered index definition after instant ADD COLUMN */ -inline void dict_index_t::instant_add_field(const dict_index_t& instant) +/** Reconstruct the clustered index fields. */ +inline void dict_index_t::reconstruct_fields() { DBUG_ASSERT(is_primary()); - DBUG_ASSERT(instant.is_primary()); - DBUG_ASSERT(!instant.is_instant()); - DBUG_ASSERT(n_def == n_fields); - DBUG_ASSERT(instant.n_def == instant.n_fields); - - DBUG_ASSERT(type == instant.type); - DBUG_ASSERT(trx_id_offset == instant.trx_id_offset); - DBUG_ASSERT(n_user_defined_cols == instant.n_user_defined_cols); - DBUG_ASSERT(n_uniq == instant.n_uniq); - DBUG_ASSERT(instant.n_fields > n_fields); - DBUG_ASSERT(instant.n_def > n_def); - DBUG_ASSERT(instant.n_nullable >= n_nullable); - DBUG_ASSERT(instant.n_core_fields >= n_core_fields); - DBUG_ASSERT(instant.n_core_null_bytes >= n_core_null_bytes); - - n_fields = instant.n_fields; - n_def = instant.n_def; - n_nullable = instant.n_nullable; - fields = static_cast<dict_field_t*>( - mem_heap_dup(heap, instant.fields, n_fields * sizeof *fields)); - - ut_d(unsigned n_null = 0); - - for (unsigned i = 0; i < n_fields; i++) { - DBUG_ASSERT(fields[i].same(instant.fields[i])); - const dict_col_t* icol = instant.fields[i].col; - DBUG_ASSERT(!icol->is_virtual()); - dict_col_t* col = fields[i].col = &table->cols[ - icol - instant.table->cols]; - fields[i].name = col->name(*table); - ut_d(n_null += col->is_nullable()); - } - ut_ad(n_null == n_nullable); -} + n_fields += table->instant->n_dropped; + n_def += table->instant->n_dropped; -/** Adjust metadata for instant ADD COLUMN. -@param[in] table table definition after instant ADD COLUMN */ -void dict_table_t::instant_add_column(const dict_table_t& table) -{ - DBUG_ASSERT(!table.cached); - DBUG_ASSERT(table.n_def == table.n_cols); - DBUG_ASSERT(table.n_t_def == table.n_t_cols); - DBUG_ASSERT(n_def == n_cols); - DBUG_ASSERT(n_t_def == n_t_cols); - DBUG_ASSERT(table.n_cols > n_cols); - ut_ad(mutex_own(&dict_sys->mutex)); - - const char* end = table.col_names; - for (unsigned i = table.n_cols; i--; ) end += strlen(end) + 1; - - col_names = static_cast<char*>( - mem_heap_dup(heap, table.col_names, - ulint(end - table.col_names))); - const dict_col_t* const old_cols = cols; - const dict_col_t* const old_cols_end = cols + n_cols; - cols = static_cast<dict_col_t*>(mem_heap_dup(heap, table.cols, - table.n_cols - * sizeof *cols)); - - /* Preserve the default values of previously instantly - added columns. */ - for (unsigned i = unsigned(n_cols) - DATA_N_SYS_COLS; i--; ) { - cols[i].def_val = old_cols[i].def_val; - } + const unsigned n_first = first_user_field(); - /* Copy the new default values to this->heap. */ - for (unsigned i = n_cols; i < table.n_cols; i++) { - dict_col_t& c = cols[i - DATA_N_SYS_COLS]; - DBUG_ASSERT(c.is_instant()); - if (c.def_val.len == 0) { - c.def_val.data = field_ref_zero; - } else if (const void*& d = c.def_val.data) { - d = mem_heap_dup(heap, d, c.def_val.len); + dict_field_t* tfields = static_cast<dict_field_t*>( + mem_heap_zalloc(heap, n_fields * sizeof *fields)); + + memcpy(tfields, fields, n_first * sizeof *fields); + + n_nullable = 0; + ulint n_core_null = 0; + const bool comp = dict_table_is_comp(table); + const auto* field_map_it = table->instant->field_map; + for (unsigned i = n_first, j = 0; i < n_fields; ) { + dict_field_t& f = tfields[i++]; + auto c = *field_map_it++; + if (c.is_dropped()) { + f.col = &table->instant->dropped[j++]; + DBUG_ASSERT(f.col->is_dropped()); + f.fixed_len = dict_col_get_fixed_size(f.col, comp); } else { - DBUG_ASSERT(c.def_val.len == UNIV_SQL_NULL); + DBUG_ASSERT(!c.is_not_null()); + const auto old = std::find_if( + fields + n_first, fields + n_fields, + [c](const dict_field_t& o) + { return o.col->ind == c.ind(); }); + ut_ad(old >= &fields[n_first]); + ut_ad(old < &fields[n_fields]); + DBUG_ASSERT(!old->prefix_len); + DBUG_ASSERT(old->col == &table->cols[c.ind()]); + f = *old; } - } - const unsigned old_n_cols = n_cols; - const unsigned n_add = unsigned(table.n_cols - n_cols); - - n_t_def += n_add; - n_t_cols += n_add; - n_cols = table.n_cols; - n_def = n_cols; - - for (unsigned i = n_v_def; i--; ) { - const dict_v_col_t& v = v_cols[i]; - for (ulint n = v.num_base; n--; ) { - dict_col_t*& base = v.base_col[n]; - if (!base->is_virtual()) { - DBUG_ASSERT(base >= old_cols); - size_t n = size_t(base - old_cols); - DBUG_ASSERT(n + DATA_N_SYS_COLS < old_n_cols); - base = &cols[n]; - } + f.col->clear_instant(); + if (f.col->is_nullable()) { + n_nullable++; + n_core_null += i <= n_core_fields; } } - dict_index_t* index = dict_table_get_first_index(this); - - index->instant_add_field(*dict_table_get_first_index(&table)); - - while ((index = dict_table_get_next_index(index)) != NULL) { - for (unsigned i = 0; i < index->n_fields; i++) { - dict_field_t& field = index->fields[i]; - if (field.col < old_cols - || field.col >= old_cols_end) { - DBUG_ASSERT(field.col->is_virtual()); - } else { - /* Secondary indexes may contain user - columns and DB_ROW_ID (if there is - GEN_CLUST_INDEX instead of PRIMARY KEY), - but not DB_TRX_ID,DB_ROLL_PTR. */ - DBUG_ASSERT(field.col >= old_cols); - size_t n = size_t(field.col - old_cols); - DBUG_ASSERT(n + DATA_N_SYS_COLS <= old_n_cols); - if (n + DATA_N_SYS_COLS >= old_n_cols) { - /* Replace DB_ROW_ID */ - n += n_add; - } - field.col = &cols[n]; - DBUG_ASSERT(!field.col->is_virtual()); - field.name = field.col->name(*this); - } - } - } + fields = tfields; + n_core_null_bytes = UT_BITS_IN_BYTES(n_core_null); } -/** Roll back instant_add_column(). -@param[in] old_n_cols original n_cols -@param[in] old_cols original cols -@param[in] old_col_names original col_names */ -void -dict_table_t::rollback_instant( - unsigned old_n_cols, - dict_col_t* old_cols, - const char* old_col_names) +/** Reconstruct dropped or reordered columns. +@param[in] metadata data from serialise_columns() +@param[in] len length of the metadata, in bytes +@return whether parsing the metadata failed */ +bool dict_table_t::deserialise_columns(const byte* metadata, ulint len) { - ut_ad(mutex_own(&dict_sys->mutex)); - dict_index_t* index = indexes.start; - /* index->is_instant() does not necessarily hold here, because - the table may have been emptied */ - DBUG_ASSERT(old_n_cols >= DATA_N_SYS_COLS); - DBUG_ASSERT(n_cols >= old_n_cols); - DBUG_ASSERT(n_cols == n_def); - DBUG_ASSERT(index->n_def == index->n_fields); - - const unsigned n_remove = n_cols - old_n_cols; - - for (unsigned i = index->n_fields - n_remove; i < index->n_fields; - i++) { - if (index->fields[i].col->is_nullable()) { - index->n_nullable--; - } - } + DBUG_ASSERT(!instant); - index->n_fields -= n_remove; - index->n_def = index->n_fields; - if (index->n_core_fields > index->n_fields) { - index->n_core_fields = index->n_fields; - index->n_core_null_bytes - = UT_BITS_IN_BYTES(unsigned(index->n_nullable)); - } + unsigned num_non_pk_fields = mach_read_from_4(metadata); + metadata += 4; - const dict_col_t* const new_cols = cols; - const dict_col_t* const new_cols_end = cols + n_cols; - - cols = old_cols; - col_names = old_col_names; - n_cols = old_n_cols; - n_def = old_n_cols; - n_t_def -= n_remove; - n_t_cols -= n_remove; - - for (unsigned i = n_v_def; i--; ) { - const dict_v_col_t& v = v_cols[i]; - for (ulint n = v.num_base; n--; ) { - dict_col_t*& base = v.base_col[n]; - if (!base->is_virtual()) { - base = &cols[base - new_cols]; - } - } + if (num_non_pk_fields >= REC_MAX_N_FIELDS - 3) { + return true; } - do { - for (unsigned i = 0; i < index->n_fields; i++) { - dict_field_t& field = index->fields[i]; - if (field.col < new_cols - || field.col >= new_cols_end) { - DBUG_ASSERT(field.col->is_virtual()); - } else { - DBUG_ASSERT(field.col >= new_cols); - size_t n = size_t(field.col - new_cols); - DBUG_ASSERT(n <= n_cols); - if (n + DATA_N_SYS_COLS >= n_cols) { - n -= n_remove; - } - field.col = &cols[n]; - DBUG_ASSERT(!field.col->is_virtual()); - field.name = field.col->name(*this); - } - } - } while ((index = dict_table_get_next_index(index)) != NULL); -} + dict_index_t* index = UT_LIST_GET_FIRST(indexes); -/** Trim the instantly added columns when an insert into SYS_COLUMNS -is rolled back during ALTER TABLE or recovery. -@param[in] n number of surviving non-system columns */ -void dict_table_t::rollback_instant(unsigned n) -{ - ut_ad(mutex_own(&dict_sys->mutex)); - dict_index_t* index = indexes.start; - DBUG_ASSERT(index->is_instant()); - DBUG_ASSERT(index->n_def == index->n_fields); - DBUG_ASSERT(n_cols == n_def); - DBUG_ASSERT(n >= index->n_uniq); - DBUG_ASSERT(n_cols > n + DATA_N_SYS_COLS); - const unsigned n_remove = n_cols - n - DATA_N_SYS_COLS; - - char* names = const_cast<char*>(dict_table_get_col_name(this, n)); - const char* sys = names; - for (unsigned i = n_remove; i--; ) { - sys += strlen(sys) + 1; + if (num_non_pk_fields < unsigned(index->n_fields) + - index->first_user_field()) { + return true; } - static const char system[] = "DB_ROW_ID\0DB_TRX_ID\0DB_ROLL_PTR"; - DBUG_ASSERT(!memcmp(sys, system, sizeof system)); - for (unsigned i = index->n_fields - n_remove; i < index->n_fields; - i++) { - if (index->fields[i].col->is_nullable()) { - index->n_nullable--; + + field_map_element_t* field_map = static_cast<field_map_element_t*>( + mem_heap_alloc(heap, + num_non_pk_fields * sizeof *field_map)); + + unsigned n_dropped_cols = 0; + + for (unsigned i = 0; i < num_non_pk_fields; i++) { + auto c = field_map[i] = mach_read_from_2(metadata); + metadata += 2; + + if (field_map[i].is_dropped()) { + if (c.ind() > DICT_MAX_FIXED_COL_LEN + 1) { + return true; + } + n_dropped_cols++; + } else if (c >= n_cols) { + return true; } } - index->n_fields -= n_remove; - index->n_def = index->n_fields; - memmove(names, sys, sizeof system); - memmove(cols + n, cols + n_cols - DATA_N_SYS_COLS, - DATA_N_SYS_COLS * sizeof *cols); - n_cols -= n_remove; - n_def = n_cols; - n_t_cols -= n_remove; - n_t_def -= n_remove; - - for (unsigned i = DATA_N_SYS_COLS; i--; ) { - cols[n_cols - i].ind--; - } - if (dict_index_is_auto_gen_clust(index)) { - DBUG_ASSERT(index->n_uniq == 1); - dict_field_t* field = index->fields; - field->name = sys; - field->col = dict_table_get_sys_col(this, DATA_ROW_ID); - field++; - field->name = sys + sizeof "DB_ROW_ID"; - field->col = dict_table_get_sys_col(this, DATA_TRX_ID); - field++; - field->name = sys + sizeof "DB_ROW_ID\0DB_TRX_ID"; - field->col = dict_table_get_sys_col(this, DATA_ROLL_PTR); - - /* Replace the DB_ROW_ID column in secondary indexes. */ - while ((index = dict_table_get_next_index(index)) != NULL) { - field = &index->fields[index->n_fields - 1]; - DBUG_ASSERT(field->col->mtype == DATA_SYS); - DBUG_ASSERT(field->col->prtype - == DATA_NOT_NULL + DATA_TRX_ID); - field->col--; - field->name = sys; + dict_col_t* dropped_cols = static_cast<dict_col_t*>(mem_heap_zalloc( + heap, n_dropped_cols * sizeof(dict_col_t))); + instant = new (mem_heap_alloc(heap, sizeof *instant)) dict_instant_t(); + instant->n_dropped = n_dropped_cols; + instant->dropped = dropped_cols; + instant->field_map = field_map; + + dict_col_t* col = dropped_cols; + for (unsigned i = 0; i < num_non_pk_fields; i++) { + if (field_map[i].is_dropped()) { + auto fixed_len = field_map[i].ind(); + DBUG_ASSERT(fixed_len <= DICT_MAX_FIXED_COL_LEN + 1); + (col++)->set_dropped(field_map[i].is_not_null(), + fixed_len == 1, + fixed_len > 1 ? fixed_len - 1 + : 0); } - - return; } + DBUG_ASSERT(col == &dropped_cols[n_dropped_cols]); - dict_field_t* field = &index->fields[index->n_uniq]; - field->name = sys + sizeof "DB_ROW_ID"; - field->col = dict_table_get_sys_col(this, DATA_TRX_ID); - field++; - field->name = sys + sizeof "DB_ROW_ID\0DB_TRX_ID"; - field->col = dict_table_get_sys_col(this, DATA_ROLL_PTR); + UT_LIST_GET_FIRST(indexes)->reconstruct_fields(); + return false; } - /** Check if record in clustered index is historical row. @param[in] rec clustered row @param[in] offsets offsets diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc index 488ff6ae915..3a4730f0d2c 100644 --- a/storage/innobase/dict/dict0stats.cc +++ b/storage/innobase/dict/dict0stats.cc @@ -33,6 +33,7 @@ Created Jan 06, 2010 Vasil Dimov #include "pars0pars.h" #include <mysql_com.h> #include "btr0btr.h" +#include "sync0sync.h" #include <algorithm> #include <map> @@ -167,7 +168,7 @@ bool dict_stats_persistent_storage_check( /*================================*/ bool caller_has_dict_sys_mutex) /*!< in: true if the caller - owns dict_sys->mutex */ + owns dict_sys.mutex */ { /* definition for the table TABLE_STATS_NAME */ dict_col_meta_t table_stats_columns[] = { @@ -235,10 +236,10 @@ dict_stats_persistent_storage_check( dberr_t ret; if (!caller_has_dict_sys_mutex) { - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); } - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); /* first check table_stats */ ret = dict_table_schema_check(&table_stats_schema, errstr, @@ -250,7 +251,7 @@ dict_stats_persistent_storage_check( } if (!caller_has_dict_sys_mutex) { - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } if (ret != DB_SUCCESS && ret != DB_STATS_DO_NOT_EXIST) { @@ -283,8 +284,7 @@ dict_stats_exec_sql( dberr_t err; bool trx_started = false; - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_d(dict_sys.assert_locked()); if (!dict_stats_persistent_storage_check(true)) { pars_info_free(pinfo); @@ -418,11 +418,6 @@ dict_stats_table_clone_create( t->corrupted = table->corrupted; - /* This private object "t" is not shared with other threads, so - we do not need the stats_latch (thus we pass false below). The - dict_table_stats_lock()/unlock() routines will do nothing. */ - dict_table_stats_latch_create(t, false); - UT_LIST_INIT(t->indexes, &dict_index_t::indexes); #ifdef BTR_CUR_HASH_ADAPT UT_LIST_INIT(t->freed_indexes, &dict_index_t::indexes); @@ -488,6 +483,8 @@ dict_stats_table_clone_create( ut_d(t->magic_n = DICT_TABLE_MAGIC_N); + rw_lock_create(dict_table_stats_key, &t->stats_latch, SYNC_INDEX_TREE); + return(t); } @@ -500,15 +497,13 @@ dict_stats_table_clone_free( /*========================*/ dict_table_t* t) /*!< in: dummy table object to free */ { - dict_table_stats_latch_destroy(t); + rw_lock_free(&t->stats_latch); mem_heap_free(t->heap); } /*********************************************************************//** Write all zeros (or 1 where it makes sense) into an index -statistics members. The resulting stats correspond to an empty index. -The caller must own index's table stats latch in X mode -(dict_table_stats_lock(table, RW_X_LATCH)) */ +statistics members. The resulting stats correspond to an empty index. */ static void dict_stats_empty_index( @@ -519,6 +514,7 @@ dict_stats_empty_index( { ut_ad(!(index->type & DICT_FTS)); ut_ad(!dict_index_is_ibuf(index)); + ut_ad(rw_lock_own(&index->table->stats_latch, RW_LOCK_X)); ulint n_uniq = index->n_uniq; @@ -550,7 +546,7 @@ dict_stats_empty_table( { /* Zero the stats members */ - dict_table_stats_lock(table, RW_X_LATCH); + rw_lock_x_lock(&table->stats_latch); table->stat_n_rows = 0; table->stat_clustered_index_size = 1; @@ -576,7 +572,7 @@ dict_stats_empty_table( table->stat_initialized = TRUE; - dict_table_stats_unlock(table, RW_X_LATCH); + rw_lock_x_unlock(&table->stats_latch); } /*********************************************************************//** @@ -790,9 +786,9 @@ dict_table_t* dict_stats_snapshot_create( dict_table_t* table) { - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); - dict_table_stats_lock(table, RW_S_LATCH); + rw_lock_s_lock(&table->stats_latch); dict_stats_assert_initialized(table); @@ -807,9 +803,9 @@ dict_stats_snapshot_create( t->stats_sample_pages = table->stats_sample_pages; t->stats_bg_flag = table->stats_bg_flag; - dict_table_stats_unlock(table, RW_S_LATCH); + rw_lock_s_unlock(&table->stats_latch); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); return(t); } @@ -1494,7 +1490,7 @@ dict_stats_analyze_index_below_cur( page_id_t page_id(index->table->space_id, btr_node_ptr_get_child_page_no( rec, offsets_rec)); - const page_size_t page_size(index->table->space->flags); + const ulint zip_size = index->table->space->zip_size(); /* assume no external pages by default - in case we quit from this function without analyzing any leaf pages */ @@ -1507,7 +1503,7 @@ dict_stats_analyze_index_below_cur( dberr_t err = DB_SUCCESS; - block = buf_page_get_gen(page_id, page_size, RW_S_LATCH, + block = buf_page_get_gen(page_id, zip_size, RW_S_LATCH, NULL /* no guessed block */, BUF_GET, __FILE__, __LINE__, &mtr, &err); @@ -2201,7 +2197,7 @@ dict_stats_update_persistent( DEBUG_PRINTF("%s(table=%s)\n", __func__, table->name); - dict_table_stats_lock(table, RW_X_LATCH); + rw_lock_x_lock(&table->stats_latch); /* analyze the clustered index first */ @@ -2212,7 +2208,7 @@ dict_stats_update_persistent( || (index->type | DICT_UNIQUE) != (DICT_CLUSTERED | DICT_UNIQUE)) { /* Table definition is corrupt */ - dict_table_stats_unlock(table, RW_X_LATCH); + rw_lock_x_unlock(&table->stats_latch); dict_stats_empty_table(table, true); return(DB_CORRUPTION); @@ -2264,7 +2260,7 @@ dict_stats_update_persistent( dict_stats_assert_initialized(table); - dict_table_stats_unlock(table, RW_X_LATCH); + rw_lock_x_unlock(&table->stats_latch); return(DB_SUCCESS); } @@ -2298,8 +2294,7 @@ dict_stats_save_index_stat( char table_utf8[MAX_TABLE_UTF8_LEN]; ut_ad(!trx || trx->internal || trx->mysql_thd); - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_d(dict_sys.assert_locked()); dict_fs2utf8(index->table->name.m_name, db_utf8, sizeof(db_utf8), table_utf8, sizeof(table_utf8)); @@ -2432,8 +2427,7 @@ dict_stats_save( table_utf8, sizeof(table_utf8)); const time_t now = time(NULL); - rw_lock_x_lock(&dict_operation_lock); - mutex_enter(&dict_sys->mutex); + dict_sys_lock(); pinfo = pars_info_create(); @@ -2471,13 +2465,10 @@ dict_stats_save( if (UNIV_UNLIKELY(ret != DB_SUCCESS)) { ib::error() << "Cannot save table statistics for table " << table->name << ": " << ret; - - mutex_exit(&dict_sys->mutex); - rw_lock_x_unlock(&dict_operation_lock); - +func_exit: + dict_sys_unlock(); dict_stats_snapshot_free(table); - - return(ret); + return ret; } trx_t* trx = trx_create(); @@ -2578,13 +2569,7 @@ dict_stats_save( end: trx_free(trx); - - mutex_exit(&dict_sys->mutex); - rw_lock_x_unlock(&dict_operation_lock); - - dict_stats_snapshot_free(table); - - return(ret); + goto func_exit; } /*********************************************************************//** @@ -2951,7 +2936,7 @@ dict_stats_fetch_from_ps( char db_utf8[MAX_DB_UTF8_LEN]; char table_utf8[MAX_TABLE_UTF8_LEN]; - ut_ad(!mutex_own(&dict_sys->mutex)); + ut_ad(!mutex_own(&dict_sys.mutex)); /* Initialize all stats to dummy values before fetching because if the persistent storage contains incomplete stats (e.g. missing stats @@ -3087,16 +3072,16 @@ dict_stats_update_for_index( { DBUG_ENTER("dict_stats_update_for_index"); - ut_ad(!mutex_own(&dict_sys->mutex)); + ut_ad(!mutex_own(&dict_sys.mutex)); if (dict_stats_is_persistent_enabled(index->table)) { if (dict_stats_persistent_storage_check(false)) { - dict_table_stats_lock(index->table, RW_X_LATCH); + rw_lock_x_lock(&index->table->stats_latch); dict_stats_analyze_index(index); index->table->stat_sum_of_other_index_sizes += index->stat_index_size; - dict_table_stats_unlock(index->table, RW_X_LATCH); + rw_lock_x_unlock(&index->table->stats_latch); dict_stats_save(index->table, &index->id); DBUG_VOID_RETURN; } @@ -3117,9 +3102,9 @@ dict_stats_update_for_index( } } - dict_table_stats_lock(index->table, RW_X_LATCH); + rw_lock_x_lock(&index->table->stats_latch); dict_stats_update_transient_for_index(index); - dict_table_stats_unlock(index->table, RW_X_LATCH); + rw_lock_x_unlock(&index->table->stats_latch); DBUG_VOID_RETURN; } @@ -3138,7 +3123,7 @@ dict_stats_update( the persistent statistics storage */ { - ut_ad(!mutex_own(&dict_sys->mutex)); + ut_ad(!mutex_own(&dict_sys.mutex)); if (!table->is_readable()) { return (dict_stats_report_error(table)); @@ -3273,7 +3258,7 @@ dict_stats_update( switch (err) { case DB_SUCCESS: - dict_table_stats_lock(table, RW_X_LATCH); + rw_lock_x_lock(&table->stats_latch); /* Pass reset_ignored_indexes=true as parameter to dict_stats_copy. This will cause statictics @@ -3282,7 +3267,7 @@ dict_stats_update( dict_stats_assert_initialized(table); - dict_table_stats_unlock(table, RW_X_LATCH); + rw_lock_x_unlock(&table->stats_latch); dict_stats_table_clone_free(t); @@ -3337,11 +3322,11 @@ dict_stats_update( transient: - dict_table_stats_lock(table, RW_X_LATCH); + rw_lock_x_lock(&table->stats_latch); dict_stats_update_transient(table); - dict_table_stats_unlock(table, RW_X_LATCH); + rw_lock_x_unlock(&table->stats_latch); return(DB_SUCCESS); } @@ -3371,7 +3356,7 @@ dict_stats_drop_index( pars_info_t* pinfo; dberr_t ret; - ut_ad(!mutex_own(&dict_sys->mutex)); + ut_ad(!mutex_own(&dict_sys.mutex)); /* skip indexes whose table names do not contain a database name e.g. if we are dropping an index from SYS_TABLES */ @@ -3391,8 +3376,7 @@ dict_stats_drop_index( pars_info_add_str_literal(pinfo, "index_name", iname); - rw_lock_x_lock(&dict_operation_lock); - mutex_enter(&dict_sys->mutex); + dict_sys_lock(); ret = dict_stats_exec_sql( pinfo, @@ -3404,8 +3388,7 @@ dict_stats_drop_index( "index_name = :index_name;\n" "END;\n", NULL); - mutex_exit(&dict_sys->mutex); - rw_lock_x_unlock(&dict_operation_lock); + dict_sys_unlock(); if (ret == DB_STATS_DO_NOT_EXIST) { ret = DB_SUCCESS; @@ -3453,8 +3436,7 @@ dict_stats_delete_from_table_stats( pars_info_t* pinfo; dberr_t ret; - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_d(dict_sys.assert_locked()); pinfo = pars_info_create(); @@ -3489,8 +3471,7 @@ dict_stats_delete_from_index_stats( pars_info_t* pinfo; dberr_t ret; - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_d(dict_sys.assert_locked()); pinfo = pars_info_create(); @@ -3526,8 +3507,7 @@ dict_stats_drop_table( char table_utf8[MAX_TABLE_UTF8_LEN]; dberr_t ret; - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_d(dict_sys.assert_locked()); /* skip tables that do not contain a database name e.g. if we are dropping SYS_TABLES */ @@ -3602,8 +3582,7 @@ dict_stats_rename_table_in_table_stats( pars_info_t* pinfo; dberr_t ret; - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_d(dict_sys.assert_locked()); pinfo = pars_info_create(); @@ -3646,8 +3625,7 @@ dict_stats_rename_table_in_index_stats( pars_info_t* pinfo; dberr_t ret; - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_d(dict_sys.assert_locked()); pinfo = pars_info_create(); @@ -3690,9 +3668,6 @@ dict_stats_rename_table( char new_table_utf8[MAX_TABLE_UTF8_LEN]; dberr_t ret; - ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_X)); - ut_ad(!mutex_own(&dict_sys->mutex)); - /* skip innodb_table_stats and innodb_index_stats themselves */ if (strcmp(old_name, TABLE_STATS_NAME) == 0 || strcmp(old_name, INDEX_STATS_NAME) == 0 @@ -3708,8 +3683,7 @@ dict_stats_rename_table( dict_fs2utf8(new_name, new_db_utf8, sizeof(new_db_utf8), new_table_utf8, sizeof(new_table_utf8)); - rw_lock_x_lock(&dict_operation_lock); - mutex_enter(&dict_sys->mutex); + dict_sys_lock(); ulint n_attempts = 0; do { @@ -3729,11 +3703,9 @@ dict_stats_rename_table( } if (ret != DB_SUCCESS) { - mutex_exit(&dict_sys->mutex); - rw_lock_x_unlock(&dict_operation_lock); + dict_sys_unlock(); os_thread_sleep(200000 /* 0.2 sec */); - rw_lock_x_lock(&dict_operation_lock); - mutex_enter(&dict_sys->mutex); + dict_sys_lock(); } } while ((ret == DB_DEADLOCK || ret == DB_DUPLICATE_KEY @@ -3761,8 +3733,7 @@ dict_stats_rename_table( TABLE_STATS_NAME_PRINT, new_db_utf8, new_table_utf8, old_db_utf8, old_table_utf8); - mutex_exit(&dict_sys->mutex); - rw_lock_x_unlock(&dict_operation_lock); + dict_sys_unlock(); return(ret); } /* else */ @@ -3785,19 +3756,16 @@ dict_stats_rename_table( } if (ret != DB_SUCCESS) { - mutex_exit(&dict_sys->mutex); - rw_lock_x_unlock(&dict_operation_lock); + dict_sys_unlock(); os_thread_sleep(200000 /* 0.2 sec */); - rw_lock_x_lock(&dict_operation_lock); - mutex_enter(&dict_sys->mutex); + dict_sys_lock(); } } while ((ret == DB_DEADLOCK || ret == DB_DUPLICATE_KEY || ret == DB_LOCK_WAIT_TIMEOUT) && n_attempts < 5); - mutex_exit(&dict_sys->mutex); - rw_lock_x_unlock(&dict_operation_lock); + dict_sys_unlock(); if (ret != DB_SUCCESS) { snprintf(errstr, errstr_sz, @@ -3825,6 +3793,60 @@ dict_stats_rename_table( return(ret); } +/*********************************************************************//** +Renames an index in InnoDB persistent stats storage. +This function creates its own transaction and commits it. +@return DB_SUCCESS or error code. DB_STATS_DO_NOT_EXIST will be returned +if the persistent stats do not exist. */ +dberr_t +dict_stats_rename_index( +/*====================*/ + const dict_table_t* table, /*!< in: table whose index + is renamed */ + const char* old_index_name, /*!< in: old index name */ + const char* new_index_name) /*!< in: new index name */ +{ + dict_sys_lock(); + + if (!dict_stats_persistent_storage_check(true)) { + dict_sys_unlock(); + return(DB_STATS_DO_NOT_EXIST); + } + + char dbname_utf8[MAX_DB_UTF8_LEN]; + char tablename_utf8[MAX_TABLE_UTF8_LEN]; + + dict_fs2utf8(table->name.m_name, dbname_utf8, sizeof(dbname_utf8), + tablename_utf8, sizeof(tablename_utf8)); + + pars_info_t* pinfo; + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "dbname_utf8", dbname_utf8); + pars_info_add_str_literal(pinfo, "tablename_utf8", tablename_utf8); + pars_info_add_str_literal(pinfo, "new_index_name", new_index_name); + pars_info_add_str_literal(pinfo, "old_index_name", old_index_name); + + dberr_t ret; + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE RENAME_INDEX_IN_INDEX_STATS () IS\n" + "BEGIN\n" + "UPDATE \"" INDEX_STATS_NAME "\" SET\n" + "index_name = :new_index_name\n" + "WHERE\n" + "database_name = :dbname_utf8 AND\n" + "table_name = :tablename_utf8 AND\n" + "index_name = :old_index_name;\n" + "END;\n", NULL); + + dict_sys_unlock(); + + return(ret); +} + /* tests @{ */ #ifdef UNIV_ENABLE_UNIT_TEST_DICT_STATS @@ -3870,7 +3892,7 @@ test_dict_table_schema_check() /* prevent any data dictionary modifications while we are checking the tables' structure */ - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); /* check that a valid table is reported as valid */ schema.n_cols = 7; @@ -3946,7 +3968,7 @@ test_dict_table_schema_check() test_dict_table_schema_check_end: - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } /* @} */ diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc index 126de450ca5..2d358f2c9e3 100644 --- a/storage/innobase/dict/dict0stats_bg.cc +++ b/storage/innobase/dict/dict0stats_bg.cc @@ -157,7 +157,7 @@ void dict_stats_update_if_needed_func(dict_table_t *table) #endif { ut_ad(table->stat_initialized); - ut_ad(!mutex_own(&dict_sys->mutex)); + ut_ad(!mutex_own(&dict_sys.mutex)); ulonglong counter = table->stat_modified_counter++; ulonglong n_rows = dict_table_get_n_rows(table); @@ -181,7 +181,7 @@ void dict_stats_update_if_needed_func(dict_table_t *table) lock waits to be enqueued at head of waiting queue. */ if (trx.is_wsrep() - && !wsrep_thd_is_applier(trx.mysql_thd) + && !wsrep_thd_is_applying(trx.mysql_thd) && wsrep_thd_is_BF(trx.mysql_thd, 0)) { WSREP_DEBUG("Avoiding background statistics" " calculation for table %s.", @@ -250,7 +250,7 @@ dict_stats_recalc_pool_del( const dict_table_t* table) /*!< in: table to remove */ { ut_ad(!srv_read_only_mode); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); mutex_enter(&recalc_pool_mutex); @@ -278,7 +278,7 @@ and restore the lock before it exits. The background stats thread is guaranteed not to start using the specified table after this function returns and before the caller unlocks the data dictionary because it sets the BG_STAT_IN_PROGRESS bit in table->stats_bg_flag -under dict_sys->mutex. */ +under dict_sys.mutex. */ void dict_stats_wait_bg_to_stop_using_table( /*===================================*/ @@ -309,11 +309,11 @@ dict_stats_thread_init() any level would do here) 2) from dict_stats_update_if_needed() and released without latching anything else in between. We know - that dict_sys->mutex (SYNC_DICT) is not acquired when + that dict_sys.mutex (SYNC_DICT) is not acquired when dict_stats_update_if_needed() is called and it may be acquired inside that function (thus a level <=SYNC_DICT would do). - 3) from row_drop_table_for_mysql() after dict_sys->mutex (SYNC_DICT) - and dict_operation_lock (SYNC_DICT_OPERATION) have been locked + 3) from row_drop_table_for_mysql() after dict_sys.mutex (SYNC_DICT) + and dict_sys.latch (SYNC_DICT_OPERATION) have been locked (thus a level <SYNC_DICT && <SYNC_DICT_OPERATION would do) So we choose SYNC_STATS_AUTO_RECALC to be about below SYNC_DICT. */ @@ -370,14 +370,14 @@ dict_stats_process_entry_from_recalc_pool() dict_table_t* table; - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); table = dict_table_open_on_id(table_id, TRUE, DICT_TABLE_OP_NORMAL); if (table == NULL) { /* table does not exist, must have been DROPped after its id was enqueued */ - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); return; } @@ -385,13 +385,13 @@ dict_stats_process_entry_from_recalc_pool() if (!fil_table_accessible(table)) { dict_table_close(table, TRUE, FALSE); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); return; } table->stats_bg_flag |= BG_STAT_IN_PROGRESS; - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); /* time() could be expensive, the current function is called once every time a table has been changed more than 10% and @@ -414,13 +414,13 @@ dict_stats_process_entry_from_recalc_pool() dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT); } - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); table->stats_bg_flag = BG_STAT_NONE; dict_table_close(table, TRUE, FALSE); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } #ifdef UNIV_DEBUG diff --git a/storage/innobase/fil/fil0crypt.cc b/storage/innobase/fil/fil0crypt.cc index 282b66936a8..7c342ca0637 100644 --- a/storage/innobase/fil/fil0crypt.cc +++ b/storage/innobase/fil/fil0crypt.cc @@ -23,16 +23,16 @@ Created Jonas Oreland Google Modified Jan Lindström jan.lindstrom@mariadb.com *******************************************************/ -#include "fil0fil.h" +#include "fil0crypt.h" #include "mtr0types.h" #include "mach0data.h" -#include "page0size.h" #include "page0zip.h" -#ifndef UNIV_INNOCHECKSUM -#include "fil0crypt.h" +#include "buf0checksum.h" +#ifdef UNIV_INNOCHECKSUM +# include "buf0buf.h" +#else #include "srv0srv.h" #include "srv0start.h" -#include "log0recv.h" #include "mtr0mtr.h" #include "mtr0log.h" #include "ut0ut.h" @@ -274,16 +274,14 @@ fil_space_merge_crypt_data( } /** Initialize encryption parameters from a tablespace header page. -@param[in] page_size page size of the tablespace +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] page first page of the tablespace @return crypt data from page 0 @retval NULL if not present or not valid */ -UNIV_INTERN -fil_space_crypt_t* -fil_space_read_crypt_data(const page_size_t& page_size, const byte* page) +fil_space_crypt_t* fil_space_read_crypt_data(ulint zip_size, const byte* page) { const ulint offset = FSP_HEADER_OFFSET - + fsp_header_get_encryption_offset(page_size); + + fsp_header_get_encryption_offset(zip_size); if (memcmp(page + offset, CRYPT_MAGIC, MAGIC_SZ) != 0) { /* Crypt data is not stored. */ @@ -362,7 +360,8 @@ fil_space_crypt_t::fill_page0( { const uint len = sizeof(iv); const ulint offset = FSP_HEADER_OFFSET - + fsp_header_get_encryption_offset(page_size_t(flags)); + + fsp_header_get_encryption_offset( + fil_space_t::zip_size(flags)); page0_offset = offset; memcpy(page + offset, CRYPT_MAGIC, MAGIC_SZ); @@ -393,7 +392,7 @@ fil_space_crypt_t::write_page0( ut_ad(this == space->crypt_data); const uint len = sizeof(iv); const ulint offset = FSP_HEADER_OFFSET - + fsp_header_get_encryption_offset(page_size_t(space->flags)); + + fsp_header_get_encryption_offset(space->zip_size()); page0_offset = offset; /* @@ -532,29 +531,27 @@ fil_parse_write_crypt_data( return ptr; } -/** Encrypt a buffer. -@param[in,out] crypt_data Crypt data -@param[in] space space_id -@param[in] offset Page offset -@param[in] lsn Log sequence number -@param[in] src_frame Page to encrypt -@param[in] page_size Page size -@param[in,out] dst_frame Output buffer +/** Encrypt a buffer for non full checksum. +@param[in,out] crypt_data Crypt data +@param[in] space space_id +@param[in] offset Page offset +@param[in] lsn Log sequence number +@param[in] src_frame Page to encrypt +@param[in] zip_size ROW_FORMAT=COMPRESSED + page size, or 0 +@param[in,out] dst_frame Output buffer @return encrypted buffer or NULL */ -UNIV_INTERN -byte* -fil_encrypt_buf( +static byte* fil_encrypt_buf_for_non_full_checksum( fil_space_crypt_t* crypt_data, ulint space, ulint offset, lsn_t lsn, const byte* src_frame, - const page_size_t& page_size, + ulint zip_size, byte* dst_frame) { - uint size = uint(page_size.physical()); + uint size = uint(zip_size ? zip_size : srv_page_size); uint key_version = fil_crypt_get_latest_key_version(crypt_data); - ut_a(key_version != ENCRYPTION_KEY_VERSION_INVALID); ulint orig_page_type = mach_read_from_2(src_frame+FIL_PAGE_TYPE); @@ -562,21 +559,21 @@ fil_encrypt_buf( uint header_len = FIL_PAGE_DATA; if (page_compressed) { - header_len += (FIL_PAGE_COMPRESSED_SIZE + FIL_PAGE_COMPRESSION_METHOD_SIZE); + header_len += FIL_PAGE_ENCRYPT_COMP_METADATA_LEN; } /* FIL page header is not encrypted */ memcpy(dst_frame, src_frame, header_len); - - /* Store key version */ - mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, key_version); + mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, + key_version); /* Calculate the start offset in a page */ - uint unencrypted_bytes = header_len + FIL_PAGE_DATA_END; - uint srclen = size - unencrypted_bytes; - const byte* src = src_frame + header_len; - byte* dst = dst_frame + header_len; - uint32 dstlen = 0; + uint unencrypted_bytes = header_len + FIL_PAGE_DATA_END; + uint srclen = size - unencrypted_bytes; + const byte* src = src_frame + header_len; + byte* dst = dst_frame + header_len; + uint32 dstlen = 0; + ib_uint32_t checksum = 0; if (page_compressed) { srclen = mach_read_from_2(src_frame + FIL_PAGE_DATA); @@ -594,30 +591,137 @@ fil_encrypt_buf( to sector boundary is written. */ if (!page_compressed) { /* FIL page trailer is also not encrypted */ - memcpy(dst_frame + page_size.physical() - FIL_PAGE_DATA_END, - src_frame + page_size.physical() - FIL_PAGE_DATA_END, + memcpy(dst_frame + size - FIL_PAGE_DATA_END, + src_frame + size - FIL_PAGE_DATA_END, FIL_PAGE_DATA_END); } else { /* Clean up rest of buffer */ memset(dst_frame+header_len+srclen, 0, - page_size.physical() - (header_len + srclen)); + size - (header_len + srclen)); } - /* handle post encryption checksum */ - ib_uint32_t checksum = 0; + checksum = fil_crypt_calculate_checksum(zip_size, dst_frame); + + /* store the post-encryption checksum after the key-version */ + mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4, + checksum); + + ut_ad(fil_space_verify_crypt_checksum(dst_frame, zip_size)); + + srv_stats.pages_encrypted.inc(); - checksum = fil_crypt_calculate_checksum(page_size, dst_frame); + return dst_frame; +} - // store the post-encryption checksum after the key-version - mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4, checksum); +/** Encrypt a buffer for full checksum format. +@param[in,out] crypt_data Crypt data +@param[in] space space_id +@param[in] offset Page offset +@param[in] lsn Log sequence number +@param[in] src_frame Page to encrypt +@param[in,out] dst_frame Output buffer +@return encrypted buffer or NULL */ +static byte* fil_encrypt_buf_for_full_crc32( + fil_space_crypt_t* crypt_data, + ulint space, + ulint offset, + lsn_t lsn, + const byte* src_frame, + byte* dst_frame) +{ + uint key_version = fil_crypt_get_latest_key_version(crypt_data); + ut_d(bool corrupted = false); + const uint size = buf_page_full_crc32_size(src_frame, NULL, +#ifdef UNIV_DEBUG + &corrupted +#else + NULL +#endif + ); + ut_ad(!corrupted); + uint srclen = size - (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + + FIL_PAGE_FCRC32_CHECKSUM); + const byte* src = src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + byte* dst = dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + uint dstlen = 0; + + ut_a(key_version != ENCRYPTION_KEY_VERSION_INVALID); - ut_ad(fil_space_verify_crypt_checksum(dst_frame, page_size)); + /* Till FIL_PAGE_LSN, page is not encrypted */ + memcpy(dst_frame, src_frame, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + + /* Write key version to the page. */ + mach_write_to_4(dst_frame + FIL_PAGE_FCRC32_KEY_VERSION, key_version); + + int rc = encryption_scheme_encrypt(src, srclen, dst, &dstlen, + crypt_data, key_version, + uint(space), uint(offset), lsn); + ut_a(rc == MY_AES_OK); + ut_a(dstlen == srclen); + + const ulint payload = size - FIL_PAGE_FCRC32_CHECKSUM; + mach_write_to_4(dst_frame + payload, ut_crc32(dst_frame, payload)); + /* Clean the rest of the buffer. FIXME: Punch holes when writing! */ + memset(dst_frame + (payload + 4), 0, srv_page_size - (payload + 4)); srv_stats.pages_encrypted.inc(); return dst_frame; } +/** Encrypt a buffer. +@param[in,out] crypt_data Crypt data +@param[in] space space_id +@param[in] offset Page offset +@param[in] lsn Log sequence number +@param[in] src_frame Page to encrypt +@param[in] zip_size ROW_FORMAT=COMPRESSED + page size, or 0 +@param[in,out] dst_frame Output buffer +@param[in] use_full_checksum full crc32 algo is used +@return encrypted buffer or NULL */ +UNIV_INTERN +byte* +fil_encrypt_buf( + fil_space_crypt_t* crypt_data, + ulint space, + ulint offset, + lsn_t lsn, + const byte* src_frame, + ulint zip_size, + byte* dst_frame, + bool use_full_checksum) +{ + if (use_full_checksum) { + return fil_encrypt_buf_for_full_crc32( + crypt_data, space, offset, + lsn, src_frame, dst_frame); + } + + return fil_encrypt_buf_for_non_full_checksum( + crypt_data, space, offset, lsn, + src_frame, zip_size, dst_frame); +} + +/** Check whether these page types are allowed to encrypt. +@param[in] space tablespace object +@param[in] src_frame source page +@return true if it is valid page type */ +static bool fil_space_encrypt_valid_page_type( + const fil_space_t* space, + byte* src_frame) +{ + switch (mach_read_from_2(src_frame+FIL_PAGE_TYPE)) { + case FIL_PAGE_RTREE: + return space->full_crc32(); + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + return false; + } + + return true; +} + /****************************************************************** Encrypt a page @@ -636,12 +740,7 @@ fil_space_encrypt( byte* src_frame, byte* dst_frame) { - switch (mach_read_from_2(src_frame+FIL_PAGE_TYPE)) { - case FIL_PAGE_TYPE_FSP_HDR: - case FIL_PAGE_TYPE_XDES: - case FIL_PAGE_RTREE: - /* File space header, extent descriptor or spatial index - are not encrypted. */ + if (!fil_space_encrypt_valid_page_type(space, src_frame)) { return src_frame; } @@ -650,32 +749,104 @@ fil_space_encrypt( } ut_ad(space->pending_io()); + return fil_encrypt_buf(space->crypt_data, space->id, offset, lsn, - src_frame, page_size_t(space->flags), - dst_frame); + src_frame, space->zip_size(), + dst_frame, space->full_crc32()); } -/** Decrypt a page. +/** Decrypt a page for full checksum format. +@param[in] space space id @param[in] crypt_data crypt_data @param[in] tmp_frame Temporary buffer -@param[in] page_size Page size @param[in,out] src_frame Page to decrypt @param[out] err DB_SUCCESS or DB_DECRYPTION_FAILED @return true if page decrypted, false if not.*/ -UNIV_INTERN -bool -fil_space_decrypt( +static bool fil_space_decrypt_full_crc32( + ulint space, fil_space_crypt_t* crypt_data, byte* tmp_frame, - const page_size_t& page_size, + byte* src_frame, + dberr_t* err) +{ + uint key_version = mach_read_from_4( + src_frame + FIL_PAGE_FCRC32_KEY_VERSION); + lsn_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN); + uint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET); + *err = DB_SUCCESS; + + if (key_version == ENCRYPTION_KEY_NOT_ENCRYPTED) { + return false; + } + + ut_ad(crypt_data); + ut_ad(crypt_data->is_encrypted()); + + memcpy(tmp_frame, src_frame, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + + /* Calculate the offset where decryption starts */ + const byte* src = src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + byte* dst = tmp_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + uint dstlen = 0; + bool corrupted = false; + uint size = buf_page_full_crc32_size(src_frame, NULL, &corrupted); + if (UNIV_UNLIKELY(corrupted)) { +fail: + *err = DB_DECRYPTION_FAILED; + return false; + } + + uint srclen = size - (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + + FIL_PAGE_FCRC32_CHECKSUM); + + int rc = encryption_scheme_decrypt(src, srclen, dst, &dstlen, + crypt_data, key_version, + (uint) space, offset, lsn); + + if (rc != MY_AES_OK || dstlen != srclen) { + if (rc == -1) { + goto fail; + } + + ib::fatal() << "Unable to decrypt data-block " + << " src: " << src << "srclen: " + << srclen << " buf: " << dst << "buflen: " + << dstlen << " return-code: " << rc + << " Can't continue!"; + } + + /* Copy only checksum part in the trailer */ + memcpy(tmp_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM, + src_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM, + FIL_PAGE_FCRC32_CHECKSUM); + + srv_stats.pages_decrypted.inc(); + + return true; /* page was decrypted */ +} + +/** Decrypt a page for non full checksum format. +@param[in] crypt_data crypt_data +@param[in] tmp_frame Temporary buffer +@param[in] physical_size page size +@param[in,out] src_frame Page to decrypt +@param[out] err DB_SUCCESS or DB_DECRYPTION_FAILED +@return true if page decrypted, false if not.*/ +static bool fil_space_decrypt_for_non_full_checksum( + fil_space_crypt_t* crypt_data, + byte* tmp_frame, + ulint physical_size, byte* src_frame, dberr_t* err) { ulint page_type = mach_read_from_2(src_frame+FIL_PAGE_TYPE); - uint key_version = mach_read_from_4(src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); - bool page_compressed = (page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED); + uint key_version = mach_read_from_4( + src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + bool page_compressed = (page_type + == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED); uint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET); - uint space = mach_read_from_4(src_frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + uint space = mach_read_from_4( + src_frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); ib_uint64_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN); *err = DB_SUCCESS; @@ -690,7 +861,7 @@ fil_space_decrypt( uint header_len = FIL_PAGE_DATA; if (page_compressed) { - header_len += (FIL_PAGE_COMPRESSED_SIZE + FIL_PAGE_COMPRESSION_METHOD_SIZE); + header_len += FIL_PAGE_ENCRYPT_COMP_METADATA_LEN; } /* Copy FIL page header, it is not encrypted */ @@ -700,8 +871,7 @@ fil_space_decrypt( const byte* src = src_frame + header_len; byte* dst = tmp_frame + header_len; uint32 dstlen = 0; - uint srclen = uint(page_size.physical()) - - header_len - FIL_PAGE_DATA_END; + uint srclen = uint(physical_size) - header_len - FIL_PAGE_DATA_END; if (page_compressed) { srclen = mach_read_from_2(src_frame + FIL_PAGE_DATA); @@ -733,8 +903,8 @@ fil_space_decrypt( to sector boundary is written. */ if (!page_compressed) { /* Copy FIL trailer */ - memcpy(tmp_frame + page_size.physical() - FIL_PAGE_DATA_END, - src_frame + page_size.physical() - FIL_PAGE_DATA_END, + memcpy(tmp_frame + physical_size - FIL_PAGE_DATA_END, + src_frame + physical_size - FIL_PAGE_DATA_END, FIL_PAGE_DATA_END); } @@ -743,6 +913,36 @@ fil_space_decrypt( return true; /* page was decrypted */ } +/** Decrypt a page. +@param[in] space_id tablespace id +@param[in] crypt_data crypt_data +@param[in] tmp_frame Temporary buffer +@param[in] physical_size page size +@param[in] fsp_flags Tablespace flags +@param[in,out] src_frame Page to decrypt +@param[out] err DB_SUCCESS or DB_DECRYPTION_FAILED +@return true if page decrypted, false if not.*/ +UNIV_INTERN +bool +fil_space_decrypt( + ulint space_id, + fil_space_crypt_t* crypt_data, + byte* tmp_frame, + ulint physical_size, + ulint fsp_flags, + byte* src_frame, + dberr_t* err) +{ + if (fil_space_t::full_crc32(fsp_flags)) { + return fil_space_decrypt_full_crc32( + space_id, crypt_data, tmp_frame, src_frame, err); + } + + return fil_space_decrypt_for_non_full_checksum(crypt_data, tmp_frame, + physical_size, src_frame, + err); +} + /** Decrypt a page. @param[in] space Tablespace @@ -759,19 +959,21 @@ fil_space_decrypt( { dberr_t err = DB_SUCCESS; byte* res = NULL; - const page_size_t page_size(space->flags); + const ulint physical_size = space->physical_size(); ut_ad(space->crypt_data != NULL && space->crypt_data->is_encrypted()); ut_ad(space->pending_io()); - bool encrypted = fil_space_decrypt(space->crypt_data, tmp_frame, - page_size, src_frame, &err); + bool encrypted = fil_space_decrypt(space->id, space->crypt_data, + tmp_frame, physical_size, + space->flags, + src_frame, &err); if (err == DB_SUCCESS) { if (encrypted) { /* Copy the decrypted page back to page buffer, not really any other options. */ - memcpy(src_frame, tmp_frame, page_size.physical()); + memcpy(src_frame, tmp_frame, physical_size); } res = src_frame; @@ -780,21 +982,18 @@ fil_space_decrypt( return res; } -/****************************************************************** +/** Calculate post encryption checksum -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] dst_frame Block where checksum is calculated @return page checksum not needed. */ -UNIV_INTERN uint32_t -fil_crypt_calculate_checksum( - const page_size_t& page_size, - const byte* dst_frame) +fil_crypt_calculate_checksum(ulint zip_size, const byte* dst_frame) { /* For encrypted tables we use only crc32 and strict_crc32 */ - return page_size.is_compressed() - ? page_zip_calc_checksum(dst_frame, page_size.physical(), + return zip_size + ? page_zip_calc_checksum(dst_frame, zip_size, SRV_CHECKSUM_ALGORITHM_CRC32) : buf_calc_page_crc32(dst_frame); } @@ -906,15 +1105,15 @@ fil_crypt_read_crypt_data(fil_space_t* space) return; } - const page_size_t page_size(space->flags); + const ulint zip_size = space->zip_size(); mtr_t mtr; mtr.start(); if (buf_block_t* block = buf_page_get(page_id_t(space->id, 0), - page_size, RW_S_LATCH, &mtr)) { + zip_size, RW_S_LATCH, &mtr)) { mutex_enter(&fil_system.mutex); if (!space->crypt_data) { space->crypt_data = fil_space_read_crypt_data( - page_size, block->frame); + zip_size, block->frame); } mutex_exit(&fil_system.mutex); } @@ -985,7 +1184,7 @@ static bool fil_crypt_start_encrypting_space(fil_space_t* space) /* 2 - get page 0 */ dberr_t err = DB_SUCCESS; buf_block_t* block = buf_page_get_gen( - page_id_t(space->id, 0), page_size_t(space->flags), + page_id_t(space->id, 0), space->zip_size(), RW_X_LATCH, NULL, BUF_GET, __FILE__, __LINE__, &mtr, &err); @@ -1571,7 +1770,7 @@ fil_crypt_get_page_throttle_func( unsigned line) { fil_space_t* space = state->space; - const page_size_t page_size = page_size_t(space->flags); + const ulint zip_size = space->zip_size(); const page_id_t page_id(space->id, offset); ut_ad(space->referenced()); @@ -1582,7 +1781,7 @@ fil_crypt_get_page_throttle_func( } dberr_t err = DB_SUCCESS; - buf_block_t* block = buf_page_get_gen(page_id, page_size, RW_X_LATCH, + buf_block_t* block = buf_page_get_gen(page_id, zip_size, RW_X_LATCH, NULL, BUF_PEEK_IF_IN_POOL, file, line, mtr, &err); @@ -1599,7 +1798,7 @@ fil_crypt_get_page_throttle_func( state->crypt_stat.pages_read_from_disk++; const ulonglong start = my_interval_timer(); - block = buf_page_get_gen(page_id, page_size, + block = buf_page_get_gen(page_id, zip_size, RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED, file, line, mtr, &err); @@ -1732,7 +1931,7 @@ fil_crypt_rotate_page( int needs_scrubbing = BTR_SCRUB_SKIP_PAGE; lsn_t block_lsn = block->page.newest_modification; byte* frame = buf_block_get_frame(block); - uint kv = mach_read_from_4(frame+FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + uint kv = buf_page_get_key_version(frame, space->flags); if (space->is_stopping()) { /* The tablespace is closing (in DROP TABLE or @@ -1976,7 +2175,7 @@ fil_crypt_flush_space( dberr_t err; if (buf_block_t* block = buf_page_get_gen( - page_id_t(space->id, 0), page_size_t(space->flags), + page_id_t(space->id, 0), space->zip_size(), RW_X_LATCH, NULL, BUF_GET, __FILE__, __LINE__, &mtr, &err)) { mtr.set_named_space(space); @@ -2554,10 +2753,9 @@ calculated checksum as if it does page could be valid unencrypted, encrypted, or corrupted. @param[in,out] page page frame (checksum is temporarily modified) -@param[in] page_size page size -@return whether the encrypted page is OK */ -bool -fil_space_verify_crypt_checksum(const byte* page, const page_size_t& page_size) +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return true if page is encrypted AND OK, false otherwise */ +bool fil_space_verify_crypt_checksum(const byte* page, ulint zip_size) { ut_ad(mach_read_from_4(page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION)); @@ -2577,24 +2775,14 @@ fil_space_verify_crypt_checksum(const byte* page, const page_size_t& page_size) page is not corrupted. */ switch (srv_checksum_algorithm_t(srv_checksum_algorithm)) { + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: - if (page_size.is_compressed()) { + if (zip_size) { return checksum == page_zip_calc_checksum( - page, page_size.physical(), - SRV_CHECKSUM_ALGORITHM_CRC32) -#ifdef INNODB_BUG_ENDIAN_CRC32 - || checksum == page_zip_calc_checksum( - page, page_size.physical(), - SRV_CHECKSUM_ALGORITHM_CRC32, true) -#endif - ; + page, zip_size, SRV_CHECKSUM_ALGORITHM_CRC32); } - return checksum == buf_calc_page_crc32(page) -#ifdef INNODB_BUG_ENDIAN_CRC32 - || checksum == buf_calc_page_crc32(page, true) -#endif - ; + return checksum == buf_calc_page_crc32(page); case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: /* Starting with MariaDB 10.1.25, 10.2.7, 10.3.1, due to MDEV-12114, fil_crypt_calculate_checksum() @@ -2609,27 +2797,20 @@ fil_space_verify_crypt_checksum(const byte* page, const page_size_t& page_size) Due to this, we must treat "strict_innodb" as "innodb". */ case SRV_CHECKSUM_ALGORITHM_INNODB: case SRV_CHECKSUM_ALGORITHM_CRC32: + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: if (checksum == BUF_NO_CHECKSUM_MAGIC) { return true; } - if (page_size.is_compressed()) { + if (zip_size) { return checksum == page_zip_calc_checksum( - page, page_size.physical(), + page, zip_size, SRV_CHECKSUM_ALGORITHM_CRC32) -#ifdef INNODB_BUG_ENDIAN_CRC32 - || checksum == page_zip_calc_checksum( - page, page_size.physical(), - SRV_CHECKSUM_ALGORITHM_CRC32, true) -#endif || checksum == page_zip_calc_checksum( - page, page_size.physical(), + page, zip_size, SRV_CHECKSUM_ALGORITHM_INNODB); } return checksum == buf_calc_page_crc32(page) -#ifdef INNODB_BUG_ENDIAN_CRC32 - || checksum == buf_calc_page_crc32(page, true) -#endif || checksum == buf_calc_page_new_checksum(page); } diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 8928e4af5dc..97d554f0b73 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -42,7 +42,6 @@ Created 10/25/1995 Heikki Tuuri #include "os0file.h" #include "page0zip.h" #include "row0mysql.h" -#include "row0trunc.h" #include "srv0start.h" #include "trx0purge.h" #include "buf0lru.h" @@ -51,6 +50,11 @@ Created 10/25/1995 Heikki Tuuri #include "sync0sync.h" #include "buf0flu.h" #include "os0api.h" +#ifdef UNIV_LINUX +# include <sys/types.h> +# include <sys/sysmacros.h> +# include <dirent.h> +#endif /** Tries to close a file in the LRU list. The caller must hold the fil_sys mutex. @@ -166,9 +170,6 @@ ulint fil_n_pending_log_flushes = 0; /** Number of pending tablespace flushes */ ulint fil_n_pending_tablespace_flushes = 0; -/** The null file address */ -const fil_addr_t fil_addr_null = {FIL_NULL, 0}; - /** The tablespace memory cache. This variable is NULL before the module is initialized. */ fil_system_t fil_system; @@ -208,19 +209,11 @@ fil_validate_skip(void) /*===================*/ { /** The fil_validate() call skip counter. */ - static int fil_validate_count = FIL_VALIDATE_SKIP; + static Atomic_counter<uint32_t> fil_validate_count; /* We want to reduce the call frequency of the costly fil_validate() check in debug builds. */ - int count = my_atomic_add32_explicit(&fil_validate_count, -1, - MY_MEMORY_ORDER_RELAXED); - if (count > 0) { - return(true); - } - - my_atomic_store32_explicit(&fil_validate_count, FIL_VALIDATE_SKIP, - MY_MEMORY_ORDER_RELAXED); - return(fil_validate()); + return (fil_validate_count++ % FIL_VALIDATE_SKIP) || fil_validate(); } #endif /* UNIV_DEBUG */ @@ -273,7 +266,7 @@ fil_node_complete_io(fil_node_t* node, const IORequest& type); blocks at the end of file are ignored: they are not taken into account when calculating the byte offset within a space. @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] byte_offset remainder of offset in bytes; in aio this must be divisible by the OS block size @param[in] len how many bytes to read; this must not cross a @@ -286,12 +279,12 @@ UNIV_INLINE dberr_t fil_read( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint byte_offset, ulint len, void* buf) { - return(fil_io(IORequestRead, true, page_id, page_size, + return(fil_io(IORequestRead, true, page_id, zip_size, byte_offset, len, buf, NULL)); } @@ -299,7 +292,7 @@ fil_read( blocks at the end of file are ignored: they are not taken into account when calculating the byte offset within a space. @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] byte_offset remainder of offset in bytes; in aio this must be divisible by the OS block size @param[in] len how many bytes to write; this must not cross @@ -312,14 +305,14 @@ UNIV_INLINE dberr_t fil_write( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint byte_offset, ulint len, void* buf) { ut_ad(!srv_read_only_mode); - return(fil_io(IORequestWrite, true, page_id, page_size, + return(fil_io(IORequestWrite, true, page_id, zip_size, byte_offset, len, buf, NULL)); } @@ -393,20 +386,6 @@ fil_space_get_latch( return(&(space->latch)); } -/** Note that the tablespace has been imported. -Initially, purpose=FIL_TYPE_IMPORT so that no redo log is -written while the space ID is being updated in each page. */ -void fil_space_t::set_imported() -{ - ut_ad(purpose == FIL_TYPE_IMPORT); - const fil_node_t* node = UT_LIST_GET_FIRST(chain); - atomic_write_supported = node->atomic_write - && srv_use_atomic_writes - && my_test_if_atomic_write(node->handle, - int(page_size_t(flags).physical())); - purpose = FIL_TYPE_TABLESPACE; -} - /**********************************************************************//** Checks if all the file nodes in a space are flushed. @return true if all are flushed */ @@ -432,6 +411,42 @@ fil_space_is_flushed( return(true); } +/** Validate the compression algorithm for full crc32 format. +@param[in] space tablespace object +@return whether the compression algorithm support */ +static bool fil_comp_algo_validate(const fil_space_t* space) +{ + if (!space->full_crc32()) { + return true; + } + + DBUG_EXECUTE_IF("fil_comp_algo_validate_fail", + return false;); + + ulint comp_algo = space->get_compression_algo(); + switch (comp_algo) { + case PAGE_UNCOMPRESSED: + case PAGE_ZLIB_ALGORITHM: +#ifdef HAVE_LZ4 + case PAGE_LZ4_ALGORITHM: +#endif /* HAVE_LZ4 */ +#ifdef HAVE_LZO + case PAGE_LZO_ALGORITHM: +#endif /* HAVE_LZO */ +#ifdef HAVE_LZMA + case PAGE_LZMA_ALGORITHM: +#endif /* HAVE_LZMA */ +#ifdef HAVE_BZIP2 + case PAGE_BZIP2_ALGORITHM: +#endif /* HAVE_BZIP2 */ +#ifdef HAVE_SNAPPY + case PAGE_SNAPPY_ALGORITHM: +#endif /* HAVE_SNAPPY */ + return true; + } + + return false; +} /** Append a file to the chain of files of a space. @param[in] name file name of a file that is not open @@ -483,103 +498,6 @@ fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle, return node; } -/** Read the first page of a data file. -@param[in] first whether this is the very first read -@return whether the page was found valid */ -bool fil_node_t::read_page0(bool first) -{ - ut_ad(mutex_own(&fil_system.mutex)); - ut_a(space->purpose != FIL_TYPE_LOG); - const page_size_t page_size(space->flags); - const ulint psize = page_size.physical(); - - os_offset_t size_bytes = os_file_get_size(handle); - ut_a(size_bytes != (os_offset_t) -1); - const ulint min_size = FIL_IBD_FILE_INITIAL_SIZE * psize; - - if (size_bytes < min_size) { - ib::error() << "The size of the file " << name - << " is only " << size_bytes - << " bytes, should be at least " << min_size; - return false; - } - - byte* buf2 = static_cast<byte*>(ut_malloc_nokey(2 * psize)); - - /* Align the memory for file i/o if we might have O_DIRECT set */ - byte* page = static_cast<byte*>(ut_align(buf2, psize)); - IORequest request(IORequest::READ); - if (os_file_read(request, handle, page, 0, psize) != DB_SUCCESS) { - ib::error() << "Unable to read first page of file " << name; - ut_free(buf2); - return false; - } - srv_stats.page0_read.add(1); - const ulint space_id = fsp_header_get_space_id(page); - ulint flags = fsp_header_get_flags(page); - const ulint size = fsp_header_get_field(page, FSP_SIZE); - const ulint free_limit = fsp_header_get_field(page, FSP_FREE_LIMIT); - const ulint free_len = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE - + page); - /* Try to read crypt_data from page 0 if it is not yet read. */ - if (!space->crypt_data) { - space->crypt_data = fil_space_read_crypt_data(page_size, page); - } - ut_free(buf2); - - if (!fsp_flags_is_valid(flags, space->id)) { - ulint cflags = fsp_flags_convert_from_101(flags); - if (cflags == ULINT_UNDEFINED - || (cflags ^ space->flags) & ~FSP_FLAGS_MEM_MASK) { - ib::error() - << "Expected tablespace flags " - << ib::hex(space->flags) - << " but found " << ib::hex(flags) - << " in the file " << name; - return false; - } - - flags = cflags; - } - - if (UNIV_UNLIKELY(space_id != space->id)) { - ib::error() << "Expected tablespace id " << space->id - << " but found " << space_id - << " in the file " << name; - return false; - } - - if (first) { - ut_ad(space->id != TRX_SYS_SPACE); - - /* Truncate the size to a multiple of extent size. */ - ulint mask = psize * FSP_EXTENT_SIZE - 1; - - if (size_bytes <= mask) { - /* .ibd files start smaller than an - extent size. Do not truncate valid data. */ - } else { - size_bytes &= ~os_offset_t(mask); - } - - this->size = ulint(size_bytes / psize); - space->size += this->size; - } else if (space->id != TRX_SYS_SPACE || space->size_in_header) { - /* If this is not the first-time open, do nothing. - For the system tablespace, we always get invoked as - first=false, so we detect the true first-time-open based - on size_in_header and proceed to initiailze the data. */ - return true; - } - - ut_ad(space->free_limit == 0 || space->free_limit == free_limit); - ut_ad(space->free_len == 0 || space->free_len == free_len); - space->size_in_header = size; - space->free_limit = free_limit; - space->free_len = free_len; - return true; -} - /** Open a file node of a tablespace. @param[in,out] node File node @return false if the file can't be opened, otherwise true */ @@ -601,8 +519,7 @@ static bool fil_node_open_file(fil_node_t* node) if (first_time_open || (space->purpose == FIL_TYPE_TABLESPACE && node == UT_LIST_GET_FIRST(space->chain) - && srv_startup_is_before_trx_rollback_phase - && !undo::Truncate::was_tablespace_truncated(space->id))) { + && srv_startup_is_before_trx_rollback_phase)) { /* We do not know the size of the file yet. First we open the file in the normal mode, no async I/O here, for simplicity. Then do some checks, and close the @@ -633,10 +550,16 @@ retry: } if (!node->read_page0(first_time_open)) { +fail: os_file_close(node->handle); node->handle = OS_FILE_CLOSED; return false; } + + if (first_time_open && !fil_comp_algo_validate(space)) { + goto fail; + } + } else if (space->purpose == FIL_TYPE_LOG) { node->handle = os_file_create( innodb_log_file_key, node->name, OS_FILE_OPEN, @@ -650,30 +573,6 @@ retry: OS_FILE_AIO, OS_DATA_FILE, read_only_mode, &success); } - if (space->purpose != FIL_TYPE_LOG) { - /* - For the temporary tablespace and during the - non-redo-logged adjustments in - IMPORT TABLESPACE, we do not care about - the atomicity of writes. - - Atomic writes is supported if the file can be used - with atomic_writes (not log file), O_DIRECT is - used (tested in ha_innodb.cc) and the file is - device and file system that supports atomic writes - for the given block size - */ - space->atomic_write_supported - = space->purpose == FIL_TYPE_TEMPORARY - || space->purpose == FIL_TYPE_IMPORT - || (node->atomic_write - && srv_use_atomic_writes - && my_test_if_atomic_write( - node->handle, - int(page_size_t(space->flags) - .physical()))); - } - ut_a(success); ut_a(node->is_open()); @@ -937,14 +836,7 @@ fil_space_extend_must_retry( ulint last_page_no = space->size; const ulint file_start_page_no = last_page_no - node->size; - /* Determine correct file block size */ - if (node->block_size == 0) { - node->block_size = os_file_get_block_size( - node->handle, node->name); - } - - const page_size_t pageSize(space->flags); - const ulint page_size = pageSize.physical(); + const ulint page_size = space->physical_size(); /* fil_read_first_page() expects srv_page_size bytes. fil_node_open_file() expects at least 4 * srv_page_size bytes.*/ @@ -1004,7 +896,6 @@ fil_space_extend_must_retry( srv_tmp_space.set_last_file_size(pages_in_MiB); return(false); } - } /*******************************************************************//** @@ -1323,7 +1214,7 @@ fil_space_create( fil_space_t* space; ut_ad(fil_system.is_initialised()); - ut_ad(fsp_flags_is_valid(flags & ~FSP_FLAGS_MEM_MASK, id)); + ut_ad(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, id)); ut_ad(purpose == FIL_TYPE_LOG || srv_page_size == UNIV_PAGE_SIZE_ORIG || flags != 0); @@ -1405,8 +1296,8 @@ fil_space_create( to do */ if (purpose == FIL_TYPE_TABLESPACE && !srv_fil_crypt_rotate_key_age && fil_crypt_threads_event && - (mode == FIL_ENCRYPTION_ON || mode == FIL_ENCRYPTION_OFF || - srv_encrypt_tables)) { + (mode == FIL_ENCRYPTION_ON || mode == FIL_ENCRYPTION_OFF + || srv_encrypt_tables)) { /* Key rotation is not enabled, need to inform background encryption threads. */ fil_system.rotation_list.push_back(*space); @@ -1660,28 +1551,6 @@ void fil_space_t::close() mutex_exit(&fil_system.mutex); } -/** Returns the page size of the space and whether it is compressed or not. -The tablespace must be cached in the memory cache. -@param[in] id space id -@param[out] found true if tablespace was found -@return page size */ -const page_size_t -fil_space_get_page_size( - ulint id, - bool* found) -{ - const ulint flags = fil_space_get_flags(id); - - if (flags == ULINT_UNDEFINED) { - *found = false; - return(univ_page_size); - } - - *found = true; - - return(page_size_t(flags)); -} - void fil_system_t::create(ulint hash_size) { ut_ad(this == &fil_system); @@ -1702,6 +1571,66 @@ void fil_system_t::create(ulint hash_size) spaces = hash_create(hash_size); fil_space_crypt_init(); +#ifdef UNIV_LINUX + ssd.clear(); + char fn[sizeof(dirent::d_name) + + sizeof "/sys/block/" "/queue/rotational"]; + const size_t sizeof_fnp = (sizeof fn) - sizeof "/sys/block"; + memcpy(fn, "/sys/block/", sizeof "/sys/block"); + char* fnp = &fn[sizeof "/sys/block"]; + + std::set<std::string> ssd_devices; + if (DIR* d = opendir("/sys/block")) { + while (struct dirent* e = readdir(d)) { + if (e->d_name[0] == '.') { + continue; + } + snprintf(fnp, sizeof_fnp, "%s/queue/rotational", + e->d_name); + int f = open(fn, O_RDONLY); + if (f == -1) { + continue; + } + char b[sizeof "4294967295:4294967295\n"]; + ssize_t l = read(f, b, sizeof b); + ::close(f); + if (l != 2 || memcmp("0\n", b, 2)) { + continue; + } + snprintf(fnp, sizeof_fnp, "%s/dev", e->d_name); + f = open(fn, O_RDONLY); + if (f == -1) { + continue; + } + l = read(f, b, sizeof b); + ::close(f); + if (l <= 0 || b[l - 1] != '\n') { + continue; + } + b[l - 1] = '\0'; + char* end = b; + unsigned long dev_major = strtoul(b, &end, 10); + if (b == end || *end != ':' + || dev_major != unsigned(dev_major)) { + continue; + } + char* c = end + 1; + unsigned long dev_minor = strtoul(c, &end, 10); + if (c == end || *end + || dev_minor != unsigned(dev_minor)) { + continue; + } + ssd.push_back(makedev(unsigned(dev_major), + unsigned(dev_minor))); + } + closedir(d); + } + /* fil_system_t::is_ssd() assumes the following */ + ut_ad(makedev(0, 8) == 8); + ut_ad(makedev(0, 4) == 4); + ut_ad(makedev(0, 2) == 2); + ut_ad(makedev(0, 1) == 1); +#endif } void fil_system_t::close() @@ -1919,13 +1848,19 @@ fil_write_flushed_lsn( const page_id_t page_id(TRX_SYS_SPACE, 0); - err = fil_read(page_id, univ_page_size, 0, srv_page_size, - buf); + err = fil_read(page_id, 0, 0, srv_page_size, buf); if (err == DB_SUCCESS) { mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, lsn); - err = fil_write(page_id, univ_page_size, 0, - srv_page_size, buf); + + ulint fsp_flags = mach_read_from_4( + buf + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS); + + if (fil_space_t::full_crc32(fsp_flags)) { + buf_flush_assign_full_crc32_checksum(buf); + } + + err = fil_write(page_id, 0, 0, srv_page_size, buf); fil_flush_file_spaces(FIL_TYPE_TABLESPACE); } @@ -1939,10 +1874,8 @@ for concurrency control. @param[in] id tablespace ID @param[in] silent whether to silently ignore missing tablespaces @return the tablespace -@retval NULL if missing or being deleted or truncated */ -UNIV_INTERN -fil_space_t* -fil_space_acquire_low(ulint id, bool silent) +@retval NULL if missing or being deleted */ +fil_space_t* fil_space_acquire_low(ulint id, bool silent) { fil_space_t* space; @@ -2042,7 +1975,7 @@ fil_op_write_log( ulint len; ut_ad(first_page_no == 0 || type == MLOG_FILE_CREATE2); - ut_ad(fsp_flags_is_valid(flags, space_id)); + ut_ad(fil_space_t::is_valid_flags(flags, space_id)); /* fil_name_parse() requires that there be at least one path separator and that the file path end with ".ibd". */ @@ -2269,9 +2202,7 @@ enum fil_operation_t { @param[in] space tablespace @param[in] count number of attempts so far @return 0 if no operations else count + 1. */ -static -ulint -fil_check_pending_ops(const fil_space_t* space, ulint count) +static ulint fil_check_pending_ops(const fil_space_t* space, ulint count) { ut_ad(mutex_own(&fil_system.mutex)); @@ -2279,10 +2210,10 @@ fil_check_pending_ops(const fil_space_t* space, ulint count) return 0; } - if (ulint n_pending_ops = my_atomic_loadlint(&space->n_pending_ops)) { + if (ulint n_pending_ops = space->n_pending_ops) { if (count > 5000) { - ib::warn() << "Trying to close/delete/truncate" + ib::warn() << "Trying to delete" " tablespace '" << space->name << "' but there are " << n_pending_ops << " pending operations on it."; @@ -2329,7 +2260,7 @@ fil_check_pending_io( ut_a(!(*node)->being_extended); if (count > 1000) { - ib::warn() << "Trying to delete/close/truncate" + ib::warn() << "Trying to delete" " tablespace '" << space->name << "' but there are " << space->n_pending_flushes @@ -2959,6 +2890,9 @@ skip_second_rename: return(success); } +/* FIXME: remove this! */ +IF_WIN(, bool os_is_sparse_file_supported(os_file_t fh)); + /** Create a tablespace file. @param[in] space_id Tablespace ID @param[in] name Tablespace name in dbname/tablename format. @@ -2987,14 +2921,12 @@ fil_ibd_create( byte* page; bool success; bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags) != 0; - fil_space_t* space = NULL; - fil_space_crypt_t *crypt_data = NULL; ut_ad(!is_system_tablespace(space_id)); ut_ad(!srv_read_only_mode); ut_a(space_id < SRV_LOG_SPACE_FIRST_ID); ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE); - ut_a(fsp_flags_is_valid(flags & ~FSP_FLAGS_MEM_MASK, space_id)); + ut_a(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, space_id)); /* Create the subdirectories in the path, if they are not there already. */ @@ -3034,6 +2966,7 @@ fil_ibd_create( } const bool is_compressed = FSP_FLAGS_HAS_PAGE_COMPRESSION(flags); + bool punch_hole = is_compressed; #ifdef _WIN32 if (is_compressed) { @@ -3051,9 +2984,8 @@ err_exit: return NULL; } - bool punch_hole = os_is_sparse_file_supported(file); - - ulint block_size = os_file_get_block_size(file, path); + /* FIXME: remove this */ + IF_WIN(, punch_hole = punch_hole && os_is_sparse_file_supported(file)); /* We have to write the space id to the file immediately and flush the file to disk. This is because in crash recovery we must be aware what @@ -3070,16 +3002,21 @@ err_exit: memset(page, '\0', srv_page_size); - flags |= FSP_FLAGS_PAGE_SSIZE(); + if (fil_space_t::full_crc32(flags)) { + flags |= FSP_FLAGS_FCRC32_PAGE_SSIZE(); + } else { + flags |= FSP_FLAGS_PAGE_SSIZE(); + } + fsp_header_init_fields(page, space_id, flags); mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id); /* Create crypt data if the tablespace is either encrypted or user has requested it to remain unencrypted. */ - if (mode == FIL_ENCRYPTION_ON || mode == FIL_ENCRYPTION_OFF || - srv_encrypt_tables) { - crypt_data = fil_space_create_crypt_data(mode, key_id); - } + fil_space_crypt_t *crypt_data = (mode != FIL_ENCRYPTION_DEFAULT + || srv_encrypt_tables) + ? fil_space_create_crypt_data(mode, key_id) + : NULL; if (crypt_data) { /* Write crypt data information in page0 while creating @@ -3087,18 +3024,9 @@ err_exit: crypt_data->fill_page0(flags, page); } - const page_size_t page_size(flags); - IORequest request(IORequest::WRITE); - - if (!page_size.is_compressed()) { - - buf_flush_init_for_writing(NULL, page, NULL, 0); - - *err = os_file_write( - request, path, file, page, 0, page_size.physical()); - } else { + if (ulint zip_size = fil_space_t::zip_size(flags)) { page_zip_des_t page_zip; - page_zip_set_size(&page_zip, page_size.physical()); + page_zip_set_size(&page_zip, zip_size); page_zip.data = page + srv_page_size; #ifdef UNIV_DEBUG page_zip.m_start = @@ -3106,11 +3034,16 @@ err_exit: page_zip.m_end = page_zip.m_nonempty = page_zip.n_blobs = 0; - buf_flush_init_for_writing(NULL, page, &page_zip, 0); + buf_flush_init_for_writing(NULL, page, &page_zip, 0, false); *err = os_file_write( - request, path, file, page_zip.data, 0, - page_size.physical()); + IORequestWrite, path, file, page_zip.data, 0, zip_size); + } else { + buf_flush_init_for_writing(NULL, page, NULL, 0, + fil_space_t::full_crc32(flags)); + + *err = os_file_write( + IORequestWrite, path, file, page, 0, srv_page_size); } ut_free(buf2); @@ -3138,25 +3071,26 @@ err_exit: } } - space = fil_space_create(name, space_id, flags, FIL_TYPE_TABLESPACE, - crypt_data, mode); + fil_space_t* space = fil_space_create(name, space_id, flags, + FIL_TYPE_TABLESPACE, + crypt_data, mode); if (!space) { free(crypt_data); *err = DB_ERROR; } else { - fil_node_t* file = space->add(path, OS_FILE_CLOSED, size, + space->punch_hole = punch_hole; + /* FIXME: Keep the file open! */ + fil_node_t* node = space->add(path, OS_FILE_CLOSED, size, false, true); mtr_t mtr; mtr.start(); fil_op_write_log( - MLOG_FILE_CREATE2, space_id, 0, file->name, + MLOG_FILE_CREATE2, space_id, 0, node->name, NULL, space->flags & ~FSP_FLAGS_MEM_MASK, &mtr); - fil_name_write(space, 0, file, &mtr); + fil_name_write(space, 0, node, &mtr); mtr.commit(); - file->block_size = block_size; - space->punch_hole = punch_hole; - + node->find_metadata(file); *err = DB_SUCCESS; } @@ -3248,11 +3182,12 @@ fil_ibd_open( ulint tablespaces_found = 0; ulint valid_tablespaces_found = 0; - ut_ad(!fix_dict || rw_lock_own(&dict_operation_lock, RW_LOCK_X)); + if (fix_dict) { + ut_d(dict_sys.assert_locked()); + ut_ad(!srv_read_only_mode); + ut_ad(srv_log_file_size != 0); + } - ut_ad(!fix_dict || mutex_own(&dict_sys->mutex)); - ut_ad(!fix_dict || !srv_read_only_mode); - ut_ad(!fix_dict || srv_log_file_size != 0); ut_ad(fil_type_is_data(purpose)); /* Table flags can be ULINT_UNDEFINED if @@ -3263,7 +3198,7 @@ corrupted: return NULL; } - ut_ad(fsp_flags_is_valid(flags & ~FSP_FLAGS_MEM_MASK, id)); + ut_ad(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, id)); df_default.init(tablename.m_name, flags); df_dict.init(tablename.m_name, flags); df_remote.init(tablename.m_name, flags); @@ -3517,7 +3452,8 @@ skip_validate: df_remote.get_first_page(); fil_space_crypt_t* crypt_data = first_page - ? fil_space_read_crypt_data(page_size_t(flags), first_page) + ? fil_space_read_crypt_data(fil_space_t::zip_size(flags), + first_page) : NULL; fil_space_t* space = fil_space_create( @@ -3866,7 +3802,8 @@ fil_ibd_load( const byte* first_page = file.get_first_page(); fil_space_crypt_t* crypt_data = first_page - ? fil_space_read_crypt_data(page_size_t(flags), first_page) + ? fil_space_read_crypt_data(fil_space_t::zip_size(flags), + first_page) : NULL; space = fil_space_create( file.name(), space_id, flags, FIL_TYPE_TABLESPACE, crypt_data); @@ -3929,7 +3866,10 @@ fil_file_readdir_next_file( void fsp_flags_try_adjust(fil_space_t* space, ulint flags) { ut_ad(!srv_read_only_mode); - ut_ad(fsp_flags_is_valid(flags, space->id)); + ut_ad(fil_space_t::is_valid_flags(flags, space->id)); + if (space->full_crc32() || fil_space_t::full_crc32(flags)) { + return; + } if (!space->size && (space->purpose != FIL_TYPE_TABLESPACE || !fil_space_get_size(space->id))) { return; @@ -3940,9 +3880,15 @@ void fsp_flags_try_adjust(fil_space_t* space, ulint flags) mtr_t mtr; mtr.start(); if (buf_block_t* b = buf_page_get( - page_id_t(space->id, 0), page_size_t(flags), + page_id_t(space->id, 0), space->zip_size(), RW_X_LATCH, &mtr)) { ulint f = fsp_header_get_flags(b->frame); + if (fil_space_t::full_crc32(f)) { + goto func_exit; + } + if (fil_space_t::is_flags_equal(f, flags)) { + goto func_exit; + } /* Suppress the message if only the DATA_DIR flag to differs. */ if ((f ^ flags) & ~(1U << FSP_FLAGS_POS_RESERVED)) { ib::warn() @@ -3951,13 +3897,11 @@ void fsp_flags_try_adjust(fil_space_t* space, ulint flags) << "' from " << ib::hex(f) << " to " << ib::hex(flags); } - if (f != flags) { - mtr.set_named_space(space); - mlog_write_ulint(FSP_HEADER_OFFSET - + FSP_SPACE_FLAGS + b->frame, - flags, MLOG_4BYTES, &mtr); - } + mtr.set_named_space(space); + mlog_write_ulint(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + + b->frame, flags, MLOG_4BYTES, &mtr); } +func_exit: mtr.commit(); } @@ -3979,7 +3923,11 @@ fil_space_for_table_exists_in_mem( mutex_enter(&fil_system.mutex); if (fil_space_t* space = fil_space_get_by_id(id)) { - if ((space->flags ^ expected_flags) & ~FSP_FLAGS_MEM_MASK) { + ulint tf = expected_flags & ~FSP_FLAGS_MEM_MASK; + ulint sf = space->flags & ~FSP_FLAGS_MEM_MASK; + + if (!fil_space_t::is_flags_equal(tf, sf) + && !fil_space_t::is_flags_equal(sf, tf)) { goto func_exit; } @@ -3996,7 +3944,8 @@ fil_space_for_table_exists_in_mem( /* Adjust the flags that are in FSP_FLAGS_MEM_MASK. FSP_SPACE_FLAGS will not be written back here. */ - space->flags = expected_flags; + space->flags = (space->flags & ~FSP_FLAGS_MEM_MASK) + | (expected_flags & FSP_FLAGS_MEM_MASK); mutex_exit(&fil_system.mutex); if (!srv_read_only_mode) { fsp_flags_try_adjust(space, expected_flags @@ -4123,12 +4072,21 @@ fil_report_invalid_page_access( : ""); } +inline void IORequest::set_fil_node(fil_node_t* node) +{ + if (!node->space->punch_hole) { + clear_punch_hole(); + } + + m_fil_node = node; +} + /** Reads or writes data. This operation could be asynchronous (aio). @param[in,out] type IO context @param[in] sync true if synchronous aio is desired @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] byte_offset remainder of offset in bytes; in aio this must be divisible by the OS block size @param[in] len how many bytes to read or write; this must @@ -4140,14 +4098,14 @@ fil_report_invalid_page_access( @param[in] message message for aio handler if non-sync aio used, else ignored @param[in] ignore_missing_space true=ignore missing space duging read -@return DB_SUCCESS, DB_TABLESPACE_DELETED or DB_TABLESPACE_TRUNCATED +@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do i/o on a tablespace which does not exist */ dberr_t fil_io( const IORequest& type, bool sync, const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint byte_offset, ulint len, void* buf, @@ -4161,7 +4119,7 @@ fil_io( ut_ad(len > 0); ut_ad(byte_offset < srv_page_size); - ut_ad(!page_size.is_compressed() || byte_offset == 0); + ut_ad(!zip_size || byte_offset == 0); ut_ad(srv_page_size == 1UL << srv_page_size_shift); compile_time_assert((1U << UNIV_PAGE_SIZE_SHIFT_MAX) == UNIV_PAGE_SIZE_MAX); @@ -4172,7 +4130,7 @@ fil_io( /* ibuf bitmap pages must be read in the sync AIO mode: */ ut_ad(recv_no_ibuf_operations || req_type.is_write() - || !ibuf_bitmap_page(page_id, page_size) + || !ibuf_bitmap_page(page_id, zip_size) || sync || req_type.is_log()); @@ -4188,7 +4146,7 @@ fil_io( } else if (req_type.is_read() && !recv_no_ibuf_operations - && ibuf_page(page_id, page_size, NULL)) { + && ibuf_page(page_id, zip_size, NULL)) { mode = OS_AIO_IBUF; @@ -4272,19 +4230,6 @@ fil_io( break; } else { - if (space->id != TRX_SYS_SPACE - && UT_LIST_GET_LEN(space->chain) == 1 - && (srv_is_tablespace_truncated(space->id) - || srv_was_tablespace_truncated(space)) - && req_type.is_read()) { - - /* Handle page which is outside the truncated - tablespace bounds when recovering from a crash - happened during a truncation */ - mutex_exit(&fil_system.mutex); - return(DB_TABLESPACE_TRUNCATED); - } - cur_page_no -= node->size; node = UT_LIST_GET_NEXT(chain, node); @@ -4343,37 +4288,10 @@ fil_io( /* Now we have made the changes in the data structures of fil_system */ mutex_exit(&fil_system.mutex); - /* Calculate the low 32 bits and the high 32 bits of the file offset */ + if (!zip_size) zip_size = srv_page_size; - if (!page_size.is_compressed()) { - - offset = ((os_offset_t) cur_page_no - << srv_page_size_shift) + byte_offset; - - ut_a(node->size - cur_page_no - >= ((byte_offset + len + (srv_page_size - 1)) - >> srv_page_size_shift)); - } else { - ulint size_shift; - - switch (page_size.physical()) { - case 1024: size_shift = 10; break; - case 2048: size_shift = 11; break; - case 4096: size_shift = 12; break; - case 8192: size_shift = 13; break; - case 16384: size_shift = 14; break; - case 32768: size_shift = 15; break; - case 65536: size_shift = 16; break; - default: ut_error; - } - - offset = ((os_offset_t) cur_page_no << size_shift) - + byte_offset; - - ut_a(node->size - cur_page_no - >= (len + (page_size.physical() - 1)) - / page_size.physical()); - } + offset = os_offset_t(cur_page_no) * zip_size + byte_offset; + ut_ad(node->size - cur_page_no >= (len + (zip_size - 1)) / zip_size); /* Do AIO */ @@ -4387,7 +4305,7 @@ fil_io( ut_ad(!req_type.is_write() || page_id.space() == SRV_LOG_SPACE_FIRST_ID || !fil_is_user_tablespace_id(page_id.space()) - || offset == page_id.page_no() * page_size.physical()); + || offset == page_id.page_no() * zip_size); /* Queue the aio request */ dberr_t err = os_aio( @@ -4507,7 +4425,7 @@ fil_aio_wait( ut_ad(type.is_read()); if (recv_recovery_is_on() && !srv_force_recovery) { - recv_sys->found_corrupt_fs = true; + recv_sys.found_corrupt_fs = true; } if (fil_space_t* space = fil_space_acquire_for_io(space_id)) { @@ -4855,7 +4773,7 @@ fil_space_validate_for_mtr_commit( /* We are serving mtr_commit(). While there is an active mini-transaction, we should have !space->stop_new_ops. This is guaranteed by meta-data locks or transactional locks, or - dict_operation_lock (X-lock in DROP, S-lock in purge). + dict_sys.latch (X-lock in DROP, S-lock in purge). However, a file I/O thread can invoke change buffer merge while fil_check_pending_operations() is waiting for operations @@ -5004,116 +4922,6 @@ fil_names_clear( return(do_write); } -/** Truncate a single-table tablespace. The tablespace must be cached -in the memory cache. -@param space_id space id -@param dir_path directory path -@param tablename the table name in the usual - databasename/tablename format of InnoDB -@param flags tablespace flags -@param trunc_to_default truncate to default size if tablespace - is being newly re-initialized. -@return DB_SUCCESS or error */ -dberr_t -truncate_t::truncate( -/*=================*/ - ulint space_id, - const char* dir_path, - const char* tablename, - ulint flags, - bool trunc_to_default) -{ - dberr_t err = DB_SUCCESS; - char* path; - - ut_a(!is_system_tablespace(space_id)); - - if (FSP_FLAGS_HAS_DATA_DIR(flags)) { - ut_ad(dir_path != NULL); - path = fil_make_filepath(dir_path, tablename, IBD, true); - } else { - path = fil_make_filepath(NULL, tablename, IBD, false); - } - - if (path == NULL) { - return(DB_OUT_OF_MEMORY); - } - - mutex_enter(&fil_system.mutex); - - fil_space_t* space = fil_space_get_by_id(space_id); - - /* The following code must change when InnoDB supports - multiple datafiles per tablespace. */ - ut_a(UT_LIST_GET_LEN(space->chain) == 1); - - fil_node_t* node = UT_LIST_GET_FIRST(space->chain); - - if (trunc_to_default) { - space->size = node->size = FIL_IBD_FILE_INITIAL_SIZE; - } - - const bool already_open = node->is_open(); - - if (!already_open) { - - bool ret; - - node->handle = os_file_create_simple_no_error_handling( - innodb_data_file_key, path, OS_FILE_OPEN, - OS_FILE_READ_WRITE, - space->purpose != FIL_TYPE_TEMPORARY - && srv_read_only_mode, &ret); - - if (!ret) { - ib::error() << "Failed to open tablespace file " - << path << "."; - - ut_free(path); - - return(DB_ERROR); - } - - ut_a(node->is_open()); - } - - os_offset_t trunc_size = trunc_to_default - ? FIL_IBD_FILE_INITIAL_SIZE - : space->size; - - const bool success = os_file_truncate( - path, node->handle, trunc_size << srv_page_size_shift); - - if (!success) { - ib::error() << "Cannot truncate file " << path - << " in TRUNCATE TABLESPACE."; - err = DB_ERROR; - } - - space->stop_new_ops = false; - - /* If we opened the file in this function, close it. */ - if (!already_open) { - bool closed = os_file_close(node->handle); - - if (!closed) { - - ib::error() << "Failed to close tablespace file " - << path << "."; - - err = DB_ERROR; - } else { - node->handle = OS_FILE_CLOSED; - } - } - - mutex_exit(&fil_system.mutex); - - ut_free(path); - - return(err); -} - /* Unit Tests */ #ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH #define MF fil_make_filepath @@ -5324,26 +5132,3 @@ fil_space_found_by_id( mutex_exit(&fil_system.mutex); return space; } - -/** -Get should we punch hole to tablespace. -@param[in] node File node -@return true, if punch hole should be tried, false if not. */ -bool -fil_node_should_punch_hole( - const fil_node_t* node) -{ - return (node->space->punch_hole); -} - -/** -Set punch hole to tablespace to given value. -@param[in] node File node -@param[in] val value to be set. */ -void -fil_space_set_punch_hole( - fil_node_t* node, - bool val) -{ - node->space->punch_hole = val; -} diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc index 7ded1a226f3..b3390a4cd12 100644 --- a/storage/innobase/fil/fil0pagecompress.cc +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (C) 2013, 2018, MariaDB Corporation. +Copyright (C) 2013, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -72,45 +72,24 @@ Updated 14/02/2015 #include "snappy-c.h" #endif -/** Compress a page_compressed page before writing to a data file. +/** Compress a page for the given compression algorithm. @param[in] buf page to be compressed @param[out] out_buf compressed page -@param[in] level compression level -@param[in] block_size file system block size -@param[in] encrypted whether the page will be subsequently encrypted -@return actual length of compressed page -@retval 0 if the page was not compressed */ -ulint fil_page_compress(const byte* buf, byte* out_buf, ulint level, - ulint block_size, bool encrypted) +@param[in] header_len header length of the page +@param[in] comp_algo compression algorithm +@param[in] comp_level compression level +@return actual length of compressed page data +@retval 0 if the page was not compressed */ +static ulint fil_page_compress_low( + const byte* buf, + byte* out_buf, + ulint header_len, + ulint comp_algo, + ulint comp_level) { - int comp_level = int(level); - ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMPRESSED_SIZE; - /* Cache to avoid change during function execution */ - ulint comp_method = innodb_compression_algorithm; - - if (encrypted) { - header_len += FIL_PAGE_COMPRESSION_METHOD_SIZE; - } - - /* Let's not compress file space header or - extent descriptor */ - switch (fil_page_get_type(buf)) { - case 0: - case FIL_PAGE_TYPE_FSP_HDR: - case FIL_PAGE_TYPE_XDES: - case FIL_PAGE_PAGE_COMPRESSED: - return 0; - } - - /* If no compression level was provided to this table, use system - default level */ - if (comp_level == 0) { - comp_level = int(page_zip_level); - } - ulint write_size = srv_page_size - header_len; - switch (comp_method) { + switch (comp_algo) { default: ut_ad(!"unknown compression method"); /* fall through */ @@ -120,10 +99,9 @@ ulint fil_page_compress(const byte* buf, byte* out_buf, ulint level, { ulong len = uLong(write_size); if (Z_OK == compress2( - out_buf + header_len, &len, - buf, uLong(srv_page_size), comp_level)) { - write_size = len; - goto success; + out_buf + header_len, &len, buf, + uLong(srv_page_size), int(comp_level))) { + return len; } } break; @@ -141,10 +119,7 @@ ulint fil_page_compress(const byte* buf, byte* out_buf, ulint level, int(srv_page_size), int(write_size)); # endif - if (write_size) { - goto success; - } - break; + return write_size; #endif /* HAVE_LZ4 */ #ifdef HAVE_LZO case PAGE_LZO_ALGORITHM: { @@ -155,8 +130,7 @@ ulint fil_page_compress(const byte* buf, byte* out_buf, ulint level, out_buf + header_len, &len, out_buf + srv_page_size) && len <= write_size) { - write_size = len; - goto success; + return len; } break; } @@ -170,8 +144,7 @@ ulint fil_page_compress(const byte* buf, byte* out_buf, ulint level, buf, srv_page_size, out_buf + header_len, &out_pos, write_size) && out_pos <= write_size) { - write_size = out_pos; - goto success; + return out_pos; } break; } @@ -187,8 +160,7 @@ ulint fil_page_compress(const byte* buf, byte* out_buf, ulint level, reinterpret_cast<const char*>(buf)), unsigned(srv_page_size), 1, 0, 0) && len <= write_size) { - write_size = len; - goto success; + return len; } break; } @@ -204,53 +176,188 @@ ulint fil_page_compress(const byte* buf, byte* out_buf, ulint level, reinterpret_cast<char*>(out_buf) + header_len, &len) && len <= write_size) { - write_size = len; - goto success; + return len; } break; } #endif /* HAVE_SNAPPY */ } - srv_stats.pages_page_compression_error.inc(); return 0; -success: +} + +/** Compress a page_compressed page for full crc32 format. +@param[in] buf page to be compressed +@param[out] out_buf compressed page +@param[in] flags tablespace flags +@param[in] block_size file system block size +@return actual length of compressed page +@retval 0 if the page was not compressed */ +static ulint fil_page_compress_for_full_crc32( + const byte* buf, + byte* out_buf, + ulint flags, + ulint block_size, + bool encrypted) +{ + ulint comp_level = fsp_flags_get_page_compression_level(flags); + + if (comp_level == 0) { + comp_level = page_zip_level; + } + + const ulint header_len = FIL_PAGE_COMP_ALGO; + + ulint write_size = fil_page_compress_low( + buf, out_buf, header_len, + fil_space_t::get_compression_algo(flags), comp_level); + + if (write_size == 0) { +fail: + srv_stats.pages_page_compression_error.inc(); + return 0; + } + + write_size += header_len; + const ulint actual_size = write_size; + /* Write the actual length of the data & page type + for full crc32 format. */ + const bool lsb = fil_space_t::full_crc32_page_compressed_len(flags); + /* In the MSB, store the rounded-up page size. */ + write_size = (write_size + lsb + (4 + 255)) & ~255; + if (write_size >= srv_page_size) { + goto fail; + } + + /* Set up the page header */ + memcpy(out_buf, buf, header_len); + out_buf[FIL_PAGE_TYPE] = 1U << (FIL_PAGE_COMPRESS_FCRC32_MARKER - 8); + out_buf[FIL_PAGE_TYPE + 1] = byte(write_size >> 8); + /* Clean up the buffer for the remaining write_size (except checksum) */ + memset(out_buf + actual_size, 0, write_size - actual_size - 4); + if (lsb) { + /* Store the LSB */ + out_buf[write_size - 5] = byte(actual_size + (1 + 4)); + } + + if (!block_size) { + block_size = 512; + } + + ut_ad(write_size); + if (write_size & (block_size - 1)) { + size_t tmp = write_size; + write_size = (write_size + (block_size - 1)) + & ~(block_size - 1); + memset(out_buf + tmp, 0, write_size - tmp); + } + +#ifdef UNIV_DEBUG + /* Verify that page can be decompressed */ + { + page_t tmp_buf[UNIV_PAGE_SIZE_MAX]; + page_t page[UNIV_PAGE_SIZE_MAX]; + memcpy(page, out_buf, write_size); + ut_ad(fil_page_decompress(tmp_buf, page, flags)); + } +#endif + srv_stats.page_compression_saved.add(srv_page_size - write_size); + srv_stats.pages_page_compressed.inc(); + + return write_size; +} + +/** Compress a page_compressed page for non full crc32 format. +@param[in] buf page to be compressed +@param[out] out_buf compressed page +@param[in] flags tablespace flags +@param[in] block_size file system block size +@param[in] encrypted whether the page will be subsequently encrypted +@return actual length of compressed page +@retval 0 if the page was not compressed */ +static ulint fil_page_compress_for_non_full_crc32( + const byte* buf, + byte* out_buf, + ulint flags, + ulint block_size, + bool encrypted) +{ + int comp_level = int(fsp_flags_get_page_compression_level(flags)); + ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMP_METADATA_LEN; + /* Cache to avoid change during function execution */ + ulint comp_algo = innodb_compression_algorithm; + + if (encrypted) { + header_len += FIL_PAGE_ENCRYPT_COMP_ALGO; + } + + /* If no compression level was provided to this table, use system + default level */ + if (comp_level == 0) { + comp_level = int(page_zip_level); + } + + ulint write_size = fil_page_compress_low( + buf, out_buf, + header_len, comp_algo, comp_level); + + if (write_size == 0) { + srv_stats.pages_page_compression_error.inc(); + return 0; + } + /* Set up the page header */ memcpy(out_buf, buf, FIL_PAGE_DATA); /* Set up the checksum */ - mach_write_to_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC); + mach_write_to_4(out_buf + FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC); /* Set up the compression algorithm */ - mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, comp_method); + mach_write_to_8(out_buf + FIL_PAGE_COMP_ALGO, comp_algo); if (encrypted) { /* Set up the correct page type */ - mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED); - mach_write_to_2(out_buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, comp_method); + mach_write_to_2(out_buf + FIL_PAGE_TYPE, + FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED); + + mach_write_to_2(out_buf + FIL_PAGE_DATA + + FIL_PAGE_ENCRYPT_COMP_ALGO, comp_algo); } else { /* Set up the correct page type */ - mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED); + mach_write_to_2(out_buf + FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED); } /* Set up the actual payload lenght */ - mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size); + mach_write_to_2(out_buf + FIL_PAGE_DATA + FIL_PAGE_COMP_SIZE, + write_size); #ifdef UNIV_DEBUG /* Verify */ - ut_ad(fil_page_is_compressed(out_buf) || fil_page_is_compressed_encrypted(out_buf)); - ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC); - ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size); - ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) == (ulint)comp_method || - mach_read_from_2(out_buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE) == (ulint)comp_method); + ut_ad(fil_page_is_compressed(out_buf) + || fil_page_is_compressed_encrypted(out_buf)); + + ut_ad(mach_read_from_4(out_buf + FIL_PAGE_SPACE_OR_CHKSUM) + == BUF_NO_CHECKSUM_MAGIC); + + ut_ad(mach_read_from_2(out_buf + FIL_PAGE_DATA + FIL_PAGE_COMP_SIZE) + == write_size); + + bool is_compressed = (mach_read_from_8(out_buf + FIL_PAGE_COMP_ALGO) + == (ulint) comp_algo); + + bool is_encrypted_compressed = + (mach_read_from_2(out_buf + FIL_PAGE_DATA + + FIL_PAGE_ENCRYPT_COMP_ALGO) + == (ulint) comp_algo); + + ut_ad(is_compressed || is_encrypted_compressed); /* Verify that page can be decompressed */ { page_t tmp_buf[UNIV_PAGE_SIZE_MAX]; page_t page[UNIV_PAGE_SIZE_MAX]; memcpy(page, out_buf, srv_page_size); - ut_ad(fil_page_decompress(tmp_buf, page)); - ut_ad(!buf_page_is_corrupted(false, page, univ_page_size, - NULL)); + ut_ad(fil_page_decompress(tmp_buf, page, flags)); + ut_ad(!buf_page_is_corrupted(false, page, flags)); } #endif /* UNIV_DEBUG */ @@ -265,7 +372,8 @@ success: /* Actual write needs to be alligned on block size */ if (write_size % block_size) { size_t tmp = write_size; - write_size = (size_t)ut_uint64_align_up((ib_uint64_t)write_size, block_size); + write_size = (size_t)ut_uint64_align_up( + (ib_uint64_t)write_size, block_size); /* Clean up the end of buffer */ memset(out_buf+tmp, 0, write_size - tmp); #ifdef UNIV_DEBUG @@ -280,131 +388,245 @@ success: return write_size; } -/** Decompress a page that may be subject to page_compressed compression. -@param[in,out] tmp_buf temporary buffer (of innodb_page_size) -@param[in,out] buf possibly compressed page buffer -@return size of the compressed data -@retval 0 if decompression failed -@retval srv_page_size if the page was not compressed */ -ulint fil_page_decompress(byte* tmp_buf, byte* buf) +/** Compress a page_compressed page before writing to a data file. +@param[in] buf page to be compressed +@param[out] out_buf compressed page +@param[in] flags tablespace flags +@param[in] block_size file system block size +@param[in] encrypted whether the page will be subsequently encrypted +@return actual length of compressed page +@retval 0 if the page was not compressed */ +ulint fil_page_compress( + const byte* buf, + byte* out_buf, + ulint flags, + ulint block_size, + bool encrypted) { - const unsigned ptype = mach_read_from_2(buf+FIL_PAGE_TYPE); - ulint header_len; - uint64_t compression_alg; - switch (ptype) { - case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED: - header_len = FIL_PAGE_DATA + FIL_PAGE_COMPRESSED_SIZE - + FIL_PAGE_COMPRESSION_METHOD_SIZE; - compression_alg = mach_read_from_2( - FIL_PAGE_DATA + FIL_PAGE_COMPRESSED_SIZE + buf); - break; - case FIL_PAGE_PAGE_COMPRESSED: - header_len = FIL_PAGE_DATA + FIL_PAGE_COMPRESSED_SIZE; - compression_alg = mach_read_from_8( - FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + buf); - break; - default: - return srv_page_size; - } + /* The full_crc32 page_compressed format assumes this. */ + ut_ad(!(block_size & 255)); + ut_ad(ut_is_2pow(block_size)); - if (mach_read_from_4(buf + FIL_PAGE_SPACE_OR_CHKSUM) - != BUF_NO_CHECKSUM_MAGIC) { + /* Let's not compress file space header or + extent descriptor */ + switch (fil_page_get_type(buf)) { + case 0: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + case FIL_PAGE_PAGE_COMPRESSED: return 0; } - ulint actual_size = mach_read_from_2(buf + FIL_PAGE_DATA); - - /* Check if payload size is corrupted */ - if (actual_size == 0 || actual_size > srv_page_size - header_len) { - return 0; + if (fil_space_t::full_crc32(flags)) { + return fil_page_compress_for_full_crc32( + buf, out_buf, flags, block_size, encrypted); } - switch (compression_alg) { + return fil_page_compress_for_non_full_crc32( + buf, out_buf, flags, block_size, encrypted); +} + +/** Decompress a page that may be subject to page_compressed compression. +@param[in,out] tmp_buf temporary buffer (of innodb_page_size) +@param[in,out] buf possibly compressed page buffer +@param[in] comp_algo compression algorithm +@param[in] header_len header length of the page +@param[in] actual size actual size of the page +@retval true if the page is decompressed or false */ +static bool fil_page_decompress_low( + byte* tmp_buf, + byte* buf, + ulint comp_algo, + ulint header_len, + ulint actual_size) +{ + switch (comp_algo) { default: ib::error() << "Unknown compression algorithm " - << compression_alg; - return 0; + << comp_algo; + return false; case PAGE_ZLIB_ALGORITHM: { uLong len = srv_page_size; - if (Z_OK == uncompress(tmp_buf, &len, + return (Z_OK == uncompress(tmp_buf, &len, buf + header_len, uLong(actual_size)) - && len == srv_page_size) { - break; - } + && len == srv_page_size); } - return 0; #ifdef HAVE_LZ4 case PAGE_LZ4_ALGORITHM: - if (LZ4_decompress_safe(reinterpret_cast<const char*>(buf) - + header_len, - reinterpret_cast<char*>(tmp_buf), - actual_size, srv_page_size) - == int(srv_page_size)) { - break; - } - return 0; + return LZ4_decompress_safe( + reinterpret_cast<const char*>(buf) + header_len, + reinterpret_cast<char*>(tmp_buf), + actual_size, srv_page_size) == int(srv_page_size); #endif /* HAVE_LZ4 */ #ifdef HAVE_LZO - case PAGE_LZO_ALGORITHM: { - lzo_uint len_lzo = srv_page_size; - if (LZO_E_OK == lzo1x_decompress_safe( - buf + header_len, - actual_size, tmp_buf, &len_lzo, NULL) - && len_lzo == srv_page_size) { - break; + case PAGE_LZO_ALGORITHM: + { + lzo_uint len_lzo = srv_page_size; + return (LZO_E_OK == lzo1x_decompress_safe( + buf + header_len, + actual_size, tmp_buf, &len_lzo, NULL) + && len_lzo == srv_page_size); } - return 0; - } #endif /* HAVE_LZO */ #ifdef HAVE_LZMA - case PAGE_LZMA_ALGORITHM: { - size_t src_pos = 0; - size_t dst_pos = 0; - uint64_t memlimit = UINT64_MAX; - - if (LZMA_OK == lzma_stream_buffer_decode( - &memlimit, 0, NULL, buf + header_len, - &src_pos, actual_size, tmp_buf, &dst_pos, - srv_page_size) - && dst_pos == srv_page_size) { - break; + case PAGE_LZMA_ALGORITHM: + { + size_t src_pos = 0; + size_t dst_pos = 0; + uint64_t memlimit = UINT64_MAX; + + return LZMA_OK == lzma_stream_buffer_decode( + &memlimit, 0, NULL, buf + header_len, + &src_pos, actual_size, tmp_buf, &dst_pos, + srv_page_size) + && dst_pos == srv_page_size; } - return 0; - } #endif /* HAVE_LZMA */ #ifdef HAVE_BZIP2 - case PAGE_BZIP2_ALGORITHM: { - unsigned int dst_pos = srv_page_size; - if (BZ_OK == BZ2_bzBuffToBuffDecompress( - reinterpret_cast<char*>(tmp_buf), - &dst_pos, - reinterpret_cast<char*>(buf) + header_len, - actual_size, 1, 0) - && dst_pos == srv_page_size) { - break; + case PAGE_BZIP2_ALGORITHM: + { + unsigned int dst_pos = srv_page_size; + return BZ_OK == BZ2_bzBuffToBuffDecompress( + reinterpret_cast<char*>(tmp_buf), + &dst_pos, + reinterpret_cast<char*>(buf) + header_len, + actual_size, 1, 0) + && dst_pos == srv_page_size; } - return 0; - } #endif /* HAVE_BZIP2 */ #ifdef HAVE_SNAPPY - case PAGE_SNAPPY_ALGORITHM: { - size_t olen = srv_page_size; - - if (SNAPPY_OK == snappy_uncompress( - reinterpret_cast<const char*>(buf) + header_len, - actual_size, - reinterpret_cast<char*>(tmp_buf), &olen) - && olen == srv_page_size) { - break; + case PAGE_SNAPPY_ALGORITHM: + { + size_t olen = srv_page_size; + + return SNAPPY_OK == snappy_uncompress( + reinterpret_cast<const char*>(buf) + + header_len, + actual_size, + reinterpret_cast<char*>(tmp_buf), &olen) + && olen == srv_page_size; + } +#endif /* HAVE_SNAPPY */ + } + + return false; +} + +/** Decompress a page for full crc32 format. +@param[in,out] tmp_buf temporary buffer (of innodb_page_size) +@param[in,out] buf possibly compressed page buffer +@param[in] flags tablespace flags +@return size of the compressed data +@retval 0 if decompression failed +@retval srv_page_size if the page was not compressed */ +ulint fil_page_decompress_for_full_crc32(byte* tmp_buf, byte* buf, ulint flags) +{ + ut_ad(fil_space_t::full_crc32(flags)); + bool compressed = false; + size_t size = buf_page_full_crc32_size(buf, &compressed, NULL); + if (!compressed) { + ut_ad(size == srv_page_size); + return size; + } + + if (!fil_space_t::is_compressed(flags)) { + return 0; + } + + if (size >= srv_page_size) { + return 0; + } + + if (fil_space_t::full_crc32_page_compressed_len(flags)) { + compile_time_assert(FIL_PAGE_FCRC32_CHECKSUM == 4); + if (size_t lsb = buf[size - 5]) { + size += lsb - 0x100; } + size -= 5; + } + + const size_t header_len = FIL_PAGE_COMP_ALGO; + + if (!fil_page_decompress_low(tmp_buf, buf, + fil_space_t::get_compression_algo(flags), + header_len, size - header_len)) { return 0; } -#endif /* HAVE_SNAPPY */ + + srv_stats.pages_page_decompressed.inc(); + memcpy(buf, tmp_buf, srv_page_size); + return size; +} + +/** Decompress a page for non full crc32 format. +@param[in,out] tmp_buf temporary buffer (of innodb_page_size) +@param[in,out] buf possibly compressed page buffer +@return size of the compressed data +@retval 0 if decompression failed +@retval srv_page_size if the page was not compressed */ +ulint fil_page_decompress_for_non_full_crc32( + byte* tmp_buf, + byte* buf) +{ + const unsigned ptype = mach_read_from_2(buf+FIL_PAGE_TYPE); + ulint header_len; + uint comp_algo; + switch (ptype) { + case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED: + header_len= FIL_PAGE_DATA + FIL_PAGE_ENCRYPT_COMP_METADATA_LEN; + comp_algo = mach_read_from_2( + FIL_PAGE_DATA + FIL_PAGE_ENCRYPT_COMP_ALGO + buf); + break; + case FIL_PAGE_PAGE_COMPRESSED: + header_len = FIL_PAGE_DATA + FIL_PAGE_COMP_METADATA_LEN; + if (mach_read_from_6(FIL_PAGE_COMP_ALGO + buf)) { + return 0; + } + comp_algo = mach_read_from_2(FIL_PAGE_COMP_ALGO + 6 + buf); + break; + default: + return srv_page_size; + } + + if (mach_read_from_4(buf + FIL_PAGE_SPACE_OR_CHKSUM) + != BUF_NO_CHECKSUM_MAGIC) { + return 0; + } + + ulint actual_size = mach_read_from_2(buf + FIL_PAGE_DATA + + FIL_PAGE_COMP_SIZE); + + /* Check if payload size is corrupted */ + if (actual_size == 0 || actual_size > srv_page_size - header_len) { + return 0; + } + + if (!fil_page_decompress_low(tmp_buf, buf, comp_algo, header_len, + actual_size)) { + return 0; } srv_stats.pages_page_decompressed.inc(); memcpy(buf, tmp_buf, srv_page_size); return actual_size; } + +/** Decompress a page that may be subject to page_compressed compression. +@param[in,out] tmp_buf temporary buffer (of innodb_page_size) +@param[in,out] buf possibly compressed page buffer +@return size of the compressed data +@retval 0 if decompression failed +@retval srv_page_size if the page was not compressed */ +ulint fil_page_decompress( + byte* tmp_buf, + byte* buf, + ulint flags) +{ + if (fil_space_t::full_crc32(flags)) { + return fil_page_decompress_for_full_crc32(tmp_buf, buf, flags); + } + + return fil_page_decompress_for_non_full_crc32(tmp_buf, buf); +} diff --git a/storage/innobase/fsp/fsp0file.cc b/storage/innobase/fsp/fsp0file.cc index 078621097b3..653b74c73be 100644 --- a/storage/innobase/fsp/fsp0file.cc +++ b/storage/innobase/fsp/fsp0file.cc @@ -343,7 +343,7 @@ Datafile::read_first_page(bool read_only_mode) if (m_order == 0) { m_space_id = fsp_header_get_space_id(m_first_page); m_flags = fsp_header_get_flags(m_first_page); - if (!fsp_flags_is_valid(m_flags, m_space_id)) { + if (!fil_space_t::is_valid_flags(m_flags, m_space_id)) { ulint cflags = fsp_flags_convert_from_101(m_flags); if (cflags == ULINT_UNDEFINED) { ib::error() @@ -356,8 +356,9 @@ Datafile::read_first_page(bool read_only_mode) } } - const page_size_t ps(m_flags); - if (ps.physical() > page_size) { + const size_t physical_size = fil_space_t::physical_size(m_flags); + + if (physical_size > page_size) { ib::error() << "File " << m_filepath << " should be longer than " << page_size << " bytes"; @@ -407,7 +408,9 @@ Datafile::validate_to_dd(ulint space_id, ulint flags) /* Make sure the datafile we found matched the space ID. If the datafile is a file-per-table tablespace then also match the row format and zip page size. */ - if (m_space_id == space_id && m_flags == flags) { + if (m_space_id == space_id + && (fil_space_t::is_flags_equal(flags, m_flags) + || fil_space_t::is_flags_equal(m_flags, flags))) { /* Datafile matches the tablespace expected. */ return(DB_SUCCESS); } @@ -537,19 +540,19 @@ err_exit: } } - if (!fsp_flags_is_valid(m_flags, m_space_id)) { + if (!fil_space_t::is_valid_flags(m_flags, m_space_id)) { /* Tablespace flags must be valid. */ error_txt = "Tablespace flags are invalid"; goto err_exit; } - const page_size_t page_size(m_flags); + ulint logical_size = fil_space_t::logical_size(m_flags); - if (srv_page_size != page_size.logical()) { + if (srv_page_size != logical_size) { /* Logical size must be innodb_page_size. */ ib::error() << "Data file '" << m_filepath << "' uses page size " - << page_size.logical() << ", but the innodb_page_size" + << logical_size << ", but the innodb_page_size" " start-up parameter is " << srv_page_size; free_first_page(); @@ -567,7 +570,7 @@ err_exit: goto err_exit; } - if (buf_page_is_corrupted(false, m_first_page, page_size)) { + if (buf_page_is_corrupted(false, m_first_page, m_flags)) { /* Look for checksum and other corruptions. */ error_txt = "Checksum mismatch"; goto err_exit; @@ -629,7 +632,6 @@ Datafile::find_space_id() for (ulint page_size = UNIV_ZIP_SIZE_MIN; page_size <= UNIV_PAGE_SIZE_MAX; page_size <<= 1) { - /* map[space_id] = count of pages */ typedef std::map< ulint, @@ -657,6 +659,20 @@ Datafile::find_space_id() byte* page = static_cast<byte*>( ut_align(buf, UNIV_SECTOR_SIZE)); + ulint fsp_flags; + /* provide dummy value if the first os_file_read() fails */ + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: + fsp_flags = 1U << FSP_FLAGS_FCRC32_POS_MARKER + | FSP_FLAGS_FCRC32_PAGE_SSIZE() + | innodb_compression_algorithm + << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO; + break; + default: + fsp_flags = 0; + } + for (ulint j = 0; j < page_count; ++j) { dberr_t err; @@ -674,33 +690,27 @@ Datafile::find_space_id() continue; } + if (j == 0) { + fsp_flags = mach_read_from_4( + page + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS); + } + bool noncompressed_ok = false; /* For noncompressed pages, the page size must be equal to srv_page_size. */ - if (page_size == srv_page_size) { + if (page_size == srv_page_size + && !fil_space_t::zip_size(fsp_flags)) { noncompressed_ok = !buf_page_is_corrupted( - false, page, univ_page_size, NULL); + false, page, fsp_flags); } bool compressed_ok = false; - /* file-per-table tablespaces can be compressed with - the same physical and logical page size. General - tablespaces must have different physical and logical - page sizes in order to be compressed. For this check, - assume the page is compressed if univ_page_size. - logical() is equal to or less than 16k and the - page_size we are checking is equal to or less than - srv_page_size. */ if (srv_page_size <= UNIV_PAGE_SIZE_DEF - && page_size <= srv_page_size) { - const page_size_t compr_page_size( - page_size, srv_page_size, - true); - + && page_size == fil_space_t::zip_size(fsp_flags)) { compressed_ok = !buf_page_is_corrupted( - false, page, compr_page_size, NULL); + false, page, fsp_flags); } if (noncompressed_ok || compressed_ok) { @@ -768,7 +778,7 @@ Datafile::restore_from_doublewrite() } /* Find if double write buffer contains page_no of given space id. */ - const byte* page = recv_sys->dblwr.find_page(m_space_id, 0); + const byte* page = recv_sys.dblwr.find_page(m_space_id, 0); const page_id_t page_id(m_space_id, 0); if (page == NULL) { @@ -787,7 +797,7 @@ Datafile::restore_from_doublewrite() ulint flags = mach_read_from_4( FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page); - if (!fsp_flags_is_valid(flags, m_space_id)) { + if (!fil_space_t::is_valid_flags(flags, m_space_id)) { ulint cflags = fsp_flags_convert_from_101(flags); if (cflags == ULINT_UNDEFINED) { ib::warn() @@ -800,21 +810,21 @@ Datafile::restore_from_doublewrite() /* The flags on the page should be converted later. */ } - const page_size_t page_size(flags); + ulint physical_size = fil_space_t::physical_size(flags); ut_a(page_get_page_no(page) == page_id.page_no()); ib::info() << "Restoring page " << page_id << " of datafile '" << m_filepath << "' from the doublewrite buffer. Writing " - << page_size.physical() << " bytes into file '" + << physical_size << " bytes into file '" << m_filepath << "'"; IORequest request(IORequest::WRITE); return(os_file_write( request, - m_filepath, m_handle, page, 0, page_size.physical()) + m_filepath, m_handle, page, 0, physical_size) != DB_SUCCESS); } @@ -921,8 +931,9 @@ RemoteDatafile::create_link_file( prev_filepath = read_link_file(link_filepath); if (prev_filepath) { - /* Truncate will call this with an existing - link file which contains the same filepath. */ + /* Truncate (starting with MySQL 5.6, probably no + longer since MariaDB Server 10.2.19) used to call this + with an existing link file which contains the same filepath. */ bool same = !strcmp(prev_filepath, filepath); ut_free(prev_filepath); if (same) { diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc index 2c7ffab11b9..b59f9417490 100644 --- a/storage/innobase/fsp/fsp0fsp.cc +++ b/storage/innobase/fsp/fsp0fsp.cc @@ -50,7 +50,6 @@ typedef ulint page_no_t; /** Return an extent to the free list of a space. @param[in,out] space tablespace @param[in] offset page number in the extent -@param[in] page_size page size @param[in,out] mtr mini-transaction */ MY_ATTRIBUTE((nonnull)) static @@ -58,7 +57,6 @@ void fsp_free_extent( fil_space_t* space, page_no_t offset, - const page_size_t& page_size, mtr_t* mtr); /********************************************************************//** @@ -78,7 +76,6 @@ We think of the extent lists of the segment catenated in the order FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE. @param[in] inode segment inode @param[in] space tablespace -@param[in] page_size page size @param[in,out] mtr mini-transaction @return the first extent descriptor, or NULL if none */ MY_ATTRIBUTE((nonnull, warn_unused_result)) @@ -87,7 +84,6 @@ xdes_t* fseg_get_first_extent( fseg_inode_t* inode, const fil_space_t* space, - const page_size_t& page_size, mtr_t* mtr); /** Put new extents to the free list if there are free extents above the free @@ -111,7 +107,6 @@ fsp_fill_free_list( This function implements the intelligent allocation strategy which tries to minimize file space fragmentation. @param[in,out] space tablespace -@param[in] page_size page size @param[in,out] seg_inode segment inode @param[in] hint hint of which page would be desirable @param[in] direction if the new page is needed because of @@ -132,7 +127,6 @@ static buf_block_t* fseg_alloc_free_page_low( fil_space_t* space, - const page_size_t& page_size, fseg_inode_t* seg_inode, ulint hint, byte direction, @@ -147,24 +141,16 @@ fseg_alloc_free_page_low( /** Gets a pointer to the space header and x-locks its page. @param[in] space tablespace -@param[in] page_size page size @param[in,out] mtr mini-transaction @return pointer to the space header, page x-locked */ -UNIV_INLINE -fsp_header_t* -fsp_get_space_header( - const fil_space_t* space, - const page_size_t& page_size, - mtr_t* mtr) +inline fsp_header_t* fsp_get_space_header(const fil_space_t* space, mtr_t* mtr) { buf_block_t* block; fsp_header_t* header; ut_ad(space->purpose != FIL_TYPE_LOG); - ut_ad(!FSP_FLAGS_GET_ZIP_SSIZE(space->flags) - == !page_size.is_compressed()); - block = buf_page_get(page_id_t(space->id, 0), page_size, + block = buf_page_get(page_id_t(space->id, 0), space->zip_size(), RW_SX_LATCH, mtr); header = FSP_HEADER_OFFSET + buf_block_get_frame(block); buf_block_dbg_add_level(block, SYNC_FSP_PAGE); @@ -371,16 +357,8 @@ xdes_init( xdes_t* descr, /*!< in: descriptor */ mtr_t* mtr) /*!< in/out: mini-transaction */ { - ulint i; - - ut_ad(descr && mtr); ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_SX_FIX)); - ut_ad((XDES_SIZE - XDES_BITMAP) % 4 == 0); - - for (i = XDES_BITMAP; i < XDES_SIZE; i += 4) { - mlog_write_ulint(descr + i, 0xFFFFFFFFUL, MLOG_4BYTES, mtr); - } - + mlog_memset(descr + XDES_BITMAP, XDES_SIZE - XDES_BITMAP, 0xff, mtr); xdes_set_state(descr, XDES_FREE, mtr); } @@ -427,9 +405,9 @@ xdes_get_descriptor_with_space_hdr( return(NULL); } - const page_size_t page_size(space->flags); + const ulint zip_size = space->zip_size(); - descr_page_no = xdes_calc_descriptor_page(page_size, offset); + descr_page_no = xdes_calc_descriptor_page(zip_size, offset); buf_block_t* block; @@ -440,7 +418,7 @@ xdes_get_descriptor_with_space_hdr( block = NULL; } else { block = buf_page_get( - page_id_t(space->id, descr_page_no), page_size, + page_id_t(space->id, descr_page_no), zip_size, RW_SX_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_FSP_PAGE); @@ -453,7 +431,7 @@ xdes_get_descriptor_with_space_hdr( } return(descr_page + XDES_ARR_OFFSET - + XDES_SIZE * xdes_calc_descriptor_index(page_size, offset)); + + XDES_SIZE * xdes_calc_descriptor_index(zip_size, offset)); } /** Get the extent descriptor of a page. @@ -465,22 +443,17 @@ defined, as they are uninitialized above the free limit. @param[in] space tablespace @param[in] offset page offset; if equal to the free limit, we try to add new extents to the space free list -@param[in] page_size page size @param[in,out] mtr mini-transaction @return the extent descriptor */ MY_ATTRIBUTE((warn_unused_result)) static xdes_t* -xdes_get_descriptor( - const fil_space_t* space, - page_no_t offset, - const page_size_t& page_size, - mtr_t* mtr) +xdes_get_descriptor(const fil_space_t* space, page_no_t offset, mtr_t* mtr) { buf_block_t* block; fsp_header_t* sp_header; - block = buf_page_get(page_id_t(space->id, 0), page_size, + block = buf_page_get(page_id_t(space->id, 0), space->zip_size(), RW_SX_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_FSP_PAGE); @@ -499,7 +472,6 @@ defined, as they are uninitialized above the free limit. @param[in] space tablespace @param[in] page descriptor page offset @param[in] offset page offset -@param[in] page_size page size @param[in,out] mtr mini-transaction @return the extent descriptor @retval NULL if the descriptor is not available */ @@ -510,15 +482,16 @@ xdes_get_descriptor_const( const fil_space_t* space, page_no_t page, page_no_t offset, - const page_size_t& page_size, mtr_t* mtr) { ut_ad(mtr_memo_contains(mtr, &space->latch, MTR_MEMO_S_LOCK)); ut_ad(offset < space->free_limit); ut_ad(offset < space->size_in_header); + const ulint zip_size = space->zip_size(); + if (buf_block_t* block = buf_page_get(page_id_t(space->id, page), - page_size, RW_S_LATCH, mtr)) { + zip_size, RW_S_LATCH, mtr)) { buf_block_dbg_add_level(block, SYNC_FSP_PAGE); ut_ad(page != 0 || space->free_limit == mach_read_from_4( @@ -529,7 +502,7 @@ xdes_get_descriptor_const( + block->frame)); return(block->frame + XDES_ARR_OFFSET + XDES_SIZE - * xdes_calc_descriptor_index(page_size, offset)); + * xdes_calc_descriptor_index(zip_size, offset)); } return(NULL); @@ -538,7 +511,6 @@ xdes_get_descriptor_const( /** Get a pointer to the extent descriptor. The page where the extent descriptor resides is x-locked. @param[in] space tablespace -@param[in] page_size page size @param[in] lst_node file address of the list node contained in the descriptor @param[in,out] mtr mini-transaction @@ -548,14 +520,13 @@ UNIV_INLINE xdes_t* xdes_lst_get_descriptor( const fil_space_t* space, - const page_size_t& page_size, fil_addr_t lst_node, mtr_t* mtr) { ut_ad(mtr_memo_contains(mtr, &space->latch, MTR_MEMO_X_LOCK)); - ut_ad(page_size.equals_to(page_size_t(space->flags))); - return(fut_get_ptr(space->id, page_size, lst_node, RW_SX_LATCH, mtr) - - XDES_FLST_NODE); + return fut_get_ptr(space->id, space->zip_size(), + lst_node, RW_SX_LATCH, mtr) + - XDES_FLST_NODE; } /********************************************************************//** @@ -612,9 +583,7 @@ void fil_space_t::modify_check(const mtr_t& mtr) const case MTR_LOG_NO_REDO: ut_ad(purpose == FIL_TYPE_TEMPORARY || purpose == FIL_TYPE_IMPORT - || my_atomic_loadlint(&redo_skipped_count) - || is_being_truncated - || srv_is_tablespace_truncated(id)); + || redo_skipped_count); return; case MTR_LOG_ALL: /* We may only write redo log for a persistent @@ -640,7 +609,7 @@ fsp_header_init_fields( ulint flags) /*!< in: tablespace flags (FSP_SPACE_FLAGS) */ { flags &= ~FSP_FLAGS_MEM_MASK; - ut_a(fsp_flags_is_valid(flags, space_id)); + ut_a(fil_space_t::is_valid_flags(flags, space_id)); mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + page, space_id); @@ -654,12 +623,12 @@ fsp_header_init_fields( @param[in,out] mtr mini-transaction */ void fsp_header_init(fil_space_t* space, ulint size, mtr_t* mtr) { - const page_id_t page_id(space->id, 0); - const page_size_t page_size(space->flags); + const page_id_t page_id(space->id, 0); + const ulint zip_size = space->zip_size(); mtr_x_lock_space(space, mtr); - buf_block_t* block = buf_page_create(page_id, page_size, mtr); - buf_page_get(page_id, page_size, RW_SX_LATCH, mtr); + buf_block_t* block = buf_page_create(page_id, zip_size, mtr); + buf_page_get(page_id, zip_size, RW_SX_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_FSP_PAGE); space->size_in_header = size; @@ -675,23 +644,23 @@ void fsp_header_init(fil_space_t* space, ulint size, mtr_t* mtr) mlog_write_ulint(FSP_HEADER_OFFSET + FSP_SPACE_ID + block->frame, space->id, MLOG_4BYTES, mtr); - mlog_write_ulint(FSP_HEADER_OFFSET + FSP_NOT_USED + block->frame, 0, - MLOG_4BYTES, mtr); + ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_NOT_USED + + block->frame)); mlog_write_ulint(FSP_HEADER_OFFSET + FSP_SIZE + block->frame, size, MLOG_4BYTES, mtr); - mlog_write_ulint(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + block->frame, 0, - MLOG_4BYTES, mtr); + ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + + block->frame)); mlog_write_ulint(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + block->frame, space->flags & ~FSP_FLAGS_MEM_MASK, MLOG_4BYTES, mtr); - mlog_write_ulint(FSP_HEADER_OFFSET + FSP_FRAG_N_USED + block->frame, 0, - MLOG_4BYTES, mtr); + ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED + + block->frame)); - flst_init(FSP_HEADER_OFFSET + FSP_FREE + block->frame, mtr); - flst_init(FSP_HEADER_OFFSET + FSP_FREE_FRAG + block->frame, mtr); - flst_init(FSP_HEADER_OFFSET + FSP_FULL_FRAG + block->frame, mtr); - flst_init(FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL + block->frame, mtr); - flst_init(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE + block->frame, mtr); + flst_init(block, FSP_HEADER_OFFSET + FSP_FREE, mtr); + flst_init(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG, mtr); + flst_init(block, FSP_HEADER_OFFSET + FSP_FULL_FRAG, mtr); + flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL, mtr); + flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, mtr); mlog_write_ull(FSP_HEADER_OFFSET + FSP_SEG_ID + block->frame, 1, mtr); @@ -768,6 +737,44 @@ fsp_try_extend_data_file_with_pages( return(success); } +/** Calculate the number of physical pages in an extent for this file. +@param[in] physical_size page_size of the datafile +@return number of pages in an extent for this file */ +inline ulint fsp_get_extent_size_in_pages(ulint physical_size) +{ + return (FSP_EXTENT_SIZE << srv_page_size_shift) / physical_size; +} + + +/** Calculate the number of pages to extend a datafile. +We extend single-table tablespaces first one extent at a time, +but 4 at a time for bigger tablespaces. It is not enough to extend always +by one extent, because we need to add at least one extent to FSP_FREE. +A single extent descriptor page will track many extents. And the extent +that uses its extent descriptor page is put onto the FSP_FREE_FRAG list. +Extents that do not use their extent descriptor page are added to FSP_FREE. +The physical page size is used to determine how many extents are tracked +on one extent descriptor page. See xdes_calc_descriptor_page(). +@param[in] physical_size page size in data file +@param[in] size current number of pages in the datafile +@return number of pages to extend the file. */ +static ulint fsp_get_pages_to_extend_ibd(ulint physical_size, ulint size) +{ + ulint extent_size = fsp_get_extent_size_in_pages(physical_size); + /* The threshold is set at 32MiB except when the physical page + size is small enough that it must be done sooner. */ + ulint threshold = std::min(32 * extent_size, physical_size); + + if (size >= threshold) { + /* Below in fsp_fill_free_list() we assume + that we add at most FSP_FREE_ADD extents at + a time */ + extent_size *= FSP_FREE_ADD; + } + + return extent_size; +} + /** Try to extend the last data file of a tablespace if it is auto-extending. @param[in,out] space tablespace @param[in,out] header tablespace header @@ -820,8 +827,7 @@ fsp_try_extend_data_file(fil_space_t* space, fsp_header_t* header, mtr_t* mtr) size = mach_read_from_4(header + FSP_SIZE); ut_ad(size == space->size_in_header); - const page_size_t page_size( - mach_read_from_4(header + FSP_SPACE_FLAGS)); + const ulint ps = space->physical_size(); switch (space->id) { case TRX_SYS_SPACE: @@ -831,8 +837,7 @@ fsp_try_extend_data_file(fil_space_t* space, fsp_header_t* header, mtr_t* mtr) size_increase = srv_tmp_space.get_increment(); break; default: - ulint extent_pages - = fsp_get_extent_size_in_pages(page_size); + ulint extent_pages = fsp_get_extent_size_in_pages(ps); if (size < extent_pages) { /* Let us first extend the file to extent_size */ if (!fsp_try_extend_data_file_with_pages( @@ -843,7 +848,7 @@ fsp_try_extend_data_file(fil_space_t* space, fsp_header_t* header, mtr_t* mtr) size = extent_pages; } - size_increase = fsp_get_pages_to_extend_ibd(page_size, size); + size_increase = fsp_get_pages_to_extend_ibd(ps, size); } if (size_increase == 0) { @@ -857,8 +862,7 @@ fsp_try_extend_data_file(fil_space_t* space, fsp_header_t* header, mtr_t* mtr) /* We ignore any fragments of a full megabyte when storing the size to the space header */ - space->size_in_header = ut_2pow_round( - space->size, (1024 * 1024) / page_size.physical()); + space->size_in_header = ut_2pow_round(space->size, (1024 * 1024) / ps); mlog_write_ulint( header + FSP_SIZE, space->size_in_header, MLOG_4BYTES, mtr); @@ -866,47 +870,6 @@ fsp_try_extend_data_file(fil_space_t* space, fsp_header_t* header, mtr_t* mtr) return(size_increase); } -/** Calculate the number of pages to extend a datafile. -We extend single-table tablespaces first one extent at a time, -but 4 at a time for bigger tablespaces. It is not enough to extend always -by one extent, because we need to add at least one extent to FSP_FREE. -A single extent descriptor page will track many extents. And the extent -that uses its extent descriptor page is put onto the FSP_FREE_FRAG list. -Extents that do not use their extent descriptor page are added to FSP_FREE. -The physical page size is used to determine how many extents are tracked -on one extent descriptor page. See xdes_calc_descriptor_page(). -@param[in] page_size page_size of the datafile -@param[in] size current number of pages in the datafile -@return number of pages to extend the file. */ -ulint -fsp_get_pages_to_extend_ibd( - const page_size_t& page_size, - ulint size) -{ - ulint size_increase; /* number of pages to extend this file */ - ulint extent_size; /* one megabyte, in pages */ - ulint threshold; /* The size of the tablespace (in number - of pages) where we start allocating more - than one extent at a time. */ - - extent_size = fsp_get_extent_size_in_pages(page_size); - - /* The threshold is set at 32MiB except when the physical page - size is small enough that it must be done sooner. */ - threshold = ut_min(32 * extent_size, page_size.physical()); - - if (size < threshold) { - size_increase = extent_size; - } else { - /* Below in fsp_fill_free_list() we assume - that we add at most FSP_FREE_ADD extents at - a time */ - size_increase = FSP_FREE_ADD * extent_size; - } - - return(size_increase); -} - /** Reset the page type. Data files created before MySQL 5.1.48 may contain garbage in FIL_PAGE_TYPE. In MySQL 3.23.53, only undo log pages and index pages were tagged. @@ -957,7 +920,7 @@ fsp_fill_free_list( ut_ad(size == space->size_in_header); ut_ad(limit == space->free_limit); - const page_size_t page_size(space->flags); + const ulint zip_size = space->zip_size(); if (size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) { bool skip_resize = init_space; @@ -981,8 +944,8 @@ fsp_fill_free_list( while ((init_space && i < 1) || ((i + FSP_EXTENT_SIZE <= size) && (count < FSP_FREE_ADD))) { - bool init_xdes - = (ut_2pow_remainder(i, page_size.physical()) == 0); + const bool init_xdes = 0 + == ut_2pow_remainder(i, ulint(space->physical_size())); space->free_limit = i + FSP_EXTENT_SIZE; mlog_write_ulint(header + FSP_FREE_LIMIT, i + FSP_EXTENT_SIZE, @@ -1000,10 +963,10 @@ fsp_fill_free_list( const page_id_t page_id(space->id, i); block = buf_page_create( - page_id, page_size, mtr); + page_id, zip_size, mtr); buf_page_get( - page_id, page_size, RW_SX_LATCH, mtr); + page_id, zip_size, RW_SX_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_FSP_PAGE); @@ -1025,30 +988,23 @@ fsp_fill_free_list( mtr_start(&ibuf_mtr); ibuf_mtr.set_named_space(space); - /* Avoid logging while truncate table - fix-up is active. */ - if (srv_is_tablespace_truncated(space->id)) { - mtr_set_log_mode( - &ibuf_mtr, MTR_LOG_NO_REDO); - } - const page_id_t page_id( space->id, i + FSP_IBUF_BITMAP_OFFSET); block = buf_page_create( - page_id, page_size, &ibuf_mtr); + page_id, zip_size, &ibuf_mtr); buf_page_get( - page_id, page_size, RW_SX_LATCH, + page_id, zip_size, RW_SX_LATCH, &ibuf_mtr); buf_block_dbg_add_level(block, SYNC_FSP_PAGE); fsp_init_file_page(space, block, &ibuf_mtr); - - ibuf_bitmap_page_init(block, &ibuf_mtr); - + mlog_write_ulint(block->frame + FIL_PAGE_TYPE, + FIL_PAGE_IBUF_BITMAP, + MLOG_2BYTES, &ibuf_mtr); mtr_commit(&ibuf_mtr); } } @@ -1056,7 +1012,7 @@ fsp_fill_free_list( buf_block_t* desc_block = NULL; descr = xdes_get_descriptor_with_space_hdr( header, space, i, mtr, init_space, &desc_block); - if (desc_block != NULL) { + if (desc_block && !space->full_crc32()) { fil_block_check_type( *desc_block, FIL_PAGE_TYPE_XDES, mtr); } @@ -1093,7 +1049,6 @@ fsp_fill_free_list( /** Allocates a new free extent. @param[in,out] space tablespace -@param[in] page_size page size @param[in] hint hint of which extent would be desirable: any page offset in the extent goes; the hint must not be > FSP_FREE_LIMIT @param[in,out] mtr mini-transaction @@ -1102,7 +1057,6 @@ static xdes_t* fsp_alloc_free_extent( fil_space_t* space, - const page_size_t& page_size, ulint hint, mtr_t* mtr) { @@ -1111,12 +1065,12 @@ fsp_alloc_free_extent( xdes_t* descr; buf_block_t* desc_block = NULL; - header = fsp_get_space_header(space, page_size, mtr); + header = fsp_get_space_header(space, mtr); descr = xdes_get_descriptor_with_space_hdr( header, space, hint, mtr, false, &desc_block); - if (desc_block != NULL) { + if (desc_block && !space->full_crc32()) { fil_block_check_type(*desc_block, FIL_PAGE_TYPE_XDES, mtr); } @@ -1137,8 +1091,7 @@ fsp_alloc_free_extent( return(NULL); /* No free extents left */ } - descr = xdes_lst_get_descriptor( - space, page_size, first, mtr); + descr = xdes_lst_get_descriptor(space, first, mtr); } flst_remove(header + FSP_FREE, descr + XDES_FLST_NODE, mtr); @@ -1189,7 +1142,6 @@ not previously x-latched. It is assumed that the block has been x-latched only by mtr, and freed in mtr in that case. @param[in,out] space tablespace @param[in] offset page number of the allocated page -@param[in] page_size page size of the allocated page @param[in] rw_latch RW_SX_LATCH, RW_X_LATCH @param[in,out] mtr mini-transaction of the allocation @param[in,out] init_mtr mini-transaction for initializing the page @@ -1200,15 +1152,12 @@ buf_block_t* fsp_page_create( fil_space_t* space, page_no_t offset, - const page_size_t& page_size, rw_lock_type_t rw_latch, mtr_t* mtr, mtr_t* init_mtr) { - ut_ad(page_size.equals_to(page_size_t(space->flags))); - buf_block_t* block = buf_page_create(page_id_t(space->id, offset), - page_size, init_mtr); + space->zip_size(), init_mtr); ut_d(bool latched = mtr_memo_contains_flagged(mtr, block, MTR_MEMO_PAGE_X_FIX @@ -1245,7 +1194,6 @@ fsp_page_create( /** Allocates a single free page from a space. The page is marked as used. @param[in,out] space tablespace -@param[in] page_size page size @param[in] hint hint of which page would be desirable @param[in] rw_latch RW_SX_LATCH, RW_X_LATCH @param[in,out] mtr mini-transaction @@ -1259,7 +1207,6 @@ static MY_ATTRIBUTE((warn_unused_result, nonnull)) buf_block_t* fsp_alloc_free_page( fil_space_t* space, - const page_size_t& page_size, ulint hint, rw_lock_type_t rw_latch, mtr_t* mtr, @@ -1272,7 +1219,7 @@ fsp_alloc_free_page( const ulint space_id = space->id; ut_d(space->modify_check(*mtr)); - header = fsp_get_space_header(space, page_size, mtr); + header = fsp_get_space_header(space, mtr); /* Get the hinted descriptor */ descr = xdes_get_descriptor_with_space_hdr(header, space, hint, mtr); @@ -1291,8 +1238,7 @@ fsp_alloc_free_page( FREE_FRAG list. But we will allocate our page from the the free extent anyway. */ - descr = fsp_alloc_free_extent(space, page_size, - hint, mtr); + descr = fsp_alloc_free_extent(space, hint, mtr); if (descr == NULL) { /* No free space left */ @@ -1304,8 +1250,7 @@ fsp_alloc_free_page( flst_add_last(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, mtr); } else { - descr = xdes_lst_get_descriptor(space, page_size, - first, mtr); + descr = xdes_lst_get_descriptor(space, first, mtr); } /* Reset the hint */ @@ -1353,23 +1298,17 @@ fsp_alloc_free_page( } fsp_alloc_from_free_frag(header, descr, free, mtr); - return(fsp_page_create(space, page_no, page_size, rw_latch, - mtr, init_mtr)); + return fsp_page_create(space, page_no, rw_latch, mtr, init_mtr); } /** Frees a single page of a space. The page is marked as free and clean. @param[in,out] space tablespace @param[in] offset page number -@param[in] page_size page size +@param[in] log whether to write MLOG_INIT_FREE_PAGE record @param[in,out] mtr mini-transaction */ -static -void -fsp_free_page( - fil_space_t* space, - ulint offset, - const page_size_t& page_size, - mtr_t* mtr) +static void fsp_free_page(fil_space_t* space, page_no_t offset, + bool log, mtr_t* mtr) { fsp_header_t* header; xdes_t* descr; @@ -1381,7 +1320,7 @@ fsp_free_page( /* fprintf(stderr, "Freeing page %lu in space %lu\n", page, space); */ - header = fsp_get_space_header(space, page_size, mtr); + header = fsp_get_space_header(space, mtr); descr = xdes_get_descriptor_with_space_hdr( header, space, offset, mtr); @@ -1423,14 +1362,24 @@ fsp_free_page( return; } + if (UNIV_UNLIKELY(!log)) { + /* The last page freed in BtrBulk::finish() must be + written with redo logging disabled for the page + itself. The modifications of the allocation data + structures are covered by redo log. */ + } else if (byte* log_ptr = mlog_open(mtr, 11)) { + log_ptr = mlog_write_initial_log_record_low( + MLOG_INIT_FREE_PAGE, space->id, offset, log_ptr, mtr); + mlog_close(mtr, log_ptr); + } + const ulint bit = offset % FSP_EXTENT_SIZE; xdes_set_bit(descr, XDES_FREE_BIT, bit, TRUE, mtr); /* xdes_init() should have set all XDES_CLEAN_BIT */ ut_ad(xdes_get_bit(descr, XDES_CLEAN_BIT, bit)); - frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, - mtr); + frag_n_used = mach_read_from_4(header + FSP_FRAG_N_USED); if (state == XDES_FULL_FRAG) { /* The fragment was full: move it to another list */ flst_remove(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE, @@ -1451,29 +1400,22 @@ fsp_free_page( /* The extent has become free: move it to another list */ flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, mtr); - fsp_free_extent(space, offset, page_size, mtr); + fsp_free_extent(space, offset, mtr); } } /** Return an extent to the free list of a space. @param[in,out] space tablespace @param[in] offset page number in the extent -@param[in] page_size page size @param[in,out] mtr mini-transaction */ -static -void -fsp_free_extent( - fil_space_t* space, - page_no_t offset, - const page_size_t& page_size, - mtr_t* mtr) +static void fsp_free_extent(fil_space_t* space, page_no_t offset, mtr_t* mtr) { fsp_header_t* header; xdes_t* descr; ut_ad(mtr_memo_contains(mtr, &space->latch, MTR_MEMO_X_LOCK)); - header = fsp_get_space_header(space, page_size, mtr); + header = fsp_get_space_header(space, mtr); descr = xdes_get_descriptor_with_space_hdr( header, space, offset, mtr); @@ -1486,10 +1428,16 @@ fsp_free_extent( space->free_len++; } +/** @return Number of segment inodes which fit on a single page */ +inline ulint FSP_SEG_INODES_PER_PAGE(ulint physical_size) +{ + return (physical_size - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE; +} + /** Returns the nth inode slot on an inode page. @param[in] page segment inode page @param[in] i inode index on page -@param[in] page_size page size +@param[in] physical_size page size @param[in,out] mtr mini-transaction @return segment inode */ UNIV_INLINE @@ -1497,10 +1445,10 @@ fseg_inode_t* fsp_seg_inode_page_get_nth_inode( page_t* page, ulint i, - const page_size_t& page_size, + ulint physical_size, mtr_t* mtr) { - ut_ad(i < FSP_SEG_INODES_PER_PAGE(page_size)); + ut_ad(i < FSP_SEG_INODES_PER_PAGE(physical_size)); ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_SX_FIX)); return(page + FSEG_ARR_OFFSET + FSEG_INODE_SIZE * i); @@ -1508,23 +1456,23 @@ fsp_seg_inode_page_get_nth_inode( /** Looks for a used segment inode on a segment inode page. @param[in] page segment inode page -@param[in] page_size page size +@param[in] physical_size page size @param[in,out] mtr mini-transaction @return segment inode index, or ULINT_UNDEFINED if not found */ static ulint fsp_seg_inode_page_find_used( page_t* page, - const page_size_t& page_size, + ulint physical_size, mtr_t* mtr) { ulint i; fseg_inode_t* inode; - for (i = 0; i < FSP_SEG_INODES_PER_PAGE(page_size); i++) { + for (i = 0; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++) { inode = fsp_seg_inode_page_get_nth_inode( - page, i, page_size, mtr); + page, i, physical_size, mtr); if (mach_read_from_8(inode + FSEG_ID)) { /* This is used */ @@ -1541,7 +1489,7 @@ fsp_seg_inode_page_find_used( /** Looks for an unused segment inode on a segment inode page. @param[in] page segment inode page @param[in] i search forward starting from this index -@param[in] page_size page size +@param[in] physical_size page size @param[in,out] mtr mini-transaction @return segment inode index, or ULINT_UNDEFINED if not found */ static @@ -1549,15 +1497,15 @@ ulint fsp_seg_inode_page_find_free( page_t* page, ulint i, - const page_size_t& page_size, + ulint physical_size, mtr_t* mtr) { - for (; i < FSP_SEG_INODES_PER_PAGE(page_size); i++) { + for (; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++) { fseg_inode_t* inode; inode = fsp_seg_inode_page_get_nth_inode( - page, i, page_size, mtr); + page, i, physical_size, mtr); if (!mach_read_from_8(inode + FSEG_ID)) { /* This is unused */ @@ -1589,10 +1537,7 @@ fsp_alloc_seg_inode_page( ut_ad(page_offset(space_header) == FSP_HEADER_OFFSET); ut_ad(page_get_space_id(page_align(space_header)) == space->id); - const page_size_t page_size(space->flags); - - block = fsp_alloc_free_page( - space, page_size, 0, RW_SX_LATCH, mtr, mtr); + block = fsp_alloc_free_page(space, 0, RW_SX_LATCH, mtr, mtr); if (block == NULL) { @@ -1607,7 +1552,7 @@ fsp_alloc_seg_inode_page( #ifdef UNIV_DEBUG const byte* inode = FSEG_ID + FSEG_ARR_OFFSET + block->frame; - for (ulint i = FSP_SEG_INODES_PER_PAGE(page_size); i--; + for (ulint i = FSP_SEG_INODES_PER_PAGE(space->physical_size()); i--; inode += FSEG_INODE_SIZE) { ut_ad(!mach_read_from_8(inode)); } @@ -1646,25 +1591,29 @@ fsp_alloc_seg_inode( && !fsp_alloc_seg_inode_page(space, space_header, mtr)) { return(NULL); } - const page_size_t page_size(space->flags); const page_id_t page_id( space->id, flst_get_first(space_header + FSP_SEG_INODES_FREE, mtr).page); - block = buf_page_get(page_id, page_size, RW_SX_LATCH, mtr); + block = buf_page_get(page_id, space->zip_size(), RW_SX_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_FSP_PAGE); - fil_block_check_type(*block, FIL_PAGE_INODE, mtr); + if (!space->full_crc32()) { + fil_block_check_type(*block, FIL_PAGE_INODE, mtr); + } page = buf_block_get_frame(block); - n = fsp_seg_inode_page_find_free(page, 0, page_size, mtr); + const ulint physical_size = space->physical_size(); + + n = fsp_seg_inode_page_find_free(page, 0, physical_size, mtr); ut_a(n != ULINT_UNDEFINED); - inode = fsp_seg_inode_page_get_nth_inode(page, n, page_size, mtr); + inode = fsp_seg_inode_page_get_nth_inode(page, n, physical_size, mtr); if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(page, n + 1, - page_size, mtr)) { + physical_size, + mtr)) { /* There are no other unused headers left on the page: move it to another list */ @@ -1682,14 +1631,10 @@ fsp_alloc_seg_inode( /** Frees a file segment inode. @param[in,out] space tablespace -@param[in] page_size page size @param[in,out] inode segment inode @param[in,out] mtr mini-transaction */ -static -void -fsp_free_seg_inode( +static void fsp_free_seg_inode( fil_space_t* space, - const page_size_t& page_size, fseg_inode_t* inode, mtr_t* mtr) { @@ -1700,12 +1645,14 @@ fsp_free_seg_inode( page = page_align(inode); - space_header = fsp_get_space_header(space, page_size, mtr); + space_header = fsp_get_space_header(space, mtr); ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + const ulint physical_size = space->physical_size(); + if (ULINT_UNDEFINED - == fsp_seg_inode_page_find_free(page, 0, page_size, mtr)) { + == fsp_seg_inode_page_find_free(page, 0, physical_size, mtr)) { /* Move the page to another list */ @@ -1720,21 +1667,21 @@ fsp_free_seg_inode( mlog_write_ulint(inode + FSEG_MAGIC_N, 0xfa051ce3, MLOG_4BYTES, mtr); if (ULINT_UNDEFINED - == fsp_seg_inode_page_find_used(page, page_size, mtr)) { + == fsp_seg_inode_page_find_used(page, physical_size, mtr)) { /* There are no other used headers left on the page: free it */ flst_remove(space_header + FSP_SEG_INODES_FREE, page + FSEG_INODE_PAGE_NODE, mtr); - fsp_free_page(space, page_get_page_no(page), page_size, mtr); + fsp_free_page(space, page_get_page_no(page), true, mtr); } } /** Returns the file segment inode, page x-latched. @param[in] header segment header @param[in] space space id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in,out] mtr mini-transaction @param[out] block inode block, or NULL to ignore @return segment inode, page x-latched; NULL if the inode is free */ @@ -1743,7 +1690,7 @@ fseg_inode_t* fseg_inode_try_get( fseg_header_t* header, ulint space, - const page_size_t& page_size, + ulint zip_size, mtr_t* mtr, buf_block_t** block) { @@ -1754,7 +1701,7 @@ fseg_inode_try_get( inode_addr.boffset = mach_read_from_2(header + FSEG_HDR_OFFSET); ut_ad(space == mach_read_from_4(header + FSEG_HDR_SPACE)); - inode = fut_get_ptr(space, page_size, inode_addr, RW_SX_LATCH, mtr, + inode = fut_get_ptr(space, zip_size, inode_addr, RW_SX_LATCH, mtr, block); if (UNIV_UNLIKELY(!mach_read_from_8(inode + FSEG_ID))) { @@ -1771,7 +1718,7 @@ fseg_inode_try_get( /** Returns the file segment inode, page x-latched. @param[in] header segment header @param[in] space space id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in,out] mtr mini-transaction @param[out] block inode block @return segment inode, page x-latched */ @@ -1780,12 +1727,12 @@ fseg_inode_t* fseg_inode_get( fseg_header_t* header, ulint space, - const page_size_t& page_size, + ulint zip_size, mtr_t* mtr, buf_block_t** block = NULL) { fseg_inode_t* inode - = fseg_inode_try_get(header, space, page_size, mtr, block); + = fseg_inode_try_get(header, space, zip_size, mtr, block); ut_a(inode); return(inode); } @@ -1936,7 +1883,6 @@ fseg_create( buf_block_t* block = 0; /* remove warning */ fseg_header_t* header = 0; /* remove warning */ ulint n_reserved; - ulint i; DBUG_ENTER("fseg_create"); @@ -1945,21 +1891,22 @@ fseg_create( <= srv_page_size - FIL_PAGE_DATA_END); mtr_x_lock_space(space, mtr); - const page_size_t page_size(space->flags); ut_d(space->modify_check(*mtr)); if (page != 0) { - block = buf_page_get(page_id_t(space->id, page), page_size, + block = buf_page_get(page_id_t(space->id, page), + space->zip_size(), RW_SX_LATCH, mtr); header = byte_offset + buf_block_get_frame(block); - const ulint type = space->id == TRX_SYS_SPACE - && page == TRX_SYS_PAGE_NO - ? FIL_PAGE_TYPE_TRX_SYS - : FIL_PAGE_TYPE_SYS; - - fil_block_check_type(*block, type, mtr); + if (!space->full_crc32()) { + fil_block_check_type(*block, space->id == TRX_SYS_SPACE + && page == TRX_SYS_PAGE_NO + ? FIL_PAGE_TYPE_TRX_SYS + : FIL_PAGE_TYPE_SYS, + mtr); + } } if (!has_done_reservation @@ -1968,7 +1915,7 @@ fseg_create( DBUG_RETURN(NULL); } - space_header = fsp_get_space_header(space, page_size, mtr); + space_header = fsp_get_space_header(space, mtr); inode = fsp_alloc_seg_inode(space, space_header, mtr); @@ -1982,9 +1929,8 @@ fseg_create( seg_id = mach_read_from_8(space_header + FSP_SEG_ID); mlog_write_ull(space_header + FSP_SEG_ID, seg_id + 1, mtr); - mlog_write_ull(inode + FSEG_ID, seg_id, mtr); - mlog_write_ulint(inode + FSEG_NOT_FULL_N_USED, 0, MLOG_4BYTES, mtr); + ut_ad(!mach_read_from_4(inode + FSEG_NOT_FULL_N_USED)); flst_init(inode + FSEG_FREE, mtr); flst_init(inode + FSEG_NOT_FULL, mtr); @@ -1992,12 +1938,13 @@ fseg_create( mlog_write_ulint(inode + FSEG_MAGIC_N, FSEG_MAGIC_N_VALUE, MLOG_4BYTES, mtr); - for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { - fseg_set_nth_frag_page_no(inode, i, FIL_NULL, mtr); - } + compile_time_assert(FSEG_FRAG_SLOT_SIZE == 4); + compile_time_assert(FIL_NULL == 0xffffffff); + mlog_memset(inode + FSEG_FRAG_ARR, + FSEG_FRAG_SLOT_SIZE * FSEG_FRAG_ARR_N_SLOTS, 0xff, mtr); if (page == 0) { - block = fseg_alloc_free_page_low(space, page_size, + block = fseg_alloc_free_page_low(space, inode, 0, FSP_UP, RW_SX_LATCH, mtr, mtr #ifdef UNIV_DEBUG @@ -2010,9 +1957,7 @@ fseg_create( ut_ad(!has_done_reservation || block != NULL); if (block == NULL) { - - fsp_free_seg_inode(space, page_size, inode, mtr); - + fsp_free_seg_inode(space, inode, mtr); goto funct_exit; } @@ -2089,9 +2034,7 @@ fseg_n_reserved_pages( space_id = page_get_space_id(page_align(header)); space = mtr_x_lock_space(space_id, mtr); - const page_size_t page_size(space->flags); - - inode = fseg_inode_get(header, space_id, page_size, mtr); + inode = fseg_inode_get(header, space_id, space->zip_size(), mtr); ret = fseg_n_reserved_pages_low(inode, used, mtr); @@ -2104,7 +2047,6 @@ the free list is empty, and the extents can be allocated consecutively from the hint onward. @param[in] inode segment inode @param[in] space tablespace -@param[in] page_size page size @param[in] hint hint which extent would be good as the first extent @param[in,out] mtr mini-transaction */ @@ -2113,7 +2055,6 @@ void fseg_fill_free_list( fseg_inode_t* inode, fil_space_t* space, - const page_size_t& page_size, ulint hint, mtr_t* mtr) { @@ -2143,7 +2084,7 @@ fseg_fill_free_list( } for (i = 0; i < FSEG_FREE_LIST_MAX_LEN; i++) { - descr = xdes_get_descriptor(space, hint, page_size, mtr); + descr = xdes_get_descriptor(space, hint, mtr); if ((descr == NULL) || (XDES_FREE != xdes_get_state(descr, mtr))) { @@ -2153,7 +2094,7 @@ fseg_fill_free_list( return; } - descr = fsp_alloc_free_extent(space, page_size, hint, mtr); + descr = fsp_alloc_free_extent(space, hint, mtr); xdes_set_state(descr, XDES_FSEG, mtr); @@ -2173,7 +2114,6 @@ NOTE that the extent returned still resides in the segment free list, it is not yet taken off it! @param[in] inode segment inode @param[in,out] space tablespace -@param[in] page_size page size @param[in,out] mtr mini-transaction @retval NULL if no page could be allocated @retval block rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded @@ -2184,7 +2124,6 @@ xdes_t* fseg_alloc_free_extent( fseg_inode_t* inode, fil_space_t* space, - const page_size_t& page_size, mtr_t* mtr) { xdes_t* descr; @@ -2200,10 +2139,10 @@ fseg_alloc_free_extent( first = flst_get_first(inode + FSEG_FREE, mtr); - descr = xdes_lst_get_descriptor(space, page_size, first, mtr); + descr = xdes_lst_get_descriptor(space, first, mtr); } else { /* Segment free list was empty, allocate from space */ - descr = fsp_alloc_free_extent(space, page_size, 0, mtr); + descr = fsp_alloc_free_extent(space, 0, mtr); if (descr == NULL) { @@ -2217,7 +2156,7 @@ fseg_alloc_free_extent( flst_add_last(inode + FSEG_FREE, descr + XDES_FLST_NODE, mtr); /* Try to fill the segment free list */ - fseg_fill_free_list(inode, space, page_size, + fseg_fill_free_list(inode, space, xdes_get_offset(descr) + FSP_EXTENT_SIZE, mtr); } @@ -2229,7 +2168,6 @@ fseg_alloc_free_extent( This function implements the intelligent allocation strategy which tries to minimize file space fragmentation. @param[in,out] space tablespace -@param[in] page_size page size @param[in,out] seg_inode segment inode @param[in] hint hint of which page would be desirable @param[in] direction if the new page is needed because of @@ -2250,7 +2188,6 @@ static buf_block_t* fseg_alloc_free_page_low( fil_space_t* space, - const page_size_t& page_size, fseg_inode_t* seg_inode, ulint hint, byte direction, @@ -2285,7 +2222,7 @@ fseg_alloc_free_page_low( reserved = fseg_n_reserved_pages_low(seg_inode, &used, mtr); - space_header = fsp_get_space_header(space, page_size, mtr); + space_header = fsp_get_space_header(space, mtr); descr = xdes_get_descriptor_with_space_hdr(space_header, space, hint, mtr); @@ -2294,7 +2231,7 @@ fseg_alloc_free_page_low( hint */ /* The file space header page is always allocated. */ hint = 0; - descr = xdes_get_descriptor(space, hint, page_size, mtr); + descr = xdes_get_descriptor(space, hint, mtr); } /* In the big if-else below we look for ret_page and ret_descr */ @@ -2321,7 +2258,7 @@ take_hinted_page: ========================================================= the hinted page ===============*/ - ret_descr = fsp_alloc_free_extent(space, page_size, hint, mtr); + ret_descr = fsp_alloc_free_extent(space, hint, mtr); ut_a(ret_descr == descr); @@ -2331,7 +2268,7 @@ take_hinted_page: ret_descr + XDES_FLST_NODE, mtr); /* Try to fill the segment free list */ - fseg_fill_free_list(seg_inode, space, page_size, + fseg_fill_free_list(seg_inode, space, hint + FSP_EXTENT_SIZE, mtr); goto take_hinted_page; /*-----------------------------------------------------------*/ @@ -2339,8 +2276,7 @@ take_hinted_page: && ((reserved - used) < reserved / FSEG_FILLFACTOR) && (used >= FSEG_FRAG_LIMIT) && (!!(ret_descr - = fseg_alloc_free_extent( - seg_inode, space, page_size, mtr)))) { + = fseg_alloc_free_extent(seg_inode, space, mtr)))) { /* 3. We take any free extent (which was already assigned above =============================================================== @@ -2386,8 +2322,7 @@ take_hinted_page: return(NULL); } - ret_descr = xdes_lst_get_descriptor(space, page_size, - first, mtr); + ret_descr = xdes_lst_get_descriptor(space, first, mtr); ret_page = xdes_get_offset(ret_descr) + xdes_find_bit(ret_descr, XDES_FREE_BIT, TRUE, 0, mtr); @@ -2397,7 +2332,7 @@ take_hinted_page: /* 6. We allocate an individual page from the space ===================================================*/ buf_block_t* block = fsp_alloc_free_page( - space, page_size, hint, rw_latch, mtr, init_mtr); + space, hint, rw_latch, mtr, init_mtr); ut_ad(!has_done_reservation || block != NULL); @@ -2419,8 +2354,7 @@ take_hinted_page: } else { /* 7. We allocate a new extent and take its first page ======================================================*/ - ret_descr = fseg_alloc_free_extent(seg_inode, - space, page_size, mtr); + ret_descr = fseg_alloc_free_extent(seg_inode, space, mtr); if (ret_descr == NULL) { ret_page = FIL_NULL; @@ -2468,8 +2402,7 @@ got_hinted_page: The extent is still in the appropriate list (FSEG_NOT_FULL or FSEG_FREE), and the page is not yet marked as used. */ - ut_ad(xdes_get_descriptor(space, ret_page, page_size, mtr) - == ret_descr); + ut_ad(xdes_get_descriptor(space, ret_page, mtr) == ret_descr); ut_ad(xdes_mtr_get_bit( ret_descr, XDES_FREE_BIT, @@ -2478,8 +2411,7 @@ got_hinted_page: fseg_mark_page_used(seg_inode, ret_page, ret_descr, mtr); } - return(fsp_page_create(space, ret_page, page_size, rw_latch, - mtr, init_mtr)); + return fsp_page_create(space, ret_page, rw_latch, mtr, init_mtr); } /**********************************************************************//** @@ -2521,10 +2453,11 @@ fseg_alloc_free_page_general( space_id = page_get_space_id(page_align(seg_header)); space = mtr_x_lock_space(space_id, mtr); - const page_size_t page_size(space->flags); - - inode = fseg_inode_get(seg_header, space_id, page_size, mtr, &iblock); - fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + inode = fseg_inode_get(seg_header, space_id, space->zip_size(), + mtr, &iblock); + if (!space->full_crc32()) { + fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + } if (!has_done_reservation && !fsp_reserve_free_extents(&n_reserved, space, 2, @@ -2532,7 +2465,7 @@ fseg_alloc_free_page_general( return(NULL); } - block = fseg_alloc_free_page_low(space, page_size, + block = fseg_alloc_free_page_low(space, inode, hint, direction, RW_X_LATCH, mtr, init_mtr #ifdef UNIV_DEBUG @@ -2650,9 +2583,9 @@ fsp_reserve_free_extents( *n_reserved = n_ext; mtr_x_lock_space(space, mtr); - const page_size_t page_size(space->flags); + const ulint physical_size = space->physical_size(); - space_header = fsp_get_space_header(space, page_size, mtr); + space_header = fsp_get_space_header(space, mtr); try_again: size = mach_read_from_4(space_header + FSP_SIZE); ut_ad(size == space->size_in_header); @@ -2667,8 +2600,7 @@ try_again: n_free_list_ext = flst_get_len(space_header + FSP_FREE); ut_ad(space->free_len == n_free_list_ext); - free_limit = mtr_read_ulint(space_header + FSP_FREE_LIMIT, - MLOG_4BYTES, mtr); + free_limit = mach_read_from_4(space_header + FSP_FREE_LIMIT); ut_ad(space->free_limit == free_limit); /* Below we play safe when counting free extents above the free limit: @@ -2684,8 +2616,7 @@ try_again: if (n_free_up > 0) { n_free_up--; - n_free_up -= n_free_up / (page_size.physical() - / FSP_EXTENT_SIZE); + n_free_up -= n_free_up / (physical_size / FSP_EXTENT_SIZE); } n_free = n_free_list_ext + n_free_up; @@ -2751,9 +2682,7 @@ fseg_mark_page_used( ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); - - ut_ad(mtr_read_ulint(seg_inode + FSEG_ID, MLOG_4BYTES, mtr) - == mtr_read_ulint(descr + XDES_ID, MLOG_4BYTES, mtr)); + ut_ad(!memcmp(seg_inode + FSEG_ID, descr + XDES_ID, 4)); if (xdes_is_free(descr, mtr)) { /* We move the extent from the free list to the @@ -2770,8 +2699,7 @@ fseg_mark_page_used( /* We mark the page as used */ xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, FALSE, mtr); - not_full_n_used = mtr_read_ulint(seg_inode + FSEG_NOT_FULL_N_USED, - MLOG_4BYTES, mtr); + not_full_n_used = mach_read_from_4(seg_inode + FSEG_NOT_FULL_N_USED); not_full_n_used++; mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED, not_full_n_used, MLOG_4BYTES, mtr); @@ -2793,7 +2721,7 @@ fseg_mark_page_used( @param[in] seg_inode segment inode @param[in,out] space tablespace @param[in] offset page number -@param[in] page_size page size +@param[in] log whether to write MLOG_INIT_FREE_PAGE record @param[in,out] mtr mini-transaction */ static void @@ -2801,7 +2729,7 @@ fseg_free_page_low( fseg_inode_t* seg_inode, fil_space_t* space, page_no_t offset, - const page_size_t& page_size, + bool log, mtr_t* mtr) { xdes_t* descr; @@ -2817,7 +2745,7 @@ fseg_free_page_low( ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); ut_d(space->modify_check(*mtr)); - descr = xdes_get_descriptor(space, offset, page_size, mtr); + descr = xdes_get_descriptor(space, offset, mtr); if (xdes_mtr_get_bit(descr, XDES_FREE_BIT, offset % FSP_EXTENT_SIZE, mtr)) { @@ -2834,19 +2762,19 @@ fseg_free_page_low( if (state != XDES_FSEG) { /* The page is in the fragment pages of the segment */ - for (ulint i = 0;; i++) { if (fseg_get_nth_frag_page_no(seg_inode, i, mtr) - == offset) { - - fseg_set_nth_frag_page_no(seg_inode, i, - FIL_NULL, mtr); - break; + != offset) { + continue; } - } - fsp_free_page(space, offset, page_size, mtr); + compile_time_assert(FIL_NULL == 0xffffffff); + mlog_memset(seg_inode + FSEG_FRAG_ARR + + i * FSEG_FRAG_SLOT_SIZE, 4, 0xff, mtr); + break; + } + fsp_free_page(space, offset, log, mtr); return; } @@ -2870,8 +2798,7 @@ fseg_free_page_low( << FORCE_RECOVERY_MSG; } - not_full_n_used = mtr_read_ulint(seg_inode + FSEG_NOT_FULL_N_USED, - MLOG_4BYTES, mtr); + not_full_n_used = mach_read_from_4(seg_inode + FSEG_NOT_FULL_N_USED); if (xdes_is_full(descr, mtr)) { /* The fragment is full: move it to another list */ flst_remove(seg_inode + FSEG_FULL, @@ -2897,7 +2824,7 @@ fseg_free_page_low( /* The extent has become free: free it to space */ flst_remove(seg_inode + FSEG_NOT_FULL, descr + XDES_FLST_NODE, mtr); - fsp_free_extent(space, offset, page_size, mtr); + fsp_free_extent(space, offset, mtr); } } @@ -2905,28 +2832,32 @@ fseg_free_page_low( @param[in,out] seg_header file segment header @param[in,out] space tablespace @param[in] offset page number +@param[in] log whether to write MLOG_INIT_FREE_PAGE record @param[in,out] mtr mini-transaction */ void fseg_free_page( fseg_header_t* seg_header, fil_space_t* space, ulint offset, + bool log, mtr_t* mtr) { DBUG_ENTER("fseg_free_page"); fseg_inode_t* seg_inode; buf_block_t* iblock; mtr_x_lock_space(space, mtr); - const page_size_t page_size(space->flags); DBUG_LOG("fseg_free_page", "space_id: " << space->id << ", page_no: " << offset); - seg_inode = fseg_inode_get(seg_header, space->id, page_size, mtr, + seg_inode = fseg_inode_get(seg_header, space->id, space->zip_size(), + mtr, &iblock); - fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + if (!space->full_crc32()) { + fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + } - fseg_free_page_low(seg_inode, space, offset, page_size, mtr); + fseg_free_page_low(seg_inode, space, offset, log, mtr); ut_d(buf_page_set_file_page_was_freed(page_id_t(space->id, offset))); @@ -2942,8 +2873,8 @@ fseg_page_is_free(fil_space_t* space, unsigned page) { bool is_free; mtr_t mtr; - page_size_t page_size(space->flags); - page_no_t dpage = xdes_calc_descriptor_page(page_size, page); + page_no_t dpage = xdes_calc_descriptor_page(space->zip_size(), + page); mtr.start(); mtr_s_lock_space(space, &mtr); @@ -2951,7 +2882,7 @@ fseg_page_is_free(fil_space_t* space, unsigned page) if (page >= space->free_limit || page >= space->size_in_header) { is_free = true; } else if (const xdes_t* descr = xdes_get_descriptor_const( - space, dpage, page, page_size, &mtr)) { + space, dpage, page, &mtr)) { is_free = xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE); } else { @@ -2965,7 +2896,6 @@ fseg_page_is_free(fil_space_t* space, unsigned page) /** Free an extent of a segment to the space free list. @param[in,out] seg_inode segment inode @param[in,out] space tablespace -@param[in] page_size page size @param[in] page page number in the extent @param[in,out] mtr mini-transaction */ MY_ATTRIBUTE((nonnull)) @@ -2974,7 +2904,6 @@ void fseg_free_extent( fseg_inode_t* seg_inode, fil_space_t* space, - const page_size_t& page_size, ulint page, mtr_t* mtr) { @@ -2984,7 +2913,7 @@ fseg_free_extent( ut_ad(mtr != NULL); - descr = xdes_get_descriptor(space, page, page_size, mtr); + descr = xdes_get_descriptor(space, page, mtr); ut_a(xdes_get_state(descr, mtr) == XDES_FSEG); ut_a(!memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8)); @@ -3003,9 +2932,8 @@ fseg_free_extent( flst_remove(seg_inode + FSEG_NOT_FULL, descr + XDES_FLST_NODE, mtr); - not_full_n_used = mtr_read_ulint( - seg_inode + FSEG_NOT_FULL_N_USED, MLOG_4BYTES, mtr); - + not_full_n_used = mach_read_from_4(FSEG_NOT_FULL_N_USED + + seg_inode); descr_n_used = xdes_get_n_used(descr, mtr); ut_a(not_full_n_used >= descr_n_used); mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED, @@ -3013,7 +2941,7 @@ fseg_free_extent( MLOG_4BYTES, mtr); } - fsp_free_extent(space, page, page_size, mtr); + fsp_free_extent(space, page, mtr); #ifdef UNIV_DEBUG for (ulint i = 0; i < FSP_EXTENT_SIZE; i++) { @@ -3051,9 +2979,8 @@ fseg_free_step( header_page = page_get_page_no(page_align(header)); fil_space_t* space = mtr_x_lock_space(space_id, mtr); - const page_size_t page_size(space->flags); - descr = xdes_get_descriptor(space, header_page, page_size, mtr); + descr = xdes_get_descriptor(space, header_page, mtr); /* Check that the header resides on a page which has not been freed yet */ @@ -3061,8 +2988,8 @@ fseg_free_step( ut_a(xdes_mtr_get_bit(descr, XDES_FREE_BIT, header_page % FSP_EXTENT_SIZE, mtr) == FALSE); buf_block_t* iblock; - - inode = fseg_inode_try_get(header, space_id, page_size, mtr, &iblock); + const ulint zip_size = space->zip_size(); + inode = fseg_inode_try_get(header, space_id, zip_size, mtr, &iblock); if (inode == NULL) { ib::info() << "Double free of inode from " @@ -3070,15 +2997,15 @@ fseg_free_step( DBUG_RETURN(true); } - fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); - descr = fseg_get_first_extent(inode, space, page_size, mtr); + if (!space->full_crc32()) { + fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + } + descr = fseg_get_first_extent(inode, space, mtr); if (descr != NULL) { /* Free the extent held by the segment */ page = xdes_get_offset(descr); - - fseg_free_extent(inode, space, page_size, page, mtr); - + fseg_free_extent(inode, space, page, mtr); DBUG_RETURN(false); } @@ -3087,7 +3014,7 @@ fseg_free_step( if (n == ULINT_UNDEFINED) { /* Freeing completed: free the segment inode */ - fsp_free_seg_inode(space, page_size, inode, mtr); + fsp_free_seg_inode(space, inode, mtr); DBUG_RETURN(true); } @@ -3095,13 +3022,13 @@ fseg_free_step( fseg_free_page_low( inode, space, fseg_get_nth_frag_page_no(inode, n, mtr), - page_size, mtr); + true, mtr); n = fseg_find_last_used_frag_page_slot(inode, mtr); if (n == ULINT_UNDEFINED) { /* Freeing completed: free the segment inode */ - fsp_free_seg_inode(space, page_size, inode, mtr); + fsp_free_seg_inode(space, inode, mtr); DBUG_RETURN(true); } @@ -3130,20 +3057,20 @@ fseg_free_step_not_header( ut_ad(mtr->is_named_space(space_id)); fil_space_t* space = mtr_x_lock_space(space_id, mtr); - const page_size_t page_size(space->flags); buf_block_t* iblock; - inode = fseg_inode_get(header, space_id, page_size, mtr, &iblock); - fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + inode = fseg_inode_get(header, space_id, space->zip_size(), mtr, + &iblock); + if (!space->full_crc32()) { + fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + } - descr = fseg_get_first_extent(inode, space, page_size, mtr); + descr = fseg_get_first_extent(inode, space, mtr); if (descr != NULL) { /* Free the extent held by the segment */ page = xdes_get_offset(descr); - - fseg_free_extent(inode, space, page_size, page, mtr); - + fseg_free_extent(inode, space, page, mtr); return false; } @@ -3156,12 +3083,10 @@ fseg_free_step_not_header( page_no = fseg_get_nth_frag_page_no(inode, n, mtr); if (page_no == page_get_page_no(page_align(header))) { - return true; } - fseg_free_page_low(inode, space, page_no, page_size, mtr); - + fseg_free_page_low(inode, space, page_no, true, mtr); return false; } @@ -3170,7 +3095,6 @@ We think of the extent lists of the segment catenated in the order FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE. @param[in] inode segment inode @param[in] space tablespace -@param[in] page_size page size @param[in,out] mtr mini-transaction @return the first extent descriptor, or NULL if none */ MY_ATTRIBUTE((nonnull, warn_unused_result)) @@ -3179,7 +3103,6 @@ xdes_t* fseg_get_first_extent( fseg_inode_t* inode, const fil_space_t* space, - const page_size_t& page_size, mtr_t* mtr) { fil_addr_t first; @@ -3205,7 +3128,7 @@ fseg_get_first_extent( ut_ad(first.page != FIL_NULL); return(first.page == FIL_NULL ? NULL - : xdes_lst_get_descriptor(space, page_size, first, mtr)); + : xdes_lst_get_descriptor(space, first, mtr)); } #ifdef UNIV_BTR_PRINT @@ -3236,9 +3159,7 @@ fseg_print_low( reserved = fseg_n_reserved_pages_low(inode, &used, mtr); seg_id = mach_read_from_8(inode + FSEG_ID); - - n_used = mtr_read_ulint(inode + FSEG_NOT_FULL_N_USED, - MLOG_4BYTES, mtr); + n_used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED); n_frag = fseg_get_n_frag_pages(inode, mtr); n_free = flst_get_len(inode + FSEG_FREE); n_not_full = flst_get_len(inode + FSEG_NOT_FULL); @@ -3269,32 +3190,20 @@ fseg_print( space_id = page_get_space_id(page_align(header)); const fil_space_t* space = mtr_x_lock_space(space_id, mtr); - const page_size_t page_size(space->flags); - inode = fseg_inode_get(header, space_id, page_size, mtr); + inode = fseg_inode_get(header, space_id, space->zip_size(), mtr); fseg_print_low(inode, mtr); } #endif /* UNIV_BTR_PRINT */ #ifdef UNIV_DEBUG -/** Print the file segment header to the given output stream. -@param[in] out the output stream into which the object is printed. -@retval the output stream into which the object was printed. */ -std::ostream& -fseg_header::to_stream(std::ostream& out) const +std::ostream &fseg_header::to_stream(std::ostream &out) const { - const ulint space = mtr_read_ulint(m_header + FSEG_HDR_SPACE, - MLOG_4BYTES, m_mtr); - const ulint page_no = mtr_read_ulint(m_header + FSEG_HDR_PAGE_NO, - MLOG_4BYTES, m_mtr); - - const ulint offset = mtr_read_ulint(m_header + FSEG_HDR_OFFSET, - MLOG_2BYTES, m_mtr); - - out << "[fseg_header_t: space=" << space << ", page=" - << page_no << ", offset=" << offset << "]"; - - return(out); + out << "[fseg_header_t: space=" + << mach_read_from_4(m_header + FSEG_HDR_SPACE) + << ", page=" << mach_read_from_4(m_header + FSEG_HDR_PAGE_NO) + << ", offset=" << mach_read_from_2(m_header + FSEG_HDR_OFFSET) << "]"; + return out; } #endif /* UNIV_DEBUG */ diff --git a/storage/innobase/fsp/fsp0space.cc b/storage/innobase/fsp/fsp0space.cc index 757eeaf90ae..1ed4af86367 100644 --- a/storage/innobase/fsp/fsp0space.cc +++ b/storage/innobase/fsp/fsp0space.cc @@ -118,8 +118,20 @@ Tablespace::open_or_create(bool is_temp) /* Create the tablespace entry for the multi-file tablespace in the tablespace manager. */ + ulint fsp_flags = 0; + + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + fsp_flags = (FSP_FLAGS_FCRC32_MASK_MARKER + | FSP_FLAGS_FCRC32_PAGE_SSIZE()); + break; + default: + fsp_flags = FSP_FLAGS_PAGE_SSIZE(); + } + space = fil_space_create( - m_name, m_space_id, FSP_FLAGS_PAGE_SSIZE(), + m_name, m_space_id, fsp_flags, is_temp ? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE, NULL); diff --git a/storage/innobase/fsp/fsp0sysspace.cc b/storage/innobase/fsp/fsp0sysspace.cc index 451187a35d9..fd5ac3c368f 100644 --- a/storage/innobase/fsp/fsp0sysspace.cc +++ b/storage/innobase/fsp/fsp0sysspace.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -918,7 +919,7 @@ SysTablespace::open_or_create( ut_ad(!fil_system.sys_space); ut_ad(space_id() == TRX_SYS_SPACE); space = fil_space_create( - name(), TRX_SYS_SPACE, flags(), + name(), TRX_SYS_SPACE, it->flags(), FIL_TYPE_TABLESPACE, NULL); mutex_enter(&fil_system.mutex); diff --git a/storage/innobase/fts/fts0config.cc b/storage/innobase/fts/fts0config.cc index 51770eb6b69..8ae10c2465d 100644 --- a/storage/innobase/fts/fts0config.cc +++ b/storage/innobase/fts/fts0config.cc @@ -120,9 +120,9 @@ fts_config_get_value( error = fts_eval_sql(trx, graph); - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); que_graph_free(graph); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); return(error); } diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc index bf0f2da30df..199941e71f8 100644 --- a/storage/innobase/fts/fts0fts.cc +++ b/storage/innobase/fts/fts0fts.cc @@ -455,7 +455,7 @@ fts_load_user_stopword( fts_stopword_t* stopword_info) /*!< in: Stopword info */ { if (!fts->dict_locked) { - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); } /* Validate the user table existence in the right format */ @@ -464,7 +464,7 @@ fts_load_user_stopword( if (!stopword_info->charset) { cleanup: if (!fts->dict_locked) { - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } return ret; @@ -904,7 +904,7 @@ fts_drop_index( } /****************************************************************//** -Free the query graph but check whether dict_sys->mutex is already +Free the query graph but check whether dict_sys.mutex is already held */ void fts_que_graph_free_check_lock( @@ -926,15 +926,15 @@ fts_que_graph_free_check_lock( } if (!has_dict) { - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); } - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); que_graph_free(graph); if (!has_dict) { - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } } @@ -3200,15 +3200,10 @@ fts_fetch_doc_from_rec( documents */ { dict_index_t* index; - dict_table_t* table; const rec_t* clust_rec; - ulint num_field; const dict_field_t* ifield; - const dict_col_t* col; ulint clust_pos; - ulint i; ulint doc_len = 0; - ulint processed_doc = 0; st_mysql_ftparser* parser; if (!get_doc) { @@ -3216,19 +3211,15 @@ fts_fetch_doc_from_rec( } index = get_doc->index_cache->index; - table = get_doc->index_cache->index->table; parser = get_doc->index_cache->index->parser; clust_rec = btr_pcur_get_rec(pcur); ut_ad(!page_rec_is_comp(clust_rec) || rec_get_status(clust_rec) == REC_STATUS_ORDINARY); - num_field = dict_index_get_n_fields(index); - - for (i = 0; i < num_field; i++) { + for (ulint i = 0; i < index->n_fields; i++) { ifield = dict_index_get_nth_field(index, i); - col = dict_field_get_col(ifield); - clust_pos = dict_col_get_clust_pos(col, clust_index); + clust_pos = dict_col_get_clust_pos(ifield->col, clust_index); if (!get_doc->index_cache->charset) { get_doc->index_cache->charset = fts_get_charset( @@ -3239,7 +3230,7 @@ fts_fetch_doc_from_rec( doc->text.f_str = btr_rec_copy_externally_stored_field( clust_rec, offsets, - dict_table_page_size(table), + btr_pcur_get_block(pcur)->zip_size(), clust_pos, &doc->text.f_len, static_cast<mem_heap_t*>( doc->self_heap->arg)); @@ -3257,13 +3248,12 @@ fts_fetch_doc_from_rec( continue; } - if (processed_doc == 0) { + if (!doc_len) { fts_tokenize_document(doc, NULL, parser); } else { fts_tokenize_document_next(doc, doc_len, NULL, parser); } - processed_doc++; doc_len += doc->text.f_len + 1; } } @@ -3628,8 +3618,7 @@ fts_read_ulint( dfield_t* dfield = que_node_get_val(exp); void* data = dfield_get_data(dfield); - *value = static_cast<ulint>(mach_read_from_4( - static_cast<const byte*>(data))); + *value = mach_read_from_4(static_cast<const byte*>(data)); return(TRUE); } @@ -3670,13 +3659,6 @@ fts_get_max_doc_id( if (!page_is_empty(btr_pcur_get_page(&pcur))) { const rec_t* rec = NULL; - rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; - rec_offs* offsets = offsets_; - mem_heap_t* heap = NULL; - ulint len; - const void* data; - - rec_offs_init(offsets_); do { rec = btr_pcur_get_rec(&pcur); @@ -3686,18 +3668,11 @@ fts_get_max_doc_id( } } while (btr_pcur_move_to_prev(&pcur, &mtr)); - if (!rec) { + if (!rec || rec_is_metadata(rec, *index)) { goto func_exit; } - ut_ad(!rec_is_metadata(rec, index)); - offsets = rec_get_offsets( - rec, index, offsets, true, ULINT_UNDEFINED, &heap); - - data = rec_get_nth_field(rec, offsets, 0, &len); - - doc_id = static_cast<doc_id_t>(fts_read_doc_id( - static_cast<const byte*>(data))); + doc_id = fts_read_doc_id(rec); } func_exit: @@ -5159,49 +5134,23 @@ fts_get_doc_id_from_row( } /** Extract the doc id from the record that belongs to index. -@param[in] table table -@param[in] rec record contains FTS_DOC_ID +@param[in] rec record containing FTS_DOC_ID @param[in] index index of rec -@param[in] heap heap memory +@param[in] offsets rec_get_offsets(rec,index) @return doc id that was extracted from rec */ doc_id_t fts_get_doc_id_from_rec( - dict_table_t* table, const rec_t* rec, const dict_index_t* index, - mem_heap_t* heap) + const rec_offs* offsets) { - ulint len; - const byte* data; - ulint col_no; - doc_id_t doc_id = 0; - rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; - rec_offs* offsets = offsets_; - mem_heap_t* my_heap = heap; - - ut_a(table->fts->doc_col != ULINT_UNDEFINED); - - rec_offs_init(offsets_); - - offsets = rec_get_offsets( - rec, index, offsets, true, ULINT_UNDEFINED, &my_heap); - - col_no = dict_col_get_index_pos( - &table->cols[table->fts->doc_col], index); - - ut_ad(col_no != ULINT_UNDEFINED); - - data = rec_get_nth_field(rec, offsets, col_no, &len); - - ut_a(len == 8); - ut_ad(8 == sizeof(doc_id)); - doc_id = static_cast<doc_id_t>(mach_read_from_8(data)); - - if (my_heap && !heap) { - mem_heap_free(my_heap); - } - - return(doc_id); + ulint f = dict_col_get_index_pos( + &index->table->cols[index->table->fts->doc_col], index); + ulint len; + doc_id_t doc_id = mach_read_from_8( + rec_get_nth_field(rec, offsets, f, &len)); + ut_ad(len == 8); + return doc_id; } /*********************************************************************//** @@ -7340,7 +7289,7 @@ fts_init_recover_doc( doc.text.f_str = btr_copy_externally_stored_field( &doc.text.f_len, static_cast<byte*>(dfield_get_data(dfield)), - dict_table_page_size(table), len, + table->space->zip_size(), len, static_cast<mem_heap_t*>(doc.self_heap->arg)); } else { doc.text.f_str = static_cast<byte*>( @@ -7394,7 +7343,7 @@ fts_init_index( fts_cache_t* cache = table->fts->cache; bool need_init = false; - ut_ad(!mutex_own(&dict_sys->mutex)); + ut_ad(!mutex_own(&dict_sys.mutex)); /* First check cache->get_docs is initialized */ if (!has_cache_lock) { @@ -7459,10 +7408,10 @@ func_exit: } if (need_init) { - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); /* Register the table with the optimize thread. */ fts_optimize_add_table(table); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } return(TRUE); diff --git a/storage/innobase/fts/fts0opt.cc b/storage/innobase/fts/fts0opt.cc index 48fd22e3fb0..487e3d5b419 100644 --- a/storage/innobase/fts/fts0opt.cc +++ b/storage/innobase/fts/fts0opt.cc @@ -999,9 +999,9 @@ fts_table_fetch_doc_ids( error = fts_eval_sql(trx, graph); fts_sql_commit(trx); - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); que_graph_free(graph); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); if (error == DB_SUCCESS) { ib_vector_sort(doc_ids->doc_ids, fts_doc_id_cmp); @@ -2927,8 +2927,8 @@ fts_optimize_init(void) /* Add fts tables to fts_slots which could be skipped during dict_load_table_one() because fts_optimize_thread wasn't even started. */ - mutex_enter(&dict_sys->mutex); - for (dict_table_t* table = UT_LIST_GET_FIRST(dict_sys->table_LRU); + mutex_enter(&dict_sys.mutex); + for (dict_table_t* table = UT_LIST_GET_FIRST(dict_sys.table_LRU); table != NULL; table = UT_LIST_GET_NEXT(table_LRU, table)) { if (!table->fts || !dict_table_has_fts_index(table)) { @@ -2942,7 +2942,7 @@ fts_optimize_init(void) fts_optimize_new_table(table); table->fts->in_queue = true; } - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); fts_opt_shutdown_event = os_event_create(0); last_check_sync_time = time(NULL); diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc index dfa23d3a425..de592e7fb7f 100644 --- a/storage/innobase/fts/fts0que.cc +++ b/storage/innobase/fts/fts0que.cc @@ -206,7 +206,7 @@ struct fts_phrase_t { distance(0), charset(NULL), heap(NULL), - page_size(dict_table_page_size(table)), + zip_size(table->space->zip_size()), proximity_pos(NULL), parser(NULL) { @@ -230,8 +230,8 @@ struct fts_phrase_t { /** Heap for word processing */ mem_heap_t* heap; - /** Row page size */ - const page_size_t page_size; + /** ROW_FORMAT=COMPRESSED page size, or 0 */ + const ulint zip_size; /** Position info for proximity search verification. Records the min and max position of words matched */ @@ -2013,7 +2013,7 @@ fts_query_fetch_document( if (dfield_is_ext(dfield)) { data = btr_copy_externally_stored_field( - &cur_len, data, phrase->page_size, + &cur_len, data, phrase->zip_size, dfield_get_len(dfield), phrase->heap); } else { cur_len = dfield_get_len(dfield); diff --git a/storage/innobase/fts/fts0sql.cc b/storage/innobase/fts/fts0sql.cc index e3736f3277d..6873df102bf 100644 --- a/storage/innobase/fts/fts0sql.cc +++ b/storage/innobase/fts/fts0sql.cc @@ -91,14 +91,14 @@ fts_get_table_id( /** Construct the name of an internal FTS table for the given table. @param[in] fts_table metadata on fulltext-indexed table -@param[in] dict_locked whether dict_sys->mutex is being held +@param[in] dict_locked whether dict_sys.mutex is being held @return the prefix, must be freed with ut_free() */ char* fts_get_table_name_prefix(const fts_table_t* fts_table) { char table_id[FTS_AUX_MIN_TABLE_ID_LENGTH]; const size_t table_id_len = size_t(fts_get_table_id(fts_table, table_id)) + 1; - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); /* Include the separator as well. */ const size_t dbname_len = fts_table->table->name.dblen() + 1; ut_ad(dbname_len > 1); @@ -106,7 +106,7 @@ char* fts_get_table_name_prefix(const fts_table_t* fts_table) char* prefix_name = static_cast<char*>( ut_malloc_nokey(prefix_name_len)); memcpy(prefix_name, fts_table->table->name.m_name, dbname_len); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); memcpy(prefix_name + dbname_len, "FTS_", 4); memcpy(prefix_name + dbname_len + 4, table_id, table_id_len); return prefix_name; @@ -115,20 +115,20 @@ char* fts_get_table_name_prefix(const fts_table_t* fts_table) /** Construct the name of an internal FTS table for the given table. @param[in] fts_table metadata on fulltext-indexed table @param[out] table_name a name up to MAX_FULL_NAME_LEN -@param[in] dict_locked whether dict_sys->mutex is being held */ +@param[in] dict_locked whether dict_sys.mutex is being held */ void fts_get_table_name(const fts_table_t* fts_table, char* table_name, bool dict_locked) { if (!dict_locked) { - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); } - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); /* Include the separator as well. */ const size_t dbname_len = fts_table->table->name.dblen() + 1; ut_ad(dbname_len > 1); memcpy(table_name, fts_table->table->name.m_name, dbname_len); if (!dict_locked) { - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } memcpy(table_name += dbname_len, "FTS_", 4); table_name += 4; @@ -157,17 +157,17 @@ fts_parse_sql( && fts_table->table->fts->dict_locked); if (!dict_locked) { - ut_ad(!mutex_own(&dict_sys->mutex)); + ut_ad(!mutex_own(&dict_sys.mutex)); /* The InnoDB SQL parser is not re-entrant. */ - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); } graph = pars_sql(info, str); ut_a(graph); if (!dict_locked) { - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } ut_free(str); @@ -187,7 +187,7 @@ fts_parse_sql_no_dict_lock( char* str; que_t* graph; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); str = ut_str3cat(fts_sql_begin, sql, fts_sql_end); diff --git a/storage/innobase/fut/fut0lst.cc b/storage/innobase/fut/fut0lst.cc index 203820037e1..e9a4c3b8636 100644 --- a/storage/innobase/fut/fut0lst.cc +++ b/storage/innobase/fut/fut0lst.cc @@ -58,8 +58,8 @@ flst_add_to_empty( flst_write_addr(base + FLST_LAST, node_addr, mtr); /* Set prev and next fields of node to add */ - flst_write_addr(node + FLST_PREV, fil_addr_null, mtr); - flst_write_addr(node + FLST_NEXT, fil_addr_null, mtr); + flst_zero_addr(node + FLST_PREV, mtr); + flst_zero_addr(node + FLST_NEXT, mtr); /* Update len of base node */ mlog_write_ulint(base + FLST_LEN, 1, MLOG_4BYTES, mtr); @@ -120,13 +120,11 @@ flst_add_last( if (last_addr.page == node_addr.page) { last_node = page_align(node) + last_addr.boffset; } else { - bool found; - const page_size_t& page_size - = fil_space_get_page_size(space, &found); + fil_space_t* s = fil_space_acquire_silent(space); + ulint zip_size = s ? s->zip_size() : 0; + if (s) s->release(); - ut_ad(found); - - last_node = fut_get_ptr(space, page_size, last_addr, + last_node = fut_get_ptr(space, zip_size, last_addr, RW_SX_LATCH, mtr); } @@ -170,13 +168,11 @@ flst_add_first( if (first_addr.page == node_addr.page) { first_node = page_align(node) + first_addr.boffset; } else { - bool found; - const page_size_t& page_size - = fil_space_get_page_size(space, &found); - - ut_ad(found); + fil_space_t* s = fil_space_acquire_silent(space); + ulint zip_size = s ? s->zip_size() : 0; + if (s) s->release(); - first_node = fut_get_ptr(space, page_size, first_addr, + first_node = fut_get_ptr(space, zip_size, first_addr, RW_SX_LATCH, mtr); } @@ -230,13 +226,11 @@ flst_insert_after( if (!fil_addr_is_null(node3_addr)) { /* Update prev field of node3 */ - bool found; - const page_size_t& page_size - = fil_space_get_page_size(space, &found); - - ut_ad(found); + fil_space_t* s = fil_space_acquire_silent(space); + ulint zip_size = s ? s->zip_size() : 0; + if (s) s->release(); - node3 = fut_get_ptr(space, page_size, + node3 = fut_get_ptr(space, zip_size, node3_addr, RW_SX_LATCH, mtr); flst_write_addr(node3 + FLST_PREV, node2_addr, mtr); } else { @@ -294,14 +288,12 @@ flst_insert_before( flst_write_addr(node2 + FLST_NEXT, node3_addr, mtr); if (!fil_addr_is_null(node1_addr)) { - bool found; - const page_size_t& page_size - = fil_space_get_page_size(space, &found); - - ut_ad(found); + fil_space_t* s = fil_space_acquire_silent(space); + ulint zip_size = s ? s->zip_size() : 0; + if (s) s->release(); /* Update next field of node1 */ - node1 = fut_get_ptr(space, page_size, node1_addr, + node1 = fut_get_ptr(space, zip_size, node1_addr, RW_SX_LATCH, mtr); flst_write_addr(node1 + FLST_NEXT, node2_addr, mtr); } else { @@ -344,11 +336,9 @@ flst_remove( buf_ptr_get_fsp_addr(node2, &space, &node2_addr); - bool found; - const page_size_t& page_size = fil_space_get_page_size(space, - &found); - - ut_ad(found); + fil_space_t* s = fil_space_acquire_silent(space); + ulint zip_size = s ? s->zip_size() : 0; + if (s) s->release(); node1_addr = flst_get_prev_addr(node2, mtr); node3_addr = flst_get_next_addr(node2, mtr); @@ -361,7 +351,7 @@ flst_remove( node1 = page_align(node2) + node1_addr.boffset; } else { - node1 = fut_get_ptr(space, page_size, + node1 = fut_get_ptr(space, zip_size, node1_addr, RW_SX_LATCH, mtr); } @@ -380,7 +370,7 @@ flst_remove( node3 = page_align(node2) + node3_addr.boffset; } else { - node3 = fut_get_ptr(space, page_size, + node3 = fut_get_ptr(space, zip_size, node3_addr, RW_SX_LATCH, mtr); } @@ -431,11 +421,9 @@ flst_validate( /* Find out the space id */ buf_ptr_get_fsp_addr(base, &space, &base_addr); - bool found; - const page_size_t& page_size = fil_space_get_page_size(space, - &found); - - ut_ad(found); + fil_space_t* s = fil_space_acquire_silent(space); + ulint zip_size = s ? s->zip_size() : 0; + if (s) s->release(); len = flst_get_len(base); node_addr = flst_get_first(base, mtr1); @@ -443,7 +431,7 @@ flst_validate( for (i = 0; i < len; i++) { mtr_start(&mtr2); - node = fut_get_ptr(space, page_size, + node = fut_get_ptr(space, zip_size, node_addr, RW_SX_LATCH, &mtr2); node_addr = flst_get_next_addr(node, &mtr2); @@ -458,7 +446,7 @@ flst_validate( for (i = 0; i < len; i++) { mtr_start(&mtr2); - node = fut_get_ptr(space, page_size, + node = fut_get_ptr(space, zip_size, node_addr, RW_SX_LATCH, &mtr2); node_addr = flst_get_prev_addr(node, &mtr2); diff --git a/storage/innobase/gis/gis0rtree.cc b/storage/innobase/gis/gis0rtree.cc index 31344372c80..170fb2e8a57 100644 --- a/storage/innobase/gis/gis0rtree.cc +++ b/storage/innobase/gis/gis0rtree.cc @@ -740,6 +740,8 @@ rtr_adjust_upper_level( mem_heap_free(heap); + ut_ad(block->zip_size() == index->table->space->zip_size()); + const uint32_t next_page_no = btr_page_get_next(page); if (next_page_no != FIL_NULL) { @@ -747,8 +749,8 @@ rtr_adjust_upper_level( next_page_no); buf_block_t* next_block = btr_block_get( - next_page_id, dict_table_page_size(index->table), - RW_X_LATCH, index, mtr); + next_page_id, block->zip_size(), RW_X_LATCH, + index, mtr); #ifdef UNIV_BTR_DEBUG ut_a(page_is_comp(next_block->frame) == page_is_comp(page)); ut_a(btr_page_get_prev(next_block->frame) @@ -892,7 +894,7 @@ rtr_split_page_move_rec_list( mtr_set_log_mode(mtr, log_mode); if (!page_zip_compress(new_page_zip, new_page, index, - page_zip_level, NULL, mtr)) { + page_zip_level, mtr)) { ulint ret_pos; /* Before trying to reorganize the page, @@ -1847,7 +1849,7 @@ rtr_estimate_n_rows_in_range( buf_block_t* block = btr_block_get( page_id_t(index->table->space_id, index->page), - page_size_t(index->table->space->flags), + index->table->space->zip_size(), RW_S_LATCH, index, &mtr); const page_t* page = buf_block_get_frame(block); const unsigned n_recs = page_header_get_field(page, PAGE_N_RECS); diff --git a/storage/innobase/gis/gis0sea.cc b/storage/innobase/gis/gis0sea.cc index 65079260aae..b90e0444f45 100644 --- a/storage/innobase/gis/gis0sea.cc +++ b/storage/innobase/gis/gis0sea.cc @@ -145,7 +145,7 @@ rtr_pcur_getnext_from_path( | MTR_MEMO_X_LOCK)); } - const page_size_t page_size(index->table->space->flags); + const ulint zip_size = index->table->space->zip_size(); /* Pop each node/page to be searched from "path" structure and do a search on it. Please note, any pages that are in @@ -269,7 +269,7 @@ rtr_pcur_getnext_from_path( block = buf_page_get_gen( page_id_t(index->table->space_id, - next_rec.page_no), page_size, + next_rec.page_no), zip_size, rw_latch, NULL, BUF_GET, __FILE__, __LINE__, mtr, &err); if (block == NULL) { @@ -424,7 +424,7 @@ rtr_pcur_getnext_from_path( block, page_id_t(index->table->space_id, block->page.id.page_no()), - page_size, BTR_MODIFY_TREE, + zip_size, BTR_MODIFY_TREE, btr_cur, mtr); } @@ -970,7 +970,7 @@ rtr_create_rtr_info( &rtr_info->rtr_path_mutex); mutex_enter(&index->rtr_track->rtr_active_mutex); - index->rtr_track->rtr_active->push_back(rtr_info); + index->rtr_track->rtr_active.push_front(rtr_info); mutex_exit(&index->rtr_track->rtr_active_mutex); return(rtr_info); } @@ -1043,7 +1043,7 @@ rtr_init_rtr_info( rtr_info->index = index; mutex_enter(&index->rtr_track->rtr_active_mutex); - index->rtr_track->rtr_active->push_back(rtr_info); + index->rtr_track->rtr_active.push_front(rtr_info); mutex_exit(&index->rtr_track->rtr_active_mutex); } @@ -1095,7 +1095,7 @@ rtr_clean_rtr_info( } if (index) { - index->rtr_track->rtr_active->remove(rtr_info); + index->rtr_track->rtr_active.remove(rtr_info); mutex_exit(&index->rtr_track->rtr_active_mutex); } @@ -1200,36 +1200,22 @@ rtr_check_discard_page( the root page */ buf_block_t* block) /*!< in: block of page to be discarded */ { - ulint pageno = block->page.id.page_no(); - rtr_info_t* rtr_info; - rtr_info_active::iterator it; + const ulint pageno = block->page.id.page_no(); mutex_enter(&index->rtr_track->rtr_active_mutex); - for (it = index->rtr_track->rtr_active->begin(); - it != index->rtr_track->rtr_active->end(); ++it) { - rtr_info = *it; - rtr_node_path_t::iterator rit; - bool found = false; - + for (const auto& rtr_info : index->rtr_track->rtr_active) { if (cursor && rtr_info == cursor->rtr_info) { continue; } mutex_enter(&rtr_info->rtr_path_mutex); - for (rit = rtr_info->path->begin(); - rit != rtr_info->path->end(); ++rit) { - node_visit_t node = *rit; - + for (const node_visit_t& node : *rtr_info->path) { if (node.page_no == pageno) { - found = true; + rtr_rebuild_path(rtr_info, pageno); break; } } - - if (found) { - rtr_rebuild_path(rtr_info, pageno); - } mutex_exit(&rtr_info->rtr_path_mutex); if (rtr_info->matches) { @@ -1342,8 +1328,7 @@ rtr_cur_restore_position( page_cur_t* page_cursor; node_visit_t* node = rtr_get_parent_node(btr_cur, level, false); node_seq_t path_ssn = node->seq_no; - const page_size_t page_size(index->table->space->flags); - + const ulint zip_size = index->table->space->zip_size(); ulint page_no = node->page_no; heap = mem_heap_create(256); @@ -1359,7 +1344,7 @@ search_again: block = buf_page_get_gen( page_id_t(index->table->space_id, page_no), - page_size, RW_X_LATCH, NULL, + zip_size, RW_X_LATCH, NULL, BUF_GET, __FILE__, __LINE__, mtr, &err); ut_ad(block); @@ -1559,14 +1544,13 @@ rtr_copy_buf( matches->block.n_fields = block->n_fields; matches->block.left_side = block->left_side; #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG - matches->block.n_pointers = block->n_pointers; + matches->block.n_pointers = 0; #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ matches->block.curr_n_fields = block->curr_n_fields; matches->block.curr_left_side = block->curr_left_side; matches->block.index = block->index; #endif /* BTR_CUR_HASH_ADAPT */ - ut_d(matches->block.debug_latch = block->debug_latch); - + ut_d(matches->block.debug_latch = NULL); } /****************************************************************//** diff --git a/storage/innobase/ha/ha0ha.cc b/storage/innobase/ha/ha0ha.cc index f7e3c53495b..8e8a3369b7c 100644 --- a/storage/innobase/ha/ha0ha.cc +++ b/storage/innobase/ha/ha0ha.cc @@ -245,11 +245,8 @@ ha_insert_for_fold_func( buf_block_t* prev_block = prev_node->block; ut_a(prev_block->frame == page_align(prev_node->data)); - ut_a(my_atomic_addlint(&prev_block->n_pointers, - ulint(-1)) - < MAX_N_POINTERS); - ut_a(my_atomic_addlint(&block->n_pointers, 1) - < MAX_N_POINTERS); + ut_a(prev_block->n_pointers-- < MAX_N_POINTERS); + ut_a(block->n_pointers++ < MAX_N_POINTERS); } prev_node->block = block; @@ -280,8 +277,7 @@ ha_insert_for_fold_func( #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG if (table->adaptive) { - ut_a(my_atomic_addlint(&block->n_pointers, 1) - < MAX_N_POINTERS); + ut_a(block->n_pointers++ < MAX_N_POINTERS); } #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ @@ -342,8 +338,7 @@ ha_delete_hash_node( #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG if (table->adaptive) { ut_a(del_node->block->frame = page_align(del_node->data)); - ut_a(my_atomic_addlint(&del_node->block->n_pointers, ulint(-1)) - < MAX_N_POINTERS); + ut_a(del_node->block->n_pointers-- < MAX_N_POINTERS); } #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ @@ -385,11 +380,8 @@ ha_search_and_update_if_found_func( if (node) { #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG if (table->adaptive) { - ut_a(my_atomic_addlint(&node->block->n_pointers, - ulint(-1)) - < MAX_N_POINTERS); - ut_a(my_atomic_addlint(&new_block->n_pointers, 1) - < MAX_N_POINTERS); + ut_a(node->block->n_pointers-- < MAX_N_POINTERS); + ut_a(new_block->n_pointers++ < MAX_N_POINTERS); } node->block = new_block; diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index f4d3b49c4a4..c4d5882dfd0 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -54,6 +54,8 @@ this program; if not, write to the Free Software Foundation, Inc., #include <my_bitmap.h> #include <mysql/service_thd_alloc.h> #include <mysql/service_thd_wait.h> +#include "field.h" +#include "scope.h" // MYSQL_PLUGIN_IMPORT extern my_bool lower_case_file_system; // MYSQL_PLUGIN_IMPORT extern char mysql_unpacked_real_data_home[]; @@ -97,7 +99,6 @@ this program; if not, write to the Free Software Foundation, Inc., #include "row0mysql.h" #include "row0quiesce.h" #include "row0sel.h" -#include "row0trunc.h" #include "row0upd.h" #include "fil0crypt.h" #include "srv0mon.h" @@ -133,6 +134,9 @@ void close_thread_tables(THD* thd); #define tdc_size 400 #endif +#include <mysql/plugin.h> +#include <mysql/service_wsrep.h> + #include "ha_innodb.h" #include "i_s.h" #include "sync0sync.h" @@ -140,28 +144,10 @@ void close_thread_tables(THD* thd); #include <string> #include <sstream> -#include <mysql/plugin.h> -#include <mysql/service_wsrep.h> - #ifdef WITH_WSREP #include "dict0priv.h" #include <mysql/service_md5.h> #include "wsrep_sst.h" - -static inline wsrep_ws_handle_t* -wsrep_ws_handle(THD* thd, const trx_t* trx) { - return wsrep_ws_handle_for_trx(wsrep_thd_ws_handle(thd), - (wsrep_trx_id_t)trx->id); -} - -extern void wsrep_cleanup_transaction(THD *thd); -static int -wsrep_abort_transaction(handlerton* hton, THD *bf_thd, THD *victim_thd, - my_bool signal); -static void -wsrep_fake_trx_id(handlerton* hton, THD *thd); -static int innobase_wsrep_set_checkpoint(handlerton* hton, const XID* xid); -static int innobase_wsrep_get_checkpoint(handlerton* hton, XID* xid); #endif /* WITH_WSREP */ /** to force correct commit order in binlog */ @@ -269,7 +255,7 @@ is_partition( /** Signal to shut down InnoDB (NULL if shutdown was signaled, or if running in innodb_read_only mode, srv_read_only_mode) */ -st_my_thread_var *srv_running; +std::atomic <st_my_thread_var *> srv_running; /** Service thread that waits for the server shutdown and stops purge threads. Purge workers have THDs that are needed to calculate virtual columns. This THDs must be destroyed rather early in the server shutdown sequence. @@ -297,16 +283,12 @@ thd_destructor_proxy(void *) mysql_mutex_lock(&thd_destructor_mutex); - my_atomic_storeptr_explicit(reinterpret_cast<void**>(&srv_running), - myvar, - MY_MEMORY_ORDER_RELAXED); + srv_running.store(myvar, std::memory_order_relaxed); /* wait until the server wakes the THD to abort and die */ - while (!srv_running->abort) + while (!myvar->abort) mysql_cond_wait(&thd_destructor_cond, &thd_destructor_mutex); mysql_mutex_unlock(&thd_destructor_mutex); - my_atomic_storeptr_explicit(reinterpret_cast<void**>(&srv_running), - NULL, - MY_MEMORY_ORDER_RELAXED); + srv_running.store(NULL, std::memory_order_relaxed); while (srv_fast_shutdown == 0 && (trx_sys.any_active_transactions() || @@ -378,6 +360,8 @@ const char* innodb_checksum_algorithm_names[] = { "strict_innodb", "none", "strict_none", + "full_crc32", + "strict_full_crc32", NullS }; @@ -477,7 +461,8 @@ static TYPELIB innodb_change_buffering_typelib = { /** Allowed values of innodb_instant_alter_column_allowed */ const char* innodb_instant_alter_column_allowed_names[] = { "never", /* compatible with MariaDB 5.5 to 10.2 */ - "add_last",/* allow instant ADD COLUMN */ + "add_last",/* allow instant ADD COLUMN ... LAST */ + "add_drop_reorder", /* allow instant ADD anywhere & DROP & reorder */ NullS }; @@ -1058,8 +1043,6 @@ static SHOW_VAR innodb_status_variables[]= { (char*) &export_vars.innodb_pages_created, SHOW_LONG}, {"pages_read", (char*) &export_vars.innodb_pages_read, SHOW_LONG}, - {"pages0_read", - (char*) &export_vars.innodb_page0_read, SHOW_LONG}, {"pages_written", (char*) &export_vars.innodb_pages_written, SHOW_LONG}, {"row_lock_current_waits", @@ -1849,8 +1832,13 @@ thd_to_trx_id( { return(thd_to_trx(thd)->id); } -#endif /* WITH_WSREP */ +static int +wsrep_abort_transaction(handlerton* hton, THD *bf_thd, THD *victim_thd, + my_bool signal); +static int innobase_wsrep_set_checkpoint(handlerton* hton, const XID* xid); +static int innobase_wsrep_get_checkpoint(handlerton* hton, XID* xid); +#endif /* WITH_WSREP */ /********************************************************************//** Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth time calls srv_active_wake_master_thread. This function should be used @@ -2599,8 +2587,7 @@ ha_innobase::innobase_reset_autoinc( if (error == DB_SUCCESS) { dict_table_autoinc_initialize(m_prebuilt->table, autoinc); - - dict_table_autoinc_unlock(m_prebuilt->table); + mutex_exit(&m_prebuilt->table->autoinc_mutex); } return(error); @@ -2888,8 +2875,9 @@ ha_innobase::ha_innobase( | HA_CAN_EXPORT | HA_CAN_RTREEKEYS | HA_CAN_TABLES_WITHOUT_ROLLBACK + | HA_CAN_ONLINE_BACKUPS | HA_CONCURRENT_OPTIMIZE - | (srv_force_primary_key ? HA_WANTS_PRIMARY_KEY : 0) + | (srv_force_primary_key ? HA_REQUIRE_PRIMARY_KEY : 0) ), m_start_of_scan(), m_mysql_has_locked() @@ -3424,6 +3412,10 @@ ha_innobase::reset_template(void) in ha_innobase::write_row(). */ m_prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE; } + if (m_prebuilt->pk_filter) { + m_prebuilt->pk_filter = NULL; + m_prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE; + } } /*****************************************************************//** @@ -3812,13 +3804,18 @@ static int innodb_init_params() DBUG_RETURN(HA_ERR_INITIALIZATION); } - /* This is the first time univ_page_size is used. - It was initialized to 16k pages before srv_page_size was set */ - univ_page_size.copy_from( - page_size_t(srv_page_size, srv_page_size, false)); - srv_sys_space.set_space_id(TRX_SYS_SPACE); - srv_sys_space.set_flags(FSP_FLAGS_PAGE_SSIZE()); + + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + srv_sys_space.set_flags(FSP_FLAGS_FCRC32_MASK_MARKER + | FSP_FLAGS_FCRC32_PAGE_SSIZE()); + break; + default: + srv_sys_space.set_flags(FSP_FLAGS_PAGE_SSIZE()); + } + srv_sys_space.set_name("innodb_system"); srv_sys_space.set_path(srv_data_home); @@ -3831,7 +3828,10 @@ static int innodb_init_params() srv_tmp_space.set_name("innodb_temporary"); srv_tmp_space.set_path(srv_data_home); - srv_tmp_space.set_flags(FSP_FLAGS_PAGE_SSIZE()); + + /* Temporary tablespace is in full crc32 format. */ + srv_tmp_space.set_flags(FSP_FLAGS_FCRC32_MASK_MARKER + | FSP_FLAGS_FCRC32_PAGE_SSIZE()); if (!srv_tmp_space.parse_params(innobase_temp_data_file_path, false)) { ib::error() << "Unable to parse innodb_temp_data_file_path=" @@ -4120,13 +4120,12 @@ static int innodb_init(void* p) innobase_hton->show_status = innobase_show_status; innobase_hton->flags = HTON_SUPPORTS_EXTENDED_KEYS | HTON_SUPPORTS_FOREIGN_KEYS - | HTON_NATIVE_SYS_VERSIONING; + | HTON_NATIVE_SYS_VERSIONING | HTON_WSREP_REPLICATION; #ifdef WITH_WSREP innobase_hton->abort_transaction=wsrep_abort_transaction; innobase_hton->set_checkpoint=innobase_wsrep_set_checkpoint; innobase_hton->get_checkpoint=innobase_wsrep_get_checkpoint; - innobase_hton->fake_trx_id=wsrep_fake_trx_id; #endif /* WITH_WSREP */ innobase_hton->tablefile_extensions = ha_innobase_exts; @@ -4218,9 +4217,7 @@ static int innodb_init(void* p) mysql_thread_create(thd_destructor_thread_key, &thd_destructor_thread, NULL, thd_destructor_proxy, NULL); - while (!my_atomic_loadptr_explicit(reinterpret_cast<void**> - (&srv_running), - MY_MEMORY_ORDER_RELAXED)) + while (!srv_running.load(std::memory_order_relaxed)) os_thread_sleep(20); } @@ -4300,11 +4297,7 @@ innobase_end(handlerton*, ha_panic_function) } } - st_my_thread_var* r = reinterpret_cast<st_my_thread_var*>( - my_atomic_loadptr_explicit( - reinterpret_cast<void**>(&srv_running), - MY_MEMORY_ORDER_RELAXED)); - if (r) { + if (auto r = srv_running.load(std::memory_order_relaxed)) { ut_ad(!srv_read_only_mode); if (!abort_loop) { // may be UNINSTALL PLUGIN statement @@ -4473,6 +4466,14 @@ innobase_commit_ordered_2( trx->flush_log_later = true; } +#ifdef WITH_WSREP + /* If the transaction is not run in 2pc, we must assign wsrep + XID here in order to get it written in rollback segment. */ + if (trx->is_wsrep()) { + thd_get_xid(thd, (MYSQL_XID*)trx->xid); + } +#endif /* WITH_WSREP */ + innobase_commit_low(trx); if (!read_only) { @@ -4675,6 +4676,15 @@ innobase_rollback( dberr_t error; +#ifdef WITH_WSREP + /* If trx was assigned wsrep XID in prepare phase and the + trx is being rolled back due to BF abort, clear XID in order + to avoid writing it to rollback segment out of order. The XID + will be reassigned when the transaction is replayed. */ + if (trx->state != TRX_STATE_NOT_STARTED && wsrep_is_wsrep_xid(trx->xid)) { + trx->xid->null(); + } +#endif /* WITH_WSREP */ if (rollback_trx || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { @@ -4711,7 +4721,8 @@ innobase_rollback_trx( if (!trx->has_logged()) { trx->will_lock = 0; #ifdef WITH_WSREP - trx->wsrep = false; + trx->wsrep= false; + trx->lock.was_chosen_as_wsrep_victim= false; #endif DBUG_RETURN(0); } @@ -5079,23 +5090,19 @@ UNIV_INTERN void lock_cancel_waiting_and_release(lock_t* lock); /** Cancel any pending lock request associated with the current THD. @sa THD::awake() @sa ha_kill_query() */ -static void innobase_kill_query(handlerton*, THD* thd, enum thd_kill_levels) +static void innobase_kill_query(handlerton*, THD *thd, enum thd_kill_levels) { - DBUG_ENTER("innobase_kill_query"); -#ifdef WITH_WSREP - if (wsrep_thd_get_conflict_state(thd) != NO_CONFLICT) { - /* if victim has been signaled by BF thread and/or aborting - is already progressing, following query aborting is not necessary - any more. - Also, BF thread should own trx mutex for the victim, which would - conflict with trx_mutex_enter() below - */ - DBUG_VOID_RETURN; - } -#endif /* WITH_WSREP */ + DBUG_ENTER("innobase_kill_query"); if (trx_t* trx= thd_to_trx(thd)) { +#ifdef WITH_WSREP + if (trx->is_wsrep() && wsrep_thd_is_aborting(thd)) + /* if victim has been signaled by BF thread and/or aborting is already + progressing, following query aborting is not necessary any more. + Also, BF thread should own trx mutex for the victim. */ + DBUG_VOID_RETURN; +#endif /* WITH_WSREP */ lock_mutex_enter(); mutex_enter(&trx_sys.mutex); trx_mutex_enter(trx); @@ -5207,17 +5214,6 @@ ha_innobase::index_type( } /****************************************************************//** -Returns the table file name extension. -@return file extension string */ - -const char** -ha_innobase::bas_ext() const -/*========================*/ -{ - return(ha_innobase_exts); -} - -/****************************************************************//** Returns the operations supported for indexes. @return flags of supported operations */ @@ -5232,24 +5228,21 @@ ha_innobase::index_flags( return(0); } - ulong extra_flag= 0; - - if (table && key == table->s->primary_key) { - extra_flag= HA_CLUSTERED_INDEX; - } - - ulong flags = HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER - | HA_READ_RANGE | HA_KEYREAD_ONLY - | extra_flag - | HA_DO_INDEX_COND_PUSHDOWN; - /* For spatial index, we don't support descending scan and ICP so far. */ if (table_share->key_info[key].flags & HA_SPATIAL) { - flags = HA_READ_NEXT | HA_READ_ORDER| HA_READ_RANGE + return HA_READ_NEXT | HA_READ_ORDER| HA_READ_RANGE | HA_KEYREAD_ONLY | HA_KEY_SCAN_NOT_ROR; } + ulong flags= key == table_share->primary_key + ? HA_CLUSTERED_INDEX : 0; + + flags |= HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER + | HA_READ_RANGE | HA_KEYREAD_ONLY + | HA_DO_INDEX_COND_PUSHDOWN + | HA_DO_RANGE_FILTER_PUSHDOWN; + return(flags); } @@ -5316,7 +5309,7 @@ ha_innobase::keys_to_use_for_scanning() /****************************************************************//** Ensures that if there's a concurrent inplace ADD INDEX, being-indexed virtual columns are computed. They are not marked as indexed in the old table, so the -server won't add them to the vcol_set automatically */ +server won't add them to the read_set automatically */ void ha_innobase::column_bitmaps_signal() /*================================*/ @@ -5336,7 +5329,7 @@ ha_innobase::column_bitmaps_signal() if (col->ord_part || (dict_index_is_online_ddl(clust_index) && row_log_col_is_indexed(clust_index, num_v))) { - table->mark_virtual_col(table->vfield[j]); + table->mark_virtual_column_with_deps(table->vfield[j]); } num_v++; } @@ -5775,12 +5768,12 @@ innobase_build_v_templ( ut_ad(n_v_col > 0); if (!locked) { - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); } if (s_templ->vtempl) { if (!locked) { - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } DBUG_VOID_RETURN; } @@ -5801,9 +5794,8 @@ innobase_build_v_templ( const dict_v_col_t* vcol = dict_table_get_nth_v_col( ib_table, i); - for (ulint j = 0; j < vcol->num_base; j++) { - ulint col_no = vcol->base_col[j]->ind; - marker[col_no] = true; + for (ulint j = vcol->num_base; j--; ) { + marker[vcol->base_col[j]->ind] = true; } } @@ -5811,9 +5803,8 @@ innobase_build_v_templ( for (ulint i = 0; i < add_v->n_v_col; i++) { const dict_v_col_t* vcol = &add_v->v_col[i]; - for (ulint j = 0; j < vcol->num_base; j++) { - ulint col_no = vcol->base_col[j]->ind; - marker[col_no] = true; + for (ulint j = vcol->num_base; j--; ) { + marker[vcol->base_col[j]->ind] = true; } } } @@ -5888,7 +5879,7 @@ innobase_build_v_templ( } if (!locked) { - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } s_templ->db_name = table->s->db.str; @@ -6025,7 +6016,7 @@ initialize_auto_increment(dict_table_t* table, const Field* field) const unsigned col_no = innodb_col_no(field); - dict_table_autoinc_lock(table); + mutex_enter(&table->autoinc_mutex); table->persistent_autoinc = 1 + dict_table_get_nth_col_pos(table, col_no, NULL); @@ -6033,7 +6024,7 @@ initialize_auto_increment(dict_table_t* table, const Field* field) if (table->autoinc) { /* Already initialized. Our caller checked table->persistent_autoinc without - dict_table_autoinc_lock(), and there might be multiple + autoinc_mutex protection, and there might be multiple ha_innobase::open() executing concurrently. */ } else if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) { /* If the recovery level is set so high that writes @@ -6055,7 +6046,7 @@ initialize_auto_increment(dict_table_t* table, const Field* field) innobase_get_int_col_max_value(field)); } - dict_table_autoinc_unlock(table); + mutex_exit(&table->autoinc_mutex); } /** Open an InnoDB table @@ -6065,6 +6056,14 @@ initialize_auto_increment(dict_table_t* table, const Field* field) int ha_innobase::open(const char* name, int, uint) { + /* TODO: If trx_rollback_recovered(bool all=false) is ever + removed, the first-time open() must hold (or acquire and release) + a table lock that conflicts with trx_resurrect_table_locks(), + to ensure that any recovered incomplete ALTER TABLE will have been + rolled back. Otherwise, dict_table_t::instant could be cleared by + the rollback invoking dict_index_t::clear_instant_alter() while + open table handles exist in client connections. */ + char norm_name[FN_REFLEN]; DBUG_ENTER("ha_innobase::open"); @@ -6090,7 +6089,6 @@ ha_innobase::open(const char* name, int, uint) sql_print_error("Failed to open table %s.\n", norm_name); } -no_such_table: set_my_errno(ENOENT); DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); @@ -6116,7 +6114,8 @@ no_such_table: ib_table->file_unreadable = true; ib_table->corrupted = true; dict_table_close(ib_table, FALSE, FALSE); - goto no_such_table; + set_my_errno(ENOENT); + DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE); } innobase_copy_frm_flags_from_table_share(ib_table, table->s); @@ -6186,7 +6185,7 @@ no_such_table: key_used_on_scan = m_primary_key; if (ib_table->n_v_cols) { - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); if (ib_table->vc_templ == NULL) { ib_table->vc_templ = UT_NEW_NOKEY(dict_vcol_templ_t()); innobase_build_v_templ( @@ -6194,7 +6193,7 @@ no_such_table: true); } - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } if (!check_index_consistency(table, ib_table)) { @@ -7232,19 +7231,19 @@ static const Field* build_template_needs_field( /*=======================*/ - ibool index_contains, /*!< in: - dict_index_contains_col_or_prefix( - index, i) */ - ibool read_just_key, /*!< in: TRUE when MySQL calls + bool index_contains, /*!< in: + dict_index_t::contains_col_or_prefix( + i) */ + bool read_just_key, /*!< in: TRUE when MySQL calls ha_innobase::extra with the argument HA_EXTRA_KEYREAD; it is enough to read just columns defined in the index (i.e., no read of the clustered index record necessary) */ - ibool fetch_all_in_key, + bool fetch_all_in_key, /*!< in: true=fetch all fields in the index */ - ibool fetch_primary_key_cols, + bool fetch_primary_key_cols, /*!< in: true=fetch the primary key columns */ dict_index_t* index, /*!< in: InnoDB index to use */ @@ -7306,11 +7305,11 @@ build_template_needs_field_in_icp( bool is_virtual) /*!< in: a virtual column or not */ { - ut_ad(contains == dict_index_contains_col_or_prefix(index, i, is_virtual)); + ut_ad(contains == index->contains_col_or_prefix(i, is_virtual)); return(index == prebuilt->index ? contains - : dict_index_contains_col_or_prefix(prebuilt->index, i, is_virtual)); + : prebuilt->index->contains_col_or_prefix(i, is_virtual)); } /**************************************************************//** @@ -7555,6 +7554,13 @@ ha_innobase::build_template( /* Below we check column by column if we need to access the clustered index. */ + if (pushed_rowid_filter && rowid_filter_is_active) { + fetch_primary_key_cols = TRUE; + m_prebuilt->pk_filter = this; + } else { + m_prebuilt->pk_filter = NULL; + } + const bool skip_virtual = omits_virtual_cols(*table_share); const ulint n_fields = table_share->fields; @@ -7578,8 +7584,9 @@ ha_innobase::build_template( ulint num_v = 0; - if (active_index != MAX_KEY - && active_index == pushed_idx_cond_keyno) { + if ((active_index != MAX_KEY + && active_index == pushed_idx_cond_keyno) + || (pushed_rowid_filter && rowid_filter_is_active)) { /* Push down an index condition or an end_range check. */ for (ulint i = 0; i < n_fields; i++) { const Field* field = table->field[i]; @@ -7588,9 +7595,8 @@ ha_innobase::build_template( num_v++; continue; } - ibool index_contains - = dict_index_contains_col_or_prefix( - index, is_v ? num_v : i - num_v, is_v); + bool index_contains = index->contains_col_or_prefix( + is_v ? num_v : i - num_v, is_v); if (is_v && index_contains) { m_prebuilt->n_template = 0; num_v = 0; @@ -7728,9 +7734,8 @@ ha_innobase::build_template( continue; } - ibool index_contains - = dict_index_contains_col_or_prefix( - index, is_v ? num_v : i - num_v, is_v); + bool index_contains = index->contains_col_or_prefix( + is_v ? num_v : i - num_v, is_v); if (!build_template_needs_field_in_icp( index, m_prebuilt, index_contains, @@ -7762,8 +7767,9 @@ ha_innobase::build_template( } } } - - m_prebuilt->idx_cond = this; + if (active_index == pushed_idx_cond_keyno) { + m_prebuilt->idx_cond = this; + } } else { no_icp: /* No index condition pushdown */ @@ -7787,8 +7793,8 @@ no_icp: cluster index. */ if (is_v && m_prebuilt->read_just_key - && !dict_index_contains_col_or_prefix( - m_prebuilt->index, num_v, true)) + && !m_prebuilt->index->contains_col_or_prefix( + num_v, true)) { /* Turn off ROW_MYSQL_WHOLE_ROW */ m_prebuilt->template_type = @@ -7797,21 +7803,15 @@ no_icp: continue; } } else { - ibool contain; - - if (!is_v) { - contain = dict_index_contains_col_or_prefix( - index, i - num_v, - false); - } else if (skip_virtual - || dict_index_is_clust(index)) { + if (is_v + && (skip_virtual || index->is_primary())) { num_v++; continue; - } else { - contain = dict_index_contains_col_or_prefix( - index, num_v, true); } + bool contain = index->contains_col_or_prefix( + is_v ? num_v: i - num_v, is_v); + field = build_template_needs_field( contain, m_prebuilt->read_just_key, @@ -7869,7 +7869,7 @@ ha_innobase::innobase_lock_autoinc(void) switch (innobase_autoinc_lock_mode) { case AUTOINC_NO_LOCKING: /* Acquire only the AUTOINC mutex. */ - dict_table_autoinc_lock(m_prebuilt->table); + mutex_enter(&m_prebuilt->table->autoinc_mutex); break; case AUTOINC_NEW_STYLE_LOCKING: @@ -7884,14 +7884,14 @@ ha_innobase::innobase_lock_autoinc(void) ) { /* Acquire the AUTOINC mutex. */ - dict_table_autoinc_lock(m_prebuilt->table); + mutex_enter(&m_prebuilt->table->autoinc_mutex); /* We need to check that another transaction isn't already holding the AUTOINC lock on the table. */ if (m_prebuilt->table->n_waiting_or_granted_auto_inc_locks) { /* Release the mutex to avoid deadlocks and fall back to old style locking. */ - dict_table_autoinc_unlock(m_prebuilt->table); + mutex_exit(&m_prebuilt->table->autoinc_mutex); } else { /* Do not fall back to old style locking. */ break; @@ -7907,7 +7907,7 @@ ha_innobase::innobase_lock_autoinc(void) if (error == DB_SUCCESS) { /* Acquire the AUTOINC mutex. */ - dict_table_autoinc_lock(m_prebuilt->table); + mutex_enter(&m_prebuilt->table->autoinc_mutex); } break; @@ -7935,8 +7935,7 @@ ha_innobase::innobase_set_max_autoinc( if (error == DB_SUCCESS) { dict_table_autoinc_update_if_greater(m_prebuilt->table, auto_inc); - - dict_table_autoinc_unlock(m_prebuilt->table); + mutex_exit(&m_prebuilt->table->autoinc_mutex); } return(error); @@ -7950,7 +7949,7 @@ handle. int ha_innobase::write_row( /*===================*/ - uchar* record) /*!< in: a row in MySQL format */ + const uchar* record) /*!< in: a row in MySQL format */ { dberr_t error; #ifdef WITH_WSREP @@ -7975,16 +7974,6 @@ ha_innobase::write_row( ++trx->will_lock; } -#ifdef WITH_WSREP - if (trx->is_wsrep() && wsrep_is_load_multi_commit(m_user_thd)) - { - /* Note that this transaction is still active. */ - trx_register_for_2pc(m_prebuilt->trx); - /* We will need an IX lock on the destination table. */ - m_prebuilt->sql_stat_start = TRUE; - } -#endif /* WITH_WSREP */ - ins_mode_t vers_set_fields; /* Handling of Auto-Increment Columns. */ if (table->next_number_field && record == table->record[0]) { @@ -8102,9 +8091,9 @@ ha_innobase::write_row( "retrying insert: %s", wsrep_thd_query(m_user_thd)); error= DB_SUCCESS; - wsrep_thd_set_conflict_state( - m_user_thd, MUST_ABORT); - innobase_srv_conc_exit_innodb(m_prebuilt); + wsrep_thd_self_abort(m_user_thd); + innobase_srv_conc_exit_innodb( + m_prebuilt); /* jump straight to func exit over * later wsrep hooks */ goto func_exit; @@ -8141,20 +8130,18 @@ set_max_autoinc: properly assigned. Fetch values from server side. */ if (trx->is_wsrep() && - wsrep_thd_exec_mode(m_user_thd) == REPL_RECV) + wsrep_thd_is_applying(m_user_thd)) { wsrep_thd_auto_increment_variables( m_user_thd, &offset, &increment); } else - { #endif /* WITH_WSREP */ + { ut_a(m_prebuilt->autoinc_increment > 0); offset = m_prebuilt->autoinc_offset; increment = m_prebuilt->autoinc_increment; -#ifdef WITH_WSREP } -#endif /* WITH_WSREP */ auto_inc = innobase_next_autoinc( auto_inc, 1, increment, offset, @@ -8190,10 +8177,14 @@ report_error: #ifdef WITH_WSREP if (!error_result && trx->is_wsrep() - && wsrep_thd_exec_mode(m_user_thd) == LOCAL_STATE + && wsrep_thd_is_local(m_user_thd) + && !wsrep_thd_ignore_table(m_user_thd) && !wsrep_consistency_check(m_user_thd) - && !wsrep_thd_ignore_table(m_user_thd)) { - if (wsrep_append_keys(m_user_thd, WSREP_KEY_EXCLUSIVE, record, + && (thd_sql_command(m_user_thd) != SQLCOM_CREATE_TABLE) + && (thd_sql_command(m_user_thd) != SQLCOM_LOAD || + thd_binlog_format(m_user_thd) == BINLOG_FORMAT_ROW)) { + if (wsrep_append_keys(m_user_thd, WSREP_SERVICE_KEY_EXCLUSIVE, + record, NULL)) { DBUG_PRINT("wsrep", ("row key failed")); error_result = HA_ERR_INTERNAL_ERROR; @@ -8852,20 +8843,13 @@ ha_innobase::update_row( m_prebuilt autoinc values don't get properly assigned. Fetch values from server side. */ - if (trx->is_wsrep() && - wsrep_thd_exec_mode(m_user_thd) == REPL_RECV) - { - wsrep_thd_auto_increment_variables( - m_user_thd, &offset, &increment); - } + if (trx->is_wsrep() && wsrep_thd_is_applying(m_user_thd)) + wsrep_thd_auto_increment_variables( + m_user_thd, &offset, &increment); else - { -#endif /* WITH_WSREP */ - offset = m_prebuilt->autoinc_offset; - increment = m_prebuilt->autoinc_increment; -#ifdef WITH_WSREP - } #endif /* WITH_WSREP */ + offset = m_prebuilt->autoinc_offset, + increment = m_prebuilt->autoinc_increment; autoinc = innobase_next_autoinc( autoinc, 1, increment, offset, @@ -8903,13 +8887,16 @@ func_exit: innobase_active_small(); #ifdef WITH_WSREP - if (error == DB_SUCCESS && trx->is_wsrep() && - wsrep_thd_exec_mode(m_user_thd) == LOCAL_STATE && - !wsrep_thd_ignore_table(m_user_thd)) { + if (error == DB_SUCCESS && trx->is_wsrep() + && wsrep_thd_is_local(m_user_thd) + && !wsrep_thd_ignore_table(m_user_thd)) { DBUG_PRINT("wsrep", ("update row key")); - if (wsrep_append_keys(m_user_thd, WSREP_KEY_EXCLUSIVE, old_row, - new_row)) { + if (wsrep_append_keys(m_user_thd, + wsrep_protocol_version >= 4 + ? WSREP_SERVICE_KEY_UPDATE + : WSREP_SERVICE_KEY_EXCLUSIVE, + old_row, new_row)){ WSREP_DEBUG("WSREP: UPDATE_ROW_KEY FAILED"); DBUG_PRINT("wsrep", ("row key failed")); DBUG_RETURN(HA_ERR_INTERNAL_ERROR); @@ -8967,10 +8954,11 @@ ha_innobase::delete_row( #ifdef WITH_WSREP if (error == DB_SUCCESS && trx->is_wsrep() - && wsrep_thd_exec_mode(m_user_thd) == LOCAL_STATE + && wsrep_thd_is_local(m_user_thd) && !wsrep_thd_ignore_table(m_user_thd)) { - if (wsrep_append_keys(m_user_thd, WSREP_KEY_EXCLUSIVE, record, - NULL)) { + if (wsrep_append_keys(m_user_thd, WSREP_SERVICE_KEY_EXCLUSIVE, + record, + NULL)) { DBUG_PRINT("wsrep", ("delete fail")); DBUG_RETURN(HA_ERR_INTERNAL_ERROR); } @@ -9491,21 +9479,21 @@ ha_innobase::change_active_index( } } } else { - dtuple_set_n_fields(m_prebuilt->search_tuple, - m_prebuilt->index->n_fields); + ulint n_fields = dict_index_get_n_unique_in_tree( + m_prebuilt->index); + + dtuple_set_n_fields(m_prebuilt->search_tuple, n_fields); dict_index_copy_types( m_prebuilt->search_tuple, m_prebuilt->index, - m_prebuilt->index->n_fields); + n_fields); /* If it's FTS query and FTS_DOC_ID exists FTS_DOC_ID field is always added to read_set. */ m_prebuilt->fts_doc_id_in_read_set = m_prebuilt->in_fts_query && m_prebuilt->read_just_key - && dict_index_contains_col_or_prefix( - m_prebuilt->index, - m_prebuilt->table->fts->doc_col, - false); + && m_prebuilt->index->contains_col_or_prefix( + m_prebuilt->table->fts->doc_col, false); } /* MySQL changes the active index for a handle also during some @@ -10139,20 +10127,22 @@ next_record: #ifdef WITH_WSREP inline const char* -wsrep_key_type_to_str(wsrep_key_type type) +wsrep_key_type_to_str(Wsrep_service_key_type type) { switch (type) { - case WSREP_KEY_SHARED: + case WSREP_SERVICE_KEY_SHARED: return "shared"; - case WSREP_KEY_SEMI: - return "semi"; - case WSREP_KEY_EXCLUSIVE: + case WSREP_SERVICE_KEY_REFERENCE: + return "reference"; + case WSREP_SERVICE_KEY_UPDATE: + return "update"; + case WSREP_SERVICE_KEY_EXCLUSIVE: return "exclusive"; }; return "unknown"; } -ulint +extern dberr_t wsrep_append_foreign_key( /*===========================*/ trx_t* trx, /*!< in: trx */ @@ -10160,15 +10150,15 @@ wsrep_append_foreign_key( const rec_t* rec, /*!<in: clustered index record */ dict_index_t* index, /*!<in: clustered index */ ibool referenced, /*!<in: is check for referenced table */ - wsrep_key_type key_type) /*!< in: access type of this key - (shared, exclusive, semi...) */ + Wsrep_service_key_type key_type) /*!< in: access type of this key + (shared, exclusive, reference...) */ { - THD* thd = trx->mysql_thd; - - if (!trx->is_wsrep() || wsrep_thd_exec_mode(thd) != LOCAL_STATE) { + if (!trx->is_wsrep() || !wsrep_thd_is_local(trx->mysql_thd)) { return DB_SUCCESS; } + THD* thd = trx->mysql_thd; + if (!foreign || (!foreign->referenced_table && !foreign->foreign_table)) { WSREP_INFO("FK: %s missing in: %s", @@ -10182,13 +10172,12 @@ wsrep_append_foreign_key( ulint rcode = DB_SUCCESS; char cache_key[513] = {'\0'}; int cache_key_len=0; - bool const copy = true; if ( !((referenced) ? foreign->referenced_table : foreign->foreign_table)) { WSREP_DEBUG("pulling %s table into cache", (referenced) ? "referenced" : "foreign"); - mutex_enter(&(dict_sys->mutex)); + mutex_enter(&dict_sys.mutex); if (referenced) { foreign->referenced_table = @@ -10218,7 +10207,7 @@ wsrep_append_foreign_key( TRUE, FALSE); } } - mutex_exit(&(dict_sys->mutex)); + mutex_exit(&dict_sys.mutex); } if ( !((referenced) ? @@ -10258,11 +10247,11 @@ wsrep_append_foreign_key( if (rcode != DB_SUCCESS) { WSREP_ERROR( "FK key set failed: " ULINTPF - " (" ULINTPF " %s), index: %s %s, %s", + " (" ULINTPF "%s), index: %s %s, %s", rcode, referenced, wsrep_key_type_to_str(key_type), - index ? index->name() : "void index", + (index) ? index->name() : "void index", (index && index->table) ? index->table->name.m_name : - "void table", + "void table", wsrep_thd_query(thd)); return DB_ERROR; } @@ -10278,7 +10267,7 @@ wsrep_append_foreign_key( #ifdef WSREP_DEBUG_PRINT ulint j; fprintf(stderr, "FK parent key, table: %s %s len: %lu ", - cache_key, (shared) ? "shared" : "exclusive", len+1); + cache_key, wsrep_key_type_to_str(key_type), len+1); for (j=0; j<len+1; j++) { fprintf(stderr, " %hhX, ", key[j]); } @@ -10297,7 +10286,8 @@ wsrep_append_foreign_key( wsrep_buf_t wkey_part[3]; wsrep_key_t wkey = {wkey_part, 3}; - if (!wsrep_prepare_key( + if (!wsrep_prepare_key_for_innodb( + thd, (const uchar*)cache_key, cache_key_len + 1, (const uchar*)key, len+1, @@ -10308,17 +10298,7 @@ wsrep_append_foreign_key( wsrep_thd_query(thd) : "void"); return DB_ERROR; } - - wsrep_t *wsrep= get_wsrep(); - - rcode = (int)wsrep->append_key( - wsrep, - wsrep_ws_handle(thd, trx), - &wkey, - 1, - key_type, - copy); - + rcode = wsrep_thd_append_key(thd, &wkey, 1, key_type); if (rcode) { DBUG_PRINT("wsrep", ("row key failed: " ULINTPF, rcode)); WSREP_ERROR("Appending cascaded fk row key failed: %s, " @@ -10339,17 +10319,19 @@ wsrep_append_key( TABLE_SHARE *table_share, const char* key, uint16_t key_len, - wsrep_key_type key_type /*!< in: access type of this key + Wsrep_service_key_type key_type /*!< in: access type of this key (shared, exclusive, semi...) */ ) { DBUG_ENTER("wsrep_append_key"); - bool const copy = true; + DBUG_PRINT("enter", + ("thd: %lu trx: %lld", thd_get_thread_id(thd), + (long long)trx->id)); #ifdef WSREP_DEBUG_PRINT - fprintf(stderr, "%s conn %ld, trx %llu, keylen %d, table %s\n Query: %s ", + fprintf(stderr, "%s conn %lu, trx " TRX_ID_FMT ", keylen %d, key %s.%s\n", wsrep_key_type_to_str(key_type), - wsrep_thd_thread_id(thd), trx->id, key_len, - table_share->table_name.str, wsrep_thd_query(thd)); + thd_get_thread_id(thd), trx->id, key_len, + table_share->table_name.str, key); for (int i=0; i<key_len; i++) { fprintf(stderr, "%hhX, ", key[i]); } @@ -10358,7 +10340,8 @@ wsrep_append_key( wsrep_buf_t wkey_part[3]; wsrep_key_t wkey = {wkey_part, 3}; - if (!wsrep_prepare_key( + if (!wsrep_prepare_key_for_innodb( + thd, (const uchar*)table_share->table_cache_key.str, table_share->table_cache_key.length, (const uchar*)key, key_len, @@ -10370,15 +10353,7 @@ wsrep_append_key( DBUG_RETURN(HA_ERR_INTERNAL_ERROR); } - wsrep_t *wsrep= get_wsrep(); - - int rcode = (int)wsrep->append_key( - wsrep, - wsrep_ws_handle(thd, trx), - &wkey, - 1, - key_type, - copy); + int rcode = wsrep_thd_append_key(thd, &wkey, 1, key_type); if (rcode) { DBUG_PRINT("wsrep", ("row key failed: %d", rcode)); WSREP_WARN("Appending row key failed: %s, %d", @@ -10419,17 +10394,30 @@ int ha_innobase::wsrep_append_keys( /*===========================*/ THD *thd, - wsrep_key_type key_type, /*!< in: access type of this key - (shared, exclusive, semi...) */ + Wsrep_service_key_type key_type, /*!< in: access type of this row + operation: + (shared, exclusive, reference...) */ const uchar* record0, /* in: row in MySQL format */ const uchar* record1) /* in: row in MySQL format */ { + /* Sanity check: newly inserted records should always be passed with + EXCLUSIVE key type, all the rest are expected to carry a pre-image + */ + ut_a(record1 != NULL || key_type == WSREP_SERVICE_KEY_EXCLUSIVE); + int rcode; DBUG_ENTER("wsrep_append_keys"); bool key_appended = false; trx_t *trx = thd_to_trx(thd); +#ifdef WSREP_DEBUG_PRINT + fprintf(stderr, "%s conn %lu, trx " TRX_ID_FMT ", table %s\nSQL: %s\n", + wsrep_key_type_to_str(key_type), + thd_get_thread_id(thd), trx->id, + table_share->table_name.str, wsrep_thd_query(thd)); +#endif + if (table_share && table_share->tmp_table != NO_TMP_TABLE) { WSREP_DEBUG("skipping tmp table DML: THD: %lu tmp: %d SQL: %s", thd_get_thread_id(thd), @@ -10454,7 +10442,9 @@ ha_innobase::wsrep_append_keys( thd, trx, table_share, keyval, len, key_type); - if (rcode) DBUG_RETURN(rcode); + if (rcode) { + DBUG_RETURN(rcode); + } } else { WSREP_DEBUG("NULL key skipped (proto 0): %s", wsrep_thd_query(thd)); @@ -10468,68 +10458,91 @@ ha_innobase::wsrep_append_keys( KEY* key_info = table->key_info + i; if (key_info->flags & HA_NOSAME) { hasPK = true; + break; } } for (i=0; i<table->s->keys; ++i) { - uint len; - char keyval0[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'}; - char keyval1[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'}; - char* key0 = &keyval0[1]; - char* key1 = &keyval1[1]; KEY* key_info = table->key_info + i; - ibool is_null; dict_index_t* idx = innobase_get_index(i); dict_table_t* tab = (idx) ? idx->table : NULL; + /* keyval[] shall contain an ordinal number at byte 0 + and the actual key data shall be written at byte 1. + Hence the total data length is the key length + 1 */ + char keyval0[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'}; + char keyval1[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'}; keyval0[0] = (char)i; keyval1[0] = (char)i; + char* key0 = &keyval0[1]; + char* key1 = &keyval1[1]; if (!tab) { WSREP_WARN("MariaDB-InnoDB key mismatch %s %s", table->s->table_name.str, key_info->name.str); } - /* !hasPK == table with no PK, must append all non-unique keys */ + /* !hasPK == table with no PK, + must append all non-unique keys */ if (!hasPK || key_info->flags & HA_NOSAME || ((tab && referenced_by_foreign_key2(tab, idx)) || (!tab && referenced_by_foreign_key()))) { - len = wsrep_store_key_val_for_row( + ibool is_null0; + uint len0 = wsrep_store_key_val_for_row( thd, table, i, key0, WSREP_MAX_SUPPORTED_KEY_LENGTH, - record0, &is_null); - if (!is_null) { - rcode = wsrep_append_key( - thd, trx, table_share, - keyval0, len+1, key_type); - if (rcode) DBUG_RETURN(rcode); - - if (key_info->flags & HA_NOSAME || - key_type == WSREP_KEY_SHARED) - key_appended = true; - } else { - WSREP_DEBUG("NULL key skipped: %s", - wsrep_thd_query(thd)); - } + record0, &is_null0); if (record1) { - len = wsrep_store_key_val_for_row( + ibool is_null1; + uint len1 = wsrep_store_key_val_for_row( thd, table, i, key1, WSREP_MAX_SUPPORTED_KEY_LENGTH, - record1, &is_null); + record1, &is_null1); + + if (is_null0 != is_null1 || + len0 != len1 || + memcmp(key0, key1, len0)) { + /* This key has chaged. If it + is unique, this is an exclusive + operation -> upgrade key type */ + if (key_info->flags & HA_NOSAME) { + key_type = WSREP_SERVICE_KEY_EXCLUSIVE; + } - if (!is_null - && memcmp(key0, key1, len)) { - rcode = wsrep_append_key( + if (!is_null1) { + rcode = wsrep_append_key( thd, trx, table_share, - keyval1, len+1, - key_type); - if (rcode) DBUG_RETURN(rcode); + keyval1, + /* for len1+1 see keyval1 + initialization comment */ + len1+1, key_type); + if (rcode) + DBUG_RETURN(rcode); + } } } + + if (!is_null0) { + rcode = wsrep_append_key( + thd, trx, table_share, + /* for len0+1 see keyval0 + initialization comment */ + keyval0, len0+1, key_type); + if (rcode) + DBUG_RETURN(rcode); + + if (key_info->flags & HA_NOSAME || + key_type == WSREP_SERVICE_KEY_SHARED|| + key_type == WSREP_SERVICE_KEY_REFERENCE) + key_appended = true; + } else { + WSREP_DEBUG("NULL key skipped: %s", + wsrep_thd_query(thd)); + } } } } @@ -10683,9 +10696,8 @@ prepare_vcol_for_base_setup( ut_ad(col->base_col == NULL); MY_BITMAP *old_read_set = field->table->read_set; - MY_BITMAP *old_vcol_set = field->table->vcol_set; - field->table->read_set = field->table->vcol_set = &field->table->tmp_set; + field->table->read_set = &field->table->tmp_set; bitmap_clear_all(&field->table->tmp_set); field->vcol_info->expr->walk( @@ -10697,7 +10709,6 @@ prepare_vcol_for_base_setup( * col->base_col))); } field->table->read_set= old_read_set; - field->table->vcol_set= old_vcol_set; } @@ -10875,6 +10886,7 @@ create_table_info_t::create_table_def() } heap = mem_heap_create(1000); + auto _ = make_scope_exit([heap]() { mem_heap_free(heap); }); ut_d(bool have_vers_start = false); ut_d(bool have_vers_end = false); @@ -10884,10 +10896,10 @@ create_table_info_t::create_table_def() ulint vers_row = 0; if (m_form->versioned()) { - if (i == m_form->s->row_start_field) { + if (i == m_form->s->vers.start_fieldno) { vers_row = DATA_VERS_START; ut_d(have_vers_start = true); - } else if (i == m_form->s->row_end_field) { + } else if (i == m_form->s->vers.end_fieldno) { vers_row = DATA_VERS_END; ut_d(have_vers_end = true); } else if (!(field->flags @@ -10935,7 +10947,6 @@ create_table_info_t::create_table_def() " must be below 256." " Unsupported code " ULINTPF ".", charset_no); - mem_heap_free(heap); dict_mem_table_free(table); DBUG_RETURN(ER_CANT_CREATE_TABLE); @@ -10966,7 +10977,6 @@ create_table_info_t::create_table_def() field->field_name.str); err_col: dict_mem_table_free(table); - mem_heap_free(heap); ut_ad(trx_state_eq(m_trx, TRX_STATE_NOT_STARTED)); DBUG_RETURN(HA_ERR_GENERIC); } @@ -11033,7 +11043,7 @@ err_col: } /** Fill base columns for the stored column present in the list. */ - if (table->s_cols && table->s_cols->size()) { + if (table->s_cols && !table->s_cols->empty()) { for (ulint i = 0; i < n_cols; i++) { Field* field = m_form->field[i]; @@ -11074,9 +11084,8 @@ err_col: "temporary table creation."); } - /* Get a new table ID. FIXME: Make this a private - sequence, not shared with persistent tables! */ - dict_table_assign_new_id(table, m_trx); + m_trx->table_id = table->id + = dict_sys.get_temporary_table_id(); ut_ad(dict_tf_get_rec_format(table->flags) != REC_FORMAT_COMPRESSED); table->space_id = SRV_TMP_SPACE_ID; @@ -11095,8 +11104,6 @@ err_col: DBUG_SUICIDE();); } - mem_heap_free(heap); - DBUG_EXECUTE_IF("ib_create_err_tablespace_exist", err = DB_TABLESPACE_EXISTS;); @@ -11328,7 +11335,7 @@ create_table_info_t::create_option_data_directory_is_valid() } /* Do not use DATA DIRECTORY with TEMPORARY TABLE. */ - if (m_create_info->options & HA_LEX_CREATE_TMP_TABLE) { + if (m_create_info->tmp_table()) { push_warning( m_thd, Sql_condition::WARN_LEVEL_WARN, ER_ILLEGAL_HA_CREATE_OPTION, @@ -11357,8 +11364,7 @@ create_table_info_t::create_options_are_invalid() const char* ret = NULL; enum row_type row_format = m_create_info->row_type; - const bool is_temp - = m_create_info->options & HA_LEX_CREATE_TMP_TABLE; + const bool is_temp = m_create_info->tmp_table(); ut_ad(m_thd != NULL); @@ -11544,9 +11550,12 @@ create_table_info_t::check_table_options() return "ENCRYPTION_KEY_ID"; } - /* Currently we do not support encryption for spatial indexes. + /* We do not support encryption for spatial indexes, + except if innodb_checksum_algorithm=full_crc32. Do not allow ENCRYPTED=YES if any SPATIAL INDEX exists. */ - if (options->encryption != FIL_ENCRYPTION_ON) { + if (options->encryption != FIL_ENCRYPTION_ON + || srv_checksum_algorithm + >= SRV_CHECKSUM_ALGORITHM_FULL_CRC32) { break; } for (ulint i = 0; i < m_form->s->keys; i++) { @@ -11720,7 +11729,7 @@ create_table_info_t::parse_table_name( if (m_innodb_file_per_table && !mysqld_embedded - && !(m_create_info->options & HA_LEX_CREATE_TMP_TABLE)) { + && !m_create_info->tmp_table()) { if ((name[1] == ':') || (name[0] == '\\' && name[1] == '\\')) { @@ -11755,13 +11764,19 @@ create_table_info_t::parse_table_name( } if (m_create_info->index_file_name) { - my_error(WARN_OPTION_IGNORED, ME_JUST_WARNING, + my_error(WARN_OPTION_IGNORED, ME_WARNING, "INDEX DIRECTORY"); } DBUG_RETURN(0); } +/** @return whether innodb_strict_mode is active */ +bool ha_innobase::is_innodb_strict_mode(THD *thd) +{ + return THDVAR(thd, strict_mode); +} + /** Determine InnoDB table flags. If strict_mode=OFF, this will adjust the flags to what should be assumed. @retval true on success @@ -11775,10 +11790,8 @@ bool create_table_info_t::innobase_table_flags() enum row_type row_type; rec_format_t innodb_row_format = get_row_format(m_default_row_format); - const bool is_temp - = m_create_info->options & HA_LEX_CREATE_TMP_TABLE; - bool zip_allowed - = !is_temp; + const bool is_temp = m_create_info->tmp_table(); + bool zip_allowed = !is_temp; const ulint zip_ssize_max = ut_min(static_cast<ulint>(UNIV_PAGE_SSIZE_MAX), @@ -12178,9 +12191,8 @@ create_table_info_t::set_tablespace_type( /* Ignore the current innodb-file-per-table setting if we are creating a temporary table. */ - m_use_file_per_table = - m_allow_file_per_table - && !(m_create_info->options & HA_LEX_CREATE_TMP_TABLE); + m_use_file_per_table = m_allow_file_per_table + && !m_create_info->tmp_table(); /* DATA DIRECTORY must have m_use_file_per_table but cannot be used with TEMPORARY tables. */ @@ -12507,13 +12519,12 @@ dict_index_t::record_size_info_t dict_index_t::record_size_info() const /* maximum allowed size of a node pointer record */ ulint page_ptr_max; - const bool comp= dict_table_is_comp(table); + const bool comp= table->not_redundant(); /* table->space == NULL after DISCARD TABLESPACE */ - const page_size_t page_size(dict_tf_get_page_size(table->flags)); + const ulint zip_size= dict_tf_get_zip_size(table->flags); record_size_info_t result; - if (page_size.is_compressed() && - page_size.physical() < univ_page_size.physical()) + if (zip_size && zip_size < srv_page_size) { /* On a ROW_FORMAT=COMPRESSED page, two records must fit in the uncompressed page modification log. On compressed pages @@ -12524,7 +12535,7 @@ dict_index_t::record_size_info_t dict_index_t::record_size_info() const an empty page, minus a byte for recoding the heap number in the page modification log. The maximum allowed node pointer size is half that. */ - result.max_leaf_size= page_zip_empty_size(n_fields, page_size.physical()); + result.max_leaf_size= page_zip_empty_size(n_fields, zip_size); if (result.max_leaf_size) { result.max_leaf_size--; @@ -12695,11 +12706,15 @@ bool create_table_info_t::row_size_is_acceptable( const size_t idx= info.get_first_overrun_field_index(); const dict_field_t *field= dict_index_get_nth_field(&index, idx); + ut_ad((!field->name) == field->col->is_dropped()); if (innodb_strict_mode || global_system_variables.log_warnings > 2) { - ib::error_or_warn(strict && innodb_strict_mode) - << "Cannot add field " << field->name << " in table " - << index.table->name << " because after adding it, the row size is " + ib::error_or_warn eow(strict && innodb_strict_mode); + if (field->name) + eow << "Cannot add field " << field->name << " in table "; + else + eow << "Cannot add an instantly dropped column in table "; + eow << index.table->name << " because after adding it, the row size is " << info.get_overrun_size() << " which is greater than maximum allowed size (" << info.max_leaf_size << " bytes) for a record on index leaf page."; @@ -12757,9 +12772,9 @@ create_table_info_t::create_table_update_dict() DBUG_RETURN(-1); } - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); fts_optimize_add_table(innobase_table); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } if (const Field* ai = m_form->found_next_number_field) { @@ -12771,7 +12786,7 @@ create_table_info_t::create_table_update_dict() autoinc = 1; } - dict_table_autoinc_lock(innobase_table); + mutex_enter(&innobase_table->autoinc_mutex); dict_table_autoinc_initialize(innobase_table, autoinc); if (innobase_table->is_temporary()) { @@ -12796,7 +12811,7 @@ create_table_info_t::create_table_update_dict() } } - dict_table_autoinc_unlock(innobase_table); + mutex_exit(&innobase_table->autoinc_mutex); } innobase_parse_hint_from_comment(m_thd, innobase_table, m_form->s); @@ -13028,12 +13043,12 @@ ha_innobase::discard_or_import_tablespace( btr_cur_instant_init(). */ table_id_t id = m_prebuilt->table->id; ut_ad(id); - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); dict_table_close(m_prebuilt->table, TRUE, FALSE); - dict_table_remove_from_cache(m_prebuilt->table); + dict_sys.remove(m_prebuilt->table); m_prebuilt->table = dict_table_open_on_id(id, TRUE, DICT_TABLE_OP_NORMAL); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); if (!m_prebuilt->table) { err = DB_TABLE_NOT_FOUND; } else { @@ -14086,7 +14101,7 @@ fsp_get_available_space_in_free_extents(const fil_space_t& space) ulint n_free_up = (size_in_header - space.free_limit) / FSP_EXTENT_SIZE; - const ulint size = page_size_t(space.flags).physical(); + const ulint size = space.physical_size(); if (n_free_up > 0) { n_free_up--; n_free_up -= n_free_up / (size / FSP_EXTENT_SIZE); @@ -14163,7 +14178,7 @@ ha_innobase::info_low( opt = DICT_STATS_RECALC_TRANSIENT; } - ut_ad(!mutex_own(&dict_sys->mutex)); + ut_ad(!mutex_own(&dict_sys.mutex)); ret = dict_stats_update(ib_table, opt); if (ret != DB_SUCCESS) { @@ -14185,7 +14200,7 @@ ha_innobase::info_low( ulint stat_sum_of_other_index_sizes; if (!(flag & HA_STATUS_NO_LOCK)) { - dict_table_stats_lock(ib_table, RW_S_LATCH); + rw_lock_s_lock(&ib_table->stats_latch); } ut_a(ib_table->stat_initialized); @@ -14199,7 +14214,7 @@ ha_innobase::info_low( = ib_table->stat_sum_of_other_index_sizes; if (!(flag & HA_STATUS_NO_LOCK)) { - dict_table_stats_unlock(ib_table, RW_S_LATCH); + rw_lock_s_unlock(&ib_table->stats_latch); } /* @@ -14236,8 +14251,7 @@ ha_innobase::info_low( stats.records = (ha_rows) n_rows; stats.deleted = 0; if (fil_space_t* space = ib_table->space) { - const ulint size = page_size_t(space->flags) - .physical(); + const ulint size = space->physical_size(); stats.data_file_length = ulonglong(stat_clustered_index_size) * size; @@ -14304,7 +14318,7 @@ ha_innobase::info_low( } if (!(flag & HA_STATUS_NO_LOCK)) { - dict_table_stats_lock(ib_table, RW_S_LATCH); + rw_lock_s_lock(&ib_table->stats_latch); } ut_a(ib_table->stat_initialized); @@ -14386,7 +14400,7 @@ ha_innobase::info_low( } if (!(flag & HA_STATUS_NO_LOCK)) { - dict_table_stats_unlock(ib_table, RW_S_LATCH); + rw_lock_s_unlock(&ib_table->stats_latch); } snprintf(path, sizeof(path), "%s/%s%s", @@ -14740,20 +14754,9 @@ ha_innobase::check( if (!(check_opt->flags & T_QUICK) && !index->is_corrupted()) { - /* Enlarge the fatal lock wait timeout during - CHECK TABLE. */ - my_atomic_addlong( - &srv_fatal_semaphore_wait_threshold, - SRV_SEMAPHORE_WAIT_EXTENSION); dberr_t err = btr_validate_index( - index, m_prebuilt->trx, false); - - /* Restore the fatal lock wait timeout after - CHECK TABLE. */ - my_atomic_addlong( - &srv_fatal_semaphore_wait_threshold, - -SRV_SEMAPHORE_WAIT_EXTENSION); + index, m_prebuilt->trx); if (err != DB_SUCCESS) { is_ok = false; @@ -15108,7 +15111,7 @@ get_foreign_key_info( dict_table_t* ref_table; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); ref_table = dict_table_open_on_name( foreign->referenced_table_name_lookup, TRUE, FALSE, DICT_ERR_IGNORE_NONE); @@ -15163,7 +15166,7 @@ ha_innobase::get_foreign_key_list( m_prebuilt->trx->op_info = "getting list of foreign keys"; - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); for (dict_foreign_set::iterator it = m_prebuilt->table->foreign_set.begin(); @@ -15180,7 +15183,7 @@ ha_innobase::get_foreign_key_list( } } - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); m_prebuilt->trx->op_info = ""; @@ -15201,7 +15204,7 @@ ha_innobase::get_parent_foreign_key_list( m_prebuilt->trx->op_info = "getting list of referencing foreign keys"; - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); for (dict_foreign_set::iterator it = m_prebuilt->table->referenced_set.begin(); @@ -15218,7 +15221,7 @@ ha_innobase::get_parent_foreign_key_list( } } - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); m_prebuilt->trx->op_info = ""; @@ -15253,143 +15256,6 @@ struct tablename_compare { } }; -/** Get the table name and database name for the given table. -@param[in,out] thd user thread handle -@param[out] f_key_info pointer to table_name_info object -@param[in] foreign foreign key constraint. */ -static -void -get_table_name_info( - THD* thd, - st_handler_tablename* f_key_info, - const dict_foreign_t* foreign) -{ -#define FILENAME_CHARSET_MBMAXLEN 5 - char tmp_buff[NAME_CHAR_LEN * FILENAME_CHARSET_MBMAXLEN + 1]; - char name_buff[NAME_CHAR_LEN * FILENAME_CHARSET_MBMAXLEN + 1]; - const char* ptr; - - size_t len = dict_get_db_name_len( - foreign->referenced_table_name_lookup); - ut_memcpy(tmp_buff, foreign->referenced_table_name_lookup, len); - tmp_buff[len] = 0; - - ut_ad(len < sizeof(tmp_buff)); - - len = filename_to_tablename(tmp_buff, name_buff, sizeof(name_buff)); - f_key_info->db = thd_strmake(thd, name_buff, len); - - ptr = dict_remove_db_name(foreign->referenced_table_name_lookup); - len = filename_to_tablename(ptr, name_buff, sizeof(name_buff)); - f_key_info->tablename = thd_strmake(thd, name_buff, len); -} - -/** Get the list of tables ordered by the dependency on the other tables using -the 'CASCADE' foreign key constraint. -@param[in,out] thd user thread handle -@param[out] fk_table_list set of tables name info for the - dependent table -@retval 0 for success. */ -int -ha_innobase::get_cascade_foreign_key_table_list( - THD* thd, - List<st_handler_tablename>* fk_table_list) -{ - m_prebuilt->trx->op_info = "getting cascading foreign keys"; - - std::list<table_list_item, ut_allocator<table_list_item> > table_list; - - typedef std::set<st_handler_tablename, tablename_compare, - ut_allocator<st_handler_tablename> > cascade_fk_set; - - cascade_fk_set fk_set; - - mutex_enter(&dict_sys->mutex); - - /* Initialize the table_list with prebuilt->table name. */ - struct table_list_item item = {m_prebuilt->table, - m_prebuilt->table->name.m_name}; - - table_list.push_back(item); - - /* Get the parent table, grand parent table info from the - table list by depth-first traversal. */ - do { - const dict_table_t* parent_table; - dict_table_t* parent = NULL; - std::pair<cascade_fk_set::iterator,bool> ret; - - item = table_list.back(); - table_list.pop_back(); - parent_table = item.table; - - if (parent_table == NULL) { - - ut_ad(item.name != NULL); - - parent_table = parent = dict_table_open_on_name( - item.name, TRUE, FALSE, - DICT_ERR_IGNORE_NONE); - - if (parent_table == NULL) { - /* foreign_key_checks is or was probably - disabled; ignore the constraint */ - continue; - } - } - - for (dict_foreign_set::const_iterator it = - parent_table->foreign_set.begin(); - it != parent_table->foreign_set.end(); ++it) { - - const dict_foreign_t* foreign = *it; - st_handler_tablename f1; - - /* Skip the table if there is no - cascading operation. */ - if (0 == (foreign->type - & ~(DICT_FOREIGN_ON_DELETE_NO_ACTION - | DICT_FOREIGN_ON_UPDATE_NO_ACTION))) { - continue; - } - - if (foreign->referenced_table_name_lookup != NULL) { - get_table_name_info(thd, &f1, foreign); - ret = fk_set.insert(f1); - - /* Ignore the table if it is already - in the set. */ - if (!ret.second) { - continue; - } - - struct table_list_item item1 = { - foreign->referenced_table, - foreign->referenced_table_name_lookup}; - - table_list.push_back(item1); - - st_handler_tablename* fk_table = - (st_handler_tablename*) thd_memdup( - thd, &f1, sizeof(*fk_table)); - - fk_table_list->push_back(fk_table); - } - } - - if (parent != NULL) { - dict_table_close(parent, true, false); - } - - } while(!table_list.empty()); - - mutex_exit(&dict_sys->mutex); - - m_prebuilt->trx->op_info = ""; - - return(0); -} - /*****************************************************************//** Checks if ALTER TABLE may change the storage engine of the table. Changing storage engines is not allowed for tables for which there @@ -15533,12 +15399,9 @@ ha_innobase::extra( } /** -MySQL calls this method at the end of each statement. This method -exists for readability only. ha_innobase::reset() doesn't give any -clue about the method. */ - +MySQL calls this method at the end of each statement */ int -ha_innobase::end_stmt() +ha_innobase::reset() { if (m_prebuilt->blob_heap) { row_mysql_prebuilt_free_blob_heap(m_prebuilt); @@ -15557,15 +15420,6 @@ ha_innobase::end_stmt() return(0); } -/** -MySQL calls this method at the end of each statement */ - -int -ha_innobase::reset() -{ - return(end_stmt()); -} - /******************************************************************//** MySQL calls this function at the start of each SQL statement inside LOCK TABLES. Inside LOCK TABLES the ::external_lock method does not work to @@ -15731,8 +15585,7 @@ ha_innobase::external_lock( && thd_sqlcom_can_generate_row_events(thd)) { bool skip = false; #ifdef WITH_WSREP - skip = trx->is_wsrep() - && wsrep_thd_exec_mode(thd) != LOCAL_STATE; + skip = trx->is_wsrep() && !wsrep_thd_is_local(thd); #endif /* WITH_WSREP */ /* used by test case */ DBUG_EXECUTE_IF("no_innodb_binlog_errors", skip = true;); @@ -16648,7 +16501,7 @@ ha_innobase::innobase_get_autoinc( /* It should have been initialized during open. */ if (*value == 0) { m_prebuilt->autoinc_error = DB_UNSUPPORTED; - dict_table_autoinc_unlock(m_prebuilt->table); + mutex_exit(&m_prebuilt->table->autoinc_mutex); } } @@ -16672,7 +16525,7 @@ ha_innobase::innobase_peek_autoinc(void) innodb_table = m_prebuilt->table; - dict_table_autoinc_lock(innodb_table); + mutex_enter(&innodb_table->autoinc_mutex); auto_inc = dict_table_autoinc_read(innodb_table); @@ -16681,7 +16534,7 @@ ha_innobase::innobase_peek_autoinc(void) " '" << innodb_table->name << "'"; } - dict_table_autoinc_unlock(innodb_table); + mutex_exit(&innodb_table->autoinc_mutex); return(auto_inc); } @@ -16788,7 +16641,7 @@ ha_innobase::get_auto_increment( /* Out of range number. Let handler::update_auto_increment() take care of this */ m_prebuilt->autoinc_last_value = 0; - dict_table_autoinc_unlock(m_prebuilt->table); + mutex_exit(&m_prebuilt->table->autoinc_mutex); *nb_reserved_values= 0; return; } @@ -16831,7 +16684,7 @@ ha_innobase::get_auto_increment( m_prebuilt->autoinc_offset = offset; m_prebuilt->autoinc_increment = increment; - dict_table_autoinc_unlock(m_prebuilt->table); + mutex_exit(&m_prebuilt->table->autoinc_mutex); } /*******************************************************************//** @@ -17437,9 +17290,7 @@ fast_shutdown_validate( uint new_val = *reinterpret_cast<uint*>(save); if (srv_fast_shutdown && !new_val - && !my_atomic_loadptr_explicit(reinterpret_cast<void**> - (&srv_running), - MY_MEMORY_ORDER_RELAXED)) { + && !srv_running.load(std::memory_order_relaxed)) { return(1); } @@ -17666,7 +17517,7 @@ func_exit: buf_block_t* block = buf_page_get( page_id_t(space_id, srv_saved_page_number_debug), - page_size_t(space->flags), RW_X_LATCH, &mtr); + space->zip_size(), RW_X_LATCH, &mtr); if (block != NULL) { byte* page = block->frame; @@ -18780,278 +18631,145 @@ static struct st_mysql_storage_engine innobase_storage_engine= { MYSQL_HANDLERTON_INTERFACE_VERSION }; #ifdef WITH_WSREP -void -wsrep_abort_slave_trx( -/*==================*/ - wsrep_seqno_t bf_seqno, - wsrep_seqno_t victim_seqno) -{ - WSREP_ERROR("Trx %lld tries to abort slave trx %lld. This could be " - "caused by:\n\t" - "1) unsupported configuration options combination, please check documentation.\n\t" - "2) a bug in the code.\n\t" - "3) a database corruption.\n Node consistency compromized, " - "need to abort. Restart the node to resync with cluster.", - (long long)bf_seqno, (long long)victim_seqno); - abort(); -} -/*******************************************************************//** -This function is used to kill one transaction in BF. */ + +/** This function is used to kill one transaction. + +This transaction was open on this node (not-yet-committed), and a +conflicting writeset from some other node that was being applied +caused a locking conflict. First committed (from other node) +wins, thus open transaction is rolled back. BF stands for +brute-force: any transaction can get aborted by galera any time +it is necessary. + +This conflict can happen only when the replicated writeset (from +other node) is being applied, not when it’s waiting in the queue. +If our local transaction reached its COMMIT and this conflicting +writeset was in the queue, then it should fail the local +certification test instead. + +A brute force abort is only triggered by a locking conflict +between a writeset being applied by an applier thread (slave thread) +and an open transaction on the node, not by a Galera writeset +comparison as in the local certification failure. + +@param[in] bf_thd Brute force (BF) thread +@param[in,out] victim_trx Vimtim trx to be killed +@param[in] signal Should victim be signaled */ UNIV_INTERN int wsrep_innobase_kill_one_trx( -/*========================*/ - void * const bf_thd_ptr, - const trx_t * const bf_trx, + THD* bf_thd, trx_t *victim_trx, - ibool signal) + bool signal) { - ut_ad(lock_mutex_own()); - ut_ad(trx_mutex_own(victim_trx)); - ut_ad(bf_thd_ptr); - ut_ad(victim_trx); + ut_ad(bf_thd); + ut_ad(victim_trx); + ut_ad(lock_mutex_own()); + ut_ad(trx_mutex_own(victim_trx)); DBUG_ENTER("wsrep_innobase_kill_one_trx"); - THD *bf_thd = bf_thd_ptr ? (THD*) bf_thd_ptr : NULL; - THD *thd = (THD *) victim_trx->mysql_thd; - int64_t bf_seqno = (bf_thd) ? wsrep_thd_trx_seqno(bf_thd) : 0; - - if (!thd) { - DBUG_PRINT("wsrep", ("no thd for conflicting lock")); - WSREP_WARN("no THD for trx: " TRX_ID_FMT, victim_trx->id); - DBUG_RETURN(1); - } - if (!bf_thd) { - DBUG_PRINT("wsrep", ("no BF thd for conflicting lock")); - WSREP_WARN("no BF THD for trx: " TRX_ID_FMT, - bf_trx ? bf_trx->id : 0); - DBUG_RETURN(1); - } - - WSREP_LOG_CONFLICT(bf_thd, thd, TRUE); - - WSREP_DEBUG("BF kill (" ULINTPF ", seqno: " INT64PF - "), victim: (%lu) trx: " TRX_ID_FMT, - signal, bf_seqno, - thd_get_thread_id(thd), - victim_trx->id); - - WSREP_DEBUG("Aborting query: %s conf %d trx: %" PRId64, - (thd && wsrep_thd_query(thd)) ? wsrep_thd_query(thd) : "void", - wsrep_thd_conflict_state(thd, FALSE), - wsrep_thd_ws_handle(thd)->trx_id); + THD *thd= (THD *) victim_trx->mysql_thd; + ut_ad(thd); + /* Note that bf_trx might not exist here e.g. on MDL conflict + case (test: galera_concurrent_ctas). Similarly, BF thread + could be also acquiring MDL-lock causing victim to be + aborted. However, we have not yet called innobase_trx_init() + for BF transaction (test: galera_many_columns)*/ + trx_t* bf_trx= thd_to_trx(bf_thd); + DBUG_ASSERT(wsrep_on(bf_thd)); wsrep_thd_LOCK(thd); - DBUG_EXECUTE_IF("sync.wsrep_after_BF_victim_lock", - { - const char act[]= - "now " - "wait_for signal.wsrep_after_BF_victim_lock"; - DBUG_ASSERT(!debug_sync_set_action(bf_thd, - STRING_WITH_LEN(act))); - };); - - - if (wsrep_thd_query_state(thd) == QUERY_EXITING) { - WSREP_DEBUG("kill trx EXITING for " TRX_ID_FMT, - victim_trx->id); - wsrep_thd_UNLOCK(thd); - DBUG_RETURN(0); - } - if (wsrep_thd_exec_mode(thd) != LOCAL_STATE) { - WSREP_DEBUG("withdraw for BF trx: " TRX_ID_FMT ", state: %d", - victim_trx->id, - wsrep_thd_get_conflict_state(thd)); - } - - switch (wsrep_thd_get_conflict_state(thd)) { - case NO_CONFLICT: - wsrep_thd_set_conflict_state(thd, MUST_ABORT); - break; - case MUST_ABORT: - WSREP_DEBUG("victim " TRX_ID_FMT " in MUST ABORT state", - victim_trx->id); - wsrep_thd_UNLOCK(thd); - wsrep_thd_awake(thd, signal); - DBUG_RETURN(0); - break; - case ABORTED: - case ABORTING: // fall through - default: - WSREP_DEBUG("victim " TRX_ID_FMT " in state %d", - victim_trx->id, wsrep_thd_get_conflict_state(thd)); - wsrep_thd_UNLOCK(thd); - DBUG_RETURN(0); - break; - } - - switch (wsrep_thd_query_state(thd)) { - case QUERY_COMMITTING: - enum wsrep_status rcode; - - WSREP_DEBUG("kill query for: %ld", - thd_get_thread_id(thd)); - WSREP_DEBUG("kill trx QUERY_COMMITTING for " TRX_ID_FMT, - victim_trx->id); - - if (wsrep_thd_exec_mode(thd) == REPL_RECV) { - wsrep_abort_slave_trx(bf_seqno, - wsrep_thd_trx_seqno(thd)); - } else { - wsrep_t *wsrep= get_wsrep(); - rcode = wsrep->abort_pre_commit( - wsrep, bf_seqno, - (wsrep_trx_id_t)wsrep_thd_ws_handle(thd)->trx_id - ); - - switch (rcode) { - case WSREP_WARNING: - WSREP_DEBUG("cancel commit warning: " - TRX_ID_FMT, - victim_trx->id); - wsrep_thd_UNLOCK(thd); - wsrep_thd_awake(thd, signal); - DBUG_RETURN(1); - break; - case WSREP_OK: - break; - default: - WSREP_ERROR( - "cancel commit bad exit: %d " - TRX_ID_FMT, - rcode, victim_trx->id); - /* unable to interrupt, must abort */ - /* note: kill_mysql() will block, if we cannot. - * kill the lock holder first. - */ - abort(); - break; - } - } - wsrep_thd_UNLOCK(thd); - wsrep_thd_awake(thd, signal); - break; - case QUERY_EXEC: - /* it is possible that victim trx is itself waiting for some - * other lock. We need to cancel this waiting - */ - WSREP_DEBUG("kill trx QUERY_EXEC for " TRX_ID_FMT, - victim_trx->id); - - victim_trx->lock.was_chosen_as_deadlock_victim= TRUE; - - if (victim_trx->lock.wait_lock) { - WSREP_DEBUG("victim has wait flag: %ld", - thd_get_thread_id(thd)); - lock_t* wait_lock = victim_trx->lock.wait_lock; - - if (wait_lock) { - WSREP_DEBUG("canceling wait lock"); - victim_trx->lock.was_chosen_as_deadlock_victim= TRUE; - lock_cancel_waiting_and_release(wait_lock); - } + WSREP_LOG_CONFLICT(bf_thd, thd, TRUE); - wsrep_thd_UNLOCK(thd); - wsrep_thd_awake(thd, signal); - } else { - /* abort currently executing query */ - DBUG_PRINT("wsrep",("sending KILL_QUERY to: %lu", - thd_get_thread_id(thd))); - WSREP_DEBUG("kill query for: %ld", - thd_get_thread_id(thd)); - /* Note that innobase_kill_query will take lock_mutex - and trx_mutex */ - wsrep_thd_UNLOCK(thd); - wsrep_thd_awake(thd, signal); - - /* for BF thd, we need to prevent him from committing */ - if (wsrep_thd_exec_mode(thd) == REPL_RECV) { - wsrep_abort_slave_trx(bf_seqno, - wsrep_thd_trx_seqno(thd)); - } - } - break; - case QUERY_IDLE: + WSREP_DEBUG("Aborter %s trx_id: " TRX_ID_FMT " thread: %ld " + "seqno: %lld client_state: %s client_mode: %s transaction_mode: %s " + "query: %s", + wsrep_thd_is_BF(bf_thd, false) ? "BF" : "normal", + bf_trx ? bf_trx->id : TRX_ID_MAX, + thd_get_thread_id(bf_thd), + wsrep_thd_trx_seqno(bf_thd), + wsrep_thd_client_state_str(bf_thd), + wsrep_thd_client_mode_str(bf_thd), + wsrep_thd_transaction_state_str(bf_thd), + wsrep_thd_query(bf_thd)); + + WSREP_DEBUG("Victim %s trx_id: " TRX_ID_FMT " thread: %ld " + "seqno: %lld client_state: %s client_mode: %s transaction_mode: %s " + "query: %s", + wsrep_thd_is_BF(thd, false) ? "BF" : "normal", + victim_trx->id, + thd_get_thread_id(thd), + wsrep_thd_trx_seqno(thd), + wsrep_thd_client_state_str(thd), + wsrep_thd_client_mode_str(thd), + wsrep_thd_transaction_state_str(thd), + wsrep_thd_query(thd)); + + /* Mark transaction as a victim for Galera abort */ + victim_trx->lock.was_chosen_as_wsrep_victim= true; + + /* Note that we need to release this as it will be acquired + below in wsrep-lib */ + wsrep_thd_UNLOCK(thd); + + if (wsrep_thd_bf_abort(bf_thd, thd, signal)) { - WSREP_DEBUG("kill IDLE for " TRX_ID_FMT, victim_trx->id); + lock_t* wait_lock = victim_trx->lock.wait_lock; + if (wait_lock) { + DBUG_ASSERT(victim_trx->is_wsrep()); + WSREP_DEBUG("victim has wait flag: %lu", + thd_get_thread_id(thd)); - if (wsrep_thd_exec_mode(thd) == REPL_RECV) { - WSREP_DEBUG("kill BF IDLE, seqno: %lld", - (long long)wsrep_thd_trx_seqno(thd)); - wsrep_thd_UNLOCK(thd); - wsrep_abort_slave_trx(bf_seqno, - wsrep_thd_trx_seqno(thd)); - DBUG_RETURN(0); + WSREP_DEBUG("canceling wait lock"); + victim_trx->lock.was_chosen_as_deadlock_victim= TRUE; + lock_cancel_waiting_and_release(wait_lock); } - /* This will lock thd from proceeding after net_read() */ - wsrep_thd_set_conflict_state(thd, ABORTING); - - wsrep_lock_rollback(); - - if (wsrep_aborting_thd_contains(thd)) { - WSREP_WARN("duplicate thd aborter %lu", - (ulong) thd_get_thread_id(thd)); - } else { - wsrep_aborting_thd_enqueue(thd); - DBUG_PRINT("wsrep",("enqueuing trx abort for %lu", - thd_get_thread_id(thd))); - WSREP_DEBUG("enqueuing trx abort for (%lu)", - thd_get_thread_id(thd)); - } - - DBUG_PRINT("wsrep",("signalling wsrep rollbacker")); - WSREP_DEBUG("signaling aborter"); - wsrep_unlock_rollback(); - wsrep_thd_UNLOCK(thd); - - break; - } - default: - WSREP_WARN("bad wsrep query state: %d", - wsrep_thd_query_state(thd)); - wsrep_thd_UNLOCK(thd); - break; } DBUG_RETURN(0); } +/** This function forces the victim transaction to abort. Aborting the + transaction does NOT end it, it still has to be rolled back. + + @param bf_thd brute force THD asking for the abort + @param victim_thd victim THD to be aborted + + @return 0 victim was aborted + @return -1 victim thread was aborted (no transaction) +*/ static int wsrep_abort_transaction( -/*====================*/ handlerton*, THD *bf_thd, THD *victim_thd, my_bool signal) { DBUG_ENTER("wsrep_innobase_abort_thd"); + ut_ad(bf_thd); + ut_ad(victim_thd); trx_t* victim_trx = thd_to_trx(victim_thd); - trx_t* bf_trx = (bf_thd) ? thd_to_trx(bf_thd) : NULL; - WSREP_DEBUG("abort transaction: BF: %s victim: %s victim conf: %d", + WSREP_DEBUG("abort transaction: BF: %s victim: %s victim conf: %s", wsrep_thd_query(bf_thd), wsrep_thd_query(victim_thd), - wsrep_thd_conflict_state(victim_thd, FALSE)); + wsrep_thd_transaction_state_str(victim_thd)); if (victim_trx) { lock_mutex_enter(); trx_mutex_enter(victim_trx); - int rcode = wsrep_innobase_kill_one_trx(bf_thd, bf_trx, - victim_trx, signal); - lock_mutex_exit(); + int rcode= wsrep_innobase_kill_one_trx(bf_thd, + victim_trx, signal); trx_mutex_exit(victim_trx); + lock_mutex_exit(); wsrep_srv_conc_cancel_wait(victim_trx); DBUG_RETURN(rcode); } else { - WSREP_DEBUG("victim does not have transaction"); - wsrep_thd_LOCK(victim_thd); - wsrep_thd_set_conflict_state(victim_thd, MUST_ABORT); - wsrep_thd_UNLOCK(victim_thd); - wsrep_thd_awake(victim_thd, signal); + wsrep_thd_bf_abort(bf_thd, victim_thd, signal); } DBUG_RETURN(-1); @@ -19087,15 +18805,6 @@ innobase_wsrep_get_checkpoint( trx_rseg_read_wsrep_checkpoint(*xid); return 0; } - -static void wsrep_fake_trx_id(handlerton *, THD *thd) -{ - trx_id_t trx_id = trx_sys.get_new_trx_id(); - WSREP_DEBUG("innodb fake trx id: " TRX_ID_FMT " thd: %s", - trx_id, wsrep_thd_query(thd)); - wsrep_ws_handle_for_trx(wsrep_thd_ws_handle(thd), trx_id); -} - #endif /* WITH_WSREP */ /* plugin options */ @@ -19103,7 +18812,11 @@ static void wsrep_fake_trx_id(handlerton *, THD *thd) static MYSQL_SYSVAR_ENUM(checksum_algorithm, srv_checksum_algorithm, PLUGIN_VAR_RQCMDARG, "The algorithm InnoDB uses for page checksumming. Possible values are" - " CRC32 (hardware accelerated if the CPU supports it)" + " FULL_CRC32" + " for new files, always use CRC-32C; for old, see CRC32 below;" + " STRICT_FULL_CRC32" + " for new files, always use CRC-32C; for old, see STRICT_CRC32 below;" + " CRC32" " write crc32, allow any of the other checksums to match when reading;" " STRICT_CRC32" " write crc32, do not allow other algorithms to match when reading;" @@ -19120,7 +18833,8 @@ static MYSQL_SYSVAR_ENUM(checksum_algorithm, srv_checksum_algorithm, " write a constant magic number, do not allow values other than that" " magic number when reading;" " Files updated when this option is set to crc32 or strict_crc32 will" - " not be readable by MariaDB versions older than 10.0.4", + " not be readable by MariaDB versions older than 10.0.4;" + " new files created with full_crc32 are readable by MariaDB 10.4.3+", NULL, NULL, SRV_CHECKSUM_ALGORITHM_CRC32, &innodb_checksum_algorithm_typelib); @@ -19167,7 +18881,7 @@ static MYSQL_SYSVAR_BOOL(stats_include_delete_marked, static MYSQL_SYSVAR_ENUM(instant_alter_column_allowed, innodb_instant_alter_column_allowed, PLUGIN_VAR_RQCMDARG, - "File format constraint for ALTER TABLE", NULL, NULL, 1/*add_last*/, + "File format constraint for ALTER TABLE", NULL, NULL, 2/*add_drop_reorder*/, &innodb_instant_alter_column_allowed_typelib); static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity, @@ -20686,20 +20400,6 @@ ha_innobase::multi_range_read_explain_info( return m_ds_mrr.dsmrr_explain_info(mrr_mode, str, size); } -/** -Index Condition Pushdown interface implementation */ - -/*************************************************************//** -InnoDB index push-down condition check -@return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */ -ICP_RESULT -innobase_index_cond( -/*================*/ - void* file) /*!< in/out: pointer to ha_innobase */ -{ - return handler_index_cond_check(file); -} - /** Parse the table file name into table name and database name. @param[in] tbl_name InnoDB table name @param[out] dbname database name buffer (NAME_LEN + 1 bytes) @@ -20844,10 +20544,10 @@ static TABLE* innodb_find_table_for_vc(THD* thd, dict_table_t* table) "SIGNAL got_no_such_table")));); if (THDVAR(thd, background_thread)) { - /* Purge thread acquires dict_operation_lock while - processing undo log record. Release the dict_operation_lock + /* Purge thread acquires dict_sys.latch while + processing undo log record. Release it before acquiring MDL on the table. */ - rw_lock_s_unlock(&dict_operation_lock); + rw_lock_s_unlock(&dict_sys.latch); return innodb_acquire_mdl(thd, table); } else { if (table->vc_templ->mysql_table_query_id @@ -20893,9 +20593,9 @@ TABLE* innobase_init_vc_templ(dict_table_t* table) DBUG_RETURN(NULL); } - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); innobase_build_v_templ(mysql_table, table, table->vc_templ, NULL, true); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); DBUG_RETURN(mysql_table); } @@ -21091,9 +20791,9 @@ innobase_get_computed_value( dfield_t* field; ulint len; - const page_size_t page_size = (old_table == NULL) - ? dict_table_page_size(index->table) - : dict_table_page_size(old_table); + const ulint zip_size = old_table + ? old_table->space->zip_size() + : dict_tf_get_zip_size(index->table->flags); ulint ret = 0; @@ -21118,7 +20818,7 @@ innobase_get_computed_value( buf = rec_buf2; } - for (ulint i = 0; i < col->num_base; i++) { + for (ulint i = 0; i < unsigned{col->num_base}; i++) { dict_col_t* base_col = col->base_col[i]; const dfield_t* row_field = NULL; ulint col_no = base_col->ind; @@ -21146,7 +20846,7 @@ innobase_get_computed_value( } data = btr_copy_externally_stored_field( - &len, data, page_size, + &len, data, zip_size, dfield_get_len(row_field), *local_heap); } @@ -21261,6 +20961,190 @@ ha_innobase::idx_cond_push( DBUG_RETURN(NULL); } + +/** Push a primary key filter. +@param[in] pk_filter filter against which primary keys + are to be checked +@retval false if pushed (always) */ +bool ha_innobase::rowid_filter_push(Rowid_filter* pk_filter) +{ + DBUG_ENTER("ha_innobase::rowid_filter_push"); + DBUG_ASSERT(pk_filter != NULL); + pushed_rowid_filter= pk_filter; + DBUG_RETURN(false); +} + +static bool +is_part_of_a_primary_key(const Field* field) +{ + const TABLE_SHARE* s = field->table->s; + + return s->primary_key != MAX_KEY + && field->part_of_key.is_set(s->primary_key); +} + +bool +ha_innobase::can_convert_string(const Field_string* field, + const Column_definition& new_type) const +{ + DBUG_ASSERT(!field->compression_method()); + if (new_type.type_handler() != field->type_handler()) { + return false; + } + + if (new_type.char_length < field->char_length()) { + return false; + } + + if (new_type.charset != field->charset()) { + if (new_type.length != field->max_display_length() + && !m_prebuilt->table->not_redundant()) { + return IS_EQUAL_NO; + } + + Charset field_cs(field->charset()); + if (!field_cs.encoding_allows_reinterpret_as( + new_type.charset)) { + return false; + } + + if (!field_cs.eq_collation_specific_names(new_type.charset)) { + return !is_part_of_a_primary_key(field); + } + + return true; + } + + if (new_type.length != field->max_display_length()) { + return false; + } + + return true; +} + +static bool +supports_enlarging(const dict_table_t* table, const Field_varstring* field, + const Column_definition& new_type) +{ + return field->field_length <= 127 || new_type.length <= 255 + || field->field_length > 255 || !table->not_redundant(); +} + +bool +ha_innobase::can_convert_varstring(const Field_varstring* field, + const Column_definition& new_type) const +{ + if (new_type.length < field->field_length) { + return false; + } + + if (new_type.char_length < field->char_length()) { + return false; + } + + if (!new_type.compression_method() != !field->compression_method()) { + return false; + } + + if (new_type.type_handler() != field->type_handler()) { + return false; + } + + if (new_type.charset != field->charset()) { + if (!supports_enlarging(m_prebuilt->table, field, new_type)) { + return false; + } + + Charset field_cs(field->charset()); + if (!field_cs.encoding_allows_reinterpret_as( + new_type.charset)) { + return false; + } + + if (!field_cs.eq_collation_specific_names(new_type.charset)) { + return !is_part_of_a_primary_key(field); + } + + return true; + } + + if (new_type.length != field->field_length) { + if (!supports_enlarging(m_prebuilt->table, field, new_type)) { + return false; + } + + return true; + } + + return true; +} + +bool +ha_innobase::can_convert_blob(const Field_blob* field, + const Column_definition& new_type) const +{ + if (new_type.type_handler() != field->type_handler()) { + return false; + } + + if (!new_type.compression_method() != !field->compression_method()) { + return false; + } + + if (new_type.pack_length != field->pack_length()) { + return false; + } + + if (new_type.charset != field->charset()) { + Charset field_cs(field->charset()); + if (!field_cs.encoding_allows_reinterpret_as( + new_type.charset)) { + return false; + } + + if (!field_cs.eq_collation_specific_names(new_type.charset)) { + bool is_part_of_a_key + = !field->part_of_key.is_clear_all(); + return !is_part_of_a_key; + } + + return true; + } + + return true; +} + +Compare_keys ha_innobase::compare_key_parts( + const Field &old_field, const Column_definition &new_field, + const KEY_PART_INFO &old_part, const KEY_PART_INFO &new_part) const +{ + const bool is_equal= old_field.is_equal(new_field); + const CHARSET_INFO *old_cs= old_field.charset(); + const CHARSET_INFO *new_cs= new_field.charset; + + if (!is_equal) + { + if (!old_field.can_be_converted_by_engine(new_field)) + return Compare_keys::NotEqual; + + if (!Charset(old_cs).eq_collation_specific_names(new_cs)) + return Compare_keys::NotEqual; + } + + if (old_part.length / old_cs->mbmaxlen != new_part.length / new_cs->mbmaxlen) + { + if (old_part.length != old_field.field_length) + return Compare_keys::NotEqual; + + if (old_part.length >= new_part.length) + return Compare_keys::NotEqual; + + return Compare_keys::EqualButKeyPartLength; + } + + return Compare_keys::Equal; +} + /******************************************************************//** Use this when the args are passed to the format string from errmsg-utf8.txt directly as is. @@ -21296,10 +21180,10 @@ ib_senderrf( switch (level) { case IB_LOG_LEVEL_INFO: - l = ME_JUST_INFO; + l = ME_NOTE; break; case IB_LOG_LEVEL_WARN: - l = ME_JUST_WARNING; + l = ME_WARNING; break; default: l = 0; diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h index 56b70d816c6..0a26d8e5671 100644 --- a/storage/innobase/handler/ha_innodb.h +++ b/storage/innobase/handler/ha_innodb.h @@ -16,10 +16,9 @@ this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA *****************************************************************************/ - #ifdef WITH_WSREP -# include <mysql/service_wsrep.h> -# include "../../../wsrep/wsrep_api.h" +#include "wsrep_api.h" +#include <mysql/service_wsrep.h> #endif /* WITH_WSREP */ #include "table.h" @@ -59,35 +58,33 @@ struct st_handler_tablename const char *tablename; }; /** The class defining a handle to an Innodb table */ -class ha_innobase: public handler +class ha_innobase final: public handler { public: ha_innobase(handlerton* hton, TABLE_SHARE* table_arg); - ~ha_innobase(); + ~ha_innobase() override; /** Get the row type from the storage engine. If this method returns ROW_TYPE_NOT_USED, the information in HA_CREATE_INFO should be used. */ - enum row_type get_row_type() const; - - const char* table_type() const; + enum row_type get_row_type() const override; - const char* index_type(uint key_number); + const char* table_type() const; - const char** bas_ext() const; + const char* index_type(uint key_number) override; - Table_flags table_flags() const; + Table_flags table_flags() const override; - ulong index_flags(uint idx, uint part, bool all_parts) const; + ulong index_flags(uint idx, uint part, bool all_parts) const override; - uint max_supported_keys() const; + uint max_supported_keys() const override; - uint max_supported_key_length() const; + uint max_supported_key_length() const override; - uint max_supported_key_part_length() const; + uint max_supported_key_part_length() const override; - const key_map* keys_to_use_for_scanning(); + const key_map* keys_to_use_for_scanning() override; - void column_bitmaps_signal(); + void column_bitmaps_signal() override; /** Opens dictionary table object using table name. For partition, we need to try alternative lower/upper case names to support moving data files across @@ -103,97 +100,97 @@ public: bool is_partition, dict_err_ignore_t ignore_err); - int open(const char *name, int mode, uint test_if_locked); + int open(const char *name, int mode, uint test_if_locked) override; - handler* clone(const char *name, MEM_ROOT *mem_root); + handler* clone(const char *name, MEM_ROOT *mem_root) override; - int close(void); + int close(void) override; - double scan_time(); + double scan_time() override; - double read_time(uint index, uint ranges, ha_rows rows); + double read_time(uint index, uint ranges, ha_rows rows) override; - int delete_all_rows(); + int delete_all_rows() override; - int write_row(uchar * buf); + int write_row(const uchar * buf) override; - int update_row(const uchar * old_data, const uchar * new_data); + int update_row(const uchar * old_data, const uchar * new_data) override; - int delete_row(const uchar * buf); + int delete_row(const uchar * buf) override; - bool was_semi_consistent_read(); + bool was_semi_consistent_read() override; - void try_semi_consistent_read(bool yes); + void try_semi_consistent_read(bool yes) override; - void unlock_row(); + void unlock_row() override; - int index_init(uint index, bool sorted); + int index_init(uint index, bool sorted) override; - int index_end(); + int index_end() override; int index_read( uchar* buf, const uchar* key, uint key_len, - ha_rkey_function find_flag); + ha_rkey_function find_flag) override; - int index_read_last(uchar * buf, const uchar * key, uint key_len); + int index_read_last(uchar * buf, const uchar * key, + uint key_len) override; - int index_next(uchar * buf); + int index_next(uchar * buf) override; - int index_next_same(uchar * buf, const uchar *key, uint keylen); + int index_next_same(uchar * buf, const uchar * key, + uint keylen) override; - int index_prev(uchar * buf); + int index_prev(uchar * buf) override; - int index_first(uchar * buf); + int index_first(uchar * buf) override; - int index_last(uchar * buf); + int index_last(uchar * buf) override; /* Copy a cached MySQL row. If requested, also avoids overwriting non-read columns. */ void copy_cached_row(uchar *to_rec, const uchar *from_rec, uint rec_length); - int rnd_init(bool scan); + int rnd_init(bool scan) override; - int rnd_end(); + int rnd_end() override; - int rnd_next(uchar *buf); + int rnd_next(uchar *buf) override; - int rnd_pos(uchar * buf, uchar *pos); + int rnd_pos(uchar * buf, uchar *pos) override; - int ft_init(); - void ft_end() { rnd_end(); } - FT_INFO *ft_init_ext(uint flags, uint inx, String* key); - int ft_read(uchar* buf); + int ft_init() override; + void ft_end() override { rnd_end(); } + FT_INFO *ft_init_ext(uint flags, uint inx, String* key) override; + int ft_read(uchar* buf) override; - void position(const uchar *record); + void position(const uchar *record) override; - int info(uint); + int info(uint) override; - int analyze(THD* thd,HA_CHECK_OPT* check_opt); + int analyze(THD* thd,HA_CHECK_OPT* check_opt) override; - int optimize(THD* thd,HA_CHECK_OPT* check_opt); + int optimize(THD* thd,HA_CHECK_OPT* check_opt) override; - int discard_or_import_tablespace(my_bool discard); + int discard_or_import_tablespace(my_bool discard) override; - int extra(ha_extra_function operation); + int extra(ha_extra_function operation) override; - int reset(); + int reset() override; - int external_lock(THD *thd, int lock_type); + int external_lock(THD *thd, int lock_type) override; - int start_stmt(THD *thd, thr_lock_type lock_type); - - void position(uchar *record); + int start_stmt(THD *thd, thr_lock_type lock_type) override; ha_rows records_in_range( uint inx, key_range* min_key, - key_range* max_key); + key_range* max_key) override; - ha_rows estimate_rows_upper_bound(); + ha_rows estimate_rows_upper_bound() override; - void update_create_info(HA_CREATE_INFO* create_info); + void update_create_info(HA_CREATE_INFO* create_info) override; inline int create( const char* name, @@ -205,63 +202,57 @@ public: int create( const char* name, TABLE* form, - HA_CREATE_INFO* create_info); - - const char* check_table_options(THD *thd, TABLE* table, - HA_CREATE_INFO* create_info, const bool use_tablespace, const ulint file_format); + HA_CREATE_INFO* create_info) override; inline int delete_table(const char* name, enum_sql_command sqlcom); - int truncate(); + int truncate() override; - int delete_table(const char *name); + int delete_table(const char *name) override; - int rename_table(const char* from, const char* to); + int rename_table(const char* from, const char* to) override; int defragment_table(const char* name, const char* index_name, bool async); - int check(THD* thd, HA_CHECK_OPT* check_opt); - char* update_table_comment(const char* comment); + int check(THD* thd, HA_CHECK_OPT* check_opt) override; + char* update_table_comment(const char* comment) override; - char* get_foreign_key_create_info(); + char* get_foreign_key_create_info() override; - int get_foreign_key_list(THD *thd, List<FOREIGN_KEY_INFO> *f_key_list); + int get_foreign_key_list(THD *thd, + List<FOREIGN_KEY_INFO> *f_key_list) override; int get_parent_foreign_key_list( THD* thd, - List<FOREIGN_KEY_INFO>* f_key_list); - int get_cascade_foreign_key_table_list( - THD* thd, - List<st_handler_tablename>* fk_table_list); + List<FOREIGN_KEY_INFO>* f_key_list) override; + bool can_switch_engines() override; - bool can_switch_engines(); + uint referenced_by_foreign_key() override; - uint referenced_by_foreign_key(); + void free_foreign_key_create_info(char* str) override; - void free_foreign_key_create_info(char* str); - - uint lock_count(void) const; + uint lock_count(void) const override; THR_LOCK_DATA** store_lock( THD* thd, THR_LOCK_DATA** to, - thr_lock_type lock_type); + thr_lock_type lock_type) override; - void init_table_handle_for_HANDLER(); + void init_table_handle_for_HANDLER() override; - virtual void get_auto_increment( + void get_auto_increment( ulonglong offset, ulonglong increment, ulonglong nb_desired_values, ulonglong* first_value, - ulonglong* nb_reserved_values); - int reset_auto_increment(ulonglong value); + ulonglong* nb_reserved_values) override; + int reset_auto_increment(ulonglong value) override; - virtual bool get_error_message(int error, String *buf); + bool get_error_message(int error, String *buf) override; - virtual bool get_foreign_dup_key(char*, uint, char*, uint); + bool get_foreign_dup_key(char*, uint, char*, uint) override; - uint8 table_cache_type(); + uint8 table_cache_type() override; /** Ask handler about permission to cache table during query registration @@ -271,11 +262,11 @@ public: const char* table_key, uint key_length, qc_engine_callback* call_back, - ulonglong* engine_data); + ulonglong* engine_data) override; - bool primary_key_is_clustered(); + bool primary_key_is_clustered() override; - int cmp_ref(const uchar* ref1, const uchar* ref2); + int cmp_ref(const uchar* ref1, const uchar* ref2) override; /** On-line ALTER TABLE interface @see handler0alter.cc @{ */ @@ -305,7 +296,7 @@ public: enum_alter_inplace_result check_if_supported_inplace_alter( TABLE* altered_table, - Alter_inplace_info* ha_alter_info); + Alter_inplace_info* ha_alter_info) override; /** Allows InnoDB to update internal structures with concurrent writes blocked (provided that check_if_supported_inplace_alter() @@ -321,7 +312,7 @@ public: */ bool prepare_inplace_alter_table( TABLE* altered_table, - Alter_inplace_info* ha_alter_info); + Alter_inplace_info* ha_alter_info) override; /** Alter the table structure in-place with operations specified using HA_ALTER_FLAGS and Alter_inplace_information. @@ -337,7 +328,7 @@ public: */ bool inplace_alter_table( TABLE* altered_table, - Alter_inplace_info* ha_alter_info); + Alter_inplace_info* ha_alter_info) override; /** Commit or rollback the changes made during prepare_inplace_alter_table() and inplace_alter_table() inside @@ -356,12 +347,12 @@ public: bool commit_inplace_alter_table( TABLE* altered_table, Alter_inplace_info* ha_alter_info, - bool commit); + bool commit) override; /** @} */ bool check_if_incompatible_data( HA_CREATE_INFO* info, - uint table_changes); + uint table_changes) override; /** @name Multi Range Read interface @{ */ @@ -376,11 +367,11 @@ public: void* seq_init_param, uint n_ranges, uint mode, - HANDLER_BUFFER* buf); + HANDLER_BUFFER* buf) override; /** Process next multi range read @see DsMrr_impl::dsmrr_next @param range_info */ - int multi_range_read_next(range_id_t *range_info); + int multi_range_read_next(range_id_t *range_info) override; /** Initialize multi range read and get information. @see ha_myisam::multi_range_read_info_const @@ -399,7 +390,7 @@ public: uint n_ranges, uint* bufsz, uint* flags, - Cost_estimate* cost); + Cost_estimate* cost) override; /** Initialize multi range read and get information. @see DsMrr_impl::dsmrr_info @@ -412,16 +403,16 @@ public: @param cost */ ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys, uint key_parts, uint* bufsz, uint* flags, - Cost_estimate* cost); + Cost_estimate* cost) override; int multi_range_read_explain_info(uint mrr_mode, - char *str, size_t size); + char *str, size_t size) override; /** Attempt to push down an index condition. @param[in] keyno MySQL key number @param[in] idx_cond Index condition to be checked @return idx_cond if pushed; NULL if not pushed */ - Item* idx_cond_push(uint keyno, Item* idx_cond); + Item* idx_cond_push(uint keyno, Item* idx_cond) override; /* @} */ /** Check if InnoDB is not storing virtual column metadata for a table. @@ -432,13 +423,35 @@ public: return s.frm_version<FRM_VER_EXPRESSSIONS && s.virtual_fields; } -protected: - /** - MySQL calls this method at the end of each statement. This method - exists for readability only, called from reset(). The name reset() - doesn't give any clue that it is called at the end of a statement. */ - int end_stmt(); + /** Push a primary key filter. + @param[in] pk_filter filter against which primary keys + are to be checked + @retval false if pushed (always) */ + bool rowid_filter_push(Rowid_filter *rowid_filter) override; + + bool + can_convert_string(const Field_string* field, + const Column_definition& new_field) const override; + bool can_convert_varstring( + const Field_varstring* field, + const Column_definition& new_field) const override; + bool + can_convert_blob(const Field_blob* field, + const Column_definition& new_field) const override; + + /** @return whether innodb_strict_mode is active */ + static bool is_innodb_strict_mode(THD* thd); + + /** @return whether innodb_strict_mode is active */ + bool is_innodb_strict_mode() + { return is_innodb_strict_mode(m_user_thd); } + Compare_keys + compare_key_parts(const Field& old_field, + const Column_definition& new_field, + const KEY_PART_INFO& old_part, + const KEY_PART_INFO& new_part) const override; +protected: dberr_t innobase_get_autoinc(ulonglong* value); dberr_t innobase_lock_autoinc(); ulonglong innobase_peek_autoinc(); @@ -457,8 +470,11 @@ protected: dict_index_t* innobase_get_index(uint keynr); #ifdef WITH_WSREP - int wsrep_append_keys(THD *thd, wsrep_key_type key_type, - const uchar* record0, const uchar* record1); + int wsrep_append_keys( + THD *thd, + Wsrep_service_key_type key_type, + const uchar* record0, + const uchar* record1); #endif /** Builds a 'template' to the prebuilt struct. @@ -468,7 +484,7 @@ protected: false if accessing individual fields is enough */ void build_template(bool whole_row); - virtual int info_low(uint, bool); + int info_low(uint, bool); /** The multi range read session object */ DsMrr_impl m_ds_mrr; @@ -565,23 +581,8 @@ extern void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char ** struct trx_t; #ifdef WITH_WSREP -//extern "C" int wsrep_trx_order_before(void *thd1, void *thd2); - -extern "C" bool wsrep_thd_is_wsrep_on(THD *thd); - - -extern "C" void wsrep_thd_set_exec_mode(THD *thd, enum wsrep_exec_mode mode); -extern "C" void wsrep_thd_set_query_state( - THD *thd, enum wsrep_query_state state); - -extern "C" void wsrep_thd_set_trx_to_replay(THD *thd, uint64 trx_id); - -extern "C" uint32 wsrep_thd_wsrep_rand(THD *thd); -extern "C" time_t wsrep_thd_query_start(THD *thd); -extern "C" query_id_t wsrep_thd_query_id(THD *thd); -extern "C" query_id_t wsrep_thd_wsrep_last_query_id(THD *thd); -extern "C" void wsrep_thd_set_wsrep_last_query_id(THD *thd, query_id_t id); -#endif +#include <mysql/service_wsrep.h> +#endif /* WITH_WSREP */ extern const struct _ft_vft ft_vft_result; diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc index ac36cff6173..a217a6cfcac 100644 --- a/storage/innobase/handler/handler0alter.cc +++ b/storage/innobase/handler/handler0alter.cc @@ -109,10 +109,12 @@ static const alter_table_operations INNOBASE_INPLACE_IGNORE | ALTER_PARTITIONED | ALTER_COLUMN_COLUMN_FORMAT | ALTER_COLUMN_STORAGE_TYPE + | ALTER_CONVERT_TO | ALTER_VIRTUAL_GCOL_EXPR | ALTER_DROP_CHECK_CONSTRAINT | ALTER_RENAME - | ALTER_COLUMN_INDEX_LENGTH; + | ALTER_COLUMN_INDEX_LENGTH + | ALTER_CHANGE_INDEX_COMMENT; /** Operations on foreign key definitions (changing the schema only) */ static const alter_table_operations INNOBASE_FOREIGN_OPERATIONS @@ -144,10 +146,722 @@ static const alter_table_operations INNOBASE_ALTER_INSTANT | ALTER_COLUMN_NAME | ALTER_ADD_VIRTUAL_COLUMN | INNOBASE_FOREIGN_OPERATIONS - | ALTER_COLUMN_EQUAL_PACK_LENGTH + | ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE | ALTER_COLUMN_UNVERSIONED + | ALTER_RENAME_INDEX | ALTER_DROP_VIRTUAL_COLUMN; +/** Acquire a page latch on the possible metadata record, +to prevent concurrent invocation of dict_index_t::clear_instant_alter() +by purge when the table turns out to be empty. +@param[in,out] index clustered index +@param[in,out] mtr mini-transaction */ +static void instant_metadata_lock(dict_index_t& index, mtr_t& mtr) +{ + DBUG_ASSERT(index.is_primary()); + + if (!index.is_instant()) { + /* dict_index_t::clear_instant_alter() cannot be called. + No need for a latch. */ + return; + } + + btr_cur_t btr_cur; + btr_cur_open_at_index_side(true, &index, BTR_SEARCH_LEAF, + &btr_cur, 0, &mtr); + ut_ad(page_cur_is_before_first(btr_cur_get_page_cur(&btr_cur))); + ut_ad(page_is_leaf(btr_cur_get_page(&btr_cur))); + ut_ad(!page_has_prev(btr_cur_get_page(&btr_cur))); + ut_ad(!buf_block_get_page_zip(btr_cur_get_block(&btr_cur))); +} + +/** Initialize instant->field_map. +@param[in] table table definition to copy from */ +inline void dict_table_t::init_instant(const dict_table_t& table) +{ + const dict_index_t& oindex __attribute__((unused))= *table.indexes.start; + dict_index_t& index = *indexes.start; + const unsigned u = index.first_user_field(); + DBUG_ASSERT(u == oindex.first_user_field()); + DBUG_ASSERT(index.n_fields >= oindex.n_fields); + + field_map_element_t* field_map_it = static_cast<field_map_element_t*>( + mem_heap_zalloc(heap, (index.n_fields - u) + * sizeof *field_map_it)); + instant->field_map = field_map_it; + + ut_d(unsigned n_drop = 0); + ut_d(unsigned n_nullable = 0); + for (unsigned i = u; i < index.n_fields; i++) { + auto& f = index.fields[i]; + DBUG_ASSERT(dict_col_get_fixed_size(f.col, not_redundant()) + <= DICT_MAX_FIXED_COL_LEN); + ut_d(n_nullable += f.col->is_nullable()); + + if (!f.col->is_dropped()) { + (*field_map_it++).set_ind(f.col->ind); + continue; + } + + auto fixed_len = dict_col_get_fixed_size( + f.col, not_redundant()); + field_map_it->set_dropped(); + if (!f.col->is_nullable()) { + field_map_it->set_not_null(); + } + field_map_it->set_ind(fixed_len + ? uint16_t(fixed_len + 1) + : DATA_BIG_COL(f.col)); + field_map_it++; + ut_ad(f.col >= table.instant->dropped); + ut_ad(f.col < table.instant->dropped + + table.instant->n_dropped); + ut_d(n_drop++); + size_t d = f.col - table.instant->dropped; + ut_ad(f.col == &table.instant->dropped[d]); + ut_ad(d <= instant->n_dropped); + f.col = &instant->dropped[d]; + } + ut_ad(n_drop == n_dropped()); + ut_ad(field_map_it == &instant->field_map[index.n_fields - u]); + ut_ad(index.n_nullable == n_nullable); +} + +/** Set is_instant() before instant_column(). +@param[in] old previous table definition +@param[in] col_map map from old.cols[] and old.v_cols[] to this +@param[out] first_alter_pos 0, or 1 + first changed column position */ +inline void dict_table_t::prepare_instant(const dict_table_t& old, + const ulint* col_map, + unsigned& first_alter_pos) +{ + DBUG_ASSERT(!is_instant()); + DBUG_ASSERT(n_dropped() == 0); + DBUG_ASSERT(old.n_cols == old.n_def); + DBUG_ASSERT(n_cols == n_def); + DBUG_ASSERT(old.supports_instant()); + DBUG_ASSERT(!persistent_autoinc + || persistent_autoinc == old.persistent_autoinc); + /* supports_instant() does not necessarily hold here, + in case ROW_FORMAT=COMPRESSED according to the + MariaDB data dictionary, and ALTER_OPTIONS was not set. + If that is the case, the instant ALTER TABLE would keep + the InnoDB table in its current format. */ + + dict_index_t& oindex = *old.indexes.start; + dict_index_t& index = *indexes.start; + first_alter_pos = 0; + + mtr_t mtr; + mtr.start(); + /* Protect oindex.n_core_fields and others, so that + purge cannot invoke dict_index_t::clear_instant_alter(). */ + instant_metadata_lock(oindex, mtr); + + for (unsigned i = 0; i + DATA_N_SYS_COLS < old.n_cols; i++) { + if (col_map[i] != i) { + first_alter_pos = 1 + i; + goto add_metadata; + } + } + + if (!old.instant) { + /* Columns were not dropped or reordered. + Therefore columns must have been added at the end, + or modified instantly in place. */ + DBUG_ASSERT(index.n_fields >= oindex.n_fields); + DBUG_ASSERT(index.n_fields > oindex.n_fields + || !not_redundant()); +#ifdef UNIV_DEBUG + if (index.n_fields == oindex.n_fields) { + ut_ad(!not_redundant()); + for (unsigned i = index.n_fields; i--; ) { + ut_ad(index.fields[i].col->same_format( + *oindex.fields[i].col)); + } + } +#endif +set_core_fields: + index.n_core_fields = oindex.n_core_fields; + index.n_core_null_bytes = oindex.n_core_null_bytes; + } else { +add_metadata: + const unsigned n_old_drop = old.n_dropped(); + unsigned n_drop = n_old_drop; + for (unsigned i = old.n_cols; i--; ) { + if (col_map[i] == ULINT_UNDEFINED) { + DBUG_ASSERT(i + DATA_N_SYS_COLS + < uint(old.n_cols)); + n_drop++; + } + } + + instant = new (mem_heap_alloc(heap, sizeof(dict_instant_t))) + dict_instant_t(); + instant->n_dropped = n_drop; + if (n_drop) { + instant->dropped + = static_cast<dict_col_t*>( + mem_heap_alloc(heap, n_drop + * sizeof(dict_col_t))); + if (n_old_drop) { + memcpy(instant->dropped, old.instant->dropped, + n_old_drop * sizeof(dict_col_t)); + } + } else { + instant->dropped = NULL; + } + + for (unsigned i = 0, d = n_old_drop; i < old.n_cols; i++) { + if (col_map[i] == ULINT_UNDEFINED) { + (new (&instant->dropped[d++]) + dict_col_t(old.cols[i]))->set_dropped(); + } + } +#ifndef DBUG_OFF + for (unsigned i = 0; i < n_drop; i++) { + DBUG_ASSERT(instant->dropped[i].is_dropped()); + } +#endif + const uint n_fields = index.n_fields + n_dropped(); + + DBUG_ASSERT(n_fields >= oindex.n_fields); + dict_field_t* fields = static_cast<dict_field_t*>( + mem_heap_zalloc(heap, n_fields * sizeof *fields)); + uint i = 0, j = 0, n_nullable = 0; + ut_d(uint core_null = 0); + for (; i < oindex.n_fields; i++) { + DBUG_ASSERT(j <= i); + dict_field_t&f = fields[i] = oindex.fields[i]; + if (f.col->is_dropped()) { + /* The column has been instantly + dropped earlier. */ + DBUG_ASSERT(f.col >= old.instant->dropped); + { + size_t d = f.col + - old.instant->dropped; + DBUG_ASSERT(d < n_old_drop); + DBUG_ASSERT(&old.instant->dropped[d] + == f.col); + DBUG_ASSERT(!f.name); + f.col = instant->dropped + d; + } + if (f.col->is_nullable()) { +found_nullable: + n_nullable++; + ut_d(core_null + += i < oindex.n_core_fields); + } + continue; + } + + const ulint col_ind = col_map[f.col->ind]; + if (col_ind != ULINT_UNDEFINED) { + if (index.fields[j].col->ind != col_ind) { + /* The fields for instantly + added columns must be placed + last in the clustered index. + Keep pre-existing fields in + the same position. */ + uint k; + for (k = j + 1; k < index.n_fields; + k++) { + if (index.fields[k].col->ind + == col_ind) { + goto found_j; + } + } + DBUG_ASSERT(!"no such col"); +found_j: + std::swap(index.fields[j], + index.fields[k]); + } + DBUG_ASSERT(index.fields[j].col->ind + == col_ind); + fields[i] = index.fields[j++]; + DBUG_ASSERT(!fields[i].col->is_dropped()); + DBUG_ASSERT(fields[i].name + == fields[i].col->name(*this)); + if (fields[i].col->is_nullable()) { + goto found_nullable; + } + continue; + } + + /* This column is being dropped. */ + unsigned d = n_old_drop; + for (unsigned c = 0; c < f.col->ind; c++) { + d += col_map[c] == ULINT_UNDEFINED; + } + DBUG_ASSERT(d < n_drop); + f.col = &instant->dropped[d]; + f.name = NULL; + if (f.col->is_nullable()) { + goto found_nullable; + } + } + /* The n_core_null_bytes only matters for + ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables. */ + ut_ad(UT_BITS_IN_BYTES(core_null) == oindex.n_core_null_bytes + || !not_redundant()); + DBUG_ASSERT(i >= oindex.n_core_fields); + DBUG_ASSERT(j <= i); + DBUG_ASSERT(n_fields - (i - j) == index.n_fields); + std::sort(index.fields + j, index.fields + index.n_fields, + [](const dict_field_t& a, const dict_field_t& b) + { return a.col->ind < b.col->ind; }); + for (; i < n_fields; i++) { + fields[i] = index.fields[j++]; + n_nullable += fields[i].col->is_nullable(); + DBUG_ASSERT(!fields[i].col->is_dropped()); + DBUG_ASSERT(fields[i].name + == fields[i].col->name(*this)); + } + DBUG_ASSERT(j == index.n_fields); + index.n_fields = index.n_def = n_fields; + index.fields = fields; + DBUG_ASSERT(n_nullable >= index.n_nullable); + DBUG_ASSERT(n_nullable >= oindex.n_nullable); + index.n_nullable = n_nullable; + goto set_core_fields; + } + + DBUG_ASSERT(n_cols + n_dropped() >= old.n_cols + old.n_dropped()); + DBUG_ASSERT(n_dropped() >= old.n_dropped()); + DBUG_ASSERT(index.n_core_fields == oindex.n_core_fields); + DBUG_ASSERT(index.n_core_null_bytes == oindex.n_core_null_bytes); + mtr.commit(); +} + +/** Adjust index metadata for instant ADD/DROP/reorder COLUMN. +@param[in] clustered index definition after instant ALTER TABLE */ +inline void dict_index_t::instant_add_field(const dict_index_t& instant) +{ + DBUG_ASSERT(is_primary()); + DBUG_ASSERT(instant.is_primary()); + DBUG_ASSERT(!has_virtual()); + DBUG_ASSERT(!instant.has_virtual()); + DBUG_ASSERT(instant.n_core_fields <= instant.n_fields); + DBUG_ASSERT(n_def == n_fields); + DBUG_ASSERT(instant.n_def == instant.n_fields); + DBUG_ASSERT(type == instant.type); + DBUG_ASSERT(trx_id_offset == instant.trx_id_offset); + DBUG_ASSERT(n_user_defined_cols == instant.n_user_defined_cols); + DBUG_ASSERT(n_uniq == instant.n_uniq); + DBUG_ASSERT(instant.n_fields >= n_fields); + DBUG_ASSERT(instant.n_nullable >= n_nullable); + /* dict_table_t::prepare_instant() initialized n_core_fields + to be equal. However, after that purge could have emptied the + table and invoked dict_index_t::clear_instant_alter(). */ + DBUG_ASSERT(instant.n_core_fields <= n_core_fields); + DBUG_ASSERT(instant.n_core_null_bytes <= n_core_null_bytes); + DBUG_ASSERT(instant.n_core_fields == n_core_fields + || (!is_instant() && instant.is_instant())); + DBUG_ASSERT(instant.n_core_null_bytes == n_core_null_bytes + || (!is_instant() && instant.is_instant())); + + /* instant will have all fields (including ones for columns + that have been or are being instantly dropped) in the same position + as this index. Fields for any added columns are appended at the end. */ +#ifndef DBUG_OFF + for (unsigned i = 0; i < n_fields; i++) { + DBUG_ASSERT(fields[i].same(instant.fields[i])); + DBUG_ASSERT(instant.fields[i].col->same_format(*fields[i] + .col)); + /* Instant conversion from NULL to NOT NULL is not allowed. */ + DBUG_ASSERT(!fields[i].col->is_nullable() + || instant.fields[i].col->is_nullable()); + DBUG_ASSERT(fields[i].col->is_nullable() + == instant.fields[i].col->is_nullable() + || !table->not_redundant()); + } +#endif + n_fields = instant.n_fields; + n_def = instant.n_def; + n_nullable = instant.n_nullable; + fields = static_cast<dict_field_t*>( + mem_heap_dup(heap, instant.fields, n_fields * sizeof *fields)); + + ut_d(unsigned n_null = 0); + ut_d(unsigned n_dropped = 0); + + for (unsigned i = 0; i < n_fields; i++) { + const dict_col_t* icol = instant.fields[i].col; + dict_field_t& f = fields[i]; + ut_d(n_null += icol->is_nullable()); + DBUG_ASSERT(!icol->is_virtual()); + if (icol->is_dropped()) { + ut_d(n_dropped++); + f.col->set_dropped(); + f.name = NULL; + } else { + f.col = &table->cols[icol - instant.table->cols]; + f.name = f.col->name(*table); + } + } + + ut_ad(n_null == n_nullable); + ut_ad(n_dropped == instant.table->n_dropped()); +} + +/** Adjust table metadata for instant ADD/DROP/reorder COLUMN. +@param[in] table altered table (with dropped columns) +@param[in] col_map mapping from cols[] and v_cols[] to table +@return whether the metadata record must be updated */ +inline bool dict_table_t::instant_column(const dict_table_t& table, + const ulint* col_map) +{ + DBUG_ASSERT(!table.cached); + DBUG_ASSERT(table.n_def == table.n_cols); + DBUG_ASSERT(table.n_t_def == table.n_t_cols); + DBUG_ASSERT(n_def == n_cols); + DBUG_ASSERT(n_t_def == n_t_cols); + DBUG_ASSERT(n_v_def == n_v_cols); + DBUG_ASSERT(table.n_v_def == table.n_v_cols); + DBUG_ASSERT(table.n_cols + table.n_dropped() >= n_cols + n_dropped()); + DBUG_ASSERT(!table.persistent_autoinc + || persistent_autoinc == table.persistent_autoinc); + ut_ad(mutex_own(&dict_sys.mutex)); + + { + const char* end = table.col_names; + for (unsigned i = table.n_cols; i--; ) end += strlen(end) + 1; + + col_names = static_cast<char*>( + mem_heap_dup(heap, table.col_names, + ulint(end - table.col_names))); + } + const dict_col_t* const old_cols = cols; + cols = static_cast<dict_col_t*>(mem_heap_dup(heap, table.cols, + table.n_cols + * sizeof *cols)); + + /* Preserve the default values of previously instantly added + columns, or copy the new default values to this->heap. */ + for (ulint i = 0; i < ulint(table.n_cols); i++) { + dict_col_t& c = cols[i]; + + if (const dict_col_t* o = find(old_cols, col_map, n_cols, i)) { + c.def_val = o->def_val; + DBUG_ASSERT(!((c.prtype ^ o->prtype) + & ~(DATA_NOT_NULL | DATA_VERSIONED + | CHAR_COLL_MASK << 16 + | DATA_LONG_TRUE_VARCHAR))); + DBUG_ASSERT(c.same_type(*o)); + DBUG_ASSERT(c.len >= o->len); + + if (o->vers_sys_start()) { + ut_ad(o->ind == vers_start); + vers_start = i; + } else if (o->vers_sys_end()) { + ut_ad(o->ind == vers_end); + vers_end = i; + } + continue; + } + + DBUG_ASSERT(c.is_added()); + if (c.def_val.len <= sizeof field_ref_zero + && (!c.def_val.len + || !memcmp(c.def_val.data, field_ref_zero, + c.def_val.len))) { + c.def_val.data = field_ref_zero; + } else if (const void*& d = c.def_val.data) { + d = mem_heap_dup(heap, d, c.def_val.len); + } else { + DBUG_ASSERT(c.def_val.len == UNIV_SQL_NULL); + } + } + + n_t_def += table.n_cols - n_cols; + n_t_cols += table.n_cols - n_cols; + n_def = table.n_cols; + + const dict_v_col_t* const old_v_cols = v_cols; + + if (const char* end = table.v_col_names) { + for (unsigned i = table.n_v_cols; i--; ) { + end += strlen(end) + 1; + } + + v_col_names = static_cast<char*>( + mem_heap_dup(heap, table.v_col_names, + ulint(end - table.v_col_names))); + v_cols = static_cast<dict_v_col_t*>( + mem_heap_dup(heap, table.v_cols, + table.n_v_cols * sizeof *v_cols)); + } else { + ut_ad(table.n_v_cols == 0); + v_col_names = NULL; + v_cols = NULL; + } + + n_t_def += table.n_v_cols - n_v_cols; + n_t_cols += table.n_v_cols - n_v_cols; + n_v_def = table.n_v_cols; + + for (unsigned i = 0; i < n_v_def; i++) { + dict_v_col_t& v = v_cols[i]; + DBUG_ASSERT(v.v_indexes.empty()); + v.n_v_indexes = 0; + v.base_col = static_cast<dict_col_t**>( + mem_heap_dup(heap, v.base_col, + v.num_base * sizeof *v.base_col)); + + for (ulint n = v.num_base; n--; ) { + dict_col_t*& base = v.base_col[n]; + if (base->is_virtual()) { + } else if (base >= table.cols + && base < table.cols + table.n_cols) { + /* The base column was instantly added. */ + size_t c = base - table.cols; + DBUG_ASSERT(base == &table.cols[c]); + base = &cols[c]; + } else { + DBUG_ASSERT(base >= old_cols); + size_t c = base - old_cols; + DBUG_ASSERT(c + DATA_N_SYS_COLS < n_cols); + DBUG_ASSERT(base == &old_cols[c]); + DBUG_ASSERT(col_map[c] + DATA_N_SYS_COLS + < n_cols); + base = &cols[col_map[c]]; + } + } + } + + dict_index_t* index = dict_table_get_first_index(this); + bool metadata_changed; + { + const dict_index_t& i = *dict_table_get_first_index(&table); + metadata_changed = i.n_fields > index->n_fields; + ut_ad(i.n_fields >= index->n_fields); + index->instant_add_field(i); + } + + if (instant || table.instant) { + const auto old_instant = instant; + /* FIXME: add instant->heap, and transfer ownership here */ + if (!instant) { + instant = new (mem_heap_zalloc(heap, sizeof *instant)) + dict_instant_t(); + goto dup_dropped; + } else if (n_dropped() < table.n_dropped()) { +dup_dropped: + instant->dropped = static_cast<dict_col_t*>( + mem_heap_dup(heap, table.instant->dropped, + table.instant->n_dropped + * sizeof *instant->dropped)); + instant->n_dropped = table.instant->n_dropped; + } else if (table.instant->n_dropped) { + memcpy(instant->dropped, table.instant->dropped, + table.instant->n_dropped + * sizeof *instant->dropped); + } + + const field_map_element_t* field_map = old_instant + ? old_instant->field_map : NULL; + + init_instant(table); + + if (!metadata_changed) { + metadata_changed = !field_map + || memcmp(field_map, + instant->field_map, + (index->n_fields + - index->first_user_field()) + * sizeof *field_map); + } + } + + while ((index = dict_table_get_next_index(index)) != NULL) { + if (index->to_be_dropped) { + continue; + } + for (unsigned i = 0; i < index->n_fields; i++) { + dict_field_t& f = index->fields[i]; + if (f.col >= table.cols + && f.col < table.cols + table.n_cols) { + /* This is an instantly added column + in a newly added index. */ + DBUG_ASSERT(!f.col->is_virtual()); + size_t c = f.col - table.cols; + DBUG_ASSERT(f.col == &table.cols[c]); + f.col = &cols[c]; + } else if (f.col >= &table.v_cols->m_col + && f.col < &table.v_cols[n_v_cols].m_col) { + /* This is an instantly added virtual column + in a newly added index. */ + DBUG_ASSERT(f.col->is_virtual()); + size_t c = reinterpret_cast<dict_v_col_t*>( + f.col) - table.v_cols; + DBUG_ASSERT(f.col == &table.v_cols[c].m_col); + f.col = &v_cols[c].m_col; + } else if (f.col < old_cols + || f.col >= old_cols + n_cols) { + DBUG_ASSERT(f.col->is_virtual()); + f.col = &v_cols[col_map[ + reinterpret_cast<dict_v_col_t*>( + f.col) + - old_v_cols + n_cols]].m_col; + } else { + f.col = &cols[col_map[f.col - old_cols]]; + DBUG_ASSERT(!f.col->is_virtual()); + } + f.name = f.col->name(*this); + if (f.col->is_virtual()) { + dict_v_col_t* v_col = reinterpret_cast + <dict_v_col_t*>(f.col); + v_col->v_indexes.push_front( + dict_v_idx_t(index, i)); + v_col->n_v_indexes++; + } + } + } + + n_cols = table.n_cols; + n_v_cols = table.n_v_cols; + return metadata_changed; +} + +/** Find the old column number for the given new column position. +@param[in] col_map column map from old column to new column +@param[in] pos new column position +@param[in] n number of columns present in the column map +@return old column position for the given new column position. */ +static ulint find_old_col_no(const ulint* col_map, ulint pos, ulint n) +{ + do { + ut_ad(n); + } while (col_map[--n] != pos); + return n; +} + +/** Roll back instant_column(). +@param[in] old_n_cols original n_cols +@param[in] old_cols original cols +@param[in] old_col_names original col_names +@param[in] old_instant original instant structure +@param[in] old_fields original fields +@param[in] old_n_fields original number of fields +@param[in] old_n_core_fields original number of core fields +@param[in] old_n_v_cols original n_v_cols +@param[in] old_v_cols original v_cols +@param[in] old_v_col_names original v_col_names +@param[in] col_map column map */ +inline void dict_table_t::rollback_instant( + unsigned old_n_cols, + dict_col_t* old_cols, + const char* old_col_names, + dict_instant_t* old_instant, + dict_field_t* old_fields, + unsigned old_n_fields, + unsigned old_n_core_fields, + unsigned old_n_v_cols, + dict_v_col_t* old_v_cols, + const char* old_v_col_names, + const ulint* col_map) +{ + ut_d(dict_sys.assert_locked()); + dict_index_t* index = indexes.start; + mtr_t mtr; + mtr.start(); + /* Prevent concurrent execution of dict_index_t::clear_instant_alter() + by acquiring a latch on the leftmost leaf page. */ + instant_metadata_lock(*index, mtr); + /* index->is_instant() does not necessarily hold here, because + the table may have been emptied */ + DBUG_ASSERT(old_n_cols >= DATA_N_SYS_COLS); + DBUG_ASSERT(n_cols == n_def); + DBUG_ASSERT(index->n_def == index->n_fields); + DBUG_ASSERT(index->n_core_fields <= index->n_fields); + DBUG_ASSERT(old_n_core_fields <= old_n_fields); + DBUG_ASSERT(instant || !old_instant); + + instant = old_instant; + + index->n_nullable = 0; + + for (unsigned i = old_n_fields; i--; ) { + if (old_fields[i].col->is_nullable()) { + index->n_nullable++; + } + } + + for (unsigned i = n_v_cols; i--; ) { + v_cols[i].~dict_v_col_t(); + } + + index->n_core_fields = (index->n_fields == index->n_core_fields) + ? old_n_fields + : old_n_core_fields; + index->n_def = index->n_fields = old_n_fields; + index->n_core_null_bytes = UT_BITS_IN_BYTES( + index->get_n_nullable(index->n_core_fields)); + + const dict_col_t* const new_cols = cols; + const dict_col_t* const new_cols_end __attribute__((unused)) = cols + n_cols; + const dict_v_col_t* const new_v_cols = v_cols; + const dict_v_col_t* const new_v_cols_end __attribute__((unused))= v_cols + n_v_cols; + + cols = old_cols; + col_names = old_col_names; + v_cols = old_v_cols; + v_col_names = old_v_col_names; + n_def = n_cols = old_n_cols; + n_v_def = n_v_cols = old_n_v_cols; + n_t_def = n_t_cols = n_cols + n_v_cols; + + if (versioned()) { + for (unsigned i = 0; i < n_cols; ++i) { + if (cols[i].vers_sys_start()) { + vers_start = i; + } else if (cols[i].vers_sys_end()) { + vers_end = i; + } + } + } + + index->fields = old_fields; + mtr.commit(); + + while ((index = dict_table_get_next_index(index)) != NULL) { + if (index->to_be_dropped) { + /* instant_column() did not adjust these indexes. */ + continue; + } + + for (unsigned i = 0; i < index->n_fields; i++) { + dict_field_t& f = index->fields[i]; + if (f.col->is_virtual()) { + DBUG_ASSERT(f.col >= &new_v_cols->m_col); + DBUG_ASSERT(f.col < &new_v_cols_end->m_col); + size_t n = size_t( + reinterpret_cast<dict_v_col_t*>(f.col) + - new_v_cols); + DBUG_ASSERT(n <= n_v_cols); + + ulint old_col_no = find_old_col_no( + col_map + n_cols, n, n_v_cols); + DBUG_ASSERT(old_col_no <= n_v_cols); + f.col = &v_cols[old_col_no].m_col; + DBUG_ASSERT(f.col->is_virtual()); + } else { + DBUG_ASSERT(f.col >= new_cols); + DBUG_ASSERT(f.col < new_cols_end); + size_t n = size_t(f.col - new_cols); + DBUG_ASSERT(n <= n_cols); + + ulint old_col_no = find_old_col_no(col_map, + n, n_cols); + DBUG_ASSERT(old_col_no < n_cols); + f.col = &cols[old_col_no]; + DBUG_ASSERT(!f.col->is_virtual()); + } + f.name = f.col->name(*this); + } + } +} + struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx { /** Dummy query graph */ @@ -164,10 +878,6 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx dict_index_t** drop_index; /** number of InnoDB indexes being dropped */ const ulint num_to_drop_index; - /** InnoDB indexes being renamed */ - dict_index_t** rename; - /** number of InnoDB indexes being renamed */ - const ulint num_to_rename; /** InnoDB foreign key constraints being dropped */ dict_foreign_t** drop_fk; /** number of InnoDB foreign key constraints being dropped */ @@ -186,7 +896,7 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx dict_table_t* old_table; /** table where the indexes are being created or dropped */ dict_table_t* new_table; - /** table definition for instant ADD COLUMN */ + /** table definition for instant ADD/DROP/reorder COLUMN */ dict_table_t* instant_table; /** mapping of old column numbers to new ones, or NULL */ const ulint* col_map; @@ -220,7 +930,22 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx dict_col_t* const old_cols; /** original column names of the table */ const char* const old_col_names; - + /** original instantly dropped or reordered columns */ + dict_instant_t* const old_instant; + /** original index fields */ + dict_field_t* const old_fields; + /** size of old_fields */ + const unsigned old_n_fields; + /** original old_table->n_core_fields */ + const unsigned old_n_core_fields; + /** original number of virtual columns in the table */ + const unsigned old_n_v_cols; + /** original virtual columns of the table */ + dict_v_col_t* const old_v_cols; + /** original virtual column names of the table */ + const char* const old_v_col_names; + /** 0, or 1 + first column whose position changes in instant ALTER */ + unsigned first_alter_pos; /** Allow non-null conversion. (1) Alter ignore should allow the conversion irrespective of sql mode. @@ -234,8 +959,6 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx ha_innobase_inplace_ctx(row_prebuilt_t*& prebuilt_arg, dict_index_t** drop_arg, ulint num_to_drop_arg, - dict_index_t** rename_arg, - ulint num_to_rename_arg, dict_foreign_t** drop_fk_arg, ulint num_to_drop_fk_arg, dict_foreign_t** add_fk_arg, @@ -254,7 +977,6 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx prebuilt (prebuilt_arg), add_index (0), add_key_numbers (0), num_to_add_index (0), drop_index (drop_arg), num_to_drop_index (num_to_drop_arg), - rename (rename_arg), num_to_rename (num_to_rename_arg), drop_fk (drop_fk_arg), num_to_drop_fk (num_to_drop_fk_arg), add_fk (add_fk_arg), num_to_add_fk (num_to_add_fk_arg), online (online_arg), heap (heap_arg), trx (0), @@ -277,6 +999,15 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx old_n_cols(prebuilt_arg->table->n_cols), old_cols(prebuilt_arg->table->cols), old_col_names(prebuilt_arg->table->col_names), + old_instant(prebuilt_arg->table->instant), + old_fields(prebuilt_arg->table->indexes.start->fields), + old_n_fields(prebuilt_arg->table->indexes.start->n_fields), + old_n_core_fields(prebuilt_arg->table->indexes.start + ->n_core_fields), + old_n_v_cols(prebuilt_arg->table->n_v_cols), + old_v_cols(prebuilt_arg->table->v_cols), + old_v_col_names(prebuilt_arg->table->v_col_names), + first_alter_pos(0), allow_not_null(allow_not_null_flag), page_compression_level(page_compressed ? (page_compression_level_arg @@ -310,6 +1041,9 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx rw_lock_free(&index->lock); dict_mem_index_free(index); } + for (unsigned i = old_n_v_cols; i--; ) { + old_v_cols[i].~dict_v_col_t(); + } if (instant_table->fts) { fts_free(instant_table); } @@ -334,14 +1068,24 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx { DBUG_ASSERT(need_rebuild()); DBUG_ASSERT(!is_instant()); - DBUG_ASSERT(old_table->n_cols == old_table->n_def); - DBUG_ASSERT(new_table->n_cols == new_table->n_def); DBUG_ASSERT(old_table->n_cols == old_n_cols); - DBUG_ASSERT(new_table->n_cols > old_table->n_cols); - instant_table = new_table; + instant_table = new_table; new_table = old_table; export_vars.innodb_instant_alter_column++; + + instant_table->prepare_instant(*old_table, col_map, + first_alter_pos); + } + + /** Adjust table metadata for instant ADD/DROP/reorder COLUMN. + @return whether the metadata record must be updated */ + bool instant_column() + { + DBUG_ASSERT(is_instant()); + DBUG_ASSERT(old_n_fields + == old_table->indexes.start->n_fields); + return old_table->instant_column(*instant_table, col_map); } /** Revert prepare_instant() if the transaction is rolled back. */ @@ -349,7 +1093,13 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx { if (!is_instant()) return; old_table->rollback_instant(old_n_cols, - old_cols, old_col_names); + old_cols, old_col_names, + old_instant, + old_fields, old_n_fields, + old_n_core_fields, + old_n_v_cols, old_v_cols, + old_v_col_names, + col_map); } /** @return whether this is instant ALTER TABLE */ @@ -359,6 +1109,40 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx return instant_table; } + /** Create an index table where indexes are ordered as follows: + + IF a new primary key is defined for the table THEN + + 1) New primary key + 2) The remaining keys in key_info + + ELSE + + 1) All new indexes in the order they arrive from MySQL + + ENDIF + + @return key definitions */ + MY_ATTRIBUTE((nonnull, warn_unused_result, malloc)) + inline index_def_t* + create_key_defs( + const Alter_inplace_info* ha_alter_info, + /*!< in: alter operation */ + const TABLE* altered_table, + /*!< in: MySQL table that is being altered */ + ulint& n_fts_add, + /*!< out: number of FTS indexes to be created */ + ulint& fts_doc_id_col, + /*!< in: The column number for Doc ID */ + bool& add_fts_doc_id, + /*!< in: whether we need to add new DOC ID + column for FTS index */ + bool& add_fts_doc_idx, + /*!< in: whether we need to add new DOC ID + index for FTS index */ + const TABLE* table); + /*!< in: MySQL table that is being altered */ + /** Share context between partitions. @param[in] ctx context from another partition of the table */ void set_shared_data(const inplace_alter_handler_ctx& ctx) @@ -617,18 +1401,14 @@ check_v_col_in_order( & ALTER_ADD_VIRTUAL_COLUMN) { bool has_new = false; - List_iterator_fast<Create_field> cf_it( - ha_alter_info->alter_info->create_list); - - cf_it.rewind(); - - while (const Create_field* new_field = cf_it++) { - if (new_field->stored_in_db()) { + for (const Create_field& new_field : + ha_alter_info->alter_info->create_list) { + if (new_field.stored_in_db()) { continue; } /* Found a new added virtual column. */ - if (!new_field->field) { + if (!new_field.field) { has_new = true; continue; } @@ -692,20 +1472,237 @@ check_v_col_in_order( } /** Determine if an instant operation is possible for altering columns. +@param[in] ib_table InnoDB table definition @param[in] ha_alter_info the ALTER TABLE operation -@param[in] table table definition before ALTER TABLE */ +@param[in] table table definition before ALTER TABLE +@param[in] altered_table table definition after ALTER TABLE +@param[in] strict whether to ensure that user records fit */ static bool instant_alter_column_possible( + const dict_table_t& ib_table, const Alter_inplace_info* ha_alter_info, - const TABLE* table) + const TABLE* table, + const TABLE* altered_table, + bool strict) { + const dict_index_t* const pk = ib_table.indexes.start; + ut_ad(pk->is_primary()); + ut_ad(!pk->has_virtual()); + + if (ha_alter_info->handler_flags + & (ALTER_STORED_COLUMN_ORDER | ALTER_DROP_STORED_COLUMN + | ALTER_ADD_STORED_BASE_COLUMN)) { +#if 1 // MDEV-17459: adjust fts_fetch_doc_from_rec() and friends; remove this + if (ib_table.fts || innobase_fulltext_exist(altered_table)) + return false; +#endif +#if 1 // MDEV-17468: fix bugs with indexed virtual columns & remove this + for (const dict_index_t* index = ib_table.indexes.start; + index; index = index->indexes.next) { + if (index->has_virtual()) { + ut_ad(ib_table.n_v_cols); + return false; + } + } +#endif + uint n_add = 0, n_nullable = 0, lenlen = 0; + const uint blob_prefix = dict_table_has_atomic_blobs(&ib_table) + ? 0 + : REC_ANTELOPE_MAX_INDEX_COL_LEN; + const uint min_local_len = blob_prefix + ? blob_prefix + FIELD_REF_SIZE + : 2 * FIELD_REF_SIZE; + size_t min_size = 0, max_size = 0; + Field** af = altered_table->field; + Field** const end = altered_table->field + + altered_table->s->fields; + List_iterator_fast<Create_field> cf_it( + ha_alter_info->alter_info->create_list); + + for (; af < end; af++) { + const Create_field* cf = cf_it++; + if (!(*af)->stored_in_db() || cf->field) { + /* Virtual or pre-existing column */ + continue; + } + const bool nullable = (*af)->real_maybe_null(); + const bool is_null = (*af)->is_real_null(); + ut_ad(!is_null || nullable); + n_nullable += nullable; + n_add++; + uint l; + switch ((*af)->type()) { + case MYSQL_TYPE_VARCHAR: + l = reinterpret_cast<const Field_varstring*> + (*af)->get_length(); + variable_length: + if (l >= min_local_len) { + max_size += blob_prefix + + FIELD_REF_SIZE; + if (!is_null) { + min_size += blob_prefix + + FIELD_REF_SIZE; + } + lenlen += 2; + } else { + if (!is_null) { + min_size += l; + } + l = (*af)->pack_length(); + max_size += l; + lenlen += l > 255 ? 2 : 1; + } + break; + case MYSQL_TYPE_GEOMETRY: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + l = reinterpret_cast<const Field_blob*> + ((*af))->get_length(); + goto variable_length; + default: + l = (*af)->pack_length(); + if (l > 255 && ib_table.not_redundant()) { + goto variable_length; + } + max_size += l; + if (!is_null) { + min_size += l; + } + } + } + + ulint n_fields = pk->n_fields + n_add; + + if (n_fields >= REC_MAX_N_USER_FIELDS + DATA_N_SYS_COLS) { + return false; + } + + if (pk->is_gen_clust()) { + min_size += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + + DATA_ROW_ID_LEN; + max_size += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + + DATA_ROW_ID_LEN; + } else { + min_size += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + max_size += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + } + + uint i = pk->n_fields; + while (i-- > pk->n_core_fields) { + const dict_field_t& f = pk->fields[i]; + if (f.col->is_nullable()) { + n_nullable++; + if (!f.col->is_dropped() + && f.col->def_val.data) { + goto instantly_added_column; + } + } else if (f.fixed_len + && (f.fixed_len <= 255 + || !ib_table.not_redundant())) { + if (ib_table.not_redundant() + || !f.col->is_dropped()) { + min_size += f.fixed_len; + max_size += f.fixed_len; + } + } else if (f.col->is_dropped() || !f.col->is_added()) { + lenlen++; + goto set_max_size; + } else { +instantly_added_column: + ut_ad(f.col->is_added()); + if (f.col->def_val.len >= min_local_len) { + min_size += blob_prefix + + FIELD_REF_SIZE; + lenlen += 2; + } else { + min_size += f.col->def_val.len; + lenlen += f.col->def_val.len + > 255 ? 2 : 1; + } +set_max_size: + if (f.fixed_len + && (f.fixed_len <= 255 + || !ib_table.not_redundant())) { + max_size += f.fixed_len; + } else if (f.col->len >= min_local_len) { + max_size += blob_prefix + + FIELD_REF_SIZE; + } else { + max_size += f.col->len; + } + } + } + + do { + const dict_field_t& f = pk->fields[i]; + if (f.col->is_nullable()) { + n_nullable++; + } else if (f.fixed_len) { + min_size += f.fixed_len; + } else { + lenlen++; + } + } while (i--); + + if (ib_table.instant + || (ha_alter_info->handler_flags + & (ALTER_STORED_COLUMN_ORDER + | ALTER_DROP_STORED_COLUMN))) { + n_fields++; + lenlen += 2; + min_size += FIELD_REF_SIZE; + } + + if (ib_table.not_redundant()) { + min_size += REC_N_NEW_EXTRA_BYTES + + UT_BITS_IN_BYTES(n_nullable) + + lenlen; + } else { + min_size += (n_fields > 255 || min_size > 255) + ? n_fields * 2 : n_fields; + min_size += REC_N_OLD_EXTRA_BYTES; + } + + if (page_zip_rec_needs_ext(min_size, ib_table.not_redundant(), + 0, 0)) { + return false; + } + + if (strict && page_zip_rec_needs_ext(max_size, + ib_table.not_redundant(), + 0, 0)) { + return false; + } + } // Making table system-versioned instantly is not implemented yet. if (ha_alter_info->handler_flags & ALTER_ADD_SYSTEM_VERSIONING) { return false; } - if (~ha_alter_info->handler_flags & ALTER_ADD_STORED_BASE_COLUMN) { + static constexpr alter_table_operations avoid_rebuild + = ALTER_ADD_STORED_BASE_COLUMN + | ALTER_DROP_STORED_COLUMN + | ALTER_STORED_COLUMN_ORDER + | ALTER_COLUMN_NULLABLE; + + if (!(ha_alter_info->handler_flags & avoid_rebuild)) { + alter_table_operations flags = ha_alter_info->handler_flags + & ~avoid_rebuild; + /* None of the flags are set that we can handle + specially to avoid rebuild. In this case, we can + allow ALGORITHM=INSTANT, except if some requested + operation requires that the table be rebuilt. */ + if (flags & INNOBASE_ALTER_REBUILD) { + return false; + } + if ((flags & ALTER_OPTIONS) + && alter_options_need_rebuild(ha_alter_info, table)) { + return false; + } + } else if (!ib_table.supports_instant()) { return false; } @@ -728,12 +1725,59 @@ instant_alter_column_possible( columns. */ if (ha_alter_info->handler_flags & ((INNOBASE_ALTER_REBUILD | INNOBASE_ONLINE_CREATE) - & ~ALTER_ADD_STORED_BASE_COLUMN & ~ALTER_OPTIONS)) { + & ~ALTER_DROP_STORED_COLUMN + & ~ALTER_STORED_COLUMN_ORDER + & ~ALTER_ADD_STORED_BASE_COLUMN + & ~ALTER_COLUMN_NULLABLE + & ~ALTER_OPTIONS)) { return false; } - return !(ha_alter_info->handler_flags & ALTER_OPTIONS) - || !alter_options_need_rebuild(ha_alter_info, table); + if ((ha_alter_info->handler_flags & ALTER_OPTIONS) + && alter_options_need_rebuild(ha_alter_info, table)) { + return false; + } + + if (ha_alter_info->handler_flags & ALTER_COLUMN_NULLABLE) { + if (ib_table.not_redundant()) { + /* Instantaneous removal of NOT NULL is + only supported for ROW_FORMAT=REDUNDANT. */ + return false; + } + if (ib_table.fts_doc_id_index + && !innobase_fulltext_exist(altered_table)) { + /* Removing hidden FTS_DOC_ID_INDEX(FTS_DOC_ID) + requires that the table be rebuilt. */ + return false; + } + + Field** af = altered_table->field; + Field** const end = altered_table->field + + altered_table->s->fields; + for (unsigned c = 0; af < end; af++) { + if (!(*af)->stored_in_db()) { + continue; + } + + const dict_col_t* col = dict_table_get_nth_col( + &ib_table, c++); + + if (!col->ord_part || col->is_nullable() + || !(*af)->real_maybe_null()) { + continue; + } + + /* The column would be changed from NOT NULL. + Ensure that it is not a clustered index key. */ + for (auto i = pk->n_uniq; i--; ) { + if (pk->fields[i].col == col) { + return false; + } + } + } + } + + return true; } /** Check whether the non-const default value for the field @@ -880,7 +1924,7 @@ static bool innobase_table_is_empty(const dict_table_t *table) btr_pcur_open_at_index_side(true, clust_index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr); btr_pcur_move_to_next_user_rec(&pcur, &mtr); - if (!rec_is_metadata(btr_pcur_get_rec(&pcur), clust_index)) + if (!rec_is_metadata(btr_pcur_get_rec(&pcur), *clust_index)) btr_pcur_move_to_prev_on_page(&pcur); scan_leaf: cur= btr_pcur_get_page_cur(&pcur); @@ -898,7 +1942,7 @@ next_page: next_page= false; block= page_cur_get_block(cur); block= btr_block_get(page_id_t(block->page.id.space(), next_page_no), - block->page.size, BTR_SEARCH_LEAF, clust_index, + block->page.zip_size(), BTR_SEARCH_LEAF, clust_index, &mtr); btr_leaf_page_release(page_cur_get_block(cur), BTR_SEARCH_LEAF, &mtr); page_cur_set_before_first(block, cur); @@ -1013,10 +2057,13 @@ ha_innobase::check_if_supported_inplace_alter( switch (innodb_instant_alter_column_allowed) { case 0: /* never */ if ((ha_alter_info->handler_flags - & ALTER_ADD_STORED_BASE_COLUMN) + & (ALTER_ADD_STORED_BASE_COLUMN + | ALTER_STORED_COLUMN_ORDER + | ALTER_DROP_STORED_COLUMN)) || m_prebuilt->table->is_instant()) { reason_rebuild = "innodb_instant_alter_column_allowed=never"; +innodb_instant_alter_column_allowed_reason: if (ha_alter_info->handler_flags & ALTER_RECREATE_TABLE) { reason_rebuild = NULL; @@ -1028,6 +2075,14 @@ ha_innobase::check_if_supported_inplace_alter( } } break; + case 1: /* add_last */ + if ((ha_alter_info->handler_flags + & (ALTER_STORED_COLUMN_ORDER | ALTER_DROP_STORED_COLUMN)) + || m_prebuilt->table->instant) { + reason_rebuild = "innodb_instant_atler_column_allowed=" + "add_last"; + goto innodb_instant_alter_column_allowed_reason; + } } switch (ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) { @@ -1139,55 +2194,16 @@ ha_innobase::check_if_supported_inplace_alter( DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); } - bool add_drop_v_cols = false; - - /* If there is add or drop virtual columns, we will support operations - with these 2 options alone with inplace interface for now */ - - if (ha_alter_info->handler_flags - & (ALTER_ADD_VIRTUAL_COLUMN - | ALTER_DROP_VIRTUAL_COLUMN - | ALTER_VIRTUAL_COLUMN_ORDER)) { - ulonglong flags = ha_alter_info->handler_flags; - - /* TODO: uncomment the flags below, once we start to - support them */ - - flags &= ~(ALTER_ADD_VIRTUAL_COLUMN - | ALTER_DROP_VIRTUAL_COLUMN - | ALTER_VIRTUAL_COLUMN_ORDER - | ALTER_VIRTUAL_GCOL_EXPR - | ALTER_COLUMN_VCOL - /* - | ALTER_ADD_STORED_BASE_COLUMN - | ALTER_DROP_STORED_COLUMN - | ALTER_STORED_COLUMN_ORDER - | ALTER_ADD_UNIQUE_INDEX - */ - | ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX - | ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX); - - if (flags != 0 - || IF_PARTITIONING((altered_table->s->partition_info_str - && altered_table->s->partition_info_str_len), 0) - || (!check_v_col_in_order( - this->table, altered_table, ha_alter_info))) { - ha_alter_info->unsupported_reason = - MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN; - DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); - } - - add_drop_v_cols = true; - } + const bool add_drop_v_cols = !!(ha_alter_info->handler_flags + & (ALTER_ADD_VIRTUAL_COLUMN + | ALTER_DROP_VIRTUAL_COLUMN + | ALTER_VIRTUAL_COLUMN_ORDER)); /* We should be able to do the operation in-place. See if we can do it online (LOCK=NONE) or without rebuild. */ bool online = true, need_rebuild = false; const uint fulltext_indexes = innobase_fulltext_exist(altered_table); - List_iterator_fast<Create_field> cf_it( - ha_alter_info->alter_info->create_list); - /* Fix the key parts. */ for (KEY* new_key = ha_alter_info->key_info_buffer; new_key < ha_alter_info->key_info_buffer @@ -1208,18 +2224,12 @@ ha_innobase::check_if_supported_inplace_alter( key_part < (new_key->key_part + new_key->user_defined_key_parts); key_part++) { - const Create_field* new_field; - DBUG_ASSERT(key_part->fieldnr < altered_table->s->fields); - cf_it.rewind(); - for (uint fieldnr = 0; (new_field = cf_it++); - fieldnr++) { - if (fieldnr == key_part->fieldnr) { - break; - } - } + const Create_field* new_field + = ha_alter_info->alter_info->create_list.elem( + key_part->fieldnr); DBUG_ASSERT(new_field); @@ -1346,17 +2356,17 @@ ha_innobase::check_if_supported_inplace_alter( DEFAULT value, ensure that the DEFAULT expression is a constant. Also, in ADD COLUMN, for now we only support a constant DEFAULT expression. */ - cf_it.rewind(); Field **af = altered_table->field; - bool add_column_not_last = false; - uint n_stored_cols = 0, n_add_cols = 0; + bool fts_need_rebuild = false; + need_rebuild = need_rebuild + || innobase_need_rebuild(ha_alter_info, table); - while (Create_field* cf = cf_it++) { - DBUG_ASSERT(cf->field + for (Create_field& cf : ha_alter_info->alter_info->create_list) { + DBUG_ASSERT(cf.field || (ha_alter_info->handler_flags & ALTER_ADD_COLUMN)); - if (const Field* f = cf->field) { + if (const Field* f = cf.field) { if (!f->real_maybe_null() || (*af)->real_maybe_null()) goto next_column; /* We are changing an existing column @@ -1398,43 +2408,74 @@ ha_innobase::check_if_supported_inplace_alter( ha_alter_info->unsupported_reason = my_get_err_msg( ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOT_NULL); - } else if (!is_non_const_value(*af)) { - - n_add_cols++; - - if (af < &altered_table->field[table_share->fields]) { - add_column_not_last = true; - } - - if (set_default_value(*af)) { - goto next_column; + } else if (!is_non_const_value(*af) + && set_default_value(*af)) { + if (fulltext_indexes > 1 + && !my_strcasecmp(system_charset_info, + (*af)->field_name.str, + FTS_DOC_ID_COL_NAME)) { + /* If a hidden FTS_DOC_ID column exists + (because of FULLTEXT INDEX), it cannot + be replaced with a user-created one + except when using ALGORITHM=COPY. */ + ha_alter_info->unsupported_reason = + my_get_err_msg(ER_INNODB_FT_LIMIT); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); } + goto next_column; } DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); next_column: - n_stored_cols += (*af++)->stored_in_db(); + af++; } - if (!add_column_not_last - && uint(m_prebuilt->table->n_cols) - DATA_N_SYS_COLS + n_add_cols - == n_stored_cols - && m_prebuilt->table->supports_instant() - && instant_alter_column_possible(ha_alter_info, table)) { + const bool supports_instant = instant_alter_column_possible( + *m_prebuilt->table, ha_alter_info, table, altered_table, + is_innodb_strict_mode()); + if (add_drop_v_cols) { + ulonglong flags = ha_alter_info->handler_flags; - DBUG_RETURN(HA_ALTER_INPLACE_INSTANT); + /* TODO: uncomment the flags below, once we start to + support them */ + + flags &= ~(ALTER_ADD_VIRTUAL_COLUMN + | ALTER_DROP_VIRTUAL_COLUMN + | ALTER_VIRTUAL_COLUMN_ORDER + | ALTER_VIRTUAL_GCOL_EXPR + | ALTER_COLUMN_VCOL + /* + | ALTER_ADD_STORED_BASE_COLUMN + | ALTER_DROP_STORED_COLUMN + | ALTER_STORED_COLUMN_ORDER + | ALTER_ADD_UNIQUE_INDEX + */ + | ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX + | ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX); + if (supports_instant) { + flags &= ~(ALTER_DROP_STORED_COLUMN +#if 0 /* MDEV-17468: remove check_v_col_in_order() and fix the code */ + | ALTER_ADD_STORED_BASE_COLUMN +#endif + | ALTER_STORED_COLUMN_ORDER); + } + if (flags != 0 + || IF_PARTITIONING((altered_table->s->partition_info_str + && altered_table->s->partition_info_str_len), 0) + || (!check_v_col_in_order( + this->table, altered_table, ha_alter_info))) { + ha_alter_info->unsupported_reason = + MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN; + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } } - if (!(ha_alter_info->handler_flags & ~(INNOBASE_ALTER_INSTANT - | INNOBASE_INPLACE_IGNORE))) { + if (supports_instant && !(ha_alter_info->handler_flags + & INNOBASE_ALTER_NOREBUILD)) { DBUG_RETURN(HA_ALTER_INPLACE_INSTANT); } - bool fts_need_rebuild = false; - need_rebuild = need_rebuild - || innobase_need_rebuild(ha_alter_info, table); - if (need_rebuild && (fulltext_indexes || innobase_spatial_exist(altered_table) @@ -1541,7 +2582,7 @@ cannot_create_many_fulltext_index: online = false; } - if (need_rebuild || fts_need_rebuild) { + if ((need_rebuild && !supports_instant) || fts_need_rebuild) { ha_alter_info->handler_flags |= ALTER_RECREATE_TABLE; DBUG_RETURN(online ? HA_ALTER_INPLACE_COPY_NO_LOCK @@ -1585,7 +2626,7 @@ innobase_init_foreign( ulint referenced_num_field) /*!< in: number of referenced columns */ { - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); if (constraint_name) { ulint db_len; @@ -1919,8 +2960,6 @@ innobase_get_foreign_key_info( const trx_t* trx, dict_s_col_list*s_cols) { - Key* key; - Foreign_key* fk_key; dict_table_t* referenced_table = NULL; char* referenced_table_name = NULL; ulint num_fk = 0; @@ -1930,10 +2969,8 @@ innobase_get_foreign_key_info( *n_add_fk = 0; - List_iterator<Key> key_iterator(alter_info->key_list); - - while ((key=key_iterator++)) { - if (key->type != Key::FOREIGN_KEY) { + for (Key& key : alter_info->key_list) { + if (key.type != Key::FOREIGN_KEY) { continue; } @@ -1951,18 +2988,15 @@ innobase_get_foreign_key_info( char db_name[MAX_DATABASE_NAME_LEN]; char tbl_name[MAX_TABLE_NAME_LEN]; - fk_key = static_cast<Foreign_key*>(key); + Foreign_key* fk_key = static_cast<Foreign_key*>(&key); if (fk_key->columns.elements > 0) { ulint i = 0; - Key_part_spec* column; - List_iterator<Key_part_spec> key_part_iterator( - fk_key->columns); /* Get all the foreign key column info for the current table */ - while ((column = key_part_iterator++)) { - column_names[i] = column->field_name.str; + for (const Key_part_spec& column : fk_key->columns) { + column_names[i] = column.field_name.str; ut_ad(i < MAX_NUM_FK_COLUMNS); i++; } @@ -2029,7 +3063,7 @@ innobase_get_foreign_key_info( db_namep = &db_name[0]; } #endif - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); referenced_table_name = dict_get_referenced_table( table->name.m_name, @@ -2047,7 +3081,7 @@ innobase_get_foreign_key_info( referenced_table = NULL;); if (!referenced_table && trx->check_foreigns) { - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); my_error(ER_FK_CANNOT_OPEN_PARENT, MYF(0), tbl_namep); @@ -2056,13 +3090,10 @@ innobase_get_foreign_key_info( if (fk_key->ref_columns.elements > 0) { ulint i = 0; - Key_part_spec* column; - List_iterator<Key_part_spec> key_part_iterator( - fk_key->ref_columns); - while ((column = key_part_iterator++)) { + for (Key_part_spec &column : fk_key->ref_columns) { referenced_column_names[i] = - column->field_name.str; + column.field_name.str; ut_ad(i < MAX_NUM_FK_COLUMNS); i++; } @@ -2083,7 +3114,7 @@ innobase_get_foreign_key_info( /* Check whether there exist such index in the the index create clause */ if (!referenced_index) { - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); my_error(ER_FK_NO_INDEX_PARENT, MYF(0), fk_key->name.str ? fk_key->name.str : "", @@ -2098,7 +3129,7 @@ innobase_get_foreign_key_info( } else { /* Not possible to add a foreign key without a referenced column */ - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); my_error(ER_CANNOT_ADD_FOREIGN, MYF(0), tbl_namep); goto err_exit; } @@ -2109,7 +3140,7 @@ innobase_get_foreign_key_info( num_col, referenced_table_name, referenced_table, referenced_index, referenced_column_names, referenced_num_col)) { - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); my_error( ER_DUP_CONSTRAINT_NAME, MYF(0), @@ -2117,7 +3148,7 @@ innobase_get_foreign_key_info( goto err_exit; } - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); correct_option = innobase_set_foreign_key_option( add_fk[num_fk], fk_key); @@ -2397,9 +3428,9 @@ innobase_row_to_mysql( } } if (table->vfield) { - my_bitmap_map* old_vcol_set = tmp_use_all_columns(table, table->vcol_set); + my_bitmap_map* old_read_set = tmp_use_all_columns(table, table->read_set); table->update_virtual_fields(table->file, VCOL_UPDATE_FOR_READ); - tmp_restore_column_map(table->vcol_set, old_vcol_set); + tmp_restore_column_map(table->read_set, old_read_set); } } @@ -2474,7 +3505,6 @@ innobase_check_index_keys( } } - my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), key.name.str); return(ER_WRONG_NAME_FOR_INDEX); @@ -2827,8 +3857,7 @@ innobase_fts_check_doc_id_index_in_def( return(FTS_NOT_EXIST_DOC_ID_INDEX); } -/*******************************************************************//** -Create an index table where indexes are ordered as follows: +/** Create an index table where indexes are ordered as follows: IF a new primary key is defined for the table THEN @@ -2842,23 +3871,15 @@ ELSE ENDIF @return key definitions */ -static MY_ATTRIBUTE((nonnull, warn_unused_result, malloc)) -index_def_t* -innobase_create_key_defs( -/*=====================*/ - mem_heap_t* heap, - /*!< in/out: memory heap where space for key - definitions are allocated */ +MY_ATTRIBUTE((nonnull, warn_unused_result, malloc)) +inline index_def_t* +ha_innobase_inplace_ctx::create_key_defs( const Alter_inplace_info* ha_alter_info, /*!< in: alter operation */ const TABLE* altered_table, /*!< in: MySQL table that is being altered */ - ulint& n_add, - /*!< in/out: number of indexes to be created */ ulint& n_fts_add, /*!< out: number of FTS indexes to be created */ - bool got_default_clust, - /*!< in: whether the table lacks a primary key */ ulint& fts_doc_id_col, /*!< in: The column number for Doc ID */ bool& add_fts_doc_id, @@ -2870,6 +3891,9 @@ innobase_create_key_defs( const TABLE* table) /*!< in: MySQL table that is being altered */ { + ulint& n_add = num_to_add_index; + const bool got_default_clust = new_table->indexes.start->is_gen_clust(); + index_def_t* indexdef; index_def_t* indexdefs; bool new_primary; @@ -2878,7 +3902,7 @@ innobase_create_key_defs( const KEY*const key_info = ha_alter_info->key_info_buffer; - DBUG_ENTER("innobase_create_key_defs"); + DBUG_ENTER("ha_innobase_inplace_ctx::create_key_defs"); DBUG_ASSERT(!add_fts_doc_id || add_fts_doc_idx); DBUG_ASSERT(ha_alter_info->index_add_count == n_add); @@ -3061,7 +4085,7 @@ online_retry_drop_indexes_low( dict_table_t* table, /*!< in/out: table */ trx_t* trx) /*!< in/out: transaction */ { - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); @@ -3098,9 +4122,9 @@ online_retry_drop_indexes( trx_free(trx); } - ut_d(mutex_enter(&dict_sys->mutex)); + ut_d(mutex_enter(&dict_sys.mutex)); ut_d(dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE)); - ut_d(mutex_exit(&dict_sys->mutex)); + ut_d(mutex_exit(&dict_sys.mutex)); ut_ad(!table->drop_aborted); } @@ -3175,7 +4199,7 @@ innobase_check_foreigns_low( bool drop) { dict_foreign_t* foreign; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); /* Check if any FOREIGN KEY constraints are defined on this column. */ @@ -3274,26 +4298,21 @@ innobase_check_foreigns( dict_foreign_t** drop_fk, ulint n_drop_fk) { - List_iterator_fast<Create_field> cf_it( - ha_alter_info->alter_info->create_list); - for (Field** fp = old_table->field; *fp; fp++) { - cf_it.rewind(); - const Create_field* new_field; - ut_ad(!(*fp)->real_maybe_null() == !!((*fp)->flags & NOT_NULL_FLAG)); - while ((new_field = cf_it++)) { - if (new_field->field == *fp) { - break; - } - } + auto end = ha_alter_info->alter_info->create_list.end(); + auto it = std::find_if( + ha_alter_info->alter_info->create_list.begin(), end, + [fp](const Create_field& field) { + return field.field == *fp; + }); - if (!new_field || (new_field->flags & NOT_NULL_FLAG)) { + if (it == end || (it->flags & NOT_NULL_FLAG)) { if (innobase_check_foreigns_low( user_table, drop_fk, n_drop_fk, - (*fp)->field_name.str, !new_field)) { + (*fp)->field_name.str, it == end)) { return(true); } } @@ -3306,7 +4325,7 @@ innobase_check_foreigns( @param[in,out] heap Memory heap where allocated @param[out] dfield InnoDB data field to copy to @param[in] field MySQL value for the column -@param[in] old_field Old field or NULL if new col is added +@param[in] old_field Old column if altering; NULL for ADD COLUMN @param[in] comp nonzero if in compact format. */ static void innobase_build_col_map_add( mem_heap_t* heap, @@ -3325,14 +4344,13 @@ static void innobase_build_col_map_add( return; } - ulint size = field->pack_length(); + const Field& from = old_field ? *old_field : *field; + ulint size = from.pack_length(); byte* buf = static_cast<byte*>(mem_heap_alloc(heap, size)); - const byte* mysql_data = old_field ? old_field->ptr : field->ptr; - row_mysql_store_col_in_innobase_format( - dfield, buf, true, mysql_data, size, comp); + dfield, buf, true, from.ptr, size, comp); } /** Construct the translation table for reordering, dropping or @@ -3354,7 +4372,7 @@ innobase_build_col_map( Alter_inplace_info* ha_alter_info, const TABLE* altered_table, const TABLE* table, - const dict_table_t* new_table, + dict_table_t* new_table, const dict_table_t* old_table, dtuple_t* defaults, mem_heap_t* heap) @@ -3385,8 +4403,6 @@ innobase_build_col_map( heap, (size_t(old_table->n_cols) + old_n_v_cols) * sizeof *col_map)); - List_iterator_fast<Create_field> cf_it( - ha_alter_info->alter_info->create_list); uint i = 0; uint num_v = 0; @@ -3402,14 +4418,15 @@ innobase_build_col_map( const bool omits_virtual = ha_innobase::omits_virtual_cols(*table->s); - while (const Create_field* new_field = cf_it++) { - bool is_v = !new_field->stored_in_db(); + for (const Create_field& new_field : + ha_alter_info->alter_info->create_list) { + bool is_v = !new_field.stored_in_db(); ulint num_old_v = 0; for (uint old_i = 0; table->field[old_i]; old_i++) { const Field* field = table->field[old_i]; if (!field->stored_in_db()) { - if (is_v && new_field->field == field) { + if (is_v && new_field.field == field) { if (!omits_virtual) { col_map[old_table->n_cols + num_v] @@ -3422,7 +4439,7 @@ innobase_build_col_map( continue; } - if (new_field->field == field) { + if (new_field.field == field) { const Field* altered_field = altered_table->field[i + num_v]; @@ -3439,16 +4456,25 @@ innobase_build_col_map( } col_map[old_i - num_old_v] = i; + if (old_table->versioned() + && altered_table->versioned()) { + if (old_i == old_table->vers_start) { + new_table->vers_start = i + num_v; + } else if (old_i == old_table->vers_end) { + new_table->vers_end = i + num_v; + } + } goto found_col; } } - ut_ad(!is_v); - innobase_build_col_map_add( - heap, dtuple_get_nth_field(defaults, i), - altered_table->field[i + num_v], - NULL, - dict_table_is_comp(new_table)); + if (!is_v) { + innobase_build_col_map_add( + heap, dtuple_get_nth_field(defaults, i), + altered_table->field[i + num_v], + NULL, + dict_table_is_comp(new_table)); + } found_col: if (is_v) { num_v++; @@ -3565,21 +4591,20 @@ innobase_get_col_names( mem_heap_zalloc(heap, user_table->n_def * sizeof *cols)); i = 0; - List_iterator_fast<Create_field> cf_it( - ha_alter_info->alter_info->create_list); - while (const Create_field* new_field = cf_it++) { + for (const Create_field& new_field : + ha_alter_info->alter_info->create_list) { ulint num_v = 0; DBUG_ASSERT(i < altered_table->s->fields); - if (!new_field->stored_in_db()) { + if (!new_field.stored_in_db()) { continue; } for (uint old_i = 0; table->field[old_i]; old_i++) { num_v += !table->field[old_i]->stored_in_db(); - if (new_field->field == table->field[old_i]) { - cols[old_i - num_v] = new_field->field_name.str; + if (new_field.field == table->field[old_i]) { + cols[old_i - num_v] = new_field.field_name.str; break; } } @@ -3800,8 +4825,7 @@ innobase_update_gis_column_type( DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); - ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); + ut_d(dict_sys.assert_locked()); info = pars_info_create(); @@ -3914,13 +4938,12 @@ prepare_inplace_add_virtual( ha_innobase_inplace_ctx* ctx; ulint i = 0; ulint j = 0; - const Create_field* new_field; ctx = static_cast<ha_innobase_inplace_ctx*> (ha_alter_info->handler_ctx); - ctx->num_to_add_vcol = altered_table->s->fields - + ctx->num_to_drop_vcol - table->s->fields; + ctx->num_to_add_vcol = altered_table->s->virtual_fields + + ctx->num_to_drop_vcol - table->s->virtual_fields; ctx->add_vcol = static_cast<dict_v_col_t*>( mem_heap_zalloc(ctx->heap, ctx->num_to_add_vcol @@ -3929,46 +4952,22 @@ prepare_inplace_add_virtual( mem_heap_alloc(ctx->heap, ctx->num_to_add_vcol * sizeof *ctx->add_vcol_name)); - List_iterator_fast<Create_field> cf_it( - ha_alter_info->alter_info->create_list); - - while ((new_field = (cf_it++)) != NULL) { - const Field* field = new_field->field; - ulint old_i; - - for (old_i = 0; table->field[old_i]; old_i++) { - const Field* n_field = table->field[old_i]; - if (field == n_field) { - break; - } - } - - i++; + for (const Create_field& new_field : + ha_alter_info->alter_info->create_list) { + const Field* field = altered_table->field[i++]; - if (table->field[old_i]) { + if (new_field.field || field->stored_in_db()) { continue; } - ut_ad(!field); - - ulint col_len; ulint is_unsigned; - ulint field_type; ulint charset_no; - - field = altered_table->field[i - 1]; - ulint col_type = get_innobase_type_from_mysql_type( &is_unsigned, field); - - if (field->stored_in_db()) { - continue; - } - - col_len = field->pack_length(); - field_type = (ulint) field->type(); + ulint col_len = field->pack_length(); + ulint field_type = (ulint) field->type(); if (!field->real_maybe_null()) { field_type |= DATA_NOT_NULL; @@ -4010,7 +5009,7 @@ prepare_inplace_add_virtual( } } - + new (&ctx->add_vcol[j]) dict_v_col_t(); ctx->add_vcol[j].m_col.prtype = dtype_form_prtype( field_type, charset_no); @@ -4027,8 +5026,8 @@ prepare_inplace_add_virtual( ctx->add_vcol[j].v_pos = ctx->old_table->n_v_cols - ctx->num_to_drop_vcol + j; - /* No need to track the list */ - ctx->add_vcol[j].v_indexes = NULL; + ctx->add_vcol[j].n_v_indexes = 0; + /* MDEV-17468: Do this on ctx->instant_table later */ innodb_base_col_setup(ctx->old_table, field, &ctx->add_vcol[j]); j++; } @@ -4155,33 +5154,96 @@ prepare_inplace_drop_virtual( @param[in] pos virtual column column no @param[in] base_pos base column pos @param[in] trx transaction -@return DB_SUCCESS if successful, otherwise error code */ -static -dberr_t -innobase_insert_sys_virtual( +@retval false on success +@retval true on failure (my_error() will have been called) */ +static bool innobase_insert_sys_virtual( const dict_table_t* table, ulint pos, ulint base_pos, trx_t* trx) { pars_info_t* info = pars_info_create(); - pars_info_add_ull_literal(info, "id", table->id); + pars_info_add_int4_literal(info, "pos", pos); + pars_info_add_int4_literal(info, "base_pos", base_pos); + + if (DB_SUCCESS != que_eval_sql( + info, + "PROCEDURE P () IS\n" + "BEGIN\n" + "INSERT INTO SYS_VIRTUAL VALUES (:id, :pos, :base_pos);\n" + "END;\n", + FALSE, trx)) { + my_error(ER_INTERNAL_ERROR, MYF(0), + "InnoDB: ADD COLUMN...VIRTUAL"); + return true; + } + return false; +} + +/** Insert a record to the SYS_COLUMNS dictionary table. +@param[in] table_id table id +@param[in] pos position of the column +@param[in] field_name field name +@param[in] mtype main type +@param[in] prtype precise type +@param[in] len fixed length in bytes, or 0 +@param[in] n_base number of base columns of virtual columns, or 0 +@param[in] update whether to update instead of inserting +@retval false on success +@retval true on failure (my_error() will have been called) */ +static bool innodb_insert_sys_columns( + table_id_t table_id, + ulint pos, + const char* field_name, + ulint mtype, + ulint prtype, + ulint len, + ulint n_base, + trx_t* trx, + bool update = false) +{ + pars_info_t* info = pars_info_create(); + pars_info_add_ull_literal(info, "id", table_id); pars_info_add_int4_literal(info, "pos", pos); + pars_info_add_str_literal(info, "name", field_name); + pars_info_add_int4_literal(info, "mtype", mtype); + pars_info_add_int4_literal(info, "prtype", prtype); + pars_info_add_int4_literal(info, "len", len); + pars_info_add_int4_literal(info, "base", n_base); + + if (update) { + if (DB_SUCCESS != que_eval_sql( + info, + "PROCEDURE UPD_COL () IS\n" + "BEGIN\n" + "UPDATE SYS_COLUMNS SET\n" + "NAME=:name, MTYPE=:mtype, PRTYPE=:prtype, " + "LEN=:len, PREC=:base\n" + "WHERE TABLE_ID=:id AND POS=:pos;\n" + "END;\n", FALSE, trx)) { + my_error(ER_INTERNAL_ERROR, MYF(0), + "InnoDB: Updating SYS_COLUMNS failed"); + return true; + } - pars_info_add_int4_literal(info, "base_pos", base_pos); + return false; + } - dberr_t error = que_eval_sql( - info, - "PROCEDURE P () IS\n" - "BEGIN\n" - "INSERT INTO SYS_VIRTUAL VALUES" - "(:id, :pos, :base_pos);\n" - "END;\n", - FALSE, trx); + if (DB_SUCCESS != que_eval_sql( + info, + "PROCEDURE ADD_COL () IS\n" + "BEGIN\n" + "INSERT INTO SYS_COLUMNS VALUES" + "(:id,:pos,:name,:mtype,:prtype,:len,:base);\n" + "END;\n", FALSE, trx)) { + my_error(ER_INTERNAL_ERROR, MYF(0), + "InnoDB: Insert into SYS_COLUMNS failed"); + return true; + } - return(error); + return false; } /** Update INNODB SYS_COLUMNS on new virtual columns @@ -4189,10 +5251,9 @@ innobase_insert_sys_virtual( @param[in] col_name column name @param[in] vcol virtual column @param[in] trx transaction -@return DB_SUCCESS if successful, otherwise error code */ -static -dberr_t -innobase_add_one_virtual( +@retval false on success +@retval true on failure (my_error() will have been called) */ +static bool innobase_add_one_virtual( const dict_table_t* table, const char* col_name, dict_v_col_t* vcol, @@ -4200,67 +5261,41 @@ innobase_add_one_virtual( { ulint pos = dict_create_v_col_pos(vcol->v_pos, vcol->m_col.ind); - ulint mtype = vcol->m_col.mtype; - ulint prtype = vcol->m_col.prtype; - ulint len = vcol->m_col.len; - pars_info_t* info = pars_info_create(); - - pars_info_add_ull_literal(info, "id", table->id); - - pars_info_add_int4_literal(info, "pos", pos); - - pars_info_add_str_literal(info, "name", col_name); - pars_info_add_int4_literal(info, "mtype", mtype); - pars_info_add_int4_literal(info, "prtype", prtype); - pars_info_add_int4_literal(info, "len", len); - pars_info_add_int4_literal(info, "prec", vcol->num_base); - dberr_t error = que_eval_sql( - info, - "PROCEDURE P () IS\n" - "BEGIN\n" - "INSERT INTO SYS_COLUMNS VALUES" - "(:id, :pos, :name, :mtype, :prtype, :len, :prec);\n" - "END;\n", - FALSE, trx); - - if (error != DB_SUCCESS) { - return(error); + if (innodb_insert_sys_columns(table->id, pos, col_name, + vcol->m_col.mtype, vcol->m_col.prtype, + vcol->m_col.len, vcol->num_base, trx)) { + return true; } - for (ulint i = 0; i < vcol->num_base; i++) { - error = innobase_insert_sys_virtual( - table, pos, vcol->base_col[i]->ind, trx); - if (error != DB_SUCCESS) { - return(error); + for (ulint i = 0; i < unsigned{vcol->num_base}; i++) { + if (innobase_insert_sys_virtual( + table, pos, vcol->base_col[i]->ind, trx)) { + return true; } } - return(error); + return false; } /** Update SYS_TABLES.N_COLS in the data dictionary. @param[in] user_table InnoDB table -@param[in] n_cols the new value of SYS_TABLES.N_COLS +@param[in] n the new value of SYS_TABLES.N_COLS @param[in] trx transaction @return whether the operation failed */ -static -bool -innodb_update_n_cols(const dict_table_t* table, ulint n_cols, trx_t* trx) +static bool innodb_update_cols(const dict_table_t* table, ulint n, trx_t* trx) { pars_info_t* info = pars_info_create(); - pars_info_add_int4_literal(info, "n", n_cols); + pars_info_add_int4_literal(info, "n", n); pars_info_add_ull_literal(info, "id", table->id); - dberr_t err = que_eval_sql(info, - "PROCEDURE UPDATE_N_COLS () IS\n" - "BEGIN\n" - "UPDATE SYS_TABLES SET N_COLS = :n" - " WHERE ID = :id;\n" - "END;\n", FALSE, trx); - - if (err != DB_SUCCESS) { + if (DB_SUCCESS != que_eval_sql(info, + "PROCEDURE UPDATE_N_COLS () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES SET N_COLS = :n" + " WHERE ID = :id;\n" + "END;\n", FALSE, trx)) { my_error(ER_INTERNAL_ERROR, MYF(0), "InnoDB: Updating SYS_TABLES.N_COLS failed"); return true; @@ -4278,297 +5313,47 @@ innodb_update_n_cols(const dict_table_t* table, ulint n_cols, trx_t* trx) static bool innobase_add_virtual_try( - Alter_inplace_info* ha_alter_info, - const dict_table_t* user_table, - trx_t* trx) + const Alter_inplace_info* ha_alter_info, + const dict_table_t* user_table, + trx_t* trx) { - ha_innobase_inplace_ctx* ctx; - dberr_t err = DB_SUCCESS; - - ctx = static_cast<ha_innobase_inplace_ctx*>( + ha_innobase_inplace_ctx* ctx = static_cast<ha_innobase_inplace_ctx*>( ha_alter_info->handler_ctx); for (ulint i = 0; i < ctx->num_to_add_vcol; i++) { - - err = innobase_add_one_virtual( - user_table, ctx->add_vcol_name[i], - &ctx->add_vcol[i], trx); - - if (err != DB_SUCCESS) { - my_error(ER_INTERNAL_ERROR, MYF(0), - "InnoDB: ADD COLUMN...VIRTUAL"); - return(true); + if (innobase_add_one_virtual( + user_table, ctx->add_vcol_name[i], + &ctx->add_vcol[i], trx)) { + return true; } } - - ulint n_col = unsigned(user_table->n_cols) - DATA_N_SYS_COLS; - ulint n_v_col = unsigned(user_table->n_v_cols) - + ctx->num_to_add_vcol - ctx->num_to_drop_vcol; - ulint new_n = dict_table_encode_n_col(n_col, n_v_col) - + (unsigned(user_table->flags & DICT_TF_COMPACT) << 31); - - return innodb_update_n_cols(user_table, new_n, trx); + return false; } -/** Insert into SYS_COLUMNS and insert/update the hidden metadata record -for instant ADD COLUMN. -@param[in,out] ctx ALTER TABLE context for the current partition -@param[in] altered_table MySQL table that is being altered -@param[in] table MySQL table as it is before the ALTER operation -@param[in,out] trx dictionary transaction -@retval true failure -@retval false success */ -static -bool -innobase_add_instant_try( - ha_innobase_inplace_ctx*ctx, - const TABLE* altered_table, - const TABLE* table, - trx_t* trx) +/** Delete metadata from SYS_COLUMNS and SYS_VIRTUAL. +@param[in] id table id +@param[in] pos first SYS_COLUMNS.POS +@param[in,out] trx data dictionary transaction +@retval true Failure +@retval false Success. */ +static bool innobase_instant_drop_cols(table_id_t id, ulint pos, trx_t* trx) { - DBUG_ASSERT(!ctx->need_rebuild()); - - if (!ctx->is_instant()) return false; - - DBUG_ASSERT(altered_table->s->fields > table->s->fields); - DBUG_ASSERT(ctx->old_table->n_cols == ctx->old_n_cols); - - dict_table_t* user_table = ctx->old_table; - user_table->instant_add_column(*ctx->instant_table); - dict_index_t* index = dict_table_get_first_index(user_table); - /* The table may have been emptied and may have lost its - 'instant-add-ness' during this instant ADD COLUMN. */ - - /* Construct a table row of default values for the stored columns. */ - dtuple_t* row = dtuple_create(ctx->heap, user_table->n_cols); - dict_table_copy_types(row, user_table); - Field** af = altered_table->field; - Field** const end = altered_table->field + altered_table->s->fields; - - for (uint i = 0; af < end; af++) { - if (!(*af)->stored_in_db()) { - continue; - } - - dict_col_t* col = dict_table_get_nth_col(user_table, i); - DBUG_ASSERT(!strcmp((*af)->field_name.str, - dict_table_get_col_name(user_table, i))); - - dfield_t* d = dtuple_get_nth_field(row, i); - - if (col->is_instant()) { - dfield_set_data(d, col->def_val.data, - col->def_val.len); - } else if ((*af)->real_maybe_null()) { - /* Store NULL for nullable 'core' columns. */ - dfield_set_null(d); - } else { - switch ((*af)->type()) { - case MYSQL_TYPE_VARCHAR: - case MYSQL_TYPE_GEOMETRY: - case MYSQL_TYPE_TINY_BLOB: - case MYSQL_TYPE_MEDIUM_BLOB: - case MYSQL_TYPE_BLOB: - case MYSQL_TYPE_LONG_BLOB: - variable_length: - /* Store the empty string for 'core' - variable-length NOT NULL columns. */ - dfield_set_data(d, field_ref_zero, 0); - break; - case MYSQL_TYPE_STRING: - if (col->mbminlen != col->mbmaxlen - && dict_table_is_comp(user_table)) { - goto variable_length; - } - /* fall through */ - default: - /* For fixed-length NOT NULL 'core' columns, - get a dummy default value from SQL. Note that - we will preserve the old values of these - columns when updating the metadata - record, to avoid unnecessary updates. */ - ulint len = (*af)->pack_length(); - DBUG_ASSERT(d->type.mtype != DATA_INT - || len <= 8); - row_mysql_store_col_in_innobase_format( - d, d->type.mtype == DATA_INT - ? static_cast<byte*>( - mem_heap_alloc(ctx->heap, len)) - : NULL, true, (*af)->ptr, len, - dict_table_is_comp(user_table)); - } - } - - if (i + DATA_N_SYS_COLS < ctx->old_n_cols) { - i++; - continue; - } - - pars_info_t* info = pars_info_create(); - pars_info_add_ull_literal(info, "id", user_table->id); - pars_info_add_int4_literal(info, "pos", i); - pars_info_add_str_literal(info, "name", (*af)->field_name.str); - pars_info_add_int4_literal(info, "mtype", d->type.mtype); - pars_info_add_int4_literal(info, "prtype", d->type.prtype); - pars_info_add_int4_literal(info, "len", d->type.len); + pars_info_t* info = pars_info_create(); + pars_info_add_ull_literal(info, "id", id); + pars_info_add_int4_literal(info, "pos", pos); - dberr_t err = que_eval_sql( + dberr_t err = que_eval_sql( info, - "PROCEDURE ADD_COL () IS\n" + "PROCEDURE DELETE_COL () IS\n" "BEGIN\n" - "INSERT INTO SYS_COLUMNS VALUES" - "(:id,:pos,:name,:mtype,:prtype,:len,0);\n" + "DELETE FROM SYS_COLUMNS WHERE\n" + "TABLE_ID = :id AND POS >= :pos;\n" + "DELETE FROM SYS_VIRTUAL WHERE TABLE_ID = :id;\n" "END;\n", FALSE, trx); - if (err != DB_SUCCESS) { - my_error(ER_INTERNAL_ERROR, MYF(0), - "InnoDB: Insert into SYS_COLUMNS failed"); - return(true); - } - - i++; - } - - if (innodb_update_n_cols(user_table, dict_table_encode_n_col( - unsigned(user_table->n_cols) - - DATA_N_SYS_COLS, - user_table->n_v_cols) - | (user_table->flags & DICT_TF_COMPACT) << 31, - trx)) { - return true; - } - - /* If the table has been discarded then change the metadata alone - and make the index to non-instant format */ - if (!user_table->space) { - index->remove_instant(); - return false; - } - - unsigned i = unsigned(user_table->n_cols) - DATA_N_SYS_COLS; - byte trx_id[DATA_TRX_ID_LEN], roll_ptr[DATA_ROLL_PTR_LEN]; - dfield_set_data(dtuple_get_nth_field(row, i++), field_ref_zero, - DATA_ROW_ID_LEN); - dfield_set_data(dtuple_get_nth_field(row, i++), trx_id, sizeof trx_id); - dfield_set_data(dtuple_get_nth_field(row, i),roll_ptr,sizeof roll_ptr); - DBUG_ASSERT(i + 1 == user_table->n_cols); - - trx_write_trx_id(trx_id, trx->id); - /* The DB_ROLL_PTR will be assigned later, when allocating undo log. - Silence a Valgrind warning in dtuple_validate() when - row_ins_clust_index_entry_low() searches for the insert position. */ - memset(roll_ptr, 0, sizeof roll_ptr); - - dtuple_t* entry = row_build_index_entry(row, NULL, index, ctx->heap); - entry->info_bits = REC_INFO_METADATA; - - mtr_t mtr; - mtr.start(); - index->set_modified(mtr); - btr_pcur_t pcur; - btr_pcur_open_at_index_side(true, index, BTR_MODIFY_TREE, &pcur, true, - 0, &mtr); - ut_ad(btr_pcur_is_before_first_on_page(&pcur)); - btr_pcur_move_to_next_on_page(&pcur); - - buf_block_t* block = btr_pcur_get_block(&pcur); - ut_ad(page_is_leaf(block->frame)); - ut_ad(!page_has_prev(block->frame)); - ut_ad(!buf_block_get_page_zip(block)); - const rec_t* rec = btr_pcur_get_rec(&pcur); - que_thr_t* thr = pars_complete_graph_for_exec( - NULL, trx, ctx->heap, NULL); - - dberr_t err; - if (rec_is_metadata(rec, index)) { - ut_ad(page_rec_is_user_rec(rec)); - if (!page_has_next(block->frame) - && page_rec_is_last(rec, block->frame)) { - goto empty_table; - } - /* Extend the record with the instantly added columns. */ - const unsigned n = user_table->n_cols - ctx->old_n_cols; - /* Reserve room for DB_TRX_ID,DB_ROLL_PTR and any - non-updated off-page columns in case they are moved off - page as a result of the update. */ - upd_t* update = upd_create(index->n_fields, ctx->heap); - update->n_fields = n; - update->info_bits = REC_INFO_METADATA; - /* Add the default values for instantly added columns */ - for (unsigned i = 0; i < n; i++) { - upd_field_t* uf = upd_get_nth_field(update, i); - unsigned f = index->n_fields - n + i; - uf->field_no = f; - uf->new_val = entry->fields[f]; - } - rec_offs* offsets = NULL; - mem_heap_t* offsets_heap = NULL; - big_rec_t* big_rec; - err = btr_cur_pessimistic_update( - BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, - btr_pcur_get_btr_cur(&pcur), - &offsets, &offsets_heap, ctx->heap, - &big_rec, update, UPD_NODE_NO_ORD_CHANGE, - thr, trx->id, &mtr); - if (big_rec) { - if (err == DB_SUCCESS) { - err = btr_store_big_rec_extern_fields( - &pcur, offsets, big_rec, &mtr, - BTR_STORE_UPDATE); - } - - dtuple_big_rec_free(big_rec); - } - if (offsets_heap) { - mem_heap_free(offsets_heap); - } - btr_pcur_close(&pcur); - goto func_exit; - } else if (page_rec_is_supremum(rec)) { -empty_table: - /* The table is empty. */ - ut_ad(fil_page_index_page_check(block->frame)); - ut_ad(!page_has_siblings(block->frame)); - ut_ad(block->page.id.page_no() == index->page); - btr_page_empty(block, NULL, index, 0, &mtr); - index->remove_instant(); - err = DB_SUCCESS; - goto func_exit; - } - - /* Convert the table to the instant ADD COLUMN format. */ - ut_ad(user_table->is_instant()); - mtr.commit(); - mtr.start(); - index->set_modified(mtr); - if (page_t* root = btr_root_get(index, &mtr)) { - if (fil_page_get_type(root) != FIL_PAGE_INDEX) { - DBUG_ASSERT(!"wrong page type"); - goto err_exit; - } - - DBUG_ASSERT(!page_is_comp(root) || !page_get_instant(root)); - mlog_write_ulint(root + FIL_PAGE_TYPE, - FIL_PAGE_TYPE_INSTANT, MLOG_2BYTES, - &mtr); - page_set_instant(root, index->n_core_fields, &mtr); - mtr.commit(); - mtr.start(); - index->set_modified(mtr); - err = row_ins_clust_index_entry_low( - BTR_NO_LOCKING_FLAG, BTR_MODIFY_TREE, index, - index->n_uniq, entry, 0, thr); - } else { -err_exit: - err = DB_CORRUPTION; - } - -func_exit: - mtr.commit(); - if (err != DB_SUCCESS) { - my_error_innodb(err, table->s->table_name.str, - user_table->flags); + my_error(ER_INTERNAL_ERROR, MYF(0), + "InnoDB: DELETE from SYS_COLUMNS/SYS_VIRTUAL failed"); return true; } @@ -4746,9 +5531,9 @@ innobase_drop_one_virtual_sys_virtual( static bool innobase_drop_virtual_try( - Alter_inplace_info* ha_alter_info, - const dict_table_t* user_table, - trx_t* trx) + const Alter_inplace_info* ha_alter_info, + const dict_table_t* user_table, + trx_t* trx) { ha_innobase_inplace_ctx* ctx; dberr_t err = DB_SUCCESS; @@ -4781,14 +5566,474 @@ innobase_drop_virtual_try( } } + return false; +} + +/** Serialise metadata of dropped or reordered columns. +@param[in,out] heap memory heap for allocation +@param[out] field data field with the metadata */ +inline +void dict_table_t::serialise_columns(mem_heap_t* heap, dfield_t* field) const +{ + DBUG_ASSERT(instant); + const dict_index_t& index = *UT_LIST_GET_FIRST(indexes); + unsigned n_fixed = index.first_user_field(); + unsigned num_non_pk_fields = index.n_fields - n_fixed; + + ulint len = 4 + num_non_pk_fields * 2; + + byte* data = static_cast<byte*>(mem_heap_alloc(heap, len)); + + dfield_set_data(field, data, len); - ulint n_col = unsigned(user_table->n_cols) - DATA_N_SYS_COLS; - ulint n_v_col = unsigned(user_table->n_v_cols) - - ctx->num_to_drop_vcol; - ulint new_n = dict_table_encode_n_col(n_col, n_v_col) - | ((user_table->flags & DICT_TF_COMPACT) << 31); + mach_write_to_4(data, num_non_pk_fields); - return innodb_update_n_cols(user_table, new_n, trx); + data += 4; + + for (ulint i = n_fixed; i < index.n_fields; i++) { + mach_write_to_2(data, instant->field_map[i - n_fixed]); + data += 2; + } +} + +/** Construct the metadata record for instant ALTER TABLE. +@param[in] row dummy or default values for existing columns +@param[in,out] heap memory heap for allocations +@return metadata record */ +inline +dtuple_t* +dict_index_t::instant_metadata(const dtuple_t& row, mem_heap_t* heap) const +{ + ut_ad(is_primary()); + dtuple_t* entry; + + if (!table->instant) { + entry = row_build_index_entry(&row, NULL, this, heap); + entry->info_bits = REC_INFO_METADATA_ADD; + return entry; + } + + entry = dtuple_create(heap, n_fields + 1); + entry->n_fields_cmp = n_uniq; + entry->info_bits = REC_INFO_METADATA_ALTER; + + const dict_field_t* field = fields; + + for (uint i = 0; i <= n_fields; i++, field++) { + dfield_t* dfield = dtuple_get_nth_field(entry, i); + + if (i == first_user_field()) { + table->serialise_columns(heap, dfield); + dfield->type.metadata_blob_init(); + field--; + continue; + } + + ut_ad(!field->col->is_virtual()); + + if (field->col->is_dropped()) { + dict_col_copy_type(field->col, &dfield->type); + if (field->col->is_nullable()) { + dfield_set_null(dfield); + } else { + dfield_set_data(dfield, field_ref_zero, + field->fixed_len); + } + continue; + } + + const dfield_t* s = dtuple_get_nth_field(&row, field->col->ind); + ut_ad(dict_col_type_assert_equal(field->col, &s->type)); + *dfield = *s; + + if (dfield_is_null(dfield)) { + continue; + } + + if (dfield_is_ext(dfield)) { + ut_ad(i > first_user_field()); + ut_ad(!field->prefix_len); + ut_ad(dfield->len >= FIELD_REF_SIZE); + dfield_set_len(dfield, dfield->len - FIELD_REF_SIZE); + } + + if (!field->prefix_len) { + continue; + } + + ut_ad(field->col->ord_part); + ut_ad(i < n_uniq); + + ulint len = dtype_get_at_most_n_mbchars( + field->col->prtype, + field->col->mbminlen, field->col->mbmaxlen, + field->prefix_len, dfield->len, + static_cast<char*>(dfield_get_data(dfield))); + dfield_set_len(dfield, len); + } + + return entry; +} + +/** Insert or update SYS_COLUMNS and the hidden metadata record +for instant ALTER TABLE. +@param[in] ha_alter_info ALTER TABLE context +@param[in,out] ctx ALTER TABLE context for the current partition +@param[in] altered_table MySQL table that is being altered +@param[in] table MySQL table as it is before the ALTER operation +@param[in,out] trx dictionary transaction +@retval true failure +@retval false success */ +static bool innobase_instant_try( + const Alter_inplace_info* ha_alter_info, + ha_innobase_inplace_ctx* ctx, + const TABLE* altered_table, + const TABLE* table, + trx_t* trx) +{ + DBUG_ASSERT(!ctx->need_rebuild()); + DBUG_ASSERT(ctx->is_instant()); + + dict_table_t* user_table = ctx->old_table; + + dict_index_t* index = dict_table_get_first_index(user_table); + mtr_t mtr; + mtr.start(); + /* Prevent purge from calling dict_index_t::clear_instant_add(), + to protect index->n_core_fields, index->table->instant and others + from changing during ctx->instant_column(). */ + instant_metadata_lock(*index, mtr); + const unsigned n_old_fields = index->n_fields; + const dict_col_t* old_cols = user_table->cols; + DBUG_ASSERT(user_table->n_cols == ctx->old_n_cols); + + const bool metadata_changed = ctx->instant_column(); + + DBUG_ASSERT(index->n_fields >= n_old_fields); + /* Release the page latch. Between this and the next + btr_pcur_open_at_index_side(), data fields such as + index->n_core_fields and index->table->instant could change, + but we would handle that in empty_table: below. */ + mtr.commit(); + /* The table may have been emptied and may have lost its + 'instantness' during this ALTER TABLE. */ + + /* Construct a table row of default values for the stored columns. */ + dtuple_t* row = dtuple_create(ctx->heap, user_table->n_cols); + dict_table_copy_types(row, user_table); + Field** af = altered_table->field; + Field** const end = altered_table->field + altered_table->s->fields; + ut_d(List_iterator_fast<Create_field> cf_it( + ha_alter_info->alter_info->create_list)); + if (ctx->first_alter_pos + && innobase_instant_drop_cols(user_table->id, + ctx->first_alter_pos - 1, trx)) { + return true; + } + for (uint i = 0; af < end; af++) { + if (!(*af)->stored_in_db()) { + ut_d(cf_it++); + continue; + } + + const dict_col_t* old = dict_table_t::find(old_cols, + ctx->col_map, + ctx->old_n_cols, i); + DBUG_ASSERT(!old || i >= ctx->old_n_cols - DATA_N_SYS_COLS + || old->ind == i + || (ctx->first_alter_pos + && old->ind >= ctx->first_alter_pos - 1)); + + dfield_t* d = dtuple_get_nth_field(row, i); + const dict_col_t* col = dict_table_get_nth_col(user_table, i); + DBUG_ASSERT(!col->is_virtual()); + DBUG_ASSERT(!col->is_dropped()); + DBUG_ASSERT(col->mtype != DATA_SYS); + DBUG_ASSERT(!strcmp((*af)->field_name.str, + dict_table_get_col_name(user_table, i))); + DBUG_ASSERT(old || col->is_added()); + + ut_d(const Create_field* new_field = cf_it++); + /* new_field->field would point to an existing column. + If it is NULL, the column was added by this ALTER TABLE. */ + ut_ad(!new_field->field == !old); + + if (col->is_added()) { + dfield_set_data(d, col->def_val.data, + col->def_val.len); + } else if ((*af)->real_maybe_null()) { + /* Store NULL for nullable 'core' columns. */ + dfield_set_null(d); + } else { + switch ((*af)->type()) { + case MYSQL_TYPE_VARCHAR: + case MYSQL_TYPE_GEOMETRY: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + variable_length: + /* Store the empty string for 'core' + variable-length NOT NULL columns. */ + dfield_set_data(d, field_ref_zero, 0); + break; + case MYSQL_TYPE_STRING: + if (col->mbminlen != col->mbmaxlen + && user_table->not_redundant()) { + goto variable_length; + } + /* fall through */ + default: + /* For fixed-length NOT NULL 'core' columns, + get a dummy default value from SQL. Note that + we will preserve the old values of these + columns when updating the metadata + record, to avoid unnecessary updates. */ + ulint len = (*af)->pack_length(); + DBUG_ASSERT(d->type.mtype != DATA_INT + || len <= 8); + row_mysql_store_col_in_innobase_format( + d, d->type.mtype == DATA_INT + ? static_cast<byte*>( + mem_heap_alloc(ctx->heap, len)) + : NULL, true, (*af)->ptr, len, + dict_table_is_comp(user_table)); + ut_ad(new_field->field->pack_length() == len); + } + } + + bool update = old && (!ctx->first_alter_pos + || i < ctx->first_alter_pos - 1); + DBUG_ASSERT(!old || col->same_format(*old)); + if (update + && old->prtype == d->type.prtype) { + /* The record is already present in SYS_COLUMNS. */ + } else if (innodb_insert_sys_columns(user_table->id, i, + (*af)->field_name.str, + d->type.mtype, + d->type.prtype, + d->type.len, 0, trx, + update)) { + return true; + } + + i++; + } + + if (innodb_update_cols(user_table, dict_table_encode_n_col( + unsigned(user_table->n_cols) + - DATA_N_SYS_COLS, + user_table->n_v_cols) + | (user_table->flags & DICT_TF_COMPACT) << 31, + trx)) { + return true; + } + + if (ctx->first_alter_pos) { +add_all_virtual: + for (uint i = 0; i < user_table->n_v_cols; i++) { + if (innobase_add_one_virtual( + user_table, + dict_table_get_v_col_name(user_table, i), + &user_table->v_cols[i], trx)) { + return true; + } + } + } else if (ha_alter_info->handler_flags & ALTER_DROP_VIRTUAL_COLUMN) { + if (innobase_instant_drop_cols(user_table->id, 65536, trx)) { + return true; + } + goto add_all_virtual; + } else if ((ha_alter_info->handler_flags & ALTER_ADD_VIRTUAL_COLUMN) + && innobase_add_virtual_try(ha_alter_info, user_table, + trx)) { + return true; + } + + if (!user_table->space) { + /* In case of ALTER TABLE...DISCARD TABLESPACE, + update only the metadata and transform the dictionary + cache entry to the canonical format. */ + index->clear_instant_alter(); + return false; + } + + unsigned i = unsigned(user_table->n_cols) - DATA_N_SYS_COLS; + DBUG_ASSERT(i >= altered_table->s->stored_fields); + DBUG_ASSERT(i <= altered_table->s->stored_fields + 1); + if (i > altered_table->s->fields) { + const dict_col_t& fts_doc_id = user_table->cols[i - 1]; + DBUG_ASSERT(!strcmp(fts_doc_id.name(*user_table), + FTS_DOC_ID_COL_NAME)); + DBUG_ASSERT(!fts_doc_id.is_nullable()); + DBUG_ASSERT(fts_doc_id.len == 8); + dfield_set_data(dtuple_get_nth_field(row, i - 1), + field_ref_zero, fts_doc_id.len); + } + byte trx_id[DATA_TRX_ID_LEN], roll_ptr[DATA_ROLL_PTR_LEN]; + dfield_set_data(dtuple_get_nth_field(row, i++), field_ref_zero, + DATA_ROW_ID_LEN); + dfield_set_data(dtuple_get_nth_field(row, i++), trx_id, sizeof trx_id); + dfield_set_data(dtuple_get_nth_field(row, i),roll_ptr,sizeof roll_ptr); + DBUG_ASSERT(i + 1 == user_table->n_cols); + + trx_write_trx_id(trx_id, trx->id); + /* The DB_ROLL_PTR will be assigned later, when allocating undo log. + Silence a Valgrind warning in dtuple_validate() when + row_ins_clust_index_entry_low() searches for the insert position. */ + memset(roll_ptr, 0, sizeof roll_ptr); + + dtuple_t* entry = index->instant_metadata(*row, ctx->heap); + mtr.start(); + index->set_modified(mtr); + btr_pcur_t pcur; + btr_pcur_open_at_index_side(true, index, BTR_MODIFY_TREE, &pcur, true, + 0, &mtr); + ut_ad(btr_pcur_is_before_first_on_page(&pcur)); + btr_pcur_move_to_next_on_page(&pcur); + + buf_block_t* block = btr_pcur_get_block(&pcur); + ut_ad(page_is_leaf(block->frame)); + ut_ad(!page_has_prev(block->frame)); + ut_ad(!buf_block_get_page_zip(block)); + const rec_t* rec = btr_pcur_get_rec(&pcur); + que_thr_t* thr = pars_complete_graph_for_exec( + NULL, trx, ctx->heap, NULL); + + dberr_t err = DB_SUCCESS; + if (rec_is_metadata(rec, *index)) { + ut_ad(page_rec_is_user_rec(rec)); + if (!rec_is_alter_metadata(rec, *index) + && !index->table->instant + && !page_has_next(block->frame) + && page_rec_is_last(rec, block->frame)) { + goto empty_table; + } + + if (!metadata_changed) { + goto func_exit; + } + + /* Ensure that the root page is in the correct format. */ + buf_block_t* root = btr_root_block_get(index, RW_X_LATCH, + &mtr); + DBUG_ASSERT(root); + if (fil_page_get_type(root->frame) != FIL_PAGE_TYPE_INSTANT) { + DBUG_ASSERT(!"wrong page type"); + err = DB_CORRUPTION; + goto func_exit; + } + + btr_set_instant(root, *index, &mtr); + + /* Extend the record with any added columns. */ + uint n = uint(index->n_fields) - n_old_fields; + /* Reserve room for DB_TRX_ID,DB_ROLL_PTR and any + non-updated off-page columns in case they are moved off + page as a result of the update. */ + const unsigned f = user_table->instant != NULL; + upd_t* update = upd_create(index->n_fields + f, ctx->heap); + update->n_fields = n + f; + update->info_bits = f + ? REC_INFO_METADATA_ALTER + : REC_INFO_METADATA_ADD; + if (f) { + upd_field_t* uf = upd_get_nth_field(update, 0); + uf->field_no = index->first_user_field(); + uf->new_val = entry->fields[uf->field_no]; + DBUG_ASSERT(!dfield_is_ext(&uf->new_val)); + DBUG_ASSERT(!dfield_is_null(&uf->new_val)); + } + + /* Add the default values for instantly added columns */ + unsigned j = f; + + for (unsigned k = n_old_fields; k < index->n_fields; k++) { + upd_field_t* uf = upd_get_nth_field(update, j++); + uf->field_no = k + f; + uf->new_val = entry->fields[k + f]; + + ut_ad(j <= n + f); + } + + ut_ad(j == n + f); + + rec_offs* offsets = NULL; + mem_heap_t* offsets_heap = NULL; + big_rec_t* big_rec; + err = btr_cur_pessimistic_update( + BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, + btr_pcur_get_btr_cur(&pcur), + &offsets, &offsets_heap, ctx->heap, + &big_rec, update, UPD_NODE_NO_ORD_CHANGE, + thr, trx->id, &mtr); + + offsets = rec_get_offsets( + btr_pcur_get_rec(&pcur), index, offsets, + true, ULINT_UNDEFINED, &offsets_heap); + if (big_rec) { + if (err == DB_SUCCESS) { + err = btr_store_big_rec_extern_fields( + &pcur, offsets, big_rec, &mtr, + BTR_STORE_UPDATE); + } + + dtuple_big_rec_free(big_rec); + } + if (offsets_heap) { + mem_heap_free(offsets_heap); + } + btr_pcur_close(&pcur); + goto func_exit; + } else if (page_rec_is_supremum(rec) && !index->table->instant) { +empty_table: + /* The table is empty. */ + ut_ad(fil_page_index_page_check(block->frame)); + ut_ad(!page_has_siblings(block->frame)); + ut_ad(block->page.id.page_no() == index->page); + /* MDEV-17383: free metadata BLOBs! */ + btr_page_empty(block, NULL, index, 0, &mtr); + if (index->is_instant()) { + index->clear_instant_add(); + } + goto func_exit; + } else if (!user_table->is_instant()) { + ut_ad(!user_table->not_redundant()); + goto func_exit; + } + + /* Convert the table to the instant ALTER TABLE format. */ + mtr.commit(); + mtr.start(); + index->set_modified(mtr); + if (buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, &mtr)) { + if (fil_page_get_type(root->frame) != FIL_PAGE_INDEX) { + DBUG_ASSERT(!"wrong page type"); + goto err_exit; + } + + btr_set_instant(root, *index, &mtr); + mtr.commit(); + mtr.start(); + index->set_modified(mtr); + err = row_ins_clust_index_entry_low( + BTR_NO_LOCKING_FLAG, BTR_MODIFY_TREE, index, + index->n_uniq, entry, 0, thr); + } else { +err_exit: + err = DB_CORRUPTION; + } + +func_exit: + mtr.commit(); + + if (err != DB_SUCCESS) { + my_error_innodb(err, table->s->table_name.str, + user_table->flags); + return true; + } + + return false; } /** Adjust the create index column number from "New table" to @@ -4808,8 +6053,6 @@ innodb_v_adjust_idx_col( ulint num_v_dropped, index_def_t* index_def) { - List_iterator_fast<Create_field> cf_it( - ha_alter_info->alter_info->create_list); for (ulint i = 0; i < index_def->n_fields; i++) { #ifdef UNIV_DEBUG bool col_found = false; @@ -4827,15 +6070,14 @@ innodb_v_adjust_idx_col( const Field* field = NULL; - cf_it.rewind(); - /* Found the field in the new table */ - while (const Create_field* new_field = cf_it++) { - if (new_field->stored_in_db()) { + for (const Create_field& new_field : + ha_alter_info->alter_info->create_list) { + if (new_field.stored_in_db()) { continue; } - field = new_field->field; + field = new_field.field; if (num_v == index_field->col_no) { break; @@ -5018,11 +6260,9 @@ prepare_inplace_alter_table_dict( const char* path = thd_innodb_tmpdir( ctx->prebuilt->trx->mysql_thd); - index_defs = innobase_create_key_defs( - ctx->heap, ha_alter_info, altered_table, ctx->num_to_add_index, + index_defs = ctx->create_key_defs( + ha_alter_info, altered_table, num_fts_index, - dict_index_is_auto_gen_clust(dict_table_get_first_index( - ctx->new_table)), fts_doc_id_col, add_fts_doc_id, add_fts_doc_id_idx, old_table); @@ -5217,10 +6457,10 @@ new_clustered_failed: } if (altered_table->versioned()) { - if (i == altered_table->s->row_start_field) { + if (i == altered_table->s->vers.start_fieldno) { field_type |= DATA_VERS_START; } else if (i == - altered_table->s->row_end_field) { + altered_table->s->vers.end_fieldno) { field_type |= DATA_VERS_END; } else if (!(field->flags & VERS_UPDATE_UNVERSIONED_FLAG)) { @@ -5396,26 +6636,19 @@ new_clustered_failed: == !!new_clustered); } - if (ctx->need_rebuild() && user_table->supports_instant()) { - if (!instant_alter_column_possible(ha_alter_info, old_table)) { - goto not_instant_add_column; - } - - for (uint i = uint(ctx->old_table->n_cols) - DATA_N_SYS_COLS; - i--; ) { - if (ctx->col_map[i] != i) { - goto not_instant_add_column; - } - } - - DBUG_ASSERT(ctx->new_table->n_cols > ctx->old_table->n_cols); + DBUG_ASSERT(!ctx->need_rebuild() + || !ctx->new_table->persistent_autoinc); + if (ctx->need_rebuild() && instant_alter_column_possible( + *user_table, ha_alter_info, old_table, altered_table, + ha_innobase::is_innodb_strict_mode(ctx->trx->mysql_thd))) { for (uint a = 0; a < ctx->num_to_add_index; a++) { ctx->add_index[a]->table = ctx->new_table; error = dict_index_add_to_cache( ctx->add_index[a], FIL_NULL, add_v); ut_a(error == DB_SUCCESS); } + DBUG_ASSERT(ha_alter_info->key_count /* hidden GEN_CLUST_INDEX in InnoDB */ + dict_index_is_auto_gen_clust( @@ -5427,24 +6660,23 @@ new_clustered_failed: altered_table->key_info) != FTS_EXIST_DOC_ID_INDEX) == ctx->num_to_add_index); + ctx->num_to_add_index = 0; ctx->add_index = NULL; uint i = 0; // index of stored columns ctx->new_table->cols[] Field **af = altered_table->field; - List_iterator_fast<Create_field> cf_it( - ha_alter_info->alter_info->create_list); - - while (const Create_field* new_field = cf_it++) { - DBUG_ASSERT(!new_field->field + for (const Create_field& new_field : + ha_alter_info->alter_info->create_list) { + DBUG_ASSERT(!new_field.field || std::find(old_table->field, old_table->field + old_table->s->fields, - new_field->field) != + new_field.field) != old_table->field + old_table->s->fields); - DBUG_ASSERT(new_field->field - || !strcmp(new_field->field_name.str, + DBUG_ASSERT(new_field.field + || !strcmp(new_field.field_name.str, (*af)->field_name.str)); if (!(*af)->stored_in_db()) { @@ -5457,26 +6689,11 @@ new_clustered_failed: DBUG_ASSERT(!strcmp((*af)->field_name.str, dict_table_get_col_name(ctx->new_table, i))); - DBUG_ASSERT(!col->is_instant()); + DBUG_ASSERT(!col->is_added()); - if (new_field->field) { - ut_d(const dict_col_t* old_col - = dict_table_get_nth_col(user_table, i)); - ut_d(const dict_index_t* index - = user_table->indexes.start); - DBUG_SLOW_ASSERT(col->mtype == old_col->mtype); - ut_ad(col->prtype == old_col->prtype - || col->prtype - == (old_col->prtype & ~DATA_VERSIONED)); - DBUG_SLOW_ASSERT(col->mbminlen - == old_col->mbminlen); - DBUG_SLOW_ASSERT(col->mbmaxlen - == old_col->mbmaxlen); - DBUG_SLOW_ASSERT(col->len >= old_col->len); - DBUG_SLOW_ASSERT(old_col->is_instant() - == (dict_col_get_clust_pos( - old_col, index) - >= index->n_core_fields)); + if (new_field.field) { + /* This is a pre-existing column, + possibly at a different position. */ } else if ((*af)->is_real_null()) { /* DEFAULT NULL */ col->def_val.len = UNIV_SQL_NULL; @@ -5544,11 +6761,15 @@ new_clustered_failed: ctx->new_table, i), FTS_DOC_ID_COL_NAME))); + if (altered_table->found_next_number_field) { + ctx->new_table->persistent_autoinc + = ctx->old_table->persistent_autoinc; + } + ctx->prepare_instant(); } if (ctx->need_rebuild()) { -not_instant_add_column: DBUG_ASSERT(ctx->need_rebuild()); DBUG_ASSERT(!ctx->is_instant()); DBUG_ASSERT(num_fts_index <= 1); @@ -5601,15 +6822,15 @@ not_instant_add_column: before we can use it we need to open the table. The new_table must be in the data dictionary cache, because we are still holding - the dict_sys->mutex. */ - ut_ad(mutex_own(&dict_sys->mutex)); + the dict_sys.mutex. */ + ut_ad(mutex_own(&dict_sys.mutex)); temp_table = dict_table_open_on_name( ctx->new_table->name.m_name, TRUE, FALSE, DICT_ERR_IGNORE_NONE); ut_a(ctx->new_table == temp_table); /* n_ref_count must be 1, because purge cannot be executing on this very table as we are - holding dict_operation_lock X-latch. */ + holding dict_sys.latch X-latch. */ DBUG_ASSERT(ctx->new_table->get_ref_count() == 1); DBUG_ASSERT(ctx->new_table->id != 0); DBUG_ASSERT(ctx->new_table->id == ctx->trx->table_id); @@ -5695,7 +6916,6 @@ error_handling_drop_uncached_1: ut_ad(new_clust_index->n_core_null_bytes == UT_BITS_IN_BYTES(new_clust_index->n_nullable)); - DBUG_ASSERT(!ctx->new_table->persistent_autoinc); if (const Field* ai = altered_table->found_next_number_field) { const unsigned col_no = innodb_col_no(ai); @@ -5844,8 +7064,7 @@ error_handling_drop_uncached: op_ok: #endif /* UNIV_DEBUG */ ut_ad(ctx->trx->dict_operation_lock_mode == RW_X_LATCH); - ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); + ut_d(dict_sys.assert_locked()); DICT_TF2_FLAG_SET(ctx->new_table, DICT_TF2_FTS); if (ctx->need_rebuild()) { @@ -5868,7 +7087,7 @@ op_ok: goto error_handling; } - trx_commit(ctx->trx); + ctx->trx->commit(); trx_start_for_ddl(ctx->trx, op); if (!ctx->new_table->fts @@ -5923,10 +7142,10 @@ error_handling: case DB_SUCCESS: ut_a(!dict_locked); - ut_d(mutex_enter(&dict_sys->mutex)); + ut_d(mutex_enter(&dict_sys.mutex)); ut_d(dict_table_check_for_dup_indexes( user_table, CHECK_PARTIAL_OK)); - ut_d(mutex_exit(&dict_sys->mutex)); + ut_d(mutex_exit(&dict_sys.mutex)); DBUG_RETURN(false); case DB_TABLESPACE_EXISTS: my_error(ER_TABLESPACE_EXISTS, MYF(0), "(unknown)"); @@ -5987,7 +7206,7 @@ error_handled: trx_commit_for_mysql(ctx->trx); /* n_ref_count must be 1, because purge cannot be executing on this very table as we are - holding dict_operation_lock X-latch. */ + holding dict_sys.latch X-latch. */ ut_ad(!stats_wait || ctx->online || user_table->get_ref_count() == 1); @@ -6113,6 +7332,116 @@ innobase_check_foreign_key_index( return(false); } +/** +Rename a given index in the InnoDB data dictionary. + +@param index index to rename +@param new_name new name of the index +@param[in,out] trx dict transaction to use, not going to be committed here + +@retval true Failure +@retval false Success */ +static MY_ATTRIBUTE((warn_unused_result)) +bool +rename_index_try( + const dict_index_t* index, + const char* new_name, + trx_t* trx) +{ + DBUG_ENTER("rename_index_try"); + ut_d(dict_sys.assert_locked()); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + + pars_info_t* pinfo; + dberr_t err; + + pinfo = pars_info_create(); + + pars_info_add_ull_literal(pinfo, "table_id", index->table->id); + pars_info_add_ull_literal(pinfo, "index_id", index->id); + pars_info_add_str_literal(pinfo, "new_name", new_name); + + trx->op_info = "Renaming an index in SYS_INDEXES"; + + DBUG_EXECUTE_IF( + "ib_rename_index_fail1", + DBUG_SET("+d,innodb_report_deadlock"); + ); + + err = que_eval_sql( + pinfo, + "PROCEDURE RENAME_INDEX_IN_SYS_INDEXES () IS\n" + "BEGIN\n" + "UPDATE SYS_INDEXES SET\n" + "NAME = :new_name\n" + "WHERE\n" + "ID = :index_id AND\n" + "TABLE_ID = :table_id;\n" + "END;\n", + FALSE, trx); /* pinfo is freed by que_eval_sql() */ + + DBUG_EXECUTE_IF( + "ib_rename_index_fail1", + DBUG_SET("-d,innodb_report_deadlock"); + ); + + trx->op_info = ""; + + if (err != DB_SUCCESS) { + my_error_innodb(err, index->table->name.m_name, 0); + DBUG_RETURN(true); + } + + DBUG_RETURN(false); +} + + +/** +Rename a given index in the InnoDB data dictionary cache. + +@param[in,out] index index to rename +@param new_name new index name +*/ +static +void +innobase_rename_index_cache(dict_index_t* index, const char* new_name) +{ + DBUG_ENTER("innobase_rename_index_cache"); + ut_d(dict_sys.assert_locked()); + + size_t old_name_len = strlen(index->name); + size_t new_name_len = strlen(new_name); + + if (old_name_len < new_name_len) { + index->name = static_cast<char*>( + mem_heap_alloc(index->heap, new_name_len + 1)); + } + + memcpy(const_cast<char*>(index->name()), new_name, new_name_len + 1); + + DBUG_VOID_RETURN; +} + + +/** Rename the index name in cache. +@param[in] ctx alter context +@param[in] ha_alter_info Data used during inplace alter. */ +static void innobase_rename_indexes_cache( + const ha_innobase_inplace_ctx* ctx, + const Alter_inplace_info* ha_alter_info) +{ + DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_RENAME_INDEX); + + for (const Alter_inplace_info::Rename_key_pair& pair : + ha_alter_info->rename_keys) { + dict_index_t* index = dict_table_get_index_on_name( + ctx->old_table, pair.old_key->name.str); + ut_ad(index); + + innobase_rename_index_cache(index, pair.new_key->name.str); + } +} + /** Fill the stored column information in s_cols list. @param[in] altered_table mysql table object @@ -6164,7 +7493,7 @@ alter_fill_stored_column( s_col.num_base = num_base; innodb_base_col_setup_for_stored(table, field, &s_col); - (*s_cols)->push_back(s_col); + (*s_cols)->push_front(s_col); } } @@ -6190,8 +7519,6 @@ ha_innobase::prepare_inplace_alter_table( { dict_index_t** drop_index; /*!< Index to be dropped */ ulint n_drop_index; /*!< Number of indexes to drop */ - dict_index_t** rename_index; /*!< Indexes to be dropped */ - ulint n_rename_index; /*!< Number of indexes to rename */ dict_foreign_t**drop_fk; /*!< Foreign key constraints to drop */ ulint n_drop_fk; /*!< Number of foreign keys to drop */ dict_foreign_t**add_fk = NULL; /*!< Foreign key constraints to drop */ @@ -6230,10 +7557,10 @@ ha_innobase::prepare_inplace_alter_table( } #endif /* UNIV_DEBUG */ - ut_d(mutex_enter(&dict_sys->mutex)); + ut_d(mutex_enter(&dict_sys.mutex)); ut_d(dict_table_check_for_dup_indexes( m_prebuilt->table, CHECK_ABORTED_OK)); - ut_d(mutex_exit(&dict_sys->mutex)); + ut_d(mutex_exit(&dict_sys.mutex)); if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) { /* Nothing to do */ @@ -6345,9 +7672,6 @@ err_exit_no_heap: already contains. */ if (ha_alter_info->handler_flags & ALTER_COLUMN_NAME) { - List_iterator_fast<Create_field> cf_it( - ha_alter_info->alter_info->create_list); - for (Field** fp = table->field; *fp; fp++) { if (!((*fp)->flags & FIELD_IS_RENAMED)) { continue; @@ -6355,10 +7679,10 @@ err_exit_no_heap: const char* name = 0; - cf_it.rewind(); - while (Create_field* cf = cf_it++) { - if (cf->field == *fp) { - name = cf->field_name.str; + for (const Create_field& cf : + ha_alter_info->alter_info->create_list) { + if (cf.field == *fp) { + name = cf.field_name.str; goto check_if_ok_to_rename; } } @@ -6517,11 +7841,8 @@ check_if_ok_to_rename: ha_alter_info->alter_info->drop_list.elements * sizeof(dict_foreign_t*))); - List_iterator<Alter_drop> drop_it( - ha_alter_info->alter_info->drop_list); - - while (Alter_drop* drop = drop_it++) { - if (drop->type != Alter_drop::FOREIGN_KEY) { + for (Alter_drop& drop : ha_alter_info->alter_info->drop_list) { + if (drop.type != Alter_drop::FOREIGN_KEY) { continue; } @@ -6542,13 +7863,13 @@ check_if_ok_to_rename: fid = fid ? fid + 1 : foreign->id; if (!my_strcasecmp(system_charset_info, - fid, drop->name)) { + fid, drop.name)) { goto found_fk; } } my_error(ER_CANT_DROP_FIELD_OR_KEY, MYF(0), - drop->type_name(), drop->name); + drop.type_name(), drop.name); goto err_exit; found_fk: for (ulint i = n_drop_fk; i--; ) { @@ -6707,9 +8028,6 @@ check_if_can_drop_indexes: } } - n_rename_index = 0; - rename_index = NULL; - n_add_fk = 0; if (ha_alter_info->handler_flags @@ -6763,6 +8081,20 @@ err_exit: } } + if (ha_alter_info->handler_flags & ALTER_RENAME_INDEX) { + for (const Alter_inplace_info::Rename_key_pair& pair : + ha_alter_info->rename_keys) { + dict_index_t* index = dict_table_get_index_on_name( + indexed_table, pair.old_key->name.str); + + if (!index || index->is_corrupted()) { + my_error(ER_INDEX_CORRUPT, MYF(0), + index->name()); + goto err_exit; + } + } + } + const ha_table_option_struct& alt_opt= *ha_alter_info->create_info->option_struct; @@ -6778,7 +8110,6 @@ err_exit: = new ha_innobase_inplace_ctx( m_prebuilt, drop_index, n_drop_index, - rename_index, n_rename_index, drop_fk, n_drop_fk, add_fk, n_add_fk, ha_alter_info->online, @@ -6857,21 +8188,20 @@ err_exit: /* See if an AUTO_INCREMENT column was added. */ uint i = 0; ulint num_v = 0; - List_iterator_fast<Create_field> cf_it( - ha_alter_info->alter_info->create_list); - while (const Create_field* new_field = cf_it++) { + for (const Create_field& new_field : + ha_alter_info->alter_info->create_list) { const Field* field; DBUG_ASSERT(i < altered_table->s->fields); for (uint old_i = 0; table->field[old_i]; old_i++) { - if (new_field->field == table->field[old_i]) { + if (new_field.field == table->field[old_i]) { goto found_col; } } /* This is an added column. */ - DBUG_ASSERT(!new_field->field); + DBUG_ASSERT(!new_field.field); DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_ADD_COLUMN); @@ -6895,7 +8225,7 @@ err_exit: autoinc_col_max_value = innobase_get_int_col_max_value(field); } found_col: - num_v += !new_field->stored_in_db(); + num_v += !new_field.stored_in_db(); i++; } @@ -6906,7 +8236,6 @@ found_col: ha_alter_info->handler_ctx = new ha_innobase_inplace_ctx( m_prebuilt, drop_index, n_drop_index, - rename_index, n_rename_index, drop_fk, n_drop_fk, add_fk, n_add_fk, ha_alter_info->online, heap, m_prebuilt->table, col_names, @@ -6969,16 +8298,14 @@ alter_templ_needs_rebuild( dict_table_t* table) { ulint i = 0; - List_iterator_fast<Create_field> cf_it( - ha_alter_info->alter_info->create_list); for (Field** fp = altered_table->field; *fp; fp++, i++) { - cf_it.rewind(); - while (const Create_field* cf = cf_it++) { + for (const Create_field& cf : + ha_alter_info->alter_info->create_list) { for (ulint j=0; j < table->n_cols; j++) { dict_col_t* cols = dict_table_get_nth_col(table, j); - if (cf->length > cols->len + if (cf.length > cols->len && dict_col_in_v_indexes(table, cols)) { return(true); } @@ -7037,9 +8364,8 @@ ha_innobase::inplace_alter_table( bool rebuild_templ = false; DBUG_ENTER("inplace_alter_table"); DBUG_ASSERT(!srv_read_only_mode); - ut_ad(!sync_check_iterate(sync_check())); - ut_ad(!rw_lock_own_flagged(&dict_operation_lock, + ut_ad(!rw_lock_own_flagged(&dict_sys.latch, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); DEBUG_SYNC(m_user_thd, "innodb_inplace_alter_table_enter"); @@ -7092,7 +8418,7 @@ ok_exit: rebuild_templ = ctx->need_rebuild() || ((ha_alter_info->handler_flags - & ALTER_COLUMN_EQUAL_PACK_LENGTH) + & ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE) && alter_templ_needs_rebuild( altered_table, ha_alter_info, ctx->new_table)); @@ -7187,10 +8513,10 @@ oom: KEY* dup_key; all_done: case DB_SUCCESS: - ut_d(mutex_enter(&dict_sys->mutex)); + ut_d(mutex_enter(&dict_sys.mutex)); ut_d(dict_table_check_for_dup_indexes( m_prebuilt->table, CHECK_PARTIAL_OK)); - ut_d(mutex_exit(&dict_sys->mutex)); + ut_d(mutex_exit(&dict_sys.mutex)); /* prebuilt->table->n_ref_count can be anything here, given that we hold at most a shared lock on the table. */ goto ok_exit; @@ -7252,10 +8578,7 @@ innobase_online_rebuild_log_free( dict_table_t* table) { dict_index_t* clust_index = dict_table_get_first_index(table); - - ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); - + ut_d(dict_sys.assert_locked()); rw_lock_x_lock(&clust_index->lock); if (clust_index->online_log) { @@ -7528,8 +8851,7 @@ innobase_drop_foreign_try( DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); - ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); + ut_d(dict_sys.assert_locked()); /* Drop the constraint from the data dictionary. */ static const char sql[] = @@ -7565,7 +8887,6 @@ innobase_drop_foreign_try( @param[in] ctx ALTER TABLE context @param[in,out] trx Data dictionary transaction @param[in] table_name Table name in MySQL -@param[in] nth_col 0-based index of the column @param[in] from old column name @param[in] to new column name @retval true Failure @@ -7576,52 +8897,22 @@ innobase_rename_column_try( const ha_innobase_inplace_ctx& ctx, trx_t* trx, const char* table_name, - ulint nth_col, const char* from, const char* to) { - pars_info_t* info; dberr_t error; DBUG_ENTER("innobase_rename_column_try"); DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); - ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); + ut_d(dict_sys.assert_locked()); if (ctx.need_rebuild()) { goto rename_foreign; } - info = pars_info_create(); - - pars_info_add_ull_literal(info, "tableid", ctx.old_table->id); - pars_info_add_int4_literal(info, "nth", nth_col); - pars_info_add_str_literal(info, "new", to); - - trx->op_info = "renaming column in SYS_COLUMNS"; - - error = que_eval_sql( - info, - "PROCEDURE RENAME_SYS_COLUMNS_PROC () IS\n" - "BEGIN\n" - "UPDATE SYS_COLUMNS SET NAME=:new\n" - "WHERE TABLE_ID=:tableid\n" - "AND POS=:nth;\n" - "END;\n", - FALSE, trx); - - DBUG_EXECUTE_IF("ib_rename_column_error", - error = DB_OUT_OF_FILE_SPACE;); - - if (error != DB_SUCCESS) { -err_exit: - my_error_innodb(error, table_name, 0); - trx->error_state = DB_SUCCESS; - trx->op_info = ""; - DBUG_RETURN(true); - } + error = DB_SUCCESS; trx->op_info = "renaming column in SYS_FIELDS"; @@ -7639,19 +8930,16 @@ err_exit: } for (ulint i = 0; i < dict_index_get_n_fields(index); i++) { - const dict_field_t* field - = dict_index_get_nth_field(index, i); - if (my_strcasecmp(system_charset_info, field->name, - from)) { + const dict_field_t& f = index->fields[i]; + DBUG_ASSERT(!f.name == f.col->is_dropped()); + + if (!f.name || my_strcasecmp(system_charset_info, + f.name, from)) { continue; } - info = pars_info_create(); - - ulint pos = i; - if (has_prefixes) { - pos = (pos << 16) + field->prefix_len; - } + pars_info_t* info = pars_info_create(); + ulint pos = has_prefixes ? i << 16 | f.prefix_len : i; pars_info_add_ull_literal(info, "indexid", index->id); pars_info_add_int4_literal(info, "nth", pos); @@ -7666,6 +8954,8 @@ err_exit: "AND POS=:nth;\n" "END;\n", FALSE, trx); + DBUG_EXECUTE_IF("ib_rename_column_error", + error = DB_OUT_OF_FILE_SPACE;); if (error != DB_SUCCESS) { goto err_exit; @@ -7673,6 +8963,14 @@ err_exit: } } + if (error != DB_SUCCESS) { +err_exit: + my_error_innodb(error, table_name, 0); + trx->error_state = DB_SUCCESS; + trx->op_info = ""; + DBUG_RETURN(true); + } + rename_foreign: trx->op_info = "renaming column in SYS_FOREIGN_COLS"; @@ -7701,7 +8999,7 @@ rename_foreign: continue; } - info = pars_info_create(); + pars_info_t* info = pars_info_create(); pars_info_add_str_literal(info, "id", foreign->id); pars_info_add_int4_literal(info, "nth", i); @@ -7743,7 +9041,7 @@ rename_foreign: continue; } - info = pars_info_create(); + pars_info_t* info = pars_info_create(); pars_info_add_str_literal(info, "id", foreign->id); pars_info_add_int4_literal(info, "nth", i); @@ -7798,11 +9096,10 @@ innobase_rename_columns_try( trx_t* trx, const char* table_name) { - List_iterator_fast<Create_field> cf_it( - ha_alter_info->alter_info->create_list); uint i = 0; ulint num_v = 0; + DBUG_ASSERT(ctx->need_rebuild()); DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_COLUMN_NAME); @@ -7812,20 +9109,13 @@ innobase_rename_columns_try( goto processed_field; } - cf_it.rewind(); - - while (Create_field* cf = cf_it++) { - if (cf->field == *fp) { - ulint col_n = is_virtual - ? dict_create_v_col_pos( - num_v, i) - : i - num_v; - + for (const Create_field& cf : + ha_alter_info->alter_info->create_list) { + if (cf.field == *fp) { if (innobase_rename_column_try( *ctx, trx, table_name, - col_n, - cf->field->field_name.str, - cf->field_name.str)) { + cf.field->field_name.str, + cf.field_name.str)) { return(true); } goto processed_field; @@ -7844,63 +9134,99 @@ processed_field: return(false); } +/** Convert field type and length to InnoDB format */ +static void get_type(const Field& f, ulint& prtype, ulint& mtype, ulint& len) +{ + mtype = get_innobase_type_from_mysql_type(&prtype, &f); + len = f.pack_length(); + prtype |= f.type(); + if (f.type() == MYSQL_TYPE_VARCHAR) { + auto l = static_cast<const Field_varstring&>(f).length_bytes; + len -= l; + if (l == 2) prtype |= DATA_LONG_TRUE_VARCHAR; + } + if (!f.real_maybe_null()) prtype |= DATA_NOT_NULL; + if (f.binary()) prtype |= DATA_BINARY_TYPE; + if (f.table->versioned()) { + if (&f == f.table->field[f.table->s->vers.start_fieldno]) { + prtype |= DATA_VERS_START; + } else if (&f == f.table->field[f.table->s->vers.end_fieldno]) { + prtype |= DATA_VERS_END; + } else if (!(f.flags & VERS_UPDATE_UNVERSIONED_FLAG)) { + prtype |= DATA_VERSIONED; + } + } + if (!f.stored_in_db()) prtype |= DATA_VIRTUAL; + + if (dtype_is_string_type(mtype)) { + prtype |= ulint(f.charset()->number) << 16; + } +} + /** Enlarge a column in the data dictionary tables. -@param user_table InnoDB table that was being altered +@param ctx In-place ALTER TABLE context @param trx data dictionary transaction @param table_name Table name in MySQL -@param nth_col 0-based index of the column -@param new_len new column length, in bytes +@param pos 0-based index to user_table->cols[] or user_table->v_cols[] +@param f new column @param is_v if it's a virtual column @retval true Failure @retval false Success */ static MY_ATTRIBUTE((nonnull, warn_unused_result)) bool -innobase_enlarge_column_try( -/*========================*/ - const dict_table_t* user_table, +innobase_rename_or_enlarge_column_try( + ha_innobase_inplace_ctx*ctx, trx_t* trx, const char* table_name, - ulint nth_col, - ulint new_len, + ulint pos, + const Field& f, bool is_v) { - pars_info_t* info; - dberr_t error; -#ifdef UNIV_DEBUG dict_col_t* col; -#endif /* UNIV_DEBUG */ - dict_v_col_t* v_col; - ulint pos; + dict_table_t* user_table = ctx->old_table; - DBUG_ENTER("innobase_enlarge_column_try"); + DBUG_ENTER("innobase_rename_or_enlarge_column_try"); + DBUG_ASSERT(!ctx->need_rebuild()); DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); - ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); + ut_d(dict_sys.assert_locked()); + + ulint n_base; if (is_v) { - v_col = dict_table_get_nth_v_col(user_table, nth_col); + dict_v_col_t* v_col= dict_table_get_nth_v_col(user_table, pos); pos = dict_create_v_col_pos(v_col->v_pos, v_col->m_col.ind); -#ifdef UNIV_DEBUG col = &v_col->m_col; -#endif /* UNIV_DEBUG */ + n_base = v_col->num_base; } else { -#ifdef UNIV_DEBUG - col = dict_table_get_nth_col(user_table, nth_col); -#endif /* UNIV_DEBUG */ - pos = nth_col; + col = dict_table_get_nth_col(user_table, pos); + n_base = 0; } + ulint prtype, mtype, len; + get_type(f, prtype, mtype, len); + DBUG_ASSERT(!dtype_is_string_type(col->mtype) + || col->mbminlen == f.charset()->mbminlen); + DBUG_ASSERT(col->len <= len); + #ifdef UNIV_DEBUG - ut_ad(col->len < new_len); - switch (col->mtype) { + ut_ad(col->mbminlen <= col->mbmaxlen); + switch (mtype) { case DATA_MYSQL: - /* NOTE: we could allow this when !(prtype & DATA_BINARY_TYPE) - and ROW_FORMAT is not REDUNDANT and mbminlen<mbmaxlen. - That is, we treat a UTF-8 CHAR(n) column somewhat like - a VARCHAR. */ - ut_error; + if (!(prtype & DATA_BINARY_TYPE) || user_table->not_redundant() + || col->mbminlen != col->mbmaxlen) { + /* NOTE: we could allow this when !(prtype & + DATA_BINARY_TYPE) and ROW_FORMAT is not REDUNDANT and + mbminlen<mbmaxlen. That is, we treat a UTF-8 CHAR(n) + column somewhat like a VARCHAR. */ + break; + } + /* fall through */ + case DATA_FIXBINARY: + case DATA_CHAR: + ut_ad(col->len == len); + break; case DATA_BINARY: case DATA_VARCHAR: case DATA_VARMYSQL: @@ -7908,60 +9234,61 @@ innobase_enlarge_column_try( case DATA_BLOB: break; default: - ut_error; + ut_ad(!((col->prtype ^ prtype) & ~DATA_VERSIONED)); + ut_ad(col->mtype == mtype); + ut_ad(col->len == len); } #endif /* UNIV_DEBUG */ - info = pars_info_create(); - - pars_info_add_ull_literal(info, "tableid", user_table->id); - pars_info_add_int4_literal(info, "nth", pos); - pars_info_add_int4_literal(info, "new", new_len); - - trx->op_info = "resizing column in SYS_COLUMNS"; - error = que_eval_sql( - info, - "PROCEDURE RESIZE_SYS_COLUMNS_PROC () IS\n" - "BEGIN\n" - "UPDATE SYS_COLUMNS SET LEN=:new\n" - "WHERE TABLE_ID=:tableid AND POS=:nth;\n" - "END;\n", - FALSE, trx); - - DBUG_EXECUTE_IF("ib_resize_column_error", - error = DB_OUT_OF_FILE_SPACE;); + const char* col_name = col->name(*user_table); + const bool same_name = !strcmp(col_name, f.field_name.str); - trx->op_info = ""; - trx->error_state = DB_SUCCESS; - - if (error != DB_SUCCESS) { - my_error_innodb(error, table_name, 0); + if (!same_name + && innobase_rename_column_try(*ctx, trx, table_name, + col_name, f.field_name.str)) { DBUG_RETURN(true); } - DBUG_RETURN(false); + if (same_name + && col->prtype == prtype && col->mtype == mtype + && col->len == len) { + DBUG_RETURN(false); + } + + DBUG_RETURN(innodb_insert_sys_columns(user_table->id, pos, + f.field_name.str, + mtype, prtype, len, + n_base, trx, true)); } -/** Enlarge columns in the data dictionary tables. +/** Rename or enlarge columns in the data dictionary cache +as part of commit_try_norebuild(). @param ha_alter_info Data used during in-place alter. -@param table the TABLE -@param user_table InnoDB table that was being altered +@param ctx In-place ALTER TABLE context +@param altered_table metadata after ALTER TABLE +@param table metadata before ALTER TABLE @param trx data dictionary transaction @param table_name Table name in MySQL @retval true Failure @retval false Success */ static MY_ATTRIBUTE((nonnull, warn_unused_result)) bool -innobase_enlarge_columns_try( -/*=========================*/ +innobase_rename_or_enlarge_columns_try( Alter_inplace_info* ha_alter_info, + ha_innobase_inplace_ctx*ctx, + const TABLE* altered_table, const TABLE* table, - const dict_table_t* user_table, trx_t* trx, const char* table_name) { - List_iterator_fast<Create_field> cf_it( - ha_alter_info->alter_info->create_list); + DBUG_ENTER("innobase_rename_or_enlarge_columns_try"); + + if (!(ha_alter_info->handler_flags + & (ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE + | ALTER_COLUMN_NAME))) { + DBUG_RETURN(false); + } + ulint i = 0; ulint num_v = 0; @@ -7969,77 +9296,83 @@ innobase_enlarge_columns_try( const bool is_v = !(*fp)->stored_in_db(); ulint idx = is_v ? num_v++ : i - num_v; - cf_it.rewind(); - while (Create_field* cf = cf_it++) { - if (cf->field == *fp) { - if ((*fp)->is_equal(cf) - == IS_EQUAL_PACK_LENGTH - && innobase_enlarge_column_try( - user_table, trx, table_name, - idx, static_cast<ulint>(cf->length), is_v)) { - return(true); + Field** af = altered_table->field; + for (const Create_field& cf : + ha_alter_info->alter_info->create_list) { + if (cf.field == *fp) { + if (innobase_rename_or_enlarge_column_try( + ctx, trx, table_name, + idx, **af, is_v)) { + DBUG_RETURN(true); } - break; } + af++; } } - return(false); + DBUG_RETURN(false); } /** Rename or enlarge columns in the data dictionary cache as part of commit_cache_norebuild(). @param ha_alter_info Data used during in-place alter. -@param table the TABLE +@param altered_table metadata after ALTER TABLE +@param table metadata before ALTER TABLE @param user_table InnoDB table that was being altered */ static MY_ATTRIBUTE((nonnull)) void innobase_rename_or_enlarge_columns_cache( /*=====================================*/ Alter_inplace_info* ha_alter_info, + const TABLE* altered_table, const TABLE* table, dict_table_t* user_table) { if (!(ha_alter_info->handler_flags - & (ALTER_COLUMN_EQUAL_PACK_LENGTH + & (ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE | ALTER_COLUMN_NAME))) { return; } - List_iterator_fast<Create_field> cf_it( - ha_alter_info->alter_info->create_list); uint i = 0; ulint num_v = 0; for (Field** fp = table->field; *fp; fp++, i++) { const bool is_virtual = !(*fp)->stored_in_db(); - cf_it.rewind(); - while (Create_field* cf = cf_it++) { - if (cf->field != *fp) { + Field** af = altered_table->field; + for (Create_field& cf : + ha_alter_info->alter_info->create_list) { + if (cf.field != *fp) { + af++; continue; } ulint col_n = is_virtual ? num_v : i - num_v; - - if ((*fp)->is_equal(cf) == IS_EQUAL_PACK_LENGTH) { - if (is_virtual) { - dict_table_get_nth_v_col( - user_table, col_n)->m_col.len - = cf->length; - } else { - dict_table_get_nth_col( - user_table, col_n)->len - = cf->length; - } - } + dict_col_t *col = is_virtual + ? &dict_table_get_nth_v_col(user_table, col_n) + ->m_col + : dict_table_get_nth_col(user_table, col_n); + const bool is_string= dtype_is_string_type(col->mtype); + DBUG_ASSERT(col->mbminlen + == (is_string + ? (*af)->charset()->mbminlen : 0)); + ulint prtype, mtype, len; + get_type(**af, prtype, mtype, len); + DBUG_ASSERT(is_string == dtype_is_string_type(mtype)); + + col->prtype = prtype; + col->mtype = mtype; + col->len = len; + col->mbmaxlen = is_string + ? (*af)->charset()->mbmaxlen : 0; if ((*fp)->flags & FIELD_IS_RENAMED) { dict_mem_table_col_rename( user_table, col_n, - cf->field->field_name.str, - cf->field_name.str, is_virtual); + cf.field->field_name.str, + (*af)->field_name.str, is_virtual); } break; @@ -8279,7 +9612,7 @@ innobase_update_foreign_cache( DBUG_ENTER("innobase_update_foreign_cache"); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); user_table = ctx->old_table; @@ -8432,27 +9765,24 @@ vers_change_fields_try( DBUG_ASSERT(ha_alter_info); DBUG_ASSERT(ctx); - List_iterator_fast<Create_field> it( - ha_alter_info->alter_info->create_list); - - while (const Create_field* create_field = it++) { - if (!create_field->field) { + for (const Create_field& create_field : ha_alter_info->alter_info->create_list) { + if (!create_field.field) { continue; } - if (create_field->versioning + if (create_field.versioning == Column_definition::VERSIONING_NOT_SET) { continue; } const dict_table_t* new_table = ctx->new_table; - const uint pos = innodb_col_no(create_field->field); + const uint pos = innodb_col_no(create_field.field); const dict_col_t* col = dict_table_get_nth_col(new_table, pos); DBUG_ASSERT(!col->vers_sys_start()); DBUG_ASSERT(!col->vers_sys_end()); ulint new_prtype - = create_field->versioning + = create_field.versioning == Column_definition::WITHOUT_VERSIONING ? col->prtype & ~DATA_VERSIONED : col->prtype | DATA_VERSIONED; @@ -8485,23 +9815,21 @@ vers_change_fields_cache( DBUG_ASSERT(ctx); DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_COLUMN_UNVERSIONED); - List_iterator_fast<Create_field> it( - ha_alter_info->alter_info->create_list); - - while (const Create_field* create_field = it++) { - if (!create_field->field || create_field->field->vcol_info) { + for (const Create_field& create_field : + ha_alter_info->alter_info->create_list) { + if (!create_field.field || create_field.field->vcol_info) { continue; } dict_col_t* col = dict_table_get_nth_col( - ctx->new_table, innodb_col_no(create_field->field)); + ctx->new_table, innodb_col_no(create_field.field)); - if (create_field->versioning + if (create_field.versioning == Column_definition::WITHOUT_VERSIONING) { DBUG_ASSERT(!col->vers_sys_start()); DBUG_ASSERT(!col->vers_sys_end()); col->prtype &= ~DATA_VERSIONED; - } else if (create_field->versioning + } else if (create_field.versioning == Column_definition::WITH_VERSIONING) { DBUG_ASSERT(!col->vers_sys_start()); @@ -8625,6 +9953,38 @@ commit_try_rebuild( } } +/** Rename indexes in dictionary. +@param[in] ctx alter info context +@param[in] ha_alter_info Operation used during inplace alter +@param[out] trx transaction to change the index name + in dictionary +@return true if it failed to rename +@return false if it is success. */ +static +bool +rename_indexes_try( + const ha_innobase_inplace_ctx* ctx, + const Alter_inplace_info* ha_alter_info, + trx_t* trx) +{ + DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_RENAME_INDEX); + + for (const Alter_inplace_info::Rename_key_pair& pair : + ha_alter_info->rename_keys) { + dict_index_t* index = dict_table_get_index_on_name( + ctx->old_table, pair.old_key->name.str); + // This was checked previously in + // ha_innobase::prepare_inplace_alter_table() + ut_ad(index); + + if (rename_index_try(index, pair.new_key->name.str, trx)) { + return true; + } + } + + return false; +} + /** Apply the changes made during commit_try_rebuild(), to the data dictionary cache and the file system. @param ctx In-place ALTER TABLE context */ @@ -8660,8 +10020,7 @@ commit_cache_rebuild( /** Set of column numbers */ typedef std::set<ulint, std::less<ulint>, ut_allocator<ulint> > col_set; -/** Store the column number of the columns in a list belonging -to indexes which are not being dropped. +/** Collect (not instantly dropped) columns from dropped indexes @param[in] ctx In-place ALTER TABLE context @param[in, out] drop_col_list list which will be set, containing columns which is part of index being dropped @@ -8670,7 +10029,7 @@ to indexes which are not being dropped. being dropped */ static void -get_col_list_to_be_dropped( +collect_columns_from_dropped_indexes( const ha_innobase_inplace_ctx* ctx, col_set& drop_col_list, col_set& drop_v_col_list) @@ -8691,6 +10050,12 @@ get_col_list_to_be_dropped( } else { ulint col_no = dict_col_get_no(idx_col); + if (ctx->col_map + && ctx->col_map[col_no] + == ULINT_UNDEFINED) { + // this column was instantly dropped + continue; + } drop_col_list.insert(col_no); } } @@ -8876,34 +10241,49 @@ commit_try_norebuild( } } - if ((ha_alter_info->handler_flags - & ALTER_COLUMN_NAME) - && innobase_rename_columns_try(ha_alter_info, ctx, old_table, - trx, table_name)) { + if (innobase_rename_or_enlarge_columns_try(ha_alter_info, ctx, + altered_table, old_table, + trx, table_name)) { DBUG_RETURN(true); } - if ((ha_alter_info->handler_flags - & ALTER_COLUMN_EQUAL_PACK_LENGTH) - && innobase_enlarge_columns_try(ha_alter_info, old_table, - ctx->old_table, trx, table_name)) { + if ((ha_alter_info->handler_flags & ALTER_RENAME_INDEX) + && rename_indexes_try(ctx, ha_alter_info, trx)) { DBUG_RETURN(true); } - if ((ha_alter_info->handler_flags - & ALTER_DROP_VIRTUAL_COLUMN) - && innobase_drop_virtual_try(ha_alter_info, ctx->old_table, trx)) { - DBUG_RETURN(true); + if (ctx->is_instant()) { + DBUG_RETURN(innobase_instant_try(ha_alter_info, ctx, + altered_table, old_table, + trx)); } - if ((ha_alter_info->handler_flags - & ALTER_ADD_VIRTUAL_COLUMN) - && innobase_add_virtual_try(ha_alter_info, ctx->old_table, trx)) { - DBUG_RETURN(true); - } + if (ha_alter_info->handler_flags + & (ALTER_DROP_VIRTUAL_COLUMN | ALTER_ADD_VIRTUAL_COLUMN)) { + if ((ha_alter_info->handler_flags & ALTER_DROP_VIRTUAL_COLUMN) + && innobase_drop_virtual_try(ha_alter_info, ctx->old_table, + trx)) { + DBUG_RETURN(true); + } - if (innobase_add_instant_try(ctx, altered_table, old_table, trx)) { - DBUG_RETURN(true); + if ((ha_alter_info->handler_flags & ALTER_ADD_VIRTUAL_COLUMN) + && innobase_add_virtual_try(ha_alter_info, ctx->old_table, + trx)) { + DBUG_RETURN(true); + } + + ulint n_col = unsigned(ctx->old_table->n_cols) + - DATA_N_SYS_COLS; + ulint n_v_col = unsigned(ctx->old_table->n_v_cols) + + ctx->num_to_add_vcol - ctx->num_to_drop_vcol; + + if (innodb_update_cols( + ctx->old_table, + dict_table_encode_n_col(n_col, n_v_col) + | unsigned(ctx->old_table->flags & DICT_TF_COMPACT) + << 31, trx)) { + DBUG_RETURN(true); + } } DBUG_RETURN(false); @@ -8913,6 +10293,7 @@ commit_try_norebuild( after a successful commit_try_norebuild() call. @param ha_alter_info algorithm=inplace context @param ctx In-place ALTER TABLE context for the current partition +@param altered_table the TABLE after the ALTER @param table the TABLE before the ALTER @param trx Data dictionary transaction (will be started and committed, for DROP INDEX) @@ -8923,6 +10304,7 @@ commit_cache_norebuild( /*===================*/ Alter_inplace_info* ha_alter_info, ha_innobase_inplace_ctx*ctx, + const TABLE* altered_table, const TABLE* table, trx_t* trx) { @@ -8945,11 +10327,17 @@ commit_cache_norebuild( bool update = !(space->flags & FSP_FLAGS_MASK_PAGE_COMPRESSION); mutex_enter(&fil_system.mutex); - space->flags = (~FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL - & (space->flags - | FSP_FLAGS_MASK_PAGE_COMPRESSION)) - | ctx->page_compression_level + space->flags &= ~FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL; + space->flags |= ctx->page_compression_level << FSP_FLAGS_MEM_COMPRESSION_LEVEL; + if (!space->full_crc32()) { + space->flags + |= FSP_FLAGS_MASK_PAGE_COMPRESSION; + } else if (!space->is_compressed()) { + space->flags + |= innodb_compression_algorithm + << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO; + } mutex_exit(&fil_system.mutex); if (update) { @@ -8967,7 +10355,7 @@ commit_cache_norebuild( mtr.start(); if (buf_block_t* b = buf_page_get( page_id_t(space->id, 0), - page_size_t(space->flags), + space->zip_size(), RW_X_LATCH, &mtr)) { mtr.set_named_space(space); mlog_write_ulint( @@ -8984,25 +10372,21 @@ commit_cache_norebuild( col_set drop_list; col_set v_drop_list; - col_set::const_iterator col_it; /* Check if the column, part of an index to be dropped is part of any other index which is not being dropped. If it so, then set the ord_part of the column to 0. */ - get_col_list_to_be_dropped(ctx, drop_list, v_drop_list); + collect_columns_from_dropped_indexes(ctx, drop_list, v_drop_list); - for (col_it = drop_list.begin(); col_it != drop_list.end(); ++col_it) { - if (!check_col_exists_in_indexes(ctx->new_table, - *col_it, false)) { - ctx->new_table->cols[*col_it].ord_part = 0; + for (ulint col : drop_list) { + if (!check_col_exists_in_indexes(ctx->new_table, col, false)) { + ctx->new_table->cols[col].ord_part = 0; } } - for (col_it = v_drop_list.begin(); - col_it != v_drop_list.end(); ++col_it) { - if (!check_col_exists_in_indexes(ctx->new_table, - *col_it, true)) { - ctx->new_table->v_cols[*col_it].m_col.ord_part = 0; + for (ulint col : v_drop_list) { + if (!check_col_exists_in_indexes(ctx->new_table, col, true)) { + ctx->new_table->v_cols[col].m_col.ord_part = 0; } } @@ -9071,13 +10455,57 @@ commit_cache_norebuild( if (!ctx->is_instant()) { innobase_rename_or_enlarge_columns_cache( - ha_alter_info, table, ctx->new_table); + ha_alter_info, altered_table, table, ctx->new_table); + } else { + ut_ad(ctx->col_map); + + if (fts_t* fts = ctx->new_table->fts) { + ut_ad(fts->doc_col != ULINT_UNDEFINED); + ut_ad(ctx->new_table->n_cols > DATA_N_SYS_COLS); + const ulint c = ctx->col_map[fts->doc_col]; + ut_ad(c < ulint(ctx->new_table->n_cols) + - DATA_N_SYS_COLS); + ut_d(const dict_col_t& col = ctx->new_table->cols[c]); + ut_ad(!col.is_nullable()); + ut_ad(!col.is_virtual()); + ut_ad(!col.is_added()); + ut_ad(col.prtype & DATA_UNSIGNED); + ut_ad(col.mtype == DATA_INT); + ut_ad(col.len == 8); + ut_ad(col.ord_part); + fts->doc_col = c; + } + + if (ha_alter_info->handler_flags & ALTER_DROP_STORED_COLUMN) { + const dict_index_t* index = ctx->new_table->indexes.start; + + for (const dict_field_t* f = index->fields, + * const end = f + index->n_fields; + f != end; f++) { + dict_col_t& c = *f->col; + if (c.is_dropped()) { + c.set_dropped(!c.is_nullable(), + DATA_LARGE_MTYPE(c.mtype) + || (!f->fixed_len + && c.len > 255), + f->fixed_len); + } + } + } + + if (!ctx->instant_table->persistent_autoinc) { + ctx->new_table->persistent_autoinc = 0; + } } if (ha_alter_info->handler_flags & ALTER_COLUMN_UNVERSIONED) { vers_change_fields_cache(ha_alter_info, ctx, table); } + if (ha_alter_info->handler_flags & ALTER_RENAME_INDEX) { + innobase_rename_indexes_cache(ctx, ha_alter_info); + } + ctx->new_table->fts_doc_id_index = ctx->new_table->fts ? dict_table_get_index_on_name( @@ -9116,7 +10544,7 @@ alter_stats_norebuild( in a separate transaction from trx, because lock waits are not allowed in a data dictionary transaction. (Lock waits are possible on the statistics table, because it is directly accessible by users, - not covered by the dict_operation_lock.) + not covered by the dict_sys.latch.) Because the data dictionary changes were already committed, orphaned rows may be left in the statistics table if the system crashes. @@ -9147,6 +10575,27 @@ alter_stats_norebuild( } } + for (const Alter_inplace_info::Rename_key_pair& pair : + ha_alter_info->rename_keys) { + dberr_t err = dict_stats_rename_index(ctx->new_table, + pair.old_key->name.str, + pair.new_key->name.str); + + if (err != DB_SUCCESS) { + push_warning_printf( + thd, + Sql_condition::WARN_LEVEL_WARN, + ER_ERROR_ON_RENAME, + "Error renaming an index of table '%s'" + " from '%s' to '%s' in InnoDB persistent" + " statistics storage: %s", + ctx->new_table->name.m_name, + pair.old_key->name.str, + pair.new_key->name.str, + ut_strerr(err)); + } + } + for (i = 0; i < ctx->num_to_add_index; i++) { dict_index_t* index = ctx->add_index[i]; DBUG_ASSERT(index->table == ctx->new_table); @@ -9182,23 +10631,8 @@ alter_stats_rebuild( DBUG_VOID_RETURN; } -#ifndef DBUG_OFF - bool file_unreadable_orig = false; -#endif /* DBUG_OFF */ - - DBUG_EXECUTE_IF( - "ib_rename_index_fail2", - file_unreadable_orig = table->file_unreadable; - table->file_unreadable = true; - ); - dberr_t ret = dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT); - DBUG_EXECUTE_IF( - "ib_rename_index_fail2", - table->file_unreadable = file_unreadable_orig; - ); - if (ret != DB_SUCCESS) { push_warning_printf( thd, @@ -9677,7 +11111,7 @@ ha_innobase::commit_inplace_alter_table( logical sense the commit in the file-based data structures happens here. */ - trx_commit_low(trx, &mtr); + trx->commit_low(&mtr); } /* If server crashes here, the dictionary in @@ -9772,7 +11206,8 @@ foreign_fail: bool fk_fail = innobase_update_foreign_cache( ctx, m_user_thd) != DB_SUCCESS; - if (!commit_cache_norebuild(ha_alter_info, ctx, table, + if (!commit_cache_norebuild(ha_alter_info, ctx, + altered_table, table, trx)) { fk_fail = true; } @@ -9842,7 +11277,13 @@ foreign_fail: } } - if (ctx0->num_to_drop_vcol || ctx0->num_to_add_vcol) { + /* MDEV-17468: Avoid this at least when ctx->is_instant(). + Currently dict_load_column_low() is the only place where + num_base for virtual columns is assigned to nonzero. */ + if (ctx0->num_to_drop_vcol || ctx0->num_to_add_vcol + || (ctx0->is_instant() + && m_prebuilt->table->n_v_cols + && ha_alter_info->handler_flags & ALTER_STORED_COLUMN_ORDER)) { /* FIXME: this workaround does not seem to work with partitioned tables */ DBUG_ASSERT(ctx0->old_table->get_ref_count() == 1); @@ -9852,7 +11293,13 @@ foreign_fail: char tb_name[NAME_LEN * 2 + 1 + 1]; strcpy(tb_name, m_prebuilt->table->name.m_name); dict_table_close(m_prebuilt->table, true, false); - dict_table_remove_from_cache(m_prebuilt->table); + if (ctx0->is_instant()) { + for (unsigned i = ctx0->old_n_v_cols; i--; ) { + ctx0->old_v_cols[i].~dict_v_col_t(); + } + const_cast<unsigned&>(ctx0->old_n_v_cols) = 0; + } + dict_sys.remove(m_prebuilt->table); m_prebuilt->table = dict_table_open_on_name( tb_name, TRUE, TRUE, DICT_ERR_IGNORE_NONE); @@ -9936,11 +11383,6 @@ foreign_fail: DBUG_ASSERT(0 == strcmp(ctx->old_table->name.m_name, ctx->tmp_name)); - DBUG_EXECUTE_IF( - "ib_rename_index_fail3", - DBUG_SET("+d,innodb_report_deadlock"); - ); - if (dict_stats_drop_table( ctx->new_table->name.m_name, errstr, sizeof(errstr)) @@ -9956,11 +11398,6 @@ foreign_fail: errstr); } - DBUG_EXECUTE_IF( - "ib_rename_index_fail3", - DBUG_SET("-d,innodb_report_deadlock"); - ); - DBUG_EXECUTE_IF("ib_ddl_crash_before_commit", DBUG_SUICIDE();); diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc index 3e52c288876..749e5102ad9 100644 --- a/storage/innobase/handler/i_s.cc +++ b/storage/innobase/handler/i_s.cc @@ -1650,12 +1650,12 @@ i_s_cmp_per_index_fill_low( RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); /* Create a snapshot of the stats so we do not bump into lock - order violations with dict_sys->mutex below. */ + order violations with dict_sys.mutex below. */ mutex_enter(&page_zip_stat_per_index_mutex); page_zip_stat_per_index_t snap (page_zip_stat_per_index); mutex_exit(&page_zip_stat_per_index_mutex); - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); page_zip_stat_per_index_t::iterator iter; ulint i; @@ -1713,13 +1713,13 @@ i_s_cmp_per_index_fill_low( contents of INFORMATION_SCHEMA.innodb_cmp_per_index being inconsistent, but it is an acceptable compromise. */ if (i == 1000) { - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); i = 0; - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); } } - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); if (reset) { page_zip_reset_stat_per_index(); @@ -2871,17 +2871,17 @@ i_s_fts_deleted_generic_fill( /* Prevent DROP of the internal tables for fulltext indexes. FIXME: acquire DDL-blocking MDL on the user table name! */ - rw_lock_s_lock(&dict_operation_lock); + rw_lock_s_lock(&dict_sys.latch); user_table = dict_table_open_on_id( innodb_ft_aux_table_id, FALSE, DICT_TABLE_OP_NORMAL); if (!user_table) { - rw_lock_s_unlock(&dict_operation_lock); + rw_lock_s_unlock(&dict_sys.latch); DBUG_RETURN(0); } else if (!dict_table_has_fts_index(user_table)) { dict_table_close(user_table, FALSE, FALSE); - rw_lock_s_unlock(&dict_operation_lock); + rw_lock_s_unlock(&dict_sys.latch); DBUG_RETURN(0); } @@ -2898,7 +2898,7 @@ i_s_fts_deleted_generic_fill( dict_table_close(user_table, FALSE, FALSE); - rw_lock_s_unlock(&dict_operation_lock); + rw_lock_s_unlock(&dict_sys.latch); trx_free(trx); @@ -3281,14 +3281,14 @@ i_s_fts_index_cache_fill( /* Prevent DROP of the internal tables for fulltext indexes. FIXME: acquire DDL-blocking MDL on the user table name! */ - rw_lock_s_lock(&dict_operation_lock); + rw_lock_s_lock(&dict_sys.latch); user_table = dict_table_open_on_id( innodb_ft_aux_table_id, FALSE, DICT_TABLE_OP_NORMAL); if (!user_table) { no_fts: - rw_lock_s_unlock(&dict_operation_lock); + rw_lock_s_unlock(&dict_sys.latch); DBUG_RETURN(0); } @@ -3316,7 +3316,7 @@ no_fts: } dict_table_close(user_table, FALSE, FALSE); - rw_lock_s_unlock(&dict_operation_lock); + rw_lock_s_unlock(&dict_sys.latch); DBUG_RETURN(ret); } @@ -3473,9 +3473,9 @@ i_s_fts_index_table_fill_selected( } } - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); que_graph_free(graph); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); trx_free(trx); @@ -3728,13 +3728,13 @@ i_s_fts_index_table_fill( /* Prevent DROP of the internal tables for fulltext indexes. FIXME: acquire DDL-blocking MDL on the user table name! */ - rw_lock_s_lock(&dict_operation_lock); + rw_lock_s_lock(&dict_sys.latch); user_table = dict_table_open_on_id( innodb_ft_aux_table_id, FALSE, DICT_TABLE_OP_NORMAL); if (!user_table) { - rw_lock_s_unlock(&dict_operation_lock); + rw_lock_s_unlock(&dict_sys.latch); DBUG_RETURN(0); } @@ -3754,7 +3754,7 @@ i_s_fts_index_table_fill( dict_table_close(user_table, FALSE, FALSE); - rw_lock_s_unlock(&dict_operation_lock); + rw_lock_s_unlock(&dict_sys.latch); ut_free(conv_str.f_str); @@ -3891,14 +3891,14 @@ i_s_fts_config_fill( /* Prevent DROP of the internal tables for fulltext indexes. FIXME: acquire DDL-blocking MDL on the user table name! */ - rw_lock_s_lock(&dict_operation_lock); + rw_lock_s_lock(&dict_sys.latch); user_table = dict_table_open_on_id( innodb_ft_aux_table_id, FALSE, DICT_TABLE_OP_NORMAL); if (!user_table) { no_fts: - rw_lock_s_unlock(&dict_operation_lock); + rw_lock_s_unlock(&dict_sys.latch); DBUG_RETURN(0); } @@ -3962,7 +3962,7 @@ no_fts: dict_table_close(user_table, FALSE, FALSE); - rw_lock_s_unlock(&dict_operation_lock); + rw_lock_s_unlock(&dict_sys.latch); trx_free(trx); @@ -4854,7 +4854,7 @@ i_s_innodb_buffer_page_fill( if (page_info->page_type == I_S_PAGE_TYPE_INDEX) { bool ret = false; - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); const dict_index_t* index = dict_index_get_if_in_cache_low( @@ -4879,7 +4879,7 @@ i_s_innodb_buffer_page_fill( system_charset_info); } - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); OK(ret); @@ -5577,7 +5577,7 @@ i_s_innodb_buf_page_lru_fill( if (page_info->page_type == I_S_PAGE_TYPE_INDEX) { bool ret = false; - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); const dict_index_t* index = dict_index_get_if_in_cache_low( @@ -5602,7 +5602,7 @@ i_s_innodb_buf_page_lru_fill( system_charset_info); } - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); OK(ret); @@ -5971,7 +5971,7 @@ i_s_dict_fill_sys_tables( ulint compact = DICT_TF_GET_COMPACT(table->flags); ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS( table->flags); - const page_size_t& page_size = dict_tf_get_page_size(table->flags); + const ulint zip_size = dict_tf_get_zip_size(table->flags); const char* row_format; if (!compact) { @@ -6000,10 +6000,7 @@ i_s_dict_fill_sys_tables( OK(field_store_string(fields[SYS_TABLES_ROW_FORMAT], row_format)); - OK(fields[SYS_TABLES_ZIP_PAGE_SIZE]->store( - page_size.is_compressed() - ? page_size.physical() - : 0, true)); + OK(fields[SYS_TABLES_ZIP_PAGE_SIZE]->store(zip_size, true)); OK(field_store_string(fields[SYS_TABLES_SPACE_TYPE], table->space_id ? "Single" : "System")); @@ -6038,7 +6035,7 @@ i_s_sys_tables_fill_table( } heap = mem_heap_create(1000); - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES); @@ -6052,7 +6049,7 @@ i_s_sys_tables_fill_table( err_msg = dict_process_sys_tables_rec_and_mtr_commit( heap, rec, &table_rec, false, &mtr); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); if (!err_msg) { i_s_dict_fill_sys_tables(thd, table_rec, @@ -6070,13 +6067,13 @@ i_s_sys_tables_fill_table( mem_heap_empty(heap); /* Get the next record */ - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); rec = dict_getnext_system(&pcur, &mtr); } mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); mem_heap_free(heap); DBUG_RETURN(0); @@ -6266,7 +6263,7 @@ i_s_dict_fill_sys_tablestats( OK(field_store_string(fields[SYS_TABLESTATS_NAME], table->name.m_name)); - dict_table_stats_lock(table, RW_S_LATCH); + rw_lock_s_lock(&table->stats_latch); if (table->stat_initialized) { OK(field_store_string(fields[SYS_TABLESTATS_INIT], @@ -6296,7 +6293,7 @@ i_s_dict_fill_sys_tablestats( OK(fields[SYS_TABLESTATS_MODIFIED]->store(0, true)); } - dict_table_stats_unlock(table, RW_S_LATCH); + rw_lock_s_unlock(&table->stats_latch); OK(fields[SYS_TABLESTATS_AUTONINC]->store(table->autoinc, true)); @@ -6334,8 +6331,8 @@ i_s_sys_tables_fill_table_stats( } heap = mem_heap_create(1000); - rw_lock_s_lock(&dict_operation_lock); - mutex_enter(&dict_sys->mutex); + rw_lock_s_lock(&dict_sys.latch); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES); @@ -6350,7 +6347,7 @@ i_s_sys_tables_fill_table_stats( heap, rec, &table_rec, true, &mtr); ulint ref_count = table_rec ? table_rec->get_ref_count() : 0; - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); DBUG_EXECUTE_IF("test_sys_tablestats", { if (strcmp("test/t1", table_rec->name.m_name) == 0 ) { @@ -6368,20 +6365,20 @@ i_s_sys_tables_fill_table_stats( err_msg); } - rw_lock_s_unlock(&dict_operation_lock); + rw_lock_s_unlock(&dict_sys.latch); mem_heap_empty(heap); /* Get the next record */ - rw_lock_s_lock(&dict_operation_lock); - mutex_enter(&dict_sys->mutex); + rw_lock_s_lock(&dict_sys.latch); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); rec = dict_getnext_system(&pcur, &mtr); } mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); - rw_lock_s_unlock(&dict_operation_lock); + mutex_exit(&dict_sys.mutex); + rw_lock_s_unlock(&dict_sys.latch); mem_heap_free(heap); DBUG_RETURN(0); @@ -6621,7 +6618,7 @@ i_s_sys_indexes_fill_table( } heap = mem_heap_create(1000); - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); /* Start scan the SYS_INDEXES table */ @@ -6643,7 +6640,7 @@ i_s_sys_indexes_fill_table( space_id = space_id == 4 ? mach_read_from_4(field) : ULINT_UNDEFINED; mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); if (!err_msg) { if (int err = i_s_dict_fill_sys_indexes( @@ -6661,13 +6658,13 @@ i_s_sys_indexes_fill_table( mem_heap_empty(heap); /* Get the next record */ - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); rec = dict_getnext_system(&pcur, &mtr); } mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); mem_heap_free(heap); DBUG_RETURN(0); @@ -6874,7 +6871,7 @@ i_s_sys_columns_fill_table( } heap = mem_heap_create(1000); - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); rec = dict_startscan_system(&pcur, &mtr, SYS_COLUMNS); @@ -6892,7 +6889,7 @@ i_s_sys_columns_fill_table( &nth_v_col); mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); if (!err_msg) { i_s_dict_fill_sys_columns(thd, table_id, col_name, @@ -6907,13 +6904,13 @@ i_s_sys_columns_fill_table( mem_heap_empty(heap); /* Get the next record */ - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); rec = dict_getnext_system(&pcur, &mtr); } mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); mem_heap_free(heap); DBUG_RETURN(0); @@ -7083,7 +7080,7 @@ i_s_sys_virtual_fill_table( DBUG_RETURN(0); } - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); rec = dict_startscan_system(&pcur, &mtr, SYS_VIRTUAL); @@ -7099,7 +7096,7 @@ i_s_sys_virtual_fill_table( &base_pos); mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); if (!err_msg) { i_s_dict_fill_sys_virtual(thd, table_id, pos, base_pos, @@ -7111,13 +7108,13 @@ i_s_sys_virtual_fill_table( } /* Get the next record */ - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); rec = dict_getnext_system(&pcur, &mtr); } mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); DBUG_RETURN(0); } @@ -7283,7 +7280,7 @@ i_s_sys_fields_fill_table( } heap = mem_heap_create(1000); - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); /* will save last index id so that we know whether we move to @@ -7304,7 +7301,7 @@ i_s_sys_fields_fill_table( &pos, &index_id, last_id); mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); if (!err_msg) { i_s_dict_fill_sys_fields(thd, index_id, &field_rec, @@ -7319,13 +7316,13 @@ i_s_sys_fields_fill_table( mem_heap_empty(heap); /* Get the next record */ - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); rec = dict_getnext_system(&pcur, &mtr); } mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); mem_heap_free(heap); DBUG_RETURN(0); @@ -7515,7 +7512,7 @@ i_s_sys_foreign_fill_table( } heap = mem_heap_create(1000); - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); rec = dict_startscan_system(&pcur, &mtr, SYS_FOREIGN); @@ -7529,7 +7526,7 @@ i_s_sys_foreign_fill_table( err_msg = dict_process_sys_foreign_rec(heap, rec, &foreign_rec); mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); if (!err_msg) { i_s_dict_fill_sys_foreign(thd, &foreign_rec, @@ -7544,12 +7541,12 @@ i_s_sys_foreign_fill_table( /* Get the next record */ mtr_start(&mtr); - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); rec = dict_getnext_system(&pcur, &mtr); } mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); mem_heap_free(heap); DBUG_RETURN(0); @@ -7729,7 +7726,7 @@ i_s_sys_foreign_cols_fill_table( } heap = mem_heap_create(1000); - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); rec = dict_startscan_system(&pcur, &mtr, SYS_FOREIGN_COLS); @@ -7746,7 +7743,7 @@ i_s_sys_foreign_cols_fill_table( heap, rec, &name, &for_col_name, &ref_col_name, &pos); mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); if (!err_msg) { i_s_dict_fill_sys_foreign_cols( @@ -7761,13 +7758,13 @@ i_s_sys_foreign_cols_fill_table( mem_heap_empty(heap); /* Get the next record */ - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); rec = dict_getnext_system(&pcur, &mtr); } mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); mem_heap_free(heap); DBUG_RETURN(0); @@ -7960,7 +7957,9 @@ i_s_dict_fill_sys_tablespaces( DBUG_ENTER("i_s_dict_fill_sys_tablespaces"); - if (is_system_tablespace(space)) { + if (fil_space_t::full_crc32(flags)) { + row_format = NULL; + } else if (is_system_tablespace(space)) { row_format = "Compact, Redundant or Dynamic"; } else if (FSP_FLAGS_GET_ZIP_SSIZE(flags)) { row_format = "Compressed"; @@ -7984,7 +7983,7 @@ i_s_dict_fill_sys_tablespaces( is_system_tablespace(space) ? "System" : "Single")); - ulint cflags = fsp_flags_is_valid(flags, space) + ulint cflags = fil_space_t::is_valid_flags(flags, space) ? flags : fsp_flags_convert_from_101(flags); if (cflags == ULINT_UNDEFINED) { fields[SYS_TABLESPACES_PAGE_SIZE]->set_null(); @@ -7996,13 +7995,11 @@ i_s_dict_fill_sys_tablespaces( DBUG_RETURN(0); } - const page_size_t page_size(cflags); - OK(fields[SYS_TABLESPACES_PAGE_SIZE]->store( - page_size.logical(), true)); + fil_space_t::logical_size(cflags), true)); OK(fields[SYS_TABLESPACES_ZIP_PAGE_SIZE]->store( - page_size.physical(), true)); + fil_space_t::physical_size(cflags), true)); os_file_stat_t stat; os_file_size_t file; @@ -8085,7 +8082,7 @@ i_s_sys_tablespaces_fill_table( } heap = mem_heap_create(1000); - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); for (rec = dict_startscan_system(&pcur, &mtr, SYS_TABLESPACES); @@ -8102,7 +8099,7 @@ i_s_sys_tablespaces_fill_table( heap, rec, &space, &name, &flags); mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); if (!err_msg) { i_s_dict_fill_sys_tablespaces( @@ -8117,12 +8114,12 @@ i_s_sys_tablespaces_fill_table( mem_heap_empty(heap); /* Get the next record */ - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); } mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); mem_heap_free(heap); DBUG_RETURN(0); @@ -8276,7 +8273,7 @@ i_s_sys_datafiles_fill_table( } heap = mem_heap_create(1000); - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); rec = dict_startscan_system(&pcur, &mtr, SYS_DATAFILES); @@ -8291,7 +8288,7 @@ i_s_sys_datafiles_fill_table( heap, rec, &space, &path); mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); if (!err_msg) { i_s_dict_fill_sys_datafiles( @@ -8305,13 +8302,13 @@ i_s_sys_datafiles_fill_table( mem_heap_empty(heap); /* Get the next record */ - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); mtr_start(&mtr); rec = dict_getnext_system(&pcur, &mtr); } mtr_commit(&mtr); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); mem_heap_free(heap); DBUG_RETURN(0); @@ -8685,7 +8682,7 @@ static ST_FIELD_INFO innodb_tablespaces_scrubbing_fields_info[] = #define TABLESPACES_SCRUBBING_COMPRESSED 2 {STRUCT_FLD(field_name, "COMPRESSED"), - STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_length, 1), STRUCT_FLD(field_type, MYSQL_TYPE_LONG), STRUCT_FLD(value, 0), STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), @@ -8737,6 +8734,15 @@ static ST_FIELD_INFO innodb_tablespaces_scrubbing_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, +#define TABLESPACES_SCRUBBING_ON_SSD 8 + {STRUCT_FLD(field_name, "ON_SSD"), + STRUCT_FLD(field_length, 1), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + END_OF_ST_FIELD_INFO }; @@ -8808,6 +8814,8 @@ i_s_dict_fill_tablespaces_scrubbing( } } + OK(fields[TABLESPACES_SCRUBBING_ON_SSD]->store(!space->is_rotational(), + true)); OK(schema_table_store_record(thd, table_to_fill)); DBUG_RETURN(0); diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc index 9228e377f69..317f322df0f 100644 --- a/storage/innobase/ibuf/ibuf0ibuf.cc +++ b/storage/innobase/ibuf/ibuf0ibuf.cc @@ -338,7 +338,7 @@ ibuf_header_page_get( block = buf_page_get( page_id_t(IBUF_SPACE_ID, FSP_IBUF_HEADER_PAGE_NO), - univ_page_size, RW_X_LATCH, mtr); + 0, RW_X_LATCH, mtr); if (block) { buf_block_dbg_add_level(block, SYNC_IBUF_HEADER); @@ -368,7 +368,7 @@ ibuf_tree_root_get( /* only segment list access is exclusive each other */ block = buf_page_get( page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO), - univ_page_size, RW_SX_LATCH, mtr); + 0, RW_SX_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW); @@ -483,7 +483,7 @@ ibuf_init_at_db_start(void) block = buf_page_get( page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO), - univ_page_size, RW_X_LATCH, &mtr); + 0, RW_X_LATCH, &mtr); buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE); @@ -548,13 +548,8 @@ ibuf_max_size_update( } -/*********************************************************************//** -Initializes an ibuf bitmap page. */ -void -ibuf_bitmap_page_init( -/*==================*/ - buf_block_t* block, /*!< in: bitmap page */ - mtr_t* mtr) /*!< in: mtr */ +/** Apply MLOG_IBUF_BITMAP_INIT when crash-upgrading */ +ATTRIBUTE_COLD void ibuf_bitmap_init_apply(buf_block_t* block) { page_t* page; ulint byte_offset; @@ -565,65 +560,41 @@ ibuf_bitmap_page_init( /* Write all zeros to the bitmap */ compile_time_assert(!(IBUF_BITS_PER_PAGE % 2)); - byte_offset = UT_BITS_IN_BYTES(block->page.size.physical() + byte_offset = UT_BITS_IN_BYTES(block->physical_size() * IBUF_BITS_PER_PAGE); memset(page + IBUF_BITMAP, 0, byte_offset); - - /* The remaining area (up to the page trailer) is uninitialized. */ - mlog_write_initial_log_record(page, MLOG_IBUF_BITMAP_INIT, mtr); -} - -/*********************************************************************//** -Parses a redo log record of an ibuf bitmap page init. -@return end of log record or NULL */ -byte* -ibuf_parse_bitmap_init( -/*===================*/ - byte* ptr, /*!< in: buffer */ - byte* end_ptr MY_ATTRIBUTE((unused)), /*!< in: buffer end */ - buf_block_t* block, /*!< in: block or NULL */ - mtr_t* mtr) /*!< in: mtr or NULL */ -{ - ut_ad(ptr != NULL); - ut_ad(end_ptr != NULL); - - if (block) { - ibuf_bitmap_page_init(block, mtr); - } - - return(ptr); } # ifdef UNIV_DEBUG /** Gets the desired bits for a given page from a bitmap page. @param[in] page bitmap page @param[in] page_id page id whose bits to get -@param[in] page_size page id whose bits to get +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... @param[in,out] mtr mini-transaction holding an x-latch on the bitmap page @return value of bits */ -# define ibuf_bitmap_page_get_bits(page, page_id, page_size, bit, mtr) \ - ibuf_bitmap_page_get_bits_low(page, page_id, page_size, \ +# define ibuf_bitmap_page_get_bits(page, page_id, zip_size, bit, mtr) \ + ibuf_bitmap_page_get_bits_low(page, page_id, zip_size, \ MTR_MEMO_PAGE_X_FIX, mtr, bit) # else /* UNIV_DEBUG */ /** Gets the desired bits for a given page from a bitmap page. @param[in] page bitmap page @param[in] page_id page id whose bits to get -@param[in] page_size page id whose bits to get +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... @param[in,out] mtr mini-transaction holding an x-latch on the bitmap page @return value of bits */ -# define ibuf_bitmap_page_get_bits(page, page_id, page_size, bit, mtr) \ - ibuf_bitmap_page_get_bits_low(page, page_id, page_size, bit) +# define ibuf_bitmap_page_get_bits(page, page_id, zip_size, bit, mtr) \ + ibuf_bitmap_page_get_bits_low(page, page_id, zip_size, bit) # endif /* UNIV_DEBUG */ /** Gets the desired bits for a given page from a bitmap page. @param[in] page bitmap page @param[in] page_id page id whose bits to get -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] latch_type MTR_MEMO_PAGE_X_FIX, MTR_MEMO_BUF_FIX, ... @param[in,out] mtr mini-transaction holding latch_type on the bitmap page @@ -634,7 +605,7 @@ ulint ibuf_bitmap_page_get_bits_low( const page_t* page, const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, #ifdef UNIV_DEBUG ulint latch_type, mtr_t* mtr, @@ -645,12 +616,14 @@ ibuf_bitmap_page_get_bits_low( ulint bit_offset; ulint map_byte; ulint value; + const ulint size = zip_size ? zip_size : srv_page_size; + ut_ad(ut_is_2pow(zip_size)); ut_ad(bit < IBUF_BITS_PER_PAGE); compile_time_assert(!(IBUF_BITS_PER_PAGE % 2)); ut_ad(mtr_memo_contains_page(mtr, page, latch_type)); - bit_offset = (page_id.page_no() % page_size.physical()) + bit_offset = (page_id.page_no() & (size - 1)) * IBUF_BITS_PER_PAGE + bit; byte_offset = bit_offset / 8; @@ -674,7 +647,7 @@ ibuf_bitmap_page_get_bits_low( /** Sets the desired bit for a given page in a bitmap page. @param[in,out] page bitmap page @param[in] page_id page id whose bits to set -@param[in] page_size page size +@param[in] physical_size page size @param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... @param[in] val value to set @param[in,out] mtr mtr containing an x-latch to the bitmap page */ @@ -683,7 +656,7 @@ void ibuf_bitmap_page_set_bits( page_t* page, const page_id_t page_id, - const page_size_t& page_size, + ulint physical_size, ulint bit, ulint val, mtr_t* mtr) @@ -697,7 +670,7 @@ ibuf_bitmap_page_set_bits( ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); ut_ad(mtr->is_named_space(page_id.space())); - bit_offset = (page_id.page_no() % page_size.physical()) + bit_offset = (page_id.page_no() % physical_size) * IBUF_BITS_PER_PAGE + bit; byte_offset = bit_offset / 8; @@ -724,26 +697,20 @@ ibuf_bitmap_page_set_bits( /** Calculates the bitmap page number for a given page number. @param[in] page_id page id -@param[in] page_size page size +@param[in] size page size @return the bitmap page id where the file page is mapped */ -UNIV_INLINE -const page_id_t -ibuf_bitmap_page_no_calc( - const page_id_t page_id, - const page_size_t& page_size) +inline page_id_t ibuf_bitmap_page_no_calc(const page_id_t page_id, ulint size) { - ulint bitmap_page_no; - - bitmap_page_no = FSP_IBUF_BITMAP_OFFSET - + (page_id.page_no() & ~(page_size.physical() - 1)); + if (!size) size = srv_page_size; - return(page_id_t(page_id.space(), bitmap_page_no)); + return page_id_t(page_id.space(), FSP_IBUF_BITMAP_OFFSET + + (page_id.page_no() & ~(size - 1))); } /** Gets the ibuf bitmap page where the bits describing a given file page are stored. @param[in] page_id page id of the file page -@param[in] page_size page size of the file page +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] file file name @param[in] line line where called @param[in,out] mtr mini-transaction @@ -754,7 +721,7 @@ static page_t* ibuf_bitmap_get_map_page_func( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, const char* file, unsigned line, mtr_t* mtr) @@ -762,8 +729,8 @@ ibuf_bitmap_get_map_page_func( buf_block_t* block = NULL; dberr_t err = DB_SUCCESS; - block = buf_page_get_gen(ibuf_bitmap_page_no_calc(page_id, page_size), - page_size, RW_X_LATCH, NULL, BUF_GET, + block = buf_page_get_gen(ibuf_bitmap_page_no_calc(page_id, zip_size), + zip_size, RW_X_LATCH, NULL, BUF_GET, file, line, mtr, &err); if (err != DB_SUCCESS) { @@ -779,13 +746,13 @@ ibuf_bitmap_get_map_page_func( /** Gets the ibuf bitmap page where the bits describing a given file page are stored. @param[in] page_id page id of the file page -@param[in] page_size page size of the file page +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in,out] mtr mini-transaction @return bitmap page where the file page is mapped, that is, the bitmap page containing the descriptor bits for the file page; the bitmap page is x-latched */ -#define ibuf_bitmap_get_map_page(page_id, page_size, mtr) \ - ibuf_bitmap_get_map_page_func(page_id, page_size, \ +#define ibuf_bitmap_get_map_page(page_id, zip_size, mtr) \ + ibuf_bitmap_get_map_page_func(page_id, zip_size, \ __FILE__, __LINE__, mtr) /************************************************************************//** @@ -819,14 +786,14 @@ ibuf_set_free_bits_low( } bitmap_page = ibuf_bitmap_get_map_page(block->page.id, - block->page.size, mtr); + block->zip_size(), mtr); #ifdef UNIV_IBUF_DEBUG ut_a(val <= ibuf_index_page_calc_free(block)); #endif /* UNIV_IBUF_DEBUG */ ibuf_bitmap_page_set_bits( - bitmap_page, block->page.id, block->page.size, + bitmap_page, block->page.id, block->physical_size(), IBUF_BITMAP_FREE, val, mtr); } @@ -863,17 +830,14 @@ ibuf_set_free_bits_func( block->page.id.space()); bitmap_page = ibuf_bitmap_get_map_page(block->page.id, - block->page.size, &mtr); + block->zip_size(), &mtr); switch (space->purpose) { case FIL_TYPE_LOG: ut_ad(0); break; case FIL_TYPE_TABLESPACE: - /* Avoid logging while fixing up truncate of table. */ - if (!srv_is_tablespace_truncated(block->page.id.space())) { - break; - } + break; /* fall through */ case FIL_TYPE_TEMPORARY: case FIL_TYPE_IMPORT: @@ -908,7 +872,7 @@ ibuf_set_free_bits_func( #endif /* UNIV_IBUF_DEBUG */ ibuf_bitmap_page_set_bits( - bitmap_page, block->page.id, block->page.size, + bitmap_page, block->page.id, block->physical_size(), IBUF_BITMAP_FREE, val, &mtr); mtr_commit(&mtr); @@ -958,7 +922,7 @@ ibuf_update_free_bits_low( ut_a(!buf_block_get_page_zip(block)); ut_ad(mtr->is_named_space(block->page.id.space())); - before = ibuf_index_page_calc_free_bits(block->page.size.logical(), + before = ibuf_index_page_calc_free_bits(srv_page_size, max_ins_size); after = ibuf_index_page_calc_free(block); @@ -993,10 +957,10 @@ ibuf_update_free_bits_zip( buf_frame_t* frame = buf_block_get_frame(block); ut_a(frame); ut_a(page_is_leaf(frame)); - ut_a(block->page.size.is_compressed()); + ut_a(block->zip_size()); bitmap_page = ibuf_bitmap_get_map_page(block->page.id, - block->page.size, mtr); + block->zip_size(), mtr); after = ibuf_index_page_calc_free_zip(block); @@ -1010,7 +974,7 @@ ibuf_update_free_bits_zip( } ibuf_bitmap_page_set_bits( - bitmap_page, block->page.id, block->page.size, + bitmap_page, block->page.id, block->physical_size(), IBUF_BITMAP_FREE, after, mtr); } @@ -1052,23 +1016,19 @@ ibuf_update_free_bits_for_two_pages_low( /** Returns TRUE if the page is one of the fixed address ibuf pages. @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @return TRUE if a fixed address ibuf i/o page */ -UNIV_INLINE -ibool -ibuf_fixed_addr_page( - const page_id_t page_id, - const page_size_t& page_size) +inline bool ibuf_fixed_addr_page(const page_id_t page_id, ulint zip_size) { return((page_id.space() == IBUF_SPACE_ID && page_id.page_no() == IBUF_TREE_ROOT_PAGE_NO) - || ibuf_bitmap_page(page_id, page_size)); + || ibuf_bitmap_page(page_id, zip_size)); } /** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. Must not be called when recv_no_ibuf_operations==true. @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] x_latch FALSE if relaxed check (avoid latching the bitmap page) @param[in] file file name @@ -1077,12 +1037,12 @@ bitmap page) bitmap page if the page is not one of the fixed address ibuf pages, or NULL, in which case a new transaction is created. @return TRUE if level 2 or level 3 page */ -ibool +bool ibuf_page_low( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, #ifdef UNIV_DEBUG - ibool x_latch, + bool x_latch, #endif /* UNIV_DEBUG */ const char* file, unsigned line, @@ -1095,12 +1055,10 @@ ibuf_page_low( ut_ad(!recv_no_ibuf_operations); ut_ad(x_latch || mtr == NULL); - if (ibuf_fixed_addr_page(page_id, page_size)) { - - return(TRUE); + if (ibuf_fixed_addr_page(page_id, zip_size)) { + return(true); } else if (page_id.space() != IBUF_SPACE_ID) { - - return(FALSE); + return(false); } compile_time_assert(IBUF_SPACE_ID == 0); @@ -1123,14 +1081,14 @@ ibuf_page_low( dberr_t err = DB_SUCCESS; buf_block_t* block = buf_page_get_gen( - ibuf_bitmap_page_no_calc(page_id, page_size), - page_size, RW_NO_LATCH, NULL, BUF_GET_NO_LATCH, - file, line, &local_mtr, &err); + ibuf_bitmap_page_no_calc(page_id, zip_size), + zip_size, RW_NO_LATCH, NULL, BUF_GET_NO_LATCH, + file, line, &local_mtr, &err); bitmap_page = buf_block_get_frame(block); ret = ibuf_bitmap_page_get_bits_low( - bitmap_page, page_id, page_size, + bitmap_page, page_id, zip_size, MTR_MEMO_BUF_FIX, &local_mtr, IBUF_BITMAP_IBUF); mtr_commit(&local_mtr); @@ -1143,10 +1101,10 @@ ibuf_page_low( mtr_start(mtr); } - bitmap_page = ibuf_bitmap_get_map_page_func(page_id, page_size, + bitmap_page = ibuf_bitmap_get_map_page_func(page_id, zip_size, file, line, mtr); - ret = ibuf_bitmap_page_get_bits(bitmap_page, page_id, page_size, + ret = ibuf_bitmap_page_get_bits(bitmap_page, page_id, zip_size, IBUF_BITMAP_IBUF, mtr); if (mtr == &local_mtr) { @@ -1389,32 +1347,26 @@ ibuf_rec_get_counter( } } -/****************************************************************//** -Add accumulated operation counts to a permanent array. Both arrays must be -of size IBUF_OP_COUNT. */ -static -void -ibuf_add_ops( -/*=========*/ - ulint* arr, /*!< in/out: array to modify */ - const ulint* ops) /*!< in: operation counts */ +/** + Add accumulated operation counts to a permanent array. + Both arrays must be of size IBUF_OP_COUNT. +*/ +static void ibuf_add_ops(Atomic_counter<ulint> *out, const ulint *in) { - ulint i; - - for (i = 0; i < IBUF_OP_COUNT; i++) { - my_atomic_addlint(&arr[i], ops[i]); - } + for (auto i = 0; i < IBUF_OP_COUNT; i++) + out[i]+= in[i]; } + /****************************************************************//** Print operation counts. The array must be of size IBUF_OP_COUNT. */ static void ibuf_print_ops( /*===========*/ - const ulint* ops, /*!< in: operation counts */ - FILE* file) /*!< in: file where to print */ + const Atomic_counter<ulint>* ops, /*!< in: operation counts */ + FILE* file) /*!< in: file where to print */ { static const char* op_names[] = { "insert", @@ -1427,7 +1379,7 @@ ibuf_print_ops( for (i = 0; i < IBUF_OP_COUNT; i++) { fprintf(file, "%s " ULINTPF "%s", op_names[i], - ops[i], (i < (IBUF_OP_COUNT - 1)) ? ", " : ""); + ulint{ops[i]}, (i < (IBUF_OP_COUNT - 1)) ? ", " : ""); } putc('\n', file); @@ -2025,11 +1977,11 @@ ibuf_add_free_page(void) (level 2 page) */ const page_id_t page_id(IBUF_SPACE_ID, block->page.id.page_no()); - bitmap_page = ibuf_bitmap_get_map_page(page_id, univ_page_size, &mtr); + bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr); mutex_exit(&ibuf_mutex); - ibuf_bitmap_page_set_bits(bitmap_page, page_id, univ_page_size, + ibuf_bitmap_page_set_bits(bitmap_page, page_id, srv_page_size, IBUF_BITMAP_IBUF, TRUE, &mtr); ibuf_mtr_commit(&mtr); @@ -2100,7 +2052,7 @@ ibuf_remove_free_page(void) compile_time_assert(IBUF_SPACE_ID == 0); fseg_free_page(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, - fil_system.sys_space, page_no, &mtr); + fil_system.sys_space, page_no, true, &mtr); const page_id_t page_id(IBUF_SPACE_ID, page_no); @@ -2118,7 +2070,7 @@ ibuf_remove_free_page(void) { buf_block_t* block; - block = buf_page_get(page_id, univ_page_size, RW_X_LATCH, &mtr); + block = buf_page_get(page_id, 0, RW_X_LATCH, &mtr); buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE); @@ -2138,13 +2090,13 @@ ibuf_remove_free_page(void) /* Set the bit indicating that this page is no more an ibuf tree page (level 2 page) */ - bitmap_page = ibuf_bitmap_get_map_page(page_id, univ_page_size, &mtr); + bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr); mutex_exit(&ibuf_mutex); ibuf_bitmap_page_set_bits( - bitmap_page, page_id, univ_page_size, IBUF_BITMAP_IBUF, FALSE, - &mtr); + bitmap_page, page_id, srv_page_size, + IBUF_BITMAP_IBUF, FALSE, &mtr); ut_d(buf_page_set_file_page_was_freed(page_id)); @@ -2976,7 +2928,7 @@ ibuf_get_volume_buffered( block = buf_page_get( page_id_t(IBUF_SPACE_ID, prev_page_no), - univ_page_size, RW_X_LATCH, mtr); + 0, RW_X_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE); @@ -3048,7 +3000,7 @@ count_later: block = buf_page_get( page_id_t(IBUF_SPACE_ID, next_page_no), - univ_page_size, RW_X_LATCH, mtr); + 0, RW_X_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE); @@ -3261,6 +3213,24 @@ ibuf_get_entry_counter_func( } } + +/** Translates the ibuf free bits to the free space on a page in bytes. +@param[in] physical_size page_size +@param[in] bits value for ibuf bitmap bits +@return maximum insert size after reorganize for the page */ +inline ulint +ibuf_index_page_calc_free_from_bits(ulint physical_size, ulint bits) +{ + ut_ad(bits < 4); + ut_ad(physical_size > IBUF_PAGE_SIZE_PER_FREE_SPACE); + + if (bits == 3) { + bits = 4; + } + + return bits * physical_size / IBUF_PAGE_SIZE_PER_FREE_SPACE; +} + /** Buffer an operation in the insert/delete buffer, instead of doing it directly to the disk page, if this is possible. @param[in] mode BTR_MODIFY_PREV or BTR_MODIFY_TREE @@ -3272,7 +3242,7 @@ buffering @param[in,out] index index where to insert; must not be unique or clustered @param[in] page_id page id where to insert -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in,out] thr query thread @return DB_SUCCESS, DB_STRONG_FAIL or other error */ static MY_ATTRIBUTE((warn_unused_result)) @@ -3285,7 +3255,7 @@ ibuf_insert_low( ulint entry_size, dict_index_t* index, const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, que_thr_t* thr) { big_rec_t* dummy_big_rec; @@ -3395,6 +3365,8 @@ ibuf_insert_low( ? &min_n_recs : NULL, &mtr); + const ulint physical_size = zip_size ? zip_size : srv_page_size; + if (op == IBUF_OP_DELETE && (min_n_recs < 2 || buf_pool_watch_occurred(page_id))) { /* The page could become empty after the record is @@ -3436,8 +3408,7 @@ fail_exit: ibuf_mtr_start(&bitmap_mtr); index->set_modified(bitmap_mtr); - bitmap_page = ibuf_bitmap_get_map_page(page_id, page_size, - &bitmap_mtr); + bitmap_page = ibuf_bitmap_get_map_page(page_id, zip_size, &bitmap_mtr); /* We check if the index page is suitable for buffered entries */ @@ -3451,11 +3422,12 @@ fail_exit: if (op == IBUF_OP_INSERT) { ulint bits = ibuf_bitmap_page_get_bits( - bitmap_page, page_id, page_size, IBUF_BITMAP_FREE, + bitmap_page, page_id, physical_size, IBUF_BITMAP_FREE, &bitmap_mtr); if (buffered + entry_size + page_dir_calc_reserved_space(1) - > ibuf_index_page_calc_free_from_bits(page_size, bits)) { + > ibuf_index_page_calc_free_from_bits(physical_size, + bits)) { /* Release the bitmap page latch early. */ ibuf_mtr_commit(&bitmap_mtr); @@ -3498,11 +3470,11 @@ fail_exit: buffered entries for this index page, if the bit is not set yet */ old_bit_value = ibuf_bitmap_page_get_bits( - bitmap_page, page_id, page_size, + bitmap_page, page_id, physical_size, IBUF_BITMAP_BUFFERED, &bitmap_mtr); if (!old_bit_value) { - ibuf_bitmap_page_set_bits(bitmap_page, page_id, page_size, + ibuf_bitmap_page_set_bits(bitmap_page, page_id, physical_size, IBUF_BITMAP_BUFFERED, TRUE, &bitmap_mtr); } @@ -3596,23 +3568,23 @@ func_exit: return(err); } -/** Buffer an operation in the insert/delete buffer, instead of doing it -directly to the disk page, if this is possible. Does not do it if the index +/** Buffer an operation in the change buffer, instead of applying it +directly to the file page, if this is possible. Does not do it if the index is clustered or unique. @param[in] op operation type @param[in] entry index entry to insert @param[in,out] index index where to insert @param[in] page_id page id where to insert -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in,out] thr query thread -@return TRUE if success */ -ibool +@return true if success */ +bool ibuf_insert( ibuf_op_t op, const dtuple_t* entry, dict_index_t* index, const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, que_thr_t* thr) { dberr_t err; @@ -3640,7 +3612,7 @@ ibuf_insert( case IBUF_USE_NONE: case IBUF_USE_DELETE: case IBUF_USE_DELETE_MARK: - DBUG_RETURN(FALSE); + DBUG_RETURN(false); case IBUF_USE_INSERT: case IBUF_USE_INSERT_DELETE_MARK: case IBUF_USE_ALL: @@ -3651,7 +3623,7 @@ ibuf_insert( switch (use) { case IBUF_USE_NONE: case IBUF_USE_INSERT: - DBUG_RETURN(FALSE); + DBUG_RETURN(false); case IBUF_USE_DELETE_MARK: case IBUF_USE_DELETE: case IBUF_USE_INSERT_DELETE_MARK: @@ -3665,7 +3637,7 @@ ibuf_insert( case IBUF_USE_NONE: case IBUF_USE_INSERT: case IBUF_USE_INSERT_DELETE_MARK: - DBUG_RETURN(FALSE); + DBUG_RETURN(false); case IBUF_USE_DELETE_MARK: case IBUF_USE_DELETE: case IBUF_USE_ALL: @@ -3705,7 +3677,7 @@ check_watch: is being buffered, have this request executed directly on the page in the buffer pool after the buffered entries for this page have been merged. */ - DBUG_RETURN(FALSE); + DBUG_RETURN(false); } } @@ -3716,30 +3688,22 @@ skip_watch: >= page_get_free_space_of_empty(dict_table_is_comp(index->table)) / 2) { - DBUG_RETURN(FALSE); + DBUG_RETURN(false); } err = ibuf_insert_low(BTR_MODIFY_PREV, op, no_counter, entry, entry_size, - index, page_id, page_size, thr); + index, page_id, zip_size, thr); if (err == DB_FAIL) { err = ibuf_insert_low(BTR_MODIFY_TREE | BTR_LATCH_FOR_INSERT, op, no_counter, entry, entry_size, - index, page_id, page_size, thr); + index, page_id, zip_size, thr); } - if (err == DB_SUCCESS) { -#ifdef UNIV_IBUF_DEBUG - /* fprintf(stderr, "Ibuf insert for page no %lu of index %s\n", - page_no, index->name); */ -#endif - DBUG_RETURN(TRUE); + ut_a(err == DB_SUCCESS || err == DB_STRONG_FAIL + || err == DB_TOO_BIG_RECORD); - } else { - ut_a(err == DB_STRONG_FAIL || err == DB_TOO_BIG_RECORD); - - DBUG_RETURN(FALSE); - } + DBUG_RETURN(err == DB_SUCCESS); } /********************************************************************//** @@ -3803,13 +3767,13 @@ ibuf_insert_to_index_page_low( "InnoDB: that table.\n", stderr); bitmap_page = ibuf_bitmap_get_map_page(block->page.id, - block->page.size, mtr); + block->zip_size(), mtr); old_bits = ibuf_bitmap_page_get_bits( - bitmap_page, block->page.id, block->page.size, + bitmap_page, block->page.id, block->zip_size(), IBUF_BITMAP_FREE, mtr); ib::error() << "page " << block->page.id << ", size " - << block->page.size.physical() << ", bitmap bits " << old_bits; + << block->physical_size() << ", bitmap bits " << old_bits; ib::error() << BUG_REPORT_MSG; @@ -4371,15 +4335,16 @@ subsequently was dropped. @param[in,out] block if page has been read from disk, pointer to the page x-latched, else NULL @param[in] page_id page id of the index page -@param[in] update_ibuf_bitmap normally this is set to TRUE, but +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] update_ibuf_bitmap normally this is set, but if we have deleted or are deleting the tablespace, then we naturally do not want to update a non-existent bitmap page */ void ibuf_merge_or_delete_for_page( buf_block_t* block, const page_id_t page_id, - const page_size_t* page_size, - ibool update_ibuf_bitmap) + ulint zip_size, + bool update_ibuf_bitmap) { btr_pcur_t pcur; #ifdef UNIV_IBUF_DEBUG @@ -4403,38 +4368,23 @@ ibuf_merge_or_delete_for_page( return; } - /* We cannot refer to page_size in the following, because it is passed - as NULL (it is unknown) when buf_read_ibuf_merge_pages() is merging - (discarding) changes for a dropped tablespace. When block != NULL or - update_ibuf_bitmap is specified, then page_size must be known. - That is why we will repeat the check below, with page_size in - place of univ_page_size. Passing univ_page_size assumes that the - uncompressed page size always is a power-of-2 multiple of the - compressed page size. */ - - if (ibuf_fixed_addr_page(page_id, univ_page_size) - || fsp_descr_page(page_id, univ_page_size)) { + const ulint physical_size = zip_size ? zip_size : srv_page_size; + + if (ibuf_fixed_addr_page(page_id, physical_size) + || fsp_descr_page(page_id, physical_size)) { return; } fil_space_t* space; if (update_ibuf_bitmap) { - - ut_ad(page_size != NULL); - - if (ibuf_fixed_addr_page(page_id, *page_size) - || fsp_descr_page(page_id, *page_size)) { - return; - } - space = fil_space_acquire_silent(page_id.space()); if (UNIV_UNLIKELY(!space)) { /* Do not try to read the bitmap page from the non-existent tablespace, delete the ibuf records */ block = NULL; - update_ibuf_bitmap = FALSE; + update_ibuf_bitmap = false; } else { page_t* bitmap_page = NULL; ulint bitmap_bits = 0; @@ -4442,12 +4392,12 @@ ibuf_merge_or_delete_for_page( ibuf_mtr_start(&mtr); bitmap_page = ibuf_bitmap_get_map_page( - page_id, *page_size, &mtr); + page_id, zip_size, &mtr); if (bitmap_page && fil_page_get_type(bitmap_page) != FIL_PAGE_TYPE_ALLOCATED) { bitmap_bits = ibuf_bitmap_page_get_bits( - bitmap_page, page_id, *page_size, + bitmap_page, page_id, zip_size, IBUF_BITMAP_BUFFERED, &mtr); } @@ -4468,8 +4418,8 @@ ibuf_merge_or_delete_for_page( } } } else if (block != NULL - && (ibuf_fixed_addr_page(page_id, *page_size) - || fsp_descr_page(page_id, *page_size))) { + && (ibuf_fixed_addr_page(page_id, physical_size) + || fsp_descr_page(page_id, physical_size))) { return; } else { @@ -4702,23 +4652,23 @@ reset_bit: if (update_ibuf_bitmap) { page_t* bitmap_page; - bitmap_page = ibuf_bitmap_get_map_page(page_id, *page_size, + bitmap_page = ibuf_bitmap_get_map_page(page_id, zip_size, &mtr); ibuf_bitmap_page_set_bits( - bitmap_page, page_id, *page_size, + bitmap_page, page_id, physical_size, IBUF_BITMAP_BUFFERED, FALSE, &mtr); if (block != NULL) { ulint old_bits = ibuf_bitmap_page_get_bits( - bitmap_page, page_id, *page_size, + bitmap_page, page_id, zip_size, IBUF_BITMAP_FREE, &mtr); ulint new_bits = ibuf_index_page_calc_free(block); if (old_bits != new_bits) { ibuf_bitmap_page_set_bits( - bitmap_page, page_id, *page_size, + bitmap_page, page_id, physical_size, IBUF_BITMAP_FREE, new_bits, &mtr); } } @@ -4733,20 +4683,15 @@ reset_bit: btr_pcur_close(&pcur); mem_heap_free(heap); - my_atomic_addlint(&ibuf->n_merges, 1); + ibuf->n_merges++; ibuf_add_ops(ibuf->n_merged_ops, mops); ibuf_add_ops(ibuf->n_discarded_ops, dops); } -/*********************************************************************//** -Deletes all entries in the insert buffer for a given space id. This is used -in DISCARD TABLESPACE, IMPORT TABLESPACE, and 5.7 TRUNCATE TABLE recovery. -NOTE: this does not update the page free bitmaps in the space. The space will -become CORRUPT when you call this function! */ -void -ibuf_delete_for_discarded_space( -/*============================*/ - ulint space) /*!< in: space id */ +/** Delete all change buffer entries for a tablespace, +in DISCARD TABLESPACE, IMPORT TABLESPACE, or crash recovery. +@param[in] space missing or to-be-discarded tablespace */ +void ibuf_delete_for_discarded_space(ulint space) { mem_heap_t* heap; btr_pcur_t pcur; @@ -4861,7 +4806,7 @@ ibuf_print( ibuf->size, ibuf->free_list_len, ibuf->seg_size, - ibuf->n_merges); + ulint{ibuf->n_merges}); fputs("merged operations:\n ", file); ibuf_print_ops(ibuf->n_merged_ops, file); @@ -4881,7 +4826,9 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) ulint page_no; ut_ad(trx->mysql_thd); ut_ad(space->purpose == FIL_TYPE_IMPORT); - const page_size_t page_size(space->flags); + + const ulint zip_size = space->zip_size(); + const ulint physical_size = space->physical_size(); /* fil_space_t::size and fil_space_t::free_limit would still be 0 at this point. So, we will have to read page 0. */ ut_ad(!space->free_limit); @@ -4890,7 +4837,8 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) mtr_t mtr; ulint size; mtr.start(); - if (buf_block_t* sp = buf_page_get(page_id_t(space->id, 0), page_size, + if (buf_block_t* sp = buf_page_get(page_id_t(space->id, 0), + zip_size, RW_S_LATCH, &mtr)) { size = std::min( mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT @@ -4914,7 +4862,7 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) below page_no is measured in number of pages since the beginning of the space, as usual. */ - for (page_no = 0; page_no < size; page_no += page_size.physical()) { + for (page_no = 0; page_no < size; page_no += physical_size) { page_t* bitmap_page; ulint i; @@ -4930,23 +4878,24 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) ibuf_enter(&mtr); bitmap_page = ibuf_bitmap_get_map_page( - page_id_t(space->id, page_no), page_size, &mtr); + page_id_t(space->id, page_no), zip_size, &mtr); if (buf_is_zeroes(span<const byte>(bitmap_page, - page_size.physical()))) { + physical_size))) { /* This means we got all-zero page instead of ibuf bitmap page. The subsequent page should be all-zero pages. */ #ifdef UNIV_DEBUG for (ulint curr_page = page_no + 1; - curr_page < page_size.physical(); curr_page++) { + curr_page < physical_size; curr_page++) { buf_block_t* block = buf_page_get( page_id_t(space->id, curr_page), - page_size, RW_S_LATCH, &mtr); + zip_size, RW_S_LATCH, &mtr); page_t* page = buf_block_get_frame(block); ut_ad(buf_is_zeroes(span<const byte>( - page, page_size.physical()))); + page, + physical_size))); } #endif /* UNIV_DEBUG */ ibuf_exit(&mtr); @@ -4959,17 +4908,13 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) return DB_CORRUPTION; } - for (i = FSP_IBUF_BITMAP_OFFSET + 1; - i < page_size.physical(); - i++) { - + for (i = FSP_IBUF_BITMAP_OFFSET + 1; i < physical_size; i++) { const ulint offset = page_no + i; - const page_id_t cur_page_id(space->id, offset); if (ibuf_bitmap_page_get_bits( - bitmap_page, cur_page_id, page_size, - IBUF_BITMAP_IBUF, &mtr)) { + bitmap_page, cur_page_id, zip_size, + IBUF_BITMAP_IBUF, &mtr)) { mutex_exit(&ibuf_mutex); ibuf_exit(&mtr); @@ -4986,7 +4931,7 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) } if (ibuf_bitmap_page_get_bits( - bitmap_page, cur_page_id, page_size, + bitmap_page, cur_page_id, zip_size, IBUF_BITMAP_BUFFERED, &mtr)) { ib_errf(trx->mysql_thd, @@ -5001,7 +4946,8 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) slightly corrupted tables can be imported and dumped. Clear the bit. */ ibuf_bitmap_page_set_bits( - bitmap_page, cur_page_id, page_size, + bitmap_page, cur_page_id, + physical_size, IBUF_BITMAP_BUFFERED, FALSE, &mtr); } } @@ -5031,18 +4977,18 @@ ibuf_set_bitmap_for_bulk_load( free_val = ibuf_index_page_calc_free(block); mtr_start(&mtr); - mtr.set_named_space_id(block->page.id.space()); + fil_space_t* space = mtr.set_named_space_id(block->page.id.space()); bitmap_page = ibuf_bitmap_get_map_page(block->page.id, - block->page.size, &mtr); + space->zip_size(), &mtr); free_val = reset ? 0 : ibuf_index_page_calc_free(block); ibuf_bitmap_page_set_bits( - bitmap_page, block->page.id, block->page.size, + bitmap_page, block->page.id, block->physical_size(), IBUF_BITMAP_FREE, free_val, &mtr); ibuf_bitmap_page_set_bits( - bitmap_page, block->page.id, block->page.size, + bitmap_page, block->page.id, block->physical_size(), IBUF_BITMAP_BUFFERED, FALSE, &mtr); mtr_commit(&mtr); diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index bcf5904cd09..29382bb033f 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -173,24 +173,19 @@ record is in spatial index */ | BTR_LATCH_FOR_DELETE \ | BTR_MODIFY_EXTERNAL))) -/**************************************************************//** -Report that an index page is corrupted. */ -void -btr_corruption_report( -/*==================*/ - const buf_block_t* block, /*!< in: corrupted block */ - const dict_index_t* index) /*!< in: index tree */ - ATTRIBUTE_COLD __attribute__((nonnull)); +/** Report that an index page is corrupted. +@param[in] buffer block +@param[in] index tree */ +ATTRIBUTE_COLD ATTRIBUTE_NORETURN __attribute__((nonnull)) +void btr_corruption_report(const buf_block_t* block,const dict_index_t* index); /** Assert that a B-tree page is not corrupted. @param block buffer block containing a B-tree page @param index the B-tree index */ -#define btr_assert_not_corrupted(block, index) \ - if ((ibool) !!page_is_comp(buf_block_get_frame(block)) \ - != dict_table_is_comp((index)->table)) { \ - btr_corruption_report(block, index); \ - ut_error; \ - } +#define btr_assert_not_corrupted(block, index) \ + if (!!page_is_comp(buf_block_get_frame(block)) \ + != index->table->not_redundant()) \ + btr_corruption_report(block, index) /**************************************************************//** Gets the root node of a tree and sx-latches it for segment access. @@ -225,6 +220,7 @@ btr_height_get( /** Gets a buffer page and declares its latching order level. @param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] mode latch mode @param[in] file file name @param[in] line line where called @@ -236,7 +232,7 @@ UNIV_INLINE buf_block_t* btr_block_get_func( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint mode, const char* file, unsigned line, @@ -245,13 +241,13 @@ btr_block_get_func( /** Gets a buffer page and declares its latching order level. @param page_id tablespace/page identifier -@param page_size page size +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param mode latch mode @param index index tree, may be NULL if not the insert buffer tree @param mtr mini-transaction handle @return the block descriptor */ -# define btr_block_get(page_id, page_size, mode, index, mtr) \ - btr_block_get_func(page_id, page_size, mode, \ +# define btr_block_get(page_id, zip_size, mode, index, mtr) \ + btr_block_get_func(page_id, zip_size, mode, \ __FILE__, __LINE__, (dict_index_t*)index, mtr) /**************************************************************//** Gets the index id field of a page. @@ -327,40 +323,33 @@ btr_node_ptr_get_child_page_no( @param[in] type type of the index @param[in,out] space tablespace where created @param[in] index_id index id -@param[in] index index, or NULL when applying TRUNCATE -log record during recovery -@param[in] btr_redo_create_info used for applying TRUNCATE log -@param[in] mtr mini-transaction handle -record during recovery -@return page number of the created root, FIL_NULL if did not succeed */ +@param[in] index index +@param[in,out] mtr mini-transaction +@return page number of the created root +@retval FIL_NULL if did not succeed */ ulint btr_create( ulint type, fil_space_t* space, index_id_t index_id, dict_index_t* index, - const btr_create_t* btr_redo_create_info, mtr_t* mtr); /** Free a persistent index tree if it exists. @param[in] page_id root page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] index_id PAGE_INDEX_ID contents @param[in,out] mtr mini-transaction */ void btr_free_if_exists( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, index_id_t index_id, mtr_t* mtr); -/** Free an index tree in a temporary tablespace or during TRUNCATE TABLE. -@param[in] page_id root page id -@param[in] page_size page size */ -void -btr_free( - const page_id_t page_id, - const page_size_t& page_size); +/** Free an index tree in a temporary tablespace. +@param[in] page_id root page id */ +void btr_free(const page_id_t page_id); /** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC. @param[in,out] index clustered index @@ -390,6 +379,12 @@ void btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset = false) MY_ATTRIBUTE((nonnull)); +/** Write instant ALTER TABLE metadata to a root page. +@param[in,out] root clustered index root page +@param[in] index clustered index with instant ALTER TABLE +@param[in,out] mtr mini-transaction */ +void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr); + /*************************************************************//** Makes tree one level higher by splitting the root, and inserts the tuple. It is assumed that mtr contains an x-latch on the tree. @@ -750,21 +745,23 @@ dberr_t btr_validate_index( /*===============*/ dict_index_t* index, /*!< in: index */ - const trx_t* trx, /*!< in: transaction or 0 */ - bool lockout)/*!< in: true if X-latch index is intended */ + const trx_t* trx) /*!< in: transaction or 0 */ MY_ATTRIBUTE((warn_unused_result)); -/*************************************************************//** -Removes a page from the level list of pages. */ -UNIV_INTERN +/** Remove a page from the level list of pages. +@param[in] space space where removed +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] page page to remove +@param[in] index index tree +@param[in,out] mtr mini-transaction */ void btr_level_list_remove_func( -/*=======================*/ - ulint space, /*!< in: space where removed */ - const page_size_t& page_size,/*!< in: page size */ - page_t* page, /*!< in/out: page to remove */ - dict_index_t* index, /*!< in: index tree */ - mtr_t* mtr); /*!< in/out: mini-transaction */ + ulint space, + ulint zip_size, + page_t* page, + dict_index_t* index, + mtr_t* mtr); + /*************************************************************//** Removes a page from the level list of pages. @param space in: space where removed @@ -799,5 +796,6 @@ btr_lift_page_up( /**************************************************************** Global variable controlling if scrubbing should be performed */ extern my_bool srv_immediate_scrub_data_uncompressed; +extern Atomic_counter<uint32_t> btr_validate_index_running; #endif diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic index 49567979c98..d3827b7dc6f 100644 --- a/storage/innobase/include/btr0btr.ic +++ b/storage/innobase/include/btr0btr.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2015, 2017, MariaDB Corporation. +Copyright (c) 2015, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -31,6 +31,7 @@ Created 6/2/1994 Heikki Tuuri /** Gets a buffer page and declares its latching order level. @param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] mode latch mode @param[in] file file name @param[in] line line where called @@ -42,7 +43,7 @@ UNIV_INLINE buf_block_t* btr_block_get_func( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint mode, const char* file, unsigned line, @@ -53,7 +54,7 @@ btr_block_get_func( dberr_t err=DB_SUCCESS; block = buf_page_get_gen( - page_id, page_size, mode, NULL, BUF_GET, file, line, mtr, &err); + page_id, zip_size, mode, NULL, BUF_GET, file, line, mtr, &err); if (err == DB_DECRYPTION_FAILED) { if (index && index->table) { diff --git a/storage/innobase/include/btr0bulk.h b/storage/innobase/include/btr0bulk.h index be4b55c1a11..46db1a73f70 100644 --- a/storage/innobase/include/btr0bulk.h +++ b/storage/innobase/include/btr0bulk.h @@ -289,8 +289,7 @@ public: ut_ad(!dict_index_is_spatial(index)); #ifdef UNIV_DEBUG if (m_flush_observer) - my_atomic_addlint(&m_index->table->space->redo_skipped_count, - 1); + m_index->table->space->redo_skipped_count++; #endif /* UNIV_DEBUG */ } @@ -299,8 +298,7 @@ public: { #ifdef UNIV_DEBUG if (m_flush_observer) - my_atomic_addlint(&m_index->table->space->redo_skipped_count, - ulint(-1)); + m_index->table->space->redo_skipped_count--; #endif /* UNIV_DEBUG */ } diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h index c6f7c846c22..12aaa73ae30 100644 --- a/storage/innobase/include/btr0cur.h +++ b/storage/innobase/include/btr0cur.h @@ -729,11 +729,12 @@ btr_free_externally_stored_field( ignored if rec == NULL */ bool rollback, /*!< in: performing rollback? */ mtr_t* local_mtr); /*!< in: mtr containing the latch */ + /** Copies the prefix of an externally stored field of a record. The clustered index record must be protected by a lock or a page latch. @param[out] buf the field, or a prefix of it @param[in] len length of buf, in bytes -@param[in] page_size BLOB page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] data 'internally' stored part of the field containing also the reference to the external part; must be protected by a lock or a page latch @@ -744,7 +745,7 @@ ulint btr_copy_externally_stored_field_prefix( byte* buf, ulint len, - const page_size_t& page_size, + ulint zip_size, const byte* data, ulint local_len); @@ -754,7 +755,7 @@ The clustered index record must be protected by a lock or a page latch. @param[in] data 'internally' stored part of the field containing also the reference to the external part; must be protected by a lock or a page latch -@param[in] page_size BLOB page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] local_len length of data @param[in,out] heap mem heap @return the whole field copied to heap */ @@ -762,7 +763,7 @@ byte* btr_copy_externally_stored_field( ulint* len, const byte* data, - const page_size_t& page_size, + ulint zip_size, ulint local_len, mem_heap_t* heap); @@ -770,7 +771,7 @@ btr_copy_externally_stored_field( @param[in] rec record in a clustered index; must be protected by a lock or a page latch @param[in] offset array returned by rec_get_offsets() -@param[in] page_size BLOB page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] no field number @param[out] len length of the field @param[in,out] heap mem heap @@ -779,7 +780,7 @@ byte* btr_rec_copy_externally_stored_field( const rec_t* rec, const rec_offs* offsets, - const page_size_t& page_size, + ulint zip_size, ulint no, ulint* len, mem_heap_t* heap); @@ -810,6 +811,7 @@ btr_rec_set_deleted_flag( /** Latches the leaf page or pages requested. @param[in] block leaf page where the search converged @param[in] page_id page id of the leaf +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] latch_mode BTR_SEARCH_LEAF, ... @param[in] cursor cursor @param[in] mtr mini-transaction @@ -818,7 +820,7 @@ btr_latch_leaves_t btr_cur_latch_leaves( buf_block_t* block, const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint latch_mode, btr_cur_t* cursor, mtr_t* mtr); @@ -1021,7 +1023,7 @@ inherited external field. */ #define BTR_EXTERN_INHERITED_FLAG 64U /** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */ -extern ulint btr_cur_n_non_sea; +extern Atomic_counter<ulint> btr_cur_n_non_sea; /** Old value of btr_cur_n_non_sea. Copied by srv_refresh_innodb_monitor_stats(). Referenced by srv_printf_innodb_monitor(). */ diff --git a/storage/innobase/include/btr0defragment.h b/storage/innobase/include/btr0defragment.h index 57f8c2f3811..22f29eae3a6 100644 --- a/storage/innobase/include/btr0defragment.h +++ b/storage/innobase/include/btr0defragment.h @@ -26,9 +26,9 @@ this program; if not, write to the Free Software Foundation, Inc., #define BTR_DEFRAGMENT_MAX_N_PAGES 32 /** stats in btr_defragment */ -extern ulint btr_defragment_compression_failures; -extern ulint btr_defragment_failures; -extern ulint btr_defragment_count; +extern Atomic_counter<ulint> btr_defragment_compression_failures; +extern Atomic_counter<ulint> btr_defragment_failures; +extern Atomic_counter<ulint> btr_defragment_count; /** Item in the work queue for btr_degrament_thread. */ struct btr_defragment_item_t diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h index 22e1ef11a68..83c374e2561 100644 --- a/storage/innobase/include/btr0types.h +++ b/storage/innobase/include/btr0types.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -27,7 +28,6 @@ Created 2/17/1996 Heikki Tuuri #define btr0types_h #include "page0types.h" -#include "page0size.h" #include "rem0types.h" /** Persistent cursor */ @@ -49,41 +49,11 @@ extern ulong btr_ahi_parts; /** The size of a reference to data stored on a different page. The reference is stored at the end of the prefix of the field in the index record. */ +#define FIELD_REF_SIZE 20U #define BTR_EXTERN_FIELD_REF_SIZE FIELD_REF_SIZE /** If the data don't exceed the size, the data are stored locally. */ #define BTR_EXTERN_LOCAL_STORED_MAX_SIZE \ (BTR_EXTERN_FIELD_REF_SIZE * 2) -/** The information is used for creating a new index tree when -applying TRUNCATE log record during recovery */ -struct btr_create_t { - - explicit btr_create_t(const byte* const ptr) - : - format_flags(), - n_fields(), - field_len(), - fields(ptr), - trx_id_pos(ULINT_UNDEFINED) - { - /* Do nothing */ - } - - /** Page format */ - ulint format_flags; - - /** Numbr of index fields */ - ulint n_fields; - - /** The length of the encoded meta-data */ - ulint field_len; - - /** Field meta-data, encoded. */ - const byte* const fields; - - /** Position of trx-id column. */ - ulint trx_id_pos; -}; - #endif diff --git a/storage/innobase/include/buf0buddy.h b/storage/innobase/include/buf0buddy.h index 5b1aefb4d69..5119a1c58c4 100644 --- a/storage/innobase/include/buf0buddy.h +++ b/storage/innobase/include/buf0buddy.h @@ -26,11 +26,6 @@ Created December 2006 by Marko Makela #ifndef buf0buddy_h #define buf0buddy_h -#ifdef UNIV_MATERIALIZE -# undef UNIV_INLINE -# define UNIV_INLINE -#endif - #include "buf0types.h" /**********************************************************************//** diff --git a/storage/innobase/include/buf0buddy.ic b/storage/innobase/include/buf0buddy.ic index dad9cb668dd..39ab46d80dd 100644 --- a/storage/innobase/include/buf0buddy.ic +++ b/storage/innobase/include/buf0buddy.ic @@ -23,11 +23,6 @@ Binary buddy allocator for compressed pages Created December 2006 by Marko Makela *******************************************************/ -#ifdef UNIV_MATERIALIZE -# undef UNIV_INLINE -# define UNIV_INLINE -#endif - #include "buf0buf.h" #include "buf0buddy.h" @@ -132,8 +127,3 @@ buf_buddy_free( buf_buddy_free_low(buf_pool, buf, buf_buddy_get_slot(size)); } - -#ifdef UNIV_MATERIALIZE -# undef UNIV_INLINE -# define UNIV_INLINE UNIV_INLINE_ORIGINAL -#endif diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 689427913cd..89d7c71b734 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -42,7 +42,6 @@ Created 11/5/1995 Heikki Tuuri #include "os0proc.h" #include "log0log.h" #include "srv0srv.h" -#include "my_atomic.h" #include <ostream> // Forward declaration @@ -424,16 +423,14 @@ be implemented at a higher level. In other words, all possible accesses to a given page through this function must be protected by the same set of mutexes or latches. @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size @return pointer to the block */ -buf_page_t* -buf_page_get_zip( - const page_id_t page_id, - const page_size_t& page_size); +buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size); /** This is the general function used to get access to a database page. It does page initialization and applies the buffered redo logs. @param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH @param[in] guess guessed block or NULL @param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, @@ -446,7 +443,7 @@ BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH buf_block_t* buf_page_get_gen( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint rw_latch, buf_block_t* guess, ulint mode, @@ -455,8 +452,9 @@ buf_page_get_gen( mtr_t* mtr, dberr_t* err); -/** This is the low level function used to get access to a database page. +/** Low level function used to get access to a database page. @param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH @param[in] guess guessed block or NULL @param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, @@ -469,7 +467,7 @@ BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH buf_block_t* buf_page_get_low( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint rw_latch, buf_block_t* guess, ulint mode, @@ -478,18 +476,18 @@ buf_page_get_low( mtr_t* mtr, dberr_t* err); -/** Initializes a page to the buffer buf_pool. The page is usually not read +/** Initialize a page in the buffer pool. The page is usually not read from a file even if it cannot be found in the buffer buf_pool. This is one of the functions which perform to a block a state transition NOT_USED => FILE_PAGE (the other is buf_page_get_gen). @param[in] page_id page id -@param[in] page_size page size -@param[in] mtr mini-transaction +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction @return pointer to the block, page bufferfixed */ buf_block_t* buf_page_create( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, mtr_t* mtr); /********************************************************************//** @@ -625,33 +623,6 @@ buf_block_buf_fix_inc_func( buf_block_t* block) /*!< in/out: block to bufferfix */ MY_ATTRIBUTE((nonnull)); -/** Increments the bufferfix count. -@param[in,out] bpage block to bufferfix -@return the count */ -UNIV_INLINE -ulint -buf_block_fix(buf_page_t* bpage); - -/** Increments the bufferfix count. -@param[in,out] block block to bufferfix -@return the count */ -UNIV_INLINE -ulint -buf_block_fix(buf_block_t* block); - -/** Decrements the bufferfix count. -@param[in,out] bpage block to bufferunfix -@return the remaining buffer-fix count */ -UNIV_INLINE -ulint -buf_block_unfix(buf_page_t* bpage); -/** Decrements the bufferfix count. -@param[in,out] block block to bufferunfix -@return the remaining buffer-fix count */ -UNIV_INLINE -ulint -buf_block_unfix(buf_block_t* block); - # ifdef UNIV_DEBUG /** Increments the bufferfix count. @param[in,out] b block to bufferfix @@ -711,19 +682,13 @@ buf_page_is_checksum_valid_none( /** Check if a page is corrupt. @param[in] check_lsn whether the LSN should be checked @param[in] read_buf database page -@param[in] page_size page size -@param[in] space tablespace +@param[in] fsp_flags tablespace flags @return whether the page is corrupted */ bool buf_page_is_corrupted( bool check_lsn, const byte* read_buf, - const page_size_t& page_size, -#ifndef UNIV_INNOCHECKSUM - const fil_space_t* space = NULL) -#else - const void* space = NULL) -#endif + ulint fsp_flags) MY_ATTRIBUTE((warn_unused_result)); inline void *aligned_malloc(size_t size, size_t align) @@ -747,6 +712,63 @@ inline void aligned_free(void *ptr) #endif } +/** Read the key version from the page. In full crc32 format, +key version is stored at {0-3th} bytes. In other format, it is +stored in 26th position. +@param[in] read_buf database page +@param[in] fsp_flags tablespace flags +@return key version of the page. */ +inline uint32_t buf_page_get_key_version(const byte* read_buf, ulint fsp_flags) +{ + return fil_space_t::full_crc32(fsp_flags) + ? mach_read_from_4(read_buf + FIL_PAGE_FCRC32_KEY_VERSION) + : mach_read_from_4(read_buf + + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); +} + +/** Read the compression info from the page. In full crc32 format, +compression info is at MSB of page type. In other format, it is +stored in page type. +@param[in] read_buf database page +@param[in] fsp_flags tablespace flags +@return true if page is compressed. */ +inline bool buf_page_is_compressed(const byte* read_buf, ulint fsp_flags) +{ + ulint page_type = mach_read_from_2(read_buf + FIL_PAGE_TYPE); + return fil_space_t::full_crc32(fsp_flags) + ? !!(page_type & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER) + : page_type == FIL_PAGE_PAGE_COMPRESSED; +} + +/** Get the compressed or uncompressed size of a full_crc32 page. +@param[in] buf page_compressed or uncompressed page +@param[out] comp whether the page could be compressed +@param[out] cr whether the page could be corrupted +@return the payload size in the file page */ +inline uint buf_page_full_crc32_size(const byte* buf, bool* comp, bool* cr) +{ + uint t = mach_read_from_2(buf + FIL_PAGE_TYPE); + uint page_size = uint(srv_page_size); + + if (!(t & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)) { + return page_size; + } + + t &= ~(1U << FIL_PAGE_COMPRESS_FCRC32_MARKER); + t <<= 8; + + if (t < page_size) { + page_size = t; + if (comp) { + *comp = true; + } + } else if (cr) { + *cr = true; + } + + return page_size; +} + #ifndef UNIV_INNOCHECKSUM /**********************************************************************//** Gets the space id, page offset, and byte offset within page of a @@ -808,10 +830,8 @@ buf_print(void); /** Dump a page to stderr. @param[in] read_buf database page -@param[in] page_size page size */ -UNIV_INTERN -void -buf_page_print(const byte* read_buf, const page_size_t& page_size) +@param[in] zip_size compressed page size, or 0 */ +void buf_page_print(const byte* read_buf, ulint zip_size = 0) ATTRIBUTE_COLD __attribute__((nonnull)); /********************************************************************//** Decompress a block. @@ -1170,6 +1190,7 @@ and the lock released later. @param[out] err DB_SUCCESS or DB_TABLESPACE_DELETED @param[in] mode BUF_READ_IBUF_PAGES_ONLY, ... @param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] unzip whether the uncompressed page is requested (for ROW_FORMAT=COMPRESSED) @return pointer to the block @@ -1179,7 +1200,7 @@ buf_page_init_for_read( dberr_t* err, ulint mode, const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, bool unzip); /** Complete a read or write request of a file page to or from the buffer pool. @@ -1390,6 +1411,15 @@ ulint buf_pool_size_align( ulint size); +/** Verify that post encryption checksum match with the calculated checksum. +This function should be called only if tablespace contains crypt data metadata. +@param[in] page page frame +@param[in] fsp_flags tablespace flags +@return true if page is encrypted and OK, false otherwise */ +bool buf_page_verify_crypt_checksum( + const byte* page, + ulint fsp_flags); + /** Calculate the checksum of a page from compressed table and update the page. @param[in,out] page page to update @@ -1410,7 +1440,7 @@ a page is written to disk. (may be src_frame or an encrypted/compressed copy of it) */ UNIV_INTERN byte* -buf_page_encrypt_before_write( +buf_page_encrypt( fil_space_t* space, buf_page_t* bpage, byte* src_frame); @@ -1420,10 +1450,9 @@ buf_page_encrypt_before_write( NOTE! The definition appears here only for other modules of this directory (buf) to see it. Do not use from outside! */ -struct buf_tmp_buffer_t { -private: - int32 reserved; /*!< true if this slot is reserved - */ +class buf_tmp_buffer_t { + /** whether this slot is reserved */ + std::atomic<bool> reserved; public: byte* crypt_buf; /*!< for encryption the data needs to be copied to a separate buffer before it's @@ -1439,16 +1468,14 @@ public: /** Release the slot */ void release() { - my_atomic_store32_explicit(&reserved, false, - MY_MEMORY_ORDER_RELAXED); + reserved.store(false, std::memory_order_relaxed); } /** Acquire the slot @return whether the slot was acquired */ bool acquire() { - return !my_atomic_fas32_explicit(&reserved, true, - MY_MEMORY_ORDER_RELAXED); + return !reserved.exchange(true, std::memory_order_relaxed); } }; @@ -1474,11 +1501,8 @@ public: buf_pool->page_hash or buf_pool->zip_hash */ - /** Page size. Protected by buf_pool mutex. */ - page_size_t size; - /** Count of how manyfold this block is currently bufferfixed. */ - int32 buf_fix_count; + Atomic_counter<uint32_t> buf_fix_count; /** type of pending I/O operation; also protected by buf_pool->mutex for writes only */ @@ -1623,6 +1647,27 @@ public: protected by buf_pool->zip_mutex or buf_block_t::mutex. */ # endif /* UNIV_DEBUG */ + + void fix() { buf_fix_count++; } + uint32_t unfix() + { + uint32_t count= buf_fix_count--; + ut_ad(count != 0); + return count - 1; + } + + /** @return the physical size, in bytes */ + ulint physical_size() const + { + return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : srv_page_size; + } + + /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes + @retval 0 if not compressed */ + ulint zip_size() const + { + return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : 0; + } }; /** The buffer control block structure */ @@ -1729,20 +1774,20 @@ struct buf_block_t{ /* @{ */ # if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG - ulint n_pointers; /*!< used in debugging: the number of + Atomic_counter<ulint> + n_pointers; /*!< used in debugging: the number of pointers in the adaptive hash index pointing to this frame; protected by atomic memory access or btr_search_own_all(). */ # define assert_block_ahi_empty(block) \ - ut_a(my_atomic_addlint(&(block)->n_pointers, 0) == 0) + ut_a((block)->n_pointers == 0) # define assert_block_ahi_empty_on_init(block) do { \ MEM_MAKE_DEFINED(&(block)->n_pointers, sizeof (block)->n_pointers); \ assert_block_ahi_empty(block); \ } while (0) # define assert_block_ahi_valid(block) \ - ut_a((block)->index \ - || my_atomic_loadlint(&(block)->n_pointers) == 0) + ut_a((block)->index || (block)->n_pointers == 0) # else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ # define assert_block_ahi_empty(block) /* nothing */ # define assert_block_ahi_empty_on_init(block) /* nothing */ @@ -1774,7 +1819,7 @@ struct buf_block_t{ # ifdef UNIV_DEBUG /** @name Debug fields */ /* @{ */ - rw_lock_t debug_latch; /*!< in the debug version, each thread + rw_lock_t* debug_latch; /*!< in the debug version, each thread which bufferfixes the block acquires an s-latch here; so we can use the debug utilities in sync0rw */ @@ -1786,6 +1831,16 @@ struct buf_block_t{ and accessed; we introduce this new mutex in InnoDB-5.1 to relieve contention on the buffer pool mutex */ + + void fix() { page.fix(); } + uint32_t unfix() { return page.unfix(); } + + /** @return the physical size, in bytes */ + ulint physical_size() const { return page.physical_size(); } + + /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes + @retval 0 if not compressed */ + ulint zip_size() const { return page.zip_size(); } }; /** Check if a buf_block_t object is in a valid state @@ -1877,13 +1932,13 @@ public: HazardPointer(buf_pool, mutex) {} /** Destructor */ - virtual ~FlushHp() {} + ~FlushHp() override {} /** Adjust the value of hp. This happens when some other thread working on the same list attempts to remove the hp from the list. @param bpage buffer block to be compared */ - void adjust(const buf_page_t* bpage); + void adjust(const buf_page_t* bpage) override; }; /** Class implementing buf_pool->LRU hazard pointer */ @@ -1898,13 +1953,13 @@ public: HazardPointer(buf_pool, mutex) {} /** Destructor */ - virtual ~LRUHp() {} + ~LRUHp() override {} /** Adjust the value of hp. This happens when some other thread working on the same list attempts to remove the hp from the list. @param bpage buffer block to be compared */ - void adjust(const buf_page_t* bpage); + void adjust(const buf_page_t* bpage) override; }; /** Special purpose iterators to be used when scanning the LRU list. @@ -1922,7 +1977,7 @@ public: LRUHp(buf_pool, mutex) {} /** Destructor */ - virtual ~LRUItr() {} + ~LRUItr() override {} /** Selects from where to start a scan. If we have scanned too deep into the LRU list it resets the value to the tail @@ -1990,17 +2045,6 @@ struct buf_buddy_stat_t { ib_uint64_t relocated_usec; }; -/** @brief The temporary memory array structure. - -NOTE! The definition appears here only for other modules of this -directory (buf) to see it. Do not use from outside! */ - -typedef struct { - ulint n_slots; /*!< Total number of slots */ - buf_tmp_buffer_t *slots; /*!< Pointer to the slots in the - array */ -} buf_tmp_array_t; - /** @brief The buffer pool structure. NOTE! The definition appears here only for other modules of this @@ -2060,7 +2104,8 @@ struct buf_pool_t{ indexed by block->frame */ ulint n_pend_reads; /*!< number of pending read operations */ - ulint n_pend_unzip; /*!< number of pending decompressions */ + Atomic_counter<ulint> + n_pend_unzip; /*!< number of pending decompressions */ time_t last_printout_time; /*!< when buf_print_io was last time @@ -2201,20 +2246,47 @@ struct buf_pool_t{ #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ UT_LIST_BASE_NODE_T(buf_buddy_free_t) zip_free[BUF_BUDDY_SIZES_MAX]; /*!< buddy free lists */ +#if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN +# error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN" +#endif + /* @} */ buf_page_t* watch; /*!< Sentinel records for buffer pool watches. Protected by buf_pool->mutex. */ - buf_tmp_array_t* tmp_arr; - /*!< Array for temporal memory - used in compression and encryption */ - -#if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN -# error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN" -#endif - /* @} */ + /** Temporary memory for page_compressed and encrypted I/O */ + struct io_buf_t { + /** number of elements in slots[] */ + const ulint n_slots; + /** array of slots */ + buf_tmp_buffer_t* const slots; + + io_buf_t() = delete; + + /** Constructor */ + explicit io_buf_t(ulint n_slots) : + n_slots(n_slots), + slots(static_cast<buf_tmp_buffer_t*>( + ut_malloc_nokey(n_slots + * sizeof *slots))) + { + memset((void*) slots, 0, n_slots * sizeof *slots); + } + + ~io_buf_t(); + + /** Reserve a buffer */ + buf_tmp_buffer_t* reserve() + { + for (buf_tmp_buffer_t* s = slots, *e = slots + n_slots; + s != e; s++) { + if (s->acquire()) return s; + } + return NULL; + } + } io_buf; }; /** Print the given buf_pool_t object. diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic index f331091a1d7..7d11e2b4cc0 100644 --- a/storage/innobase/include/buf0buf.ic +++ b/storage/innobase/include/buf0buf.ic @@ -955,49 +955,6 @@ buf_block_get_modify_clock( return(block->modify_clock); } -/** Increments the bufferfix count. -@param[in,out] bpage block to bufferfix -@return the count */ -UNIV_INLINE -ulint -buf_block_fix(buf_page_t* bpage) -{ - return uint32(my_atomic_add32_explicit( - &bpage->buf_fix_count, 1, - MY_MEMORY_ORDER_RELAXED)) + 1; -} - -/** Increments the bufferfix count. -@param[in,out] block block to bufferfix -@return the count */ -UNIV_INLINE -ulint -buf_block_fix(buf_block_t* block) -{ - return buf_block_fix(&block->page); -} - -/** Get the bufferfix count. -@param[in] bpage block to bufferfix -@return the count */ -UNIV_INLINE -ulint -buf_block_get_fix(buf_page_t* bpage) -{ - return my_atomic_load32_explicit(&bpage->buf_fix_count, - MY_MEMORY_ORDER_RELAXED); -} - -/** Get the bufferfix count. -@param[in] bpage block to bufferfix -@return the count */ -UNIV_INLINE -ulint -buf_block_get_fix(buf_block_t* block) -{ - return buf_block_get_fix(&block->page); -} - /*******************************************************************//** Increments the bufferfix count. */ UNIV_INLINE @@ -1016,36 +973,12 @@ buf_block_buf_fix_inc_func( threaded. */ if (!fsp_is_system_temporary(block->page.id.space())) { ibool ret; - ret = rw_lock_s_lock_nowait(&block->debug_latch, file, line); + ret = rw_lock_s_lock_nowait(block->debug_latch, file, line); ut_a(ret); } #endif /* UNIV_DEBUG */ - buf_block_fix(block); -} - -/** Decrements the bufferfix count. -@param[in,out] bpage block to bufferunfix -@return the remaining buffer-fix count */ -UNIV_INLINE -ulint -buf_block_unfix(buf_page_t* bpage) -{ - uint32 count = uint32(my_atomic_add32_explicit( - &bpage->buf_fix_count, - -1, MY_MEMORY_ORDER_RELAXED)); - ut_ad(count != 0); - return count - 1; -} - -/** Decrements the bufferfix count. -@param[in,out] block block to bufferunfix -@return the remaining buffer-fix count */ -UNIV_INLINE -ulint -buf_block_unfix(buf_block_t* block) -{ - return buf_block_unfix(&block->page); + block->fix(); } /*******************************************************************//** @@ -1056,14 +989,14 @@ buf_block_buf_fix_dec( /*==================*/ buf_block_t* block) /*!< in/out: block to bufferunfix */ { - buf_block_unfix(block); + block->unfix(); #ifdef UNIV_DEBUG /* No debug latch is acquired if block belongs to system temporary. Debug latch is not of much help if access to block is single threaded. */ if (!fsp_is_system_temporary(block->page.id.space())) { - rw_lock_s_unlock(&block->debug_latch); + rw_lock_s_unlock(block->debug_latch); } #endif /* UNIV_DEBUG */ } @@ -1306,14 +1239,14 @@ buf_page_release_zip( is single threaded. */ buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage); if (!fsp_is_system_temporary(block->page.id.space())) { - rw_lock_s_unlock(&block->debug_latch); + rw_lock_s_unlock(block->debug_latch); } } #endif /* UNIV_DEBUG */ /* Fall through */ case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_DIRTY: - buf_block_unfix(reinterpret_cast<buf_block_t*>(bpage)); + reinterpret_cast<buf_block_t*>(bpage)->unfix(); return; case BUF_BLOCK_POOL_WATCH: @@ -1342,7 +1275,7 @@ buf_page_release_latch( temporary. Debug latch is not of much help if access to block is single threaded. */ if (!fsp_is_system_temporary(block->page.id.space())) { - rw_lock_s_unlock(&block->debug_latch); + rw_lock_s_unlock(block->debug_latch); } #endif /* UNIV_DEBUG */ diff --git a/storage/innobase/include/buf0checksum.h b/storage/innobase/include/buf0checksum.h index ce39e290ac7..8dc25f91d59 100644 --- a/storage/innobase/include/buf0checksum.h +++ b/storage/innobase/include/buf0checksum.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2018, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,18 +29,6 @@ Created Aug 11, 2011 Vasil Dimov #include "buf0types.h" -#ifdef INNODB_BUG_ENDIAN_CRC32 -/** Calculate the CRC32 checksum of a page. The value is stored to the page -when it is written to a file and also checked for a match when reading from -the file. Note that we must be careful to calculate the same value on all -architectures. -@param[in] page buffer page (srv_page_size bytes) -@param[in] bug_endian whether to use big endian byteorder -when converting byte strings to integers, for bug-compatibility with -big-endian architecture running MySQL 5.6, MariaDB 10.0 or MariaDB 10.1 -@return CRC-32C */ -uint32_t buf_calc_page_crc32(const byte* page, bool bug_endian = false); -#else /** Calculate the CRC32 checksum of a page. The value is stored to the page when it is written to a file and also checked for a match when reading from the file. Note that we must be careful to calculate the same value on all @@ -48,7 +36,6 @@ architectures. @param[in] page buffer page (srv_page_size bytes) @return CRC-32C */ uint32_t buf_calc_page_crc32(const byte* page); -#endif /** Calculate a checksum which is stored to the page when it is written to a file. Note that we must be careful to calculate the same value on diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h index a0122d1c3f8..e022dd55215 100644 --- a/storage/innobase/include/buf0flu.h +++ b/storage/innobase/include/buf0flu.h @@ -73,17 +73,24 @@ buf_flush_relocate_on_flush_list( @param[in,out] bpage flushed page @param[in] dblwr whether the doublewrite buffer was used */ void buf_flush_write_complete(buf_page_t* bpage, bool dblwr); + +/** Assign the full crc32 checksum for non-compressed page. +@param[in,out] page page to be updated */ +void buf_flush_assign_full_crc32_checksum(byte* page); + /** Initialize a page for writing to the tablespace. -@param[in] block buffer block; NULL if bypassing the buffer pool -@param[in,out] page page frame -@param[in,out] page_zip_ compressed page, or NULL if uncompressed -@param[in] newest_lsn newest modification LSN to the page */ +@param[in] block buffer block; NULL if bypassing the buffer pool +@param[in,out] page page frame +@param[in,out] page_zip_ compressed page, or NULL if uncompressed +@param[in] newest_lsn newest modification LSN to the page +@param[in] use_full_checksum whether tablespace uses full checksum */ void buf_flush_init_for_writing( const buf_block_t* block, byte* page, void* page_zip_, - lsn_t newest_lsn); + lsn_t newest_lsn, + bool use_full_checksum); # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG /********************************************************************//** @@ -181,18 +188,6 @@ buf_flush_note_modification( lsn_t end_lsn, /*!< in: end lsn of the last mtr in the set of mtr's */ FlushObserver* observer); /*!< in: flush observer */ - -/********************************************************************//** -This function should be called when recovery has modified a buffer page. */ -UNIV_INLINE -void -buf_flush_recv_note_modification( -/*=============================*/ - buf_block_t* block, /*!< in: block which is modified */ - lsn_t start_lsn, /*!< in: start lsn of the first mtr in a - set of mtr's */ - lsn_t end_lsn); /*!< in: end lsn of the last mtr in the - set of mtr's */ /********************************************************************//** Returns TRUE if the file page block is immediately suitable for replacement, i.e., transition FILE_PAGE => NOT_USED allowed. diff --git a/storage/innobase/include/buf0flu.ic b/storage/innobase/include/buf0flu.ic index 8d06a53c547..02f3d8ced57 100644 --- a/storage/innobase/include/buf0flu.ic +++ b/storage/innobase/include/buf0flu.ic @@ -38,17 +38,6 @@ buf_flush_insert_into_flush_list( lsn_t lsn); /*!< in: oldest modification */ /********************************************************************//** -Inserts a modified block into the flush list in the right sorted position. -This function is used by recovery, because there the modifications do not -necessarily come in the order of lsn's. */ -void -buf_flush_insert_sorted_into_flush_list( -/*====================================*/ - buf_pool_t* buf_pool, /*!< buffer pool instance */ - buf_block_t* block, /*!< in/out: block which is modified */ - lsn_t lsn); /*!< in: oldest modification */ - -/********************************************************************//** This function should be called at a mini-transaction commit, if a page was modified in it. Puts the block to the list of modified blocks, if it is not already in it. */ @@ -63,24 +52,11 @@ buf_flush_note_modification( modified this block */ FlushObserver* observer) /*!< in: flush observer */ { -#ifdef UNIV_DEBUG - { - /* Allow write to proceed to shared temporary tablespace - in read-only mode. */ - ut_ad(!srv_read_only_mode - || fsp_is_system_temporary(block->page.id.space())); - ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); - ut_ad(block->page.buf_fix_count > 0); - - buf_pool_t* buf_pool = buf_pool_from_block(block); - - ut_ad(!buf_pool_mutex_own(buf_pool)); - ut_ad(!buf_flush_list_mutex_own(buf_pool)); - } -#endif /* UNIV_DEBUG */ - mutex_enter(&block->mutex); - + ut_ad(!srv_read_only_mode + || fsp_is_system_temporary(block->page.id.space())); + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.buf_fix_count > 0); ut_ad(block->page.newest_modification <= end_lsn); block->page.newest_modification = end_lsn; @@ -98,52 +74,7 @@ buf_flush_note_modification( ut_ad(block->page.oldest_modification <= start_lsn); } - buf_page_mutex_exit(block); + mutex_exit(&block->mutex); srv_stats.buf_pool_write_requests.inc(); } - -/********************************************************************//** -This function should be called when recovery has modified a buffer page. */ -UNIV_INLINE -void -buf_flush_recv_note_modification( -/*=============================*/ - buf_block_t* block, /*!< in: block which is modified */ - lsn_t start_lsn, /*!< in: start lsn of the first mtr in a - set of mtr's */ - lsn_t end_lsn) /*!< in: end lsn of the last mtr in the - set of mtr's */ -{ -#ifdef UNIV_DEBUG - { - ut_ad(!srv_read_only_mode); - ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); - ut_ad(block->page.buf_fix_count > 0); - - buf_pool_t* buf_pool = buf_pool_from_block(block); - - ut_ad(!buf_pool_mutex_own(buf_pool)); - ut_ad(!buf_flush_list_mutex_own(buf_pool)); - - ut_ad(start_lsn != 0); - ut_ad(block->page.newest_modification <= end_lsn); - } -#endif /* UNIV_DEBUG */ - - buf_page_mutex_enter(block); - - block->page.newest_modification = end_lsn; - - if (!block->page.oldest_modification) { - buf_pool_t* buf_pool = buf_pool_from_block(block); - - buf_flush_insert_sorted_into_flush_list( - buf_pool, block, start_lsn); - } else { - ut_ad(block->page.oldest_modification <= start_lsn); - } - - buf_page_mutex_exit(block); - -} diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h index e590d818334..ff0ba474bb3 100644 --- a/storage/innobase/include/buf0rea.h +++ b/storage/innobase/include/buf0rea.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2015, 2017, MariaDB Corporation. +Copyright (c) 2015, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -34,30 +34,23 @@ buffer buf_pool if it is not already there. Sets the io_fix flag and sets an exclusive lock on the buffer frame. The flag is cleared and the x-lock released by the i/o-handler thread. @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @retval DB_SUCCESS if the page was read and is not corrupted, @retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted, @retval DB_DECRYPTION_FAILED if page post encryption checksum matches but after decryption normal page checksum does not match. @retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */ -dberr_t -buf_read_page( - const page_id_t page_id, - const page_size_t& page_size); +dberr_t buf_read_page(const page_id_t page_id, ulint zip_size); -/********************************************************************//** -High-level function which reads a page asynchronously from a file to the +/** High-level function which reads a page asynchronously from a file to the buffer buf_pool if it is not already there. Sets the io_fix flag and sets an exclusive lock on the buffer frame. The flag is cleared and the x-lock released by the i/o-handler thread. @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] sync true if synchronous aio is desired */ void -buf_read_page_background( - const page_id_t page_id, - const page_size_t& page_size, - bool sync); +buf_read_page_background(const page_id_t page_id, ulint zip_size, bool sync); /** Applies a random read-ahead in buf_pool if there are at least a threshold value of accessed pages from the random read-ahead area. Does not read any @@ -70,16 +63,13 @@ performed by ibuf routines, a situation which could result in a deadlock if the OS does not support asynchronous i/o. @param[in] page_id page id of a page which the current thread wants to access -@param[in] page_size page size -@param[in] inside_ibuf TRUE if we are inside ibuf routine +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] ibuf whether we are inside ibuf routine @return number of page read requests issued; NOTE that if we read ibuf pages, it may happen that the page at the given page number does not get read even if we return a positive value! */ ulint -buf_read_ahead_random( - const page_id_t page_id, - const page_size_t& page_size, - ibool inside_ibuf); +buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf); /** Applies linear read-ahead if in the buf_pool the page is a border page of a linear read-ahead area and all the pages in the area have been accessed. @@ -104,14 +94,11 @@ NOTE 3: the calling thread must want access to the page given: this rule is set to prevent unintended read-aheads performed by ibuf routines, a situation which could result in a deadlock if the OS does not support asynchronous io. @param[in] page_id page id; see NOTE 3 above -@param[in] page_size page size -@param[in] inside_ibuf TRUE if we are inside ibuf routine +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] ibuf whether if we are inside ibuf routine @return number of page read requests issued */ ulint -buf_read_ahead_linear( - const page_id_t page_id, - const page_size_t& page_size, - ibool inside_ibuf); +buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf); /********************************************************************//** Issues read requests for pages which the ibuf module wants to read in, in diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h index bd5e26df47b..5532a524782 100644 --- a/storage/innobase/include/buf0types.h +++ b/storage/innobase/include/buf0types.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2015, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2019, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -82,8 +83,16 @@ enum srv_checksum_algorithm_t { innodb when reading */ SRV_CHECKSUM_ALGORITHM_NONE, /*!< Write none, allow crc32, innodb or none when reading */ - SRV_CHECKSUM_ALGORITHM_STRICT_NONE /*!< Write none, allow none + SRV_CHECKSUM_ALGORITHM_STRICT_NONE, /*!< Write none, allow none when reading */ + + /** For new files, always compute CRC-32C for the whole page. + For old files, allow crc32, innodb or none when reading. */ + SRV_CHECKSUM_ALGORITHM_FULL_CRC32, + + /** For new files, always compute CRC-32C for the whole page. + For old files, allow crc32 when reading. */ + SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32 }; inline @@ -202,6 +211,12 @@ private: const page_id_t page_id); }; +/** A field reference full of zero, for use in assertions and checks, +and dummy default values of instantly dropped columns. +Initially, BLOB field references are set to zero, in +dtuple_convert_big_rec(). */ +extern const byte field_ref_zero[UNIV_PAGE_SIZE_MAX]; + #ifndef UNIV_INNOCHECKSUM #include "ut0mutex.h" diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h index 11a7f2e516f..04ddf5b0a42 100644 --- a/storage/innobase/include/data0data.h +++ b/storage/innobase/include/data0data.h @@ -543,6 +543,33 @@ struct dtuple_t { inserted or updated. @param[in] index index possibly with instantly added columns */ void trim(const dict_index_t& index); + + /** + @param info_bits the info_bits of a data tuple + @return whether this is a hidden metadata record + for instant ADD COLUMN or ALTER TABLE */ + static bool is_alter_metadata(ulint info_bits) + { + return UNIV_UNLIKELY(info_bits == REC_INFO_METADATA_ALTER); + } + + /** + @param info_bits the info_bits of a data tuple + @return whether this is a hidden metadata record + for instant ADD COLUMN or ALTER TABLE */ + static bool is_metadata(ulint info_bits) + { + return UNIV_UNLIKELY((info_bits & ~REC_INFO_DELETED_FLAG) + == REC_INFO_METADATA_ADD); + } + + /** @return whether this is a hidden metadata record + for instant ALTER TABLE (not only ADD COLUMN) */ + bool is_alter_metadata() const { return is_alter_metadata(info_bits); } + + /** @return whether this is a hidden metadata record + for instant ADD COLUMN or ALTER TABLE */ + bool is_metadata() const { return is_metadata(info_bits); } }; inline ulint dtuple_get_n_fields(const dtuple_t* tuple) diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h index 740a1b83aca..0e496085113 100644 --- a/storage/innobase/include/data0type.h +++ b/storage/innobase/include/data0type.h @@ -262,35 +262,31 @@ dtype_get_at_most_n_mbchars( ulint data_len, /*!< in: length of str (in bytes) */ const char* str); /*!< in: the string whose prefix length is being determined */ -/*********************************************************************//** -Checks if a data main type is a string type. Also a BLOB is considered a -string type. -@return TRUE if string type */ -ibool -dtype_is_string_type( -/*=================*/ - ulint mtype); /*!< in: InnoDB main data type code: DATA_CHAR, ... */ -/*********************************************************************//** -Checks if a type is a binary string type. Note that for tables created with -< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For -those DATA_BLOB columns this function currently returns FALSE. -@return TRUE if binary string type */ -ibool -dtype_is_binary_string_type( -/*========================*/ - ulint mtype, /*!< in: main data type */ - ulint prtype);/*!< in: precise type */ -/*********************************************************************//** -Checks if a type is a non-binary string type. That is, dtype_is_string_type is -TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created -with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. -For those DATA_BLOB columns this function currently returns TRUE. -@return TRUE if non-binary string type */ -ibool -dtype_is_non_binary_string_type( -/*============================*/ - ulint mtype, /*!< in: main data type */ - ulint prtype);/*!< in: precise type */ +/** @return whether main type is a string type */ +inline bool dtype_is_string_type(ulint mtype) +{ + return mtype <= DATA_BLOB + || mtype == DATA_MYSQL || mtype == DATA_VARMYSQL; +} + +/** @return whether a type is a binary string type */ +inline bool dtype_is_binary_string_type(ulint mtype, ulint prtype) +{ + /* Note that for tables created before MySQL 4.0.14, + we do not know if a DATA_BLOB column is a BLOB or a TEXT column. + For those DATA_BLOB columns we return false. */ + + return mtype == DATA_FIXBINARY || mtype == DATA_BINARY + || (mtype == DATA_BLOB && (prtype & DATA_BINARY_TYPE)); +} + +/** @return whether a type is a non-binary string type */ +inline bool dtype_is_non_binary_string_type(ulint mtype, ulint prtype) +{ + return dtype_is_string_type(mtype) + && !dtype_is_binary_string_type(mtype, prtype); +} + /*********************************************************************//** Sets a data type structure. */ UNIV_INLINE @@ -338,14 +334,15 @@ dtype_get_mblen( multi-byte character */ ulint* mbmaxlen); /*!< out: maximum length of a multi-byte character */ -/*********************************************************************//** -Gets the MySQL charset-collation code for MySQL string types. -@return MySQL charset-collation code */ -UNIV_INLINE -ulint -dtype_get_charset_coll( -/*===================*/ - ulint prtype);/*!< in: precise data type */ +/** +Get the charset-collation code for string types. +@param prtype InnoDB precise type +@return charset-collation code */ +inline uint16_t dtype_get_charset_coll(ulint prtype) +{ + return static_cast<uint16_t>(prtype >> 16) & CHAR_COLL_MASK; +} + /** Form a precise type from the < 4.1.2 format precise type plus the charset-collation code. @param[in] old_prtype MySQL type code and the flags @@ -554,11 +551,55 @@ struct dtype_t{ { return (prtype & DATA_VERSIONED) == DATA_VERS_END; } + + /** Set the type of the BLOB in the hidden metadata record. */ + void metadata_blob_init() + { + prtype = DATA_NOT_NULL; + mtype = DATA_BLOB; + len = 0; + mbminlen = 0; + mbmaxlen = 0; + } }; /** The DB_TRX_ID,DB_ROLL_PTR values for "no history is available" */ extern const byte reset_trx_id[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]; +/** Info bit denoting the predefined minimum record: this bit is set +if and only if the record is the first user record on a non-leaf +B-tree page that is the leftmost page on its level +(PAGE_LEVEL is nonzero and FIL_PAGE_PREV is FIL_NULL). */ +#define REC_INFO_MIN_REC_FLAG 0x10UL +/** The delete-mark flag in info bits */ +#define REC_INFO_DELETED_FLAG 0x20UL + +/** Record status values for ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED */ +enum rec_comp_status_t { + /** User record (PAGE_LEVEL=0, heap>=PAGE_HEAP_NO_USER_LOW) */ + REC_STATUS_ORDINARY = 0, + /** Node pointer record (PAGE_LEVEL>=0, heap>=PAGE_HEAP_NO_USER_LOW) */ + REC_STATUS_NODE_PTR = 1, + /** The page infimum pseudo-record (heap=PAGE_HEAP_NO_INFIMUM) */ + REC_STATUS_INFIMUM = 2, + /** The page supremum pseudo-record (heap=PAGE_HEAP_NO_SUPREMUM) */ + REC_STATUS_SUPREMUM = 3, + /** Clustered index record that has been inserted or updated + after instant ADD COLUMN (more than dict_index_t::n_core_fields) */ + REC_STATUS_INSTANT = 4 +}; + +/** The dtuple_t::info_bits of the hidden metadata of instant ADD COLUMN. +@see rec_is_metadata() +@see rec_is_alter_metadata() */ +static const byte REC_INFO_METADATA_ADD + = REC_INFO_MIN_REC_FLAG | REC_STATUS_INSTANT; + +/** The dtuple_t::info_bits of the hidden metadata of instant ALTER TABLE. +@see rec_is_metadata() */ +static const byte REC_INFO_METADATA_ALTER + = REC_INFO_METADATA_ADD | REC_INFO_DELETED_FLAG; + #include "data0type.ic" #endif diff --git a/storage/innobase/include/data0type.ic b/storage/innobase/include/data0type.ic index f2c499716ce..037a71a9345 100644 --- a/storage/innobase/include/data0type.ic +++ b/storage/innobase/include/data0type.ic @@ -28,18 +28,6 @@ Created 1/16/1996 Heikki Tuuri #include "ha_prototypes.h" /*********************************************************************//** -Gets the MySQL charset-collation code for MySQL string types. -@return MySQL charset-collation code */ -UNIV_INLINE -ulint -dtype_get_charset_coll( -/*===================*/ - ulint prtype) /*!< in: precise data type */ -{ - return((prtype >> 16) & CHAR_COLL_MASK); -} - -/*********************************************************************//** Determines if a MySQL string type is a subset of UTF-8. This function may return false negatives, in case further character-set collation codes are introduced in MySQL later. diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h index f70a65890c9..6cfc63f4a9e 100644 --- a/storage/innobase/include/db0err.h +++ b/storage/innobase/include/db0err.h @@ -136,8 +136,6 @@ enum dberr_t { DB_FTS_TOO_MANY_WORDS_IN_PHRASE, /*< Too many words in a phrase */ - DB_TABLESPACE_TRUNCATED, /*!< tablespace was truncated */ - DB_DECRYPTION_FAILED, /* Tablespace encrypted and decrypt operation failed because of missing key management plugin, diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h index 4853d5ad73f..778471b77ae 100644 --- a/storage/innobase/include/dict0boot.h +++ b/storage/innobase/include/dict0boot.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -50,12 +51,8 @@ dict_hdr_get_new_id( (not assigned if NULL) */ index_id_t* index_id, /*!< out: index id (not assigned if NULL) */ - ulint* space_id, /*!< out: space id + ulint* space_id); /*!< out: space id (not assigned if NULL) */ - const dict_table_t* table, /*!< in: table */ - bool disable_redo); /*!< in: if true and table - object is NULL - then disable-redo */ /**********************************************************************//** Writes the current value of the row id counter to the dictionary header file page. */ @@ -124,13 +121,6 @@ dict_is_sys_table( /* The following is a secondary index on SYS_TABLES */ #define DICT_TABLE_IDS_ID 5 -#define DICT_HDR_FIRST_ID 10 /* the ids for tables etc. start - from this number, except for basic - system tables and their above defined - indexes; ibuf tables and indexes are - assigned as the id the number - DICT_IBUF_ID_MIN plus the space id */ - /* The offset of the dictionary header on the page */ #define DICT_HDR FSEG_PAGE_DATA diff --git a/storage/innobase/include/dict0boot.ic b/storage/innobase/include/dict0boot.ic index dacfcd58b53..7b0a2fd0b86 100644 --- a/storage/innobase/include/dict0boot.ic +++ b/storage/innobase/include/dict0boot.ic @@ -33,18 +33,18 @@ dict_sys_get_new_row_id(void) { row_id_t id; - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); - id = dict_sys->row_id; + id = dict_sys.row_id; if (0 == (id % DICT_HDR_ROW_ID_WRITE_MARGIN)) { dict_hdr_flush_row_id(); } - dict_sys->row_id++; + dict_sys.row_id++; - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); return(id); } diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h index 8ab987cd39a..92f55ce4a14 100644 --- a/storage/innobase/include/dict0crea.h +++ b/storage/innobase/include/dict0crea.h @@ -67,14 +67,6 @@ dict_create_table_step( /*===================*/ que_thr_t* thr); /*!< in: query thread */ -/** Assign a new table ID and put it into the table cache and the transaction. -@param[in,out] table Table that needs an ID -@param[in,out] trx Transaction */ -void -dict_table_assign_new_id( - dict_table_t* table, - trx_t* trx); - /***********************************************************//** Creates an index. This is a high-level function used in SQL execution graphs. @@ -104,29 +96,12 @@ dict_create_index_tree( dict_index_t* index, /*!< in/out: index */ const trx_t* trx); /*!< in: InnoDB transaction handle */ -/*******************************************************************//** -Recreate the index tree associated with a row in SYS_INDEXES table. -@return new root page number, or FIL_NULL on failure */ -ulint -dict_recreate_index_tree( -/*======================*/ - const dict_table_t* table, /*!< in: the table the index - belongs to */ - btr_pcur_t* pcur, /*!< in/out: persistent cursor pointing - to record in the clustered index of - SYS_INDEXES table. The cursor may be - repositioned in this call. */ - mtr_t* mtr); /*!< in: mtr having the latch - on the record page. The mtr may be - committed and restarted in this call. */ - /** Drop the index tree associated with a row in SYS_INDEXES table. @param[in,out] rec SYS_INDEXES record @param[in,out] pcur persistent cursor on rec @param[in,out] trx dictionary transaction -@param[in,out] mtr mini-transaction -@return whether freeing the B-tree was attempted */ -bool dict_drop_index_tree(rec_t* rec, btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr) +@param[in,out] mtr mini-transaction */ +void dict_drop_index_tree(rec_t* rec, btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr) MY_ATTRIBUTE((nonnull)); /***************************************************************//** diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h index 9f5485bb15c..35309fc1b54 100644 --- a/storage/innobase/include/dict0dict.h +++ b/storage/innobase/include/dict0dict.h @@ -32,12 +32,13 @@ Created 1/8/1996 Heikki Tuuri #include "dict0mem.h" #include "fsp0fsp.h" #include <deque> -#include "dict0pagecompress.h" extern bool innodb_table_stats_not_found; extern bool innodb_index_stats_not_found; -#include "sync0rw.h" +/** the first table or index ID for other than hard-coded system tables */ +constexpr uint8_t DICT_HDR_FIRST_ID= 10; + /********************************************************************//** Get the database name length in a table name. @return database name length */ @@ -130,19 +131,14 @@ dict_table_close( MY_ATTRIBUTE((nonnull)); /*********************************************************************//** Closes the only open handle to a table and drops a table while assuring -that dict_sys->mutex is held the whole time. This assures that the table +that dict_sys.mutex is held the whole time. This assures that the table is not evicted after the close when the count of open handles goes to zero. -Because dict_sys->mutex is held, we do not need to call -dict_table_prevent_eviction(). */ +Because dict_sys.mutex is held, we do not need to call prevent_eviction(). */ void dict_table_close_and_drop( /*======================*/ trx_t* trx, /*!< in: data dictionary transaction */ dict_table_t* table); /*!< in/out: table */ -/**********************************************************************//** -Inits the data dictionary module. */ -void -dict_init(void); /*********************************************************************//** Gets the minimum number of bytes per character. @@ -287,13 +283,6 @@ dict_col_name_is_reserved( /*======================*/ const char* name) /*!< in: column name */ MY_ATTRIBUTE((nonnull, warn_unused_result)); -/********************************************************************//** -Acquire the autoinc lock. */ -void -dict_table_autoinc_lock( -/*====================*/ - dict_table_t* table) /*!< in/out: table */ - MY_ATTRIBUTE((nonnull)); /** Unconditionally set the AUTO_INCREMENT counter. @param[in,out] table table or partition @param[in] value next available AUTO_INCREMENT value */ @@ -302,7 +291,7 @@ UNIV_INLINE void dict_table_autoinc_initialize(dict_table_t* table, ib_uint64_t value) { - ut_ad(dict_table_autoinc_own(table)); + ut_ad(mutex_own(&table->autoinc_mutex)); table->autoinc = value; } @@ -315,7 +304,7 @@ UNIV_INLINE ib_uint64_t dict_table_autoinc_read(const dict_table_t* table) { - ut_ad(dict_table_autoinc_own(table)); + ut_ad(mutex_own(&table->autoinc_mutex)); return(table->autoinc); } @@ -329,7 +318,7 @@ UNIV_INLINE bool dict_table_autoinc_update_if_greater(dict_table_t* table, ib_uint64_t value) { - ut_ad(dict_table_autoinc_own(table)); + ut_ad(mutex_own(&table->autoinc_mutex)); if (value > table->autoinc) { @@ -340,13 +329,6 @@ dict_table_autoinc_update_if_greater(dict_table_t* table, ib_uint64_t value) return(false); } -/********************************************************************//** -Release the autoinc lock. */ -void -dict_table_autoinc_unlock( -/*======================*/ - dict_table_t* table) /*!< in/out: table */ - MY_ATTRIBUTE((nonnull)); /**********************************************************************//** Adds system columns to a table object. */ void @@ -356,22 +338,6 @@ dict_table_add_system_columns( mem_heap_t* heap) /*!< in: temporary heap */ MY_ATTRIBUTE((nonnull)); /**********************************************************************//** -Removes a table object from the dictionary cache. */ -void -dict_table_remove_from_cache( -/*=========================*/ - dict_table_t* table) /*!< in, own: table */ - MY_ATTRIBUTE((nonnull)); -/**********************************************************************//** -Removes a table object from the dictionary cache. */ -void -dict_table_remove_from_cache_low( -/*=============================*/ - dict_table_t* table, /*!< in, own: table */ - ibool lru_evict) /*!< in: TRUE if table being evicted - to make room in the table LRU list */ - MY_ATTRIBUTE((nonnull)); -/**********************************************************************//** Renames a table object. @return TRUE if success */ dberr_t @@ -694,65 +660,14 @@ do { \ dict_table_skip_corrupt_index(index); \ } while (0) -/********************************************************************//** -Check whether the index is the clustered index. -@return nonzero for clustered index, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_clust( -/*================*/ - const dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((warn_unused_result)); - -/** Check if index is auto-generated clustered index. -@param[in] index index - -@return true if index is auto-generated clustered index. */ -UNIV_INLINE -bool -dict_index_is_auto_gen_clust( - const dict_index_t* index); - -/********************************************************************//** -Check whether the index is unique. -@return nonzero for unique index, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_unique( -/*=================*/ - const dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((warn_unused_result)); -/********************************************************************//** -Check whether the index is a Spatial Index. -@return nonzero for Spatial Index, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_spatial( -/*==================*/ - const dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((warn_unused_result)); - +#define dict_index_is_clust(index) (index)->is_clust() +#define dict_index_is_auto_gen_clust(index) (index)->is_gen_clust() +#define dict_index_is_unique(index) (index)->is_unique() +#define dict_index_is_spatial(index) (index)->is_spatial() +#define dict_index_is_ibuf(index) (index)->is_ibuf() +#define dict_index_is_sec_or_ibuf(index) !(index)->is_primary() #define dict_index_has_virtual(index) (index)->has_virtual() -/********************************************************************//** -Check whether the index is the insert buffer tree. -@return nonzero for insert buffer, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_ibuf( -/*===============*/ - const dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((warn_unused_result)); -/********************************************************************//** -Check whether the index is a secondary index or the insert buffer tree. -@return nonzero for insert buffer, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_sec_or_ibuf( -/*======================*/ - const dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((warn_unused_result)); - /** Get all the FTS indexes on a table. @param[in] table table @param[out] indexes all FTS indexes on this table @@ -908,15 +823,8 @@ dict_index_get_min_size( /*====================*/ const dict_index_t* index) /*!< in: index */ MY_ATTRIBUTE((nonnull, warn_unused_result)); -/********************************************************************//** -Check whether the table uses the compact page format. -@return TRUE if table uses the compact page format */ -UNIV_INLINE -bool -dict_table_is_comp( -/*===============*/ - const dict_table_t* table) /*!< in: table */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); + +#define dict_table_is_comp(table) (table)->not_redundant() /** Determine if a table uses atomic BLOBs (no locally stored prefix). @param[in] table InnoDB table @@ -928,6 +836,18 @@ dict_table_has_atomic_blobs(const dict_table_t* table) return(DICT_TF_HAS_ATOMIC_BLOBS(table->flags)); } +/** @return potential max length stored inline for externally stored fields */ +inline size_t dict_table_t::get_overflow_field_local_len() const +{ + if (dict_table_has_atomic_blobs(this)) { + /* ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED: do not + store any BLOB prefix locally */ + return BTR_EXTERN_FIELD_REF_SIZE; + } + /* up to MySQL 5.1: store a 768-byte prefix locally */ + return BTR_EXTERN_FIELD_REF_SIZE + DICT_ANTELOPE_MAX_INDEX_COL_LEN; +} + /** Set the various values in a dict_table_t::flags pointer. @param[in,out] flags, Pointer to a 4 byte Table Flags @param[in] format, File Format @@ -962,44 +882,34 @@ ulint dict_tf_to_fsp_flags(ulint table_flags) MY_ATTRIBUTE((const)); -/** Extract the page size from table flags. + +/** Extract the ROW_FORMAT=COMPRESSED page size from table flags. @param[in] flags flags -@return compressed page size, or 0 if not compressed */ -UNIV_INLINE -const page_size_t -dict_tf_get_page_size( - ulint flags) -MY_ATTRIBUTE((const)); +@return ROW_FORMAT=COMPRESSED page size +@retval 0 if not compressed */ +inline ulint dict_tf_get_zip_size(ulint flags) +{ + flags &= DICT_TF_MASK_ZIP_SSIZE; + return flags + ? (UNIV_ZIP_SIZE_MIN >> 1) + << (FSP_FLAGS_GET_ZIP_SSIZE(flags >> DICT_TF_POS_ZIP_SSIZE + << FSP_FLAGS_POS_ZIP_SSIZE)) + : 0; +} /** Determine the extent size (in pages) for the given table @param[in] table the table whose extent size is being calculated. @return extent size in pages (256, 128 or 64) */ -ulint -dict_table_extent_size( - const dict_table_t* table); +inline ulint dict_table_extent_size(const dict_table_t* table) +{ + if (ulint zip_size = table->space->zip_size()) { + return (1ULL << 20) / zip_size; + } -/** Get the table page size. */ -#define dict_table_page_size(table) page_size_t(table->space->flags) + return FSP_EXTENT_SIZE; +} -/*********************************************************************//** -Obtain exclusive locks on all index trees of the table. This is to prevent -accessing index trees while InnoDB is updating internal metadata for -operations such as truncate tables. */ -UNIV_INLINE -void -dict_table_x_lock_indexes( -/*======================*/ - dict_table_t* table) /*!< in: table */ - MY_ATTRIBUTE((nonnull)); -/*********************************************************************//** -Release the exclusive locks on all index tree. */ -UNIV_INLINE -void -dict_table_x_unlock_indexes( -/*========================*/ - dict_table_t* table) /*!< in: table */ - MY_ATTRIBUTE((nonnull)); /********************************************************************//** Checks if a column is in the ordering columns of the clustered index of a table. Column prefixes are treated like whole columns. @@ -1061,10 +971,6 @@ dict_make_room_in_cache( ulint max_tables, /*!< in: max tables allowed in cache */ ulint pct_check); /*!< in: max percent to check */ -/** Clears the virtual column's index list before index is being freed. -@param[in] index Index being freed */ -void dict_index_remove_from_v_col_list(dict_index_t* index); - /** Adds an index to the dictionary cache, with possible indexing newly added column. @param[in,out] index index; NOTE! The index memory @@ -1210,21 +1116,6 @@ dict_index_get_nth_col_or_prefix_pos( ulint* prefix_col_pos) /*!< out: col num if prefix */ __attribute__((warn_unused_result)); - -/********************************************************************//** -Returns TRUE if the index contains a column or a prefix of that column. -@param[in] index index -@param[in] n column number -@param[in] is_virtual whether it is a virtual col -@return TRUE if contains the column or its prefix */ -bool -dict_index_contains_col_or_prefix( -/*==============================*/ - const dict_index_t* index, /*!< in: index */ - ulint n, /*!< in: column number */ - bool is_virtual) - /*!< in: whether it is a virtual col */ - MY_ATTRIBUTE((warn_unused_result)); /********************************************************************//** Looks for a matching field in an index. The column has to be the same. The column in index must be complete, or must contain a prefix longer than the @@ -1249,16 +1140,6 @@ dict_table_get_nth_col_pos( ulint n, /*!< in: column number */ ulint* prefix_col_pos) /*!< out: col num if prefix */ MY_ATTRIBUTE((nonnull(1), warn_unused_result)); -/********************************************************************//** -Returns the position of a system column in an index. -@return position, ULINT_UNDEFINED if not contained */ -UNIV_INLINE -ulint -dict_index_get_sys_col_pos( -/*=======================*/ - const dict_index_t* index, /*!< in: index */ - ulint type) /*!< in: DATA_ROW_ID, ... */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); /*******************************************************************//** Adds a column to index. */ void @@ -1292,7 +1173,7 @@ dict_field_get_col( /**********************************************************************//** Returns an index object if it is found in the dictionary cache. -Assumes that dict_sys->mutex is already being held. +Assumes that dict_sys.mutex is already being held. @return index, NULL if not found */ dict_index_t* dict_index_get_if_in_cache_low( @@ -1356,21 +1237,6 @@ dict_index_build_node_ptr( ulint level) /*!< in: level of rec in tree: 0 means leaf level */ MY_ATTRIBUTE((nonnull, warn_unused_result)); -/**********************************************************************//** -Copies an initial segment of a physical record, long enough to specify an -index entry uniquely. -@return pointer to the prefix record */ -rec_t* -dict_index_copy_rec_order_prefix( -/*=============================*/ - const dict_index_t* index, /*!< in: index */ - const rec_t* rec, /*!< in: record for which to - copy prefix */ - ulint* n_fields,/*!< out: number of fields copied */ - byte** buf, /*!< in/out: memory buffer for the - copied prefix, or NULL */ - ulint* buf_size)/*!< in/out: buffer size */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); /** Convert a physical record into a search tuple. @param[in] rec index record (not necessarily in an index page) @param[in] index index @@ -1455,53 +1321,9 @@ dict_index_calc_min_rec_len( /*========================*/ const dict_index_t* index) /*!< in: index */ MY_ATTRIBUTE((nonnull, warn_unused_result)); -/** Reserve the dictionary system mutex. */ -void -dict_mutex_enter_for_mysql_func(const char *file, unsigned line); -#define dict_mutex_enter_for_mysql() \ - dict_mutex_enter_for_mysql_func(__FILE__, __LINE__) - -/********************************************************************//** -Releases the dictionary system mutex for MySQL. */ -void -dict_mutex_exit_for_mysql(void); -/*===========================*/ - -/** Create a dict_table_t's stats latch or delay for lazy creation. -This function is only called from either single threaded environment -or from a thread that has not shared the table object with other threads. -@param[in,out] table table whose stats latch to create -@param[in] enabled if false then the latch is disabled -and dict_table_stats_lock()/unlock() become noop on this table. */ -void -dict_table_stats_latch_create( - dict_table_t* table, - bool enabled); - -/** Destroy a dict_table_t's stats latch. -This function is only called from either single threaded environment -or from a thread that has not shared the table object with other threads. -@param[in,out] table table whose stats latch to destroy */ -void -dict_table_stats_latch_destroy( - dict_table_t* table); - -/** Lock the appropriate latch to protect a given table's statistics. -@param[in] table table whose stats to lock -@param[in] latch_mode RW_S_LATCH or RW_X_LATCH */ -void -dict_table_stats_lock( - dict_table_t* table, - ulint latch_mode); - -/** Unlock the latch that has been locked by dict_table_stats_lock(). -@param[in] table table whose stats to unlock -@param[in] latch_mode RW_S_LATCH or RW_X_LATCH */ -void -dict_table_stats_unlock( - dict_table_t* table, - ulint latch_mode); +#define dict_mutex_enter_for_mysql() mutex_enter(&dict_sys.mutex) +#define dict_mutex_exit_for_mysql() mutex_exit(&dict_sys.mutex) /********************************************************************//** Checks if the database name in two table names is the same. @@ -1547,23 +1369,6 @@ dict_table_is_fts_column( ulint col_no, /* in: col number to search for */ bool is_virtual)/*!< in: whether it is a virtual column */ MY_ATTRIBUTE((warn_unused_result)); -/**********************************************************************//** -Prevent table eviction by moving a table to the non-LRU list from the -LRU list if it is not already there. */ -UNIV_INLINE -void -dict_table_prevent_eviction( -/*========================*/ - dict_table_t* table) /*!< in: table to prevent eviction */ - MY_ATTRIBUTE((nonnull)); - -/**********************************************************************//** -Move a table to the non LRU end of the LRU list. */ -void -dict_table_move_from_lru_to_non_lru( -/*================================*/ - dict_table_t* table) /*!< in: table to move from LRU to non-LRU */ - MY_ATTRIBUTE((nonnull)); /** Looks for an index with the given id given a table instance. @param[in] table table instance @@ -1575,14 +1380,6 @@ dict_table_find_index_on_id( index_id_t id) MY_ATTRIBUTE((nonnull(1))); -/**********************************************************************//** -Move to the most recently used segment of the LRU list. */ -void -dict_move_to_mru( -/*=============*/ - dict_table_t* table) /*!< in: table to move to MRU */ - MY_ATTRIBUTE((nonnull)); - /** Maximum number of columns in a foreign key constraint. Please Note MySQL has a much lower limit on the number of columns allowed in a foreign key constraint */ @@ -1594,13 +1391,10 @@ extern FILE* dict_foreign_err_file; extern ib_mutex_t dict_foreign_err_mutex; /* mutex protecting the foreign key error messages */ -/** the dictionary system */ -extern dict_sys_t* dict_sys; -/** the data dictionary rw-latch protecting dict_sys */ -extern rw_lock_t dict_operation_lock; - -/* Dictionary system struct */ -struct dict_sys_t{ +/** InnoDB data dictionary cache */ +class dict_sys_t +{ +public: DictSysMutex mutex; /*!< mutex protecting the data dictionary; protects also the disk-based dictionary system tables; @@ -1608,6 +1402,15 @@ struct dict_sys_t{ and DROP TABLE, as well as reading the dictionary data for a table from system tables */ + /** @brief the data dictionary rw-latch protecting dict_sys + + Table create, drop, etc. reserve this in X-mode; implicit or + backround operations purge, rollback, foreign key checks reserve this + in S-mode; not all internal InnoDB operations are covered by MDL. + + This latch also prevents lock waits when accessing the InnoDB + data dictionary tables. @see trx_t::dict_operation_lock_mode */ + rw_lock_t latch; row_id_t row_id; /*!< the next row id to assign; NOTE that at a checkpoint this must be written to the dict system @@ -1616,8 +1419,8 @@ struct dict_sys_t{ the log records */ hash_table_t* table_hash; /*!< hash table of the tables, based on name */ - hash_table_t* table_id_hash; /*!< hash table of the tables, based - on id */ + /** hash table of persistent table IDs */ + hash_table_t* table_id_hash; dict_table_t* sys_tables; /*!< SYS_TABLES table */ dict_table_t* sys_columns; /*!< SYS_COLUMNS table */ dict_table_t* sys_indexes; /*!< SYS_INDEXES table */ @@ -1631,8 +1434,145 @@ struct dict_sys_t{ UT_LIST_BASE_NODE_T(dict_table_t) table_non_LRU; /*!< List of tables that can't be evicted from the cache */ +private: + bool m_initialised; + /** the sequence of temporary table IDs */ + std::atomic<table_id_t> temp_table_id; + /** hash table of temporary table IDs */ + hash_table_t* temp_id_hash; +public: + /** @return a new temporary table ID */ + table_id_t get_temporary_table_id() { + return temp_table_id.fetch_add(1, std::memory_order_relaxed); + } + + /** Look up a temporary table. + @param id temporary table ID + @return temporary table + @retval NULL if the table does not exist + (should only happen during the rollback of CREATE...SELECT) */ + dict_table_t* get_temporary_table(table_id_t id) + { + ut_ad(mutex_own(&mutex)); + dict_table_t* table; + ulint fold = ut_fold_ull(id); + HASH_SEARCH(id_hash, temp_id_hash, fold, dict_table_t*, table, + ut_ad(table->cached), table->id == id); + if (UNIV_LIKELY(table != NULL)) { + DBUG_ASSERT(table->is_temporary()); + DBUG_ASSERT(table->id >= DICT_HDR_FIRST_ID); + table->acquire(); + } + return table; + } + + /** Look up a persistent table. + @param id table ID + @return table + @retval NULL if not cached */ + dict_table_t* get_table(table_id_t id) + { + ut_ad(mutex_own(&mutex)); + dict_table_t* table; + ulint fold = ut_fold_ull(id); + HASH_SEARCH(id_hash, table_id_hash, fold, dict_table_t*, table, + ut_ad(table->cached), table->id == id); + DBUG_ASSERT(!table || !table->is_temporary()); + return table; + } + + /** + Constructor. Further initialisation happens in create(). + */ + + dict_sys_t() : m_initialised(false), temp_table_id(DICT_HDR_FIRST_ID) {} + + bool is_initialised() const { return m_initialised; } + + /** Initialise the data dictionary cache. */ + void create(); + + /** Close the data dictionary cache on shutdown. */ + void close(); + + /** Resize the hash tables based on the current buffer pool size. */ + void resize(); + + /** Add a table definition to the data dictionary cache */ + inline void add(dict_table_t* table); + /** Remove a table definition from the data dictionary cache. + @param[in,out] table cached table definition to be evicted + @param[in] lru whether this is part of least-recently-used evictiono + @param[in] keep whether to keep (not free) the object */ + void remove(dict_table_t* table, bool lru = false, bool keep = false); + +#ifdef UNIV_DEBUG + /** Find a table */ + template <bool in_lru> bool find(dict_table_t* table) + { + ut_ad(table); + ut_ad(table->can_be_evicted == in_lru); + ut_ad(mutex_own(&mutex)); + for (const dict_table_t* t = UT_LIST_GET_FIRST(in_lru + ? table_LRU : table_non_LRU); + t; t = UT_LIST_GET_NEXT(table_LRU, t)) + { + if (t == table) return true; + ut_ad(t->can_be_evicted == in_lru); + } + return false; + } + /** Find a table */ + bool find(dict_table_t* table) + { + return table->can_be_evicted ? find<true>(table) : find<false>(table); + } +#endif + + /** Move a table to the non-LRU list from the LRU list. */ + void prevent_eviction(dict_table_t* table) + { + ut_ad(find(table)); + if (table->can_be_evicted) + { + table->can_be_evicted = FALSE; + UT_LIST_REMOVE(table_LRU, table); + UT_LIST_ADD_LAST(table_non_LRU, table); + } + } + /** Acquire a reference to a cached table. */ + inline void acquire(dict_table_t* table); + +#ifdef UNIV_DEBUG + /** Assert that the data dictionary is locked */ + void assert_locked() + { + ut_ad(mutex_own(&mutex)); + ut_ad(rw_lock_own(&latch, RW_LOCK_X)); + } +#endif + /** Lock the data dictionary cache. */ + void lock(const char* file, unsigned line) + { + rw_lock_x_lock_func(&latch, 0, file, line); + mutex_enter_loc(&mutex, file, line); + } + + /** Unlock the data dictionary cache. */ + void unlock() + { + mutex_exit(&mutex); + rw_lock_x_unlock(&latch); + } }; +/** the data dictionary cache */ +extern dict_sys_t dict_sys; + +#define dict_table_prevent_eviction(table) dict_sys.prevent_eviction(table) +#define dict_sys_lock() dict_sys.lock(__FILE__, __LINE__) +#define dict_sys_unlock() dict_sys.unlock() + /** dummy index for ROW_FORMAT=REDUNDANT supremum and infimum records */ extern dict_index_t* dict_ind_redundant; @@ -1716,16 +1656,6 @@ dict_fs2utf8( size_t table_utf8_size)/*!< in: table_utf8 size */ MY_ATTRIBUTE((nonnull)); -/** Resize the hash tables besed on the current buffer pool size. */ -void -dict_resize(); - -/**********************************************************************//** -Closes the data dictionary module. */ -void -dict_close(void); -/*============*/ - /**********************************************************************//** Check whether the table is corrupted. @return nonzero for corrupted table, zero for valid tables */ diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic index 4174580c918..b6d15f28a69 100644 --- a/storage/innobase/include/dict0dict.ic +++ b/storage/innobase/include/dict0dict.ic @@ -25,6 +25,7 @@ Created 1/8/1996 Heikki Tuuri ***********************************************************************/ #include "fsp0sysspace.h" +#include "dict0pagecompress.h" /*********************************************************************//** Gets the minimum number of bytes per character. @@ -241,83 +242,6 @@ dict_table_get_next_index( #endif /* UNIV_DEBUG */ /********************************************************************//** -Check whether the index is the clustered index. -@return nonzero for clustered index, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_clust( -/*================*/ - const dict_index_t* index) /*!< in: index */ -{ - ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - return(index->type & DICT_CLUSTERED); -} - -/** Check if index is auto-generated clustered index. -@param[in] index index - -@return true if index is auto-generated clustered index. */ -UNIV_INLINE -bool -dict_index_is_auto_gen_clust( - const dict_index_t* index) -{ - return(index->type == DICT_CLUSTERED); -} - -/********************************************************************//** -Check whether the index is unique. -@return nonzero for unique index, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_unique( -/*=================*/ - const dict_index_t* index) /*!< in: index */ -{ - ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - return(index->type & DICT_UNIQUE); -} - -/********************************************************************//** -Check whether the index is a Spatial Index. -@return nonzero for Spatial Index, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_spatial( -/*==================*/ - const dict_index_t* index) /*!< in: index */ -{ - ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - return ulint(UNIV_EXPECT(index->type & DICT_SPATIAL, 0)); -} - -/********************************************************************//** -Check whether the index is the insert buffer tree. -@return nonzero for insert buffer, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_ibuf( -/*===============*/ - const dict_index_t* index) /*!< in: index */ -{ - ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - return(index->type & DICT_IBUF); -} - -/********************************************************************//** -Check whether the index is a secondary index or the insert buffer tree. -@return nonzero for insert buffer, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_sec_or_ibuf( -/*======================*/ - const dict_index_t* index) /*!< in: index */ -{ - ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - return((index->type & (DICT_CLUSTERED | DICT_IBUF)) != DICT_CLUSTERED); -} - -/********************************************************************//** Gets the number of user-defined non-virtual columns in a table in the dictionary cache. @return number of user-defined (e.g., not ROW_ID) non-virtual @@ -462,7 +386,8 @@ dict_table_get_nth_v_col( ut_ad(table); ut_ad(pos < table->n_v_def); ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); - ut_ad(!table->v_cols[pos].m_col.is_instant()); + ut_ad(!table->v_cols[pos].m_col.is_added()); + ut_ad(!table->v_cols[pos].m_col.is_dropped()); return &table->v_cols[pos]; } @@ -501,19 +426,6 @@ dict_table_get_sys_col_no( return unsigned(table->n_cols) + (sys - DATA_N_SYS_COLS); } -/********************************************************************//** -Check whether the table uses the compact page format. -@return TRUE if table uses the compact page format */ -UNIV_INLINE -bool -dict_table_is_comp( -/*===============*/ - const dict_table_t* table) /*!< in: table */ -{ - ut_ad(table); - return (table->flags & DICT_TF_COMPACT) != 0; -} - /************************************************************************ Check if the table has an FTS index. */ UNIV_INLINE @@ -720,20 +632,34 @@ dict_tf_to_fsp_flags(ulint table_flags) DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure", return(ULINT_UNDEFINED);); - /* Adjust bit zero. */ - fsp_flags = DICT_TF_HAS_ATOMIC_BLOBS(table_flags) ? 1 : 0; + /* No ROW_FORMAT=COMPRESSED for innodb_checksum_algorithm=full_crc32 */ + if ((srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32 + || srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_FULL_CRC32) + && !(table_flags & DICT_TF_MASK_ZIP_SSIZE)) { + + fsp_flags = 1U << FSP_FLAGS_FCRC32_POS_MARKER + | FSP_FLAGS_FCRC32_PAGE_SSIZE(); + + if (page_compression_level) { + fsp_flags |= innodb_compression_algorithm + << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO; + } + } else { + /* Adjust bit zero. */ + fsp_flags = DICT_TF_HAS_ATOMIC_BLOBS(table_flags) ? 1 : 0; - /* ZIP_SSIZE and ATOMIC_BLOBS are at the same position. */ - fsp_flags |= table_flags - & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS); + /* ZIP_SSIZE and ATOMIC_BLOBS are at the same position. */ + fsp_flags |= table_flags + & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS); - fsp_flags |= FSP_FLAGS_PAGE_SSIZE(); + fsp_flags |= FSP_FLAGS_PAGE_SSIZE(); - if (page_compression_level) { - fsp_flags |= FSP_FLAGS_MASK_PAGE_COMPRESSION; + if (page_compression_level) { + fsp_flags |= FSP_FLAGS_MASK_PAGE_COMPRESSION; + } } - ut_a(fsp_flags_is_valid(fsp_flags, false)); + ut_a(fil_space_t::is_valid_flags(fsp_flags, false)); if (DICT_TF_HAS_DATA_DIR(table_flags)) { fsp_flags |= 1U << FSP_FLAGS_MEM_DATA_DIR; @@ -779,50 +705,6 @@ dict_tf_to_sys_tables_type( return(type); } -/** Extract the page size info from table flags. -@param[in] flags flags -@return a structure containing the compressed and uncompressed -page sizes and a boolean indicating if the page is compressed. */ -UNIV_INLINE -const page_size_t -dict_tf_get_page_size( - ulint flags) -{ - const ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags); - - if (zip_ssize == 0) { - return(univ_page_size); - } - - const ulint zip_size = (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize; - - ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX); - - return(page_size_t(zip_size, srv_page_size, true)); -} - -/*********************************************************************//** -Obtain exclusive locks on all index trees of the table. This is to prevent -accessing index trees while InnoDB is updating internal metadata for -operations such as truncate tables. */ -UNIV_INLINE -void -dict_table_x_lock_indexes( -/*======================*/ - dict_table_t* table) /*!< in: table */ -{ - dict_index_t* index; - - ut_ad(mutex_own(&dict_sys->mutex)); - - /* Loop through each index of the table and lock them */ - for (index = dict_table_get_first_index(table); - index != NULL; - index = dict_table_get_next_index(index)) { - rw_lock_x_lock(dict_index_get_lock(index)); - } -} - /*********************************************************************//** Returns true if the particular FTS index in the table is still syncing in the background, false otherwise. @@ -844,24 +726,6 @@ dict_fts_index_syncing( } return(false); } -/*********************************************************************//** -Release the exclusive locks on all index tree. */ -UNIV_INLINE -void -dict_table_x_unlock_indexes( -/*========================*/ - dict_table_t* table) /*!< in: table */ -{ - dict_index_t* index; - - ut_ad(mutex_own(&dict_sys->mutex)); - - for (index = dict_table_get_first_index(table); - index != NULL; - index = dict_table_get_next_index(index)) { - rw_lock_x_unlock(dict_index_get_lock(index)); - } -} /********************************************************************//** Gets the number of fields in the internal representation of an index, @@ -979,30 +843,6 @@ dict_index_get_nth_field( } #endif /* UNIV_DEBUG */ -/********************************************************************//** -Returns the position of a system column in an index. -@return position, ULINT_UNDEFINED if not contained */ -UNIV_INLINE -ulint -dict_index_get_sys_col_pos( -/*=======================*/ - const dict_index_t* index, /*!< in: index */ - ulint type) /*!< in: DATA_ROW_ID, ... */ -{ - ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - ut_ad(!dict_index_is_ibuf(index)); - - if (dict_index_is_clust(index)) { - - return(dict_col_get_clust_pos( - dict_table_get_sys_col(index->table, type), - index)); - } - - return(dict_index_get_nth_col_pos( - index, dict_table_get_sys_col_no(index->table, type), NULL)); -} - /*********************************************************************//** Gets the field column. @return field->col, pointer to the table column */ @@ -1233,9 +1073,7 @@ dict_table_is_fts_column( index = (dict_index_t*) ib_vector_getp(indexes, i); - if (dict_index_contains_col_or_prefix( - index, col_no, is_virtual)) { - + if (index->contains_col_or_prefix(col_no, is_virtual)) { return(i); } } @@ -1300,21 +1138,6 @@ dict_max_v_field_len_store_undo( return(max_log_len); } -/**********************************************************************//** -Prevent table eviction by moving a table to the non-LRU list from the -LRU list if it is not already there. */ -UNIV_INLINE -void -dict_table_prevent_eviction( -/*========================*/ - dict_table_t* table) /*!< in: table to prevent eviction */ -{ - ut_ad(mutex_own(&dict_sys->mutex)); - if (table->can_be_evicted) { - dict_table_move_from_lru_to_non_lru(table); - } -} - /********************************************************************//** Check whether the table is corrupted. @return nonzero for corrupted table, zero for valid tables */ @@ -1358,8 +1181,8 @@ inline void dict_table_t::acquire() { - ut_ad(mutex_own(&dict_sys->mutex)); - my_atomic_add32_explicit(&n_ref_count, 1, MY_MEMORY_ORDER_RELAXED); + ut_ad(mutex_own(&dict_sys.mutex)); + n_ref_count++; } /** Release the table handle. @@ -1368,8 +1191,7 @@ inline bool dict_table_t::release() { - int32 n = my_atomic_add32_explicit( - &n_ref_count, -1, MY_MEMORY_ORDER_RELAXED); + auto n = n_ref_count--; ut_ad(n > 0); return n == 1; } diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h index aa3de6d0b17..afc017fd9d1 100644 --- a/storage/innobase/include/dict0load.h +++ b/storage/innobase/include/dict0load.h @@ -82,7 +82,7 @@ dict_get_first_table_name_in_db( /** Make sure the data_file_name is saved in dict_table_t if needed. Try to read it from the fil_system first, then from SYS_DATAFILES. @param[in] table Table object -@param[in] dict_mutex_own true if dict_sys->mutex is owned already */ +@param[in] dict_mutex_own true if dict_sys.mutex is owned already */ void dict_get_and_save_data_dir_path( dict_table_t* table, diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index 2504b2ef61d..259da23fcd9 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -44,9 +44,9 @@ Created 1/8/1996 Heikki Tuuri #include "fts0fts.h" #include "buf0buf.h" #include "gis0type.h" -#include "os0once.h" #include "fil0fil.h" #include "fil0crypt.h" +#include <sql_const.h> #include <set> #include <algorithm> #include <iterator> @@ -573,6 +573,10 @@ struct dict_col_t{ this column. Our current max limit is 3072 (REC_VERSION_56_MAX_INDEX_COL_LEN) bytes. */ +private: + /** Special value of ind for a dropped column */ + static const unsigned DROPPED = 1023; +public: /** Detach a virtual column from an index. @param index being-freed index */ @@ -588,7 +592,7 @@ struct dict_col_t{ } def_val; /** Retrieve the column name. - @param[in] table the table of this column */ + @param table the table of this column */ const char *name(const dict_table_t &table) const; /** @return whether this is a virtual column */ @@ -603,7 +607,8 @@ struct dict_col_t{ ut_ad(mtype == DATA_INT || mtype == DATA_FIXBINARY); return mtype == DATA_INT; } - /** @return whether this is system versioned */ + /** @return whether this user column (not row_start, row_end) + has System Versioning property */ bool is_versioned() const { return !(~prtype & DATA_VERSIONED); } /** @return whether this is the system version start */ bool vers_sys_start() const @@ -617,29 +622,119 @@ struct dict_col_t{ } /** @return whether this is an instantly-added column */ - bool is_instant() const + bool is_added() const { DBUG_ASSERT(def_val.len != UNIV_SQL_DEFAULT || !def_val.data); return def_val.len != UNIV_SQL_DEFAULT; } + /** Flag the column instantly dropped */ + void set_dropped() { ind = DROPPED; } + /** Flag the column instantly dropped. + @param not_null whether the column was NOT NULL + @param len2 whether the length exceeds 255 bytes + @param fixed_len the fixed length in bytes, or 0 */ + void set_dropped(bool not_null, bool len2, unsigned fixed) + { + DBUG_ASSERT(!len2 || !fixed); + prtype= not_null ? DATA_NOT_NULL | DATA_BINARY_TYPE : DATA_BINARY_TYPE; + if (fixed) + { + mtype= DATA_FIXBINARY; + len= fixed; + } + else + { + mtype= DATA_BINARY; + len= len2 ? 65535 : 255; + } + mbminlen= mbmaxlen= 0; + ind= DROPPED; + ord_part= 0; + max_prefix= 0; + } + /** @return whether the column was instantly dropped */ + bool is_dropped() const { return ind == DROPPED; } + /** @return whether the column was instantly dropped + @param index the clustered index */ + inline bool is_dropped(const dict_index_t &index) const; + /** Get the default value of an instantly-added column. @param[out] len value length (in bytes), or UNIV_SQL_NULL @return default value @retval NULL if the default value is SQL NULL (len=UNIV_SQL_NULL) */ const byte *instant_value(ulint *len) const { - DBUG_ASSERT(is_instant()); + DBUG_ASSERT(is_added()); *len= def_val.len; return static_cast<const byte*>(def_val.data); } /** Remove the 'instant ADD' status of the column */ - void remove_instant() + void clear_instant() { - DBUG_ASSERT(is_instant()); def_val.len= UNIV_SQL_DEFAULT; def_val.data= NULL; } + + /** @return whether two columns have compatible data type encoding */ + bool same_type(const dict_col_t &other) const + { + if (mtype != other.mtype) + { + /* For latin1_swedish_ci, DATA_CHAR and DATA_VARCHAR + will be used instead of DATA_MYSQL and DATA_VARMYSQL. + As long as mtype,prtype are being written to InnoDB + data dictionary tables, we cannot simplify this. */ + switch (mtype) { + default: + return false; + case DATA_VARCHAR: + if (other.mtype != DATA_VARMYSQL) + return false; + goto check_encoding; + case DATA_VARMYSQL: + if (other.mtype != DATA_VARCHAR) + return false; + goto check_encoding; + case DATA_CHAR: + if (other.mtype != DATA_MYSQL) + return false; + goto check_encoding; + case DATA_MYSQL: + if (other.mtype != DATA_CHAR) + return false; + goto check_encoding; + } + } + else if (dtype_is_string_type(mtype)) + { + check_encoding: + const uint16_t cset= dtype_get_charset_coll(prtype); + const uint16_t ocset= dtype_get_charset_coll(other.prtype); + return cset == ocset || dict_col_t::same_encoding(cset, ocset); + } + + return true; + } + + /** @return whether two collations codes have the same character encoding */ + static bool same_encoding(uint16_t a, uint16_t b); + + /** Determine if the columns have the same format + except for is_nullable() and is_versioned(). + @param other column to compare to + @return whether the columns have the same format */ + bool same_format(const dict_col_t &other) const + { + return same_type(other) && len >= other.len && + mbminlen == other.mbminlen && mbmaxlen == other.mbmaxlen && + !((prtype ^ other.prtype) & ~(DATA_NOT_NULL | DATA_VERSIONED | + CHAR_COLL_MASK << 16 | + DATA_LONG_TRUE_VARCHAR)); + } + + /** @return whether the column values are comparable by memcmp() */ + bool is_binary() const { return prtype & DATA_BINARY_TYPE; } }; /** Index information put in a list of virtual column structure. Index @@ -656,9 +751,6 @@ struct dict_v_idx_t { : index(index), nth_field(nth_field) {} }; -/** Index list to put in dict_v_col_t */ -typedef std::list<dict_v_idx_t, ut_allocator<dict_v_idx_t> > dict_v_idx_list; - /** Data structure for a virtual column in a table */ struct dict_v_col_t{ /** column structure */ @@ -668,15 +760,42 @@ struct dict_v_col_t{ dict_col_t** base_col; /** number of base column */ - ulint num_base; + unsigned num_base:10; /** column pos in table */ - ulint v_pos; + unsigned v_pos:10; - /** Virtual index list, and column position in the index, - the allocated memory is not from table->heap */ - dict_v_idx_list* v_indexes; + /** number of indexes */ + unsigned n_v_indexes:12; + /** Virtual index list, and column position in the index */ + std::forward_list<dict_v_idx_t, ut_allocator<dict_v_idx_t> > + v_indexes; + + /** Detach the column from an index. + @param index index to be detached from */ + void detach(const dict_index_t &index) + { + if (!n_v_indexes) return; + auto i= v_indexes.before_begin(); + ut_d(unsigned n= 0); + do { + auto prev = i++; + if (i == v_indexes.end()) + { + ut_ad(n == n_v_indexes); + return; + } + ut_ad(++n <= n_v_indexes); + if (i->index == &index) + { + v_indexes.erase_after(prev); + n_v_indexes--; + return; + } + } + while (i != v_indexes.end()); + } }; /** Data structure for newly added virtual column in a table */ @@ -704,7 +823,8 @@ struct dict_s_col_t { }; /** list to put stored column for create_table_info_t */ -typedef std::list<dict_s_col_t, ut_allocator<dict_s_col_t> > dict_s_col_list; +typedef std::forward_list<dict_s_col_t, ut_allocator<dict_s_col_t> > +dict_s_col_list; /** @brief DICT_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and is the maximum indexed column length (or indexed prefix length) in @@ -812,17 +932,15 @@ extern ulong zip_pad_max; an uncompressed page should be left as padding to avoid compression failures. This estimate is based on a self-adapting heuristic. */ struct zip_pad_info_t { - SysMutex* mutex; /*!< mutex protecting the info */ - ulint pad; /*!< number of bytes used as pad */ + SysMutex mutex; /*!< mutex protecting the info */ + Atomic_relaxed<ulint> + pad; /*!< number of bytes used as pad */ ulint success;/*!< successful compression ops during current round */ ulint failure;/*!< failed compression ops during current round */ ulint n_rounds;/*!< number of currently successful rounds */ - volatile os_once::state_t - mutex_created; - /*!< Creation state of mutex member */ }; /** Number of samples of data size kept when page compression fails for @@ -835,7 +953,7 @@ const char innobase_index_reserve_name[] = "GEN_CLUST_INDEX"; /** Data structure for an index. Most fields will be initialized to 0, NULL or FALSE in dict_mem_index_create(). */ -struct dict_index_t{ +struct dict_index_t { index_id_t id; /*!< id of the index */ mem_heap_t* heap; /*!< memory heap */ id_name_t name; /*!< index name */ @@ -897,13 +1015,13 @@ struct dict_index_t{ dictionary cache */ unsigned to_be_dropped:1; /*!< TRUE if the index is to be dropped; - protected by dict_operation_lock */ + protected by dict_sys.latch */ unsigned online_status:2; /*!< enum online_index_status. Transitions from ONLINE_INDEX_COMPLETE (to ONLINE_INDEX_CREATION) are protected - by dict_operation_lock and - dict_sys->mutex. Other changes are + by dict_sys.latch and + dict_sys.mutex. Other changes are protected by index->lock. */ unsigned uncommitted:1; /*!< a flag that is set for secondary indexes @@ -913,6 +1031,8 @@ struct dict_index_t{ #ifdef UNIV_DEBUG /** whether this is a dummy index object */ bool is_dummy; + /** whether btr_cur_instant_init() is in progress */ + bool in_instant_init; uint32_t magic_n;/*!< magic number */ /** Value of dict_index_t::magic_n */ # define DICT_INDEX_MAGIC_N 76789786 @@ -987,8 +1107,14 @@ struct dict_index_t{ /* in which slot the next sample should be saved. */ /* @} */ - /** R-tree split sequence number */ - volatile int32 rtr_ssn; +private: + /** R-tree split sequence number */ + Atomic_relaxed<node_seq_t> rtr_ssn; +public: + void set_ssn(node_seq_t ssn) { rtr_ssn= ssn; } + node_seq_t assign_ssn() { return rtr_ssn.fetch_add(1) + 1; } + node_seq_t ssn() const { return rtr_ssn; } + rtr_info_track_t* rtr_track;/*!< tracking all R-Tree search cursors */ trx_id_t trx_id; /*!< id of the transaction that created this @@ -1028,7 +1154,7 @@ struct dict_index_t{ page cannot be read or decrypted */ inline bool is_readable() const; - /** @return whether instant ADD COLUMN is in effect */ + /** @return whether instant ALTER TABLE is in effect */ inline bool is_instant() const; /** @return whether the index is the primary key index @@ -1038,12 +1164,38 @@ struct dict_index_t{ return DICT_CLUSTERED == (type & (DICT_CLUSTERED | DICT_IBUF)); } + /** @return whether this is a generated clustered index */ + bool is_gen_clust() const { return type == DICT_CLUSTERED; } + + /** @return whether this is a clustered index */ + bool is_clust() const { return type & DICT_CLUSTERED; } + + /** @return whether this is a unique index */ + bool is_unique() const { return type & DICT_UNIQUE; } + /** @return whether this is a spatial index */ bool is_spatial() const { return UNIV_UNLIKELY(type & DICT_SPATIAL); } + /** @return whether this is the change buffer */ + bool is_ibuf() const { return UNIV_UNLIKELY(type & DICT_IBUF); } + /** @return whether the index includes virtual columns */ bool has_virtual() const { return type & DICT_VIRTUAL; } + /** @return the position of DB_TRX_ID */ + unsigned db_trx_id() const { + DBUG_ASSERT(is_primary()); + DBUG_ASSERT(n_uniq); + DBUG_ASSERT(n_uniq <= MAX_REF_PARTS); + return n_uniq; + } + /** @return the position of DB_ROLL_PTR */ + unsigned db_roll_ptr() const { return db_trx_id() + 1; } + + /** @return the offset of the metadata BLOB field, + or the first user field after the PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR */ + unsigned first_user_field() const { return db_trx_id() + 2; } + /** @return whether the index is corrupted */ inline bool is_corrupted() const; @@ -1051,7 +1203,7 @@ struct dict_index_t{ @param whether to reset fields[].col */ void detach_columns(bool clear= false) { - if (!has_virtual()) + if (!has_virtual() || !cached) return; for (unsigned i= 0; i < n_fields; i++) { @@ -1060,7 +1212,7 @@ struct dict_index_t{ continue; col->detach(*this); if (clear) - fields[i].col= NULL; + fields[i].col= nullptr; } } @@ -1094,24 +1246,20 @@ struct dict_index_t{ return fields[n].col->instant_value(len); } - /** Adjust clustered index metadata for instant ADD COLUMN. - @param[in] clustered index definition after instant ADD COLUMN */ - void instant_add_field(const dict_index_t& instant); - - /** Remove the 'instant ADD' status of a clustered index. - Protected by index root page x-latch or table X-lock. */ - void remove_instant() - { - DBUG_ASSERT(is_primary()); - if (!is_instant()) { - return; - } - for (unsigned i = n_core_fields; i < n_fields; i++) { - fields[i].col->remove_instant(); - } - n_core_fields = n_fields; - n_core_null_bytes = UT_BITS_IN_BYTES(unsigned(n_nullable)); - } + /** Adjust index metadata for instant ADD/DROP/reorder COLUMN. + @param[in] clustered index definition after instant ALTER TABLE */ + inline void instant_add_field(const dict_index_t& instant); + /** Remove instant ADD COLUMN metadata. */ + inline void clear_instant_add(); + /** Remove instant ALTER TABLE metadata. */ + inline void clear_instant_alter(); + + /** Construct the metadata record for instant ALTER TABLE. + @param[in] row dummy or default values for existing columns + @param[in,out] heap memory heap for allocations + @return metadata record */ + inline dtuple_t* + instant_metadata(const dtuple_t& row, mem_heap_t* heap) const; /** Check if record in clustered index is historical row. @param[in] rec clustered row @@ -1127,6 +1275,16 @@ struct dict_index_t{ bool vers_history_row(const rec_t* rec, bool &history_row); + /** Reconstruct the clustered index fields. */ + inline void reconstruct_fields(); + + /** Check if the index contains a column or a prefix of that column. + @param[in] n column number + @param[in] is_virtual whether it is a virtual col + @return whether the index contains the column or its prefix */ + bool contains_col_or_prefix(ulint n, bool is_virtual) const + MY_ATTRIBUTE((warn_unused_result)); + #ifdef BTR_CUR_HASH_ADAPT /** @return a clone of this */ dict_index_t* clone() const; @@ -1206,20 +1364,8 @@ struct dict_index_t{ @param index being-freed index */ inline void dict_col_t::detach(const dict_index_t &index) { - ut_ad(is_virtual()); - - if (dict_v_idx_list *v_indexes= reinterpret_cast<const dict_v_col_t*>(this) - ->v_indexes) - { - for (dict_v_idx_list::iterator i= v_indexes->begin(); - i != v_indexes->end(); i++) - { - if (i->index == &index) { - v_indexes->erase(i); - return; - } - } - } + if (is_virtual()) + reinterpret_cast<dict_v_col_t*>(this)->detach(index); } /** The status of online index creation */ @@ -1534,6 +1680,64 @@ struct dict_vcol_templ_t { dict_vcol_templ_t() : vtempl(0), mysql_table_query_id(~0ULL) {} }; +/** Metadata on clustered index fields starting from first_user_field() */ +class field_map_element_t +{ + /** Number of bits for representing a column number */ + static constexpr uint16_t IND_BITS = 10; + + /** Set if the column of the field has been instantly dropped */ + static constexpr uint16_t DROPPED = 1U << (IND_BITS + 5); + + /** Set if the column was dropped and originally declared NOT NULL */ + static constexpr uint16_t NOT_NULL = 1U << (IND_BITS + 4); + + /** Column index (if !(data & DROPPED)): table->cols[data & IND], + or field length (if (data & DROPPED)): + (data & IND) = 0 if variable-length with max_len < 256 bytes; + (data & IND) = 1 if variable-length with max_len > 255 bytes; + (data & IND) = 1 + L otherwise, with L=fixed length of the column */ + static constexpr uint16_t IND = (1U << IND_BITS) - 1; + + /** Field metadata */ + uint16_t data; + + void clear_not_null() { data &= ~NOT_NULL; } +public: + bool is_dropped() const { return data & DROPPED; } + void set_dropped() { data |= DROPPED; } + bool is_not_null() const { return data & NOT_NULL; } + void set_not_null() { ut_ad(is_dropped()); data |= NOT_NULL; } + uint16_t ind() const { return data & IND; } + void set_ind(uint16_t i) + { + DBUG_ASSERT(i <= IND); + DBUG_ASSERT(!ind()); + data |= i; + } + field_map_element_t& operator= (uint16_t value) + { + data = value; + return *this; + } + operator uint16_t() { return data; } +}; + +static_assert(sizeof(field_map_element_t) == 2, + "Size mismatch for a persistent data item!"); + +/** Instantly dropped or reordered columns */ +struct dict_instant_t +{ + /** Number of dropped columns */ + unsigned n_dropped; + /** Dropped columns */ + dict_col_t* dropped; + /** Map of clustered index non-PK fields[i - first_user_field()] + to table columns */ + field_map_element_t* field_map; +}; + /** These are used when MySQL FRM and InnoDB data dictionary are in inconsistent state. */ typedef enum { @@ -1552,11 +1756,7 @@ struct dict_table_t { /** Get reference count. @return current value of n_ref_count */ - inline int32 get_ref_count() - { - return my_atomic_load32_explicit(&n_ref_count, - MY_MEMORY_ORDER_RELAXED); - } + inline uint32_t get_ref_count() const { return n_ref_count; } /** Acquire the table handle. */ inline void acquire(); @@ -1576,6 +1776,9 @@ struct dict_table_t { return flags2 & DICT_TF2_TEMPORARY; } + /** @return whether the table is not in ROW_FORMAT=REDUNDANT */ + bool not_redundant() const { return flags & DICT_TF_COMPACT; } + /** @return whether this table is readable @retval true normally @retval false if this is a single-table tablespace @@ -1594,35 +1797,92 @@ struct dict_table_t { return strstr(name, "/" TEMP_FILE_PREFIX) != NULL; } - /** @return whether instant ADD COLUMN is in effect */ + /** @return whether instant ALTER TABLE is in effect */ bool is_instant() const { return(UT_LIST_GET_FIRST(indexes)->is_instant()); } - /** @return whether the table supports instant ADD COLUMN */ + /** @return whether the table supports instant ALTER TABLE */ bool supports_instant() const { return(!(flags & DICT_TF_MASK_ZIP_SSIZE)); } - /** Adjust metadata for instant ADD COLUMN. - @param[in] table table definition after instant ADD COLUMN */ - void instant_add_column(const dict_table_t& table); + /** @return the number of instantly dropped columns */ + unsigned n_dropped() const { return instant ? instant->n_dropped : 0; } + + /** Look up an old column. + @param[in] cols the old columns of the table + @param[in] col_map map from old table columns to altered ones + @param[in] n_cols number of old columns + @param[in] i the number of the new column + @return old column + @retval NULL if column i was added to the table */ + static const dict_col_t* find(const dict_col_t* cols, + const ulint* col_map, ulint n_cols, + ulint i) + { + for (ulint o = n_cols; o--; ) { + if (col_map[o] == i) { + return &cols[o]; + } + } + return NULL; + } - /** Roll back instant_add_column(). - @param[in] old_n_cols original n_cols - @param[in] old_cols original cols - @param[in] old_col_names original col_names */ - void rollback_instant( + /** Serialise metadata of dropped or reordered columns. + @param[in,out] heap memory heap for allocation + @param[out] field data field with the metadata */ + inline void serialise_columns(mem_heap_t* heap, dfield_t* field) const; + + /** Reconstruct dropped or reordered columns. + @param[in] metadata data from serialise_columns() + @param[in] len length of the metadata, in bytes + @return whether parsing the metadata failed */ + bool deserialise_columns(const byte* metadata, ulint len); + + /** Set is_instant() before instant_column(). + @param[in] old previous table definition + @param[in] col_map map from old.cols[] + and old.v_cols[] to this + @param[out] first_alter_pos 0, or + 1 + first changed column position */ + inline void prepare_instant(const dict_table_t& old, + const ulint* col_map, + unsigned& first_alter_pos); + + /** Adjust table metadata for instant ADD/DROP/reorder COLUMN. + @param[in] table table on which prepare_instant() was invoked + @param[in] col_map mapping from cols[] and v_cols[] to table + @return whether the metadata record must be updated */ + inline bool instant_column(const dict_table_t& table, + const ulint* col_map); + + /** Roll back instant_column(). + @param[in] old_n_cols original n_cols + @param[in] old_cols original cols + @param[in] old_col_names original col_names + @param[in] old_instant original instant structure + @param[in] old_fields original fields + @param[in] old_n_fields original number of fields + @param[in] old_n_core_fields original number of core fields + @param[in] old_n_v_cols original n_v_cols + @param[in] old_v_cols original v_cols + @param[in] old_v_col_names original v_col_names + @param[in] col_map column map */ + inline void rollback_instant( unsigned old_n_cols, dict_col_t* old_cols, - const char* old_col_names); - - /** Trim the instantly added columns when an insert into SYS_COLUMNS - is rolled back during ALTER TABLE or recovery. - @param[in] n number of surviving non-system columns */ - void rollback_instant(unsigned n); + const char* old_col_names, + dict_instant_t* old_instant, + dict_field_t* old_fields, + unsigned old_n_fields, + unsigned old_n_core_fields, + unsigned old_n_v_cols, + dict_v_col_t* old_v_cols, + const char* old_v_col_names, + const ulint* col_map); /** Add the table definition to the data dictionary cache */ void add_to_cache(); @@ -1640,23 +1900,28 @@ struct dict_table_t { void inc_fk_checks() { #ifdef UNIV_DEBUG - lint fk_checks= (lint) + int32_t fk_checks= #endif - my_atomic_addlint(&n_foreign_key_checks_running, 1); + n_foreign_key_checks_running++; ut_ad(fk_checks >= 0); } void dec_fk_checks() { #ifdef UNIV_DEBUG - lint fk_checks= (lint) + int32_t fk_checks= #endif - my_atomic_addlint(&n_foreign_key_checks_running, ulint(-1)); + n_foreign_key_checks_running--; ut_ad(fk_checks > 0); } /** For overflow fields returns potential max length stored inline */ - size_t get_overflow_field_local_len() const; + inline size_t get_overflow_field_local_len() const; +private: + /** Initialize instant->field_map. + @param[in] table table definition to copy from */ + inline void init_instant(const dict_table_t& table); +public: /** Id of the table. */ table_id_t id; /** Hash chain node. */ @@ -1715,8 +1980,7 @@ struct dict_table_t { /** TRUE if the table is to be dropped, but not yet actually dropped (could in the background drop list). It is turned on at the beginning of row_drop_table_for_mysql() and turned off just before we start to - update system tables for the drop. It is protected by - dict_operation_lock. */ + update system tables for the drop. It is protected by dict_sys.latch. */ unsigned to_be_dropped:1; /** Number of non-virtual columns defined so far. */ @@ -1766,6 +2030,9 @@ struct dict_table_t { reason s_cols is a part of dict_table_t */ dict_s_col_list* s_cols; + /** Instantly dropped or reordered columns, or NULL if none */ + dict_instant_t* instant; + /** Column names packed in a character string "name1\0name2\0...nameN\0". Until the string contains n_cols, it will be allocated from a temporary heap. The final string will be allocated @@ -1815,7 +2082,7 @@ struct dict_table_t { /** Count of how many foreign key check operations are currently being performed on the table. We cannot drop the table while there are foreign key checks running on it. */ - ulint n_foreign_key_checks_running; + Atomic_counter<int32_t> n_foreign_key_checks_running; /** Transactions whose view low limit is greater than this number are not allowed to store to the MySQL query cache or retrieve from it. @@ -1837,9 +2104,6 @@ struct dict_table_t { /** Statistics for query optimization. @{ */ - /** Creation state of 'stats_latch'. */ - volatile os_once::state_t stats_latch_created; - /** This latch protects: dict_table_t::stat_initialized, dict_table_t::stat_n_rows (*), @@ -1851,7 +2115,7 @@ struct dict_table_t { dict_table_t::indexes*::stat_n_leaf_pages. (*) Those are not always protected for performance reasons. */ - rw_lock_t* stats_latch; + rw_lock_t stats_latch; /** TRUE if statistics have been calculated the first time after database startup or table creation. */ @@ -1933,7 +2197,7 @@ struct dict_table_t { /** The state of the background stats thread wrt this table. See BG_STAT_NONE, BG_STAT_IN_PROGRESS and BG_STAT_SHOULD_QUIT. - Writes are covered by dict_sys->mutex. Dirty reads are possible. */ + Writes are covered by dict_sys.mutex. Dirty reads are possible. */ #define BG_SCRUB_IN_PROGRESS ((byte)(1 << 2)) /*!< BG_SCRUB_IN_PROGRESS is set in @@ -1949,7 +2213,7 @@ struct dict_table_t { /** The state of the background stats thread wrt this table. See BG_STAT_NONE, BG_STAT_IN_PROGRESS and BG_STAT_SHOULD_QUIT. - Writes are covered by dict_sys->mutex. Dirty reads are possible. */ + Writes are covered by dict_sys.mutex. Dirty reads are possible. */ byte stats_bg_flag; bool stats_error_printed; @@ -1975,11 +2239,8 @@ struct dict_table_t { from a select. */ lock_t* autoinc_lock; - /** Creation state of autoinc_mutex member */ - volatile os_once::state_t autoinc_mutex_created; - /** Mutex protecting the autoincrement counter. */ - ib_mutex_t* autoinc_mutex; + ib_mutex_t autoinc_mutex; /** Autoinc counter value to give to the next inserted row. */ ib_uint64_t autoinc; @@ -2015,7 +2276,7 @@ private: /** Count of how many handles are opened to this table. Dropping of the table is NOT allowed until this count gets to zero. MySQL does NOT itself check the number of open handles at DROP. */ - int32 n_ref_count; + Atomic_counter<uint32_t> n_ref_count; public: /** List of locks on the table. Protected by lock_sys.mutex. */ @@ -2051,12 +2312,15 @@ inline bool dict_index_t::is_readable() const { return table->is_readable(); } inline bool dict_index_t::is_instant() const { ut_ad(n_core_fields > 0); - ut_ad(n_core_fields <= n_fields); + ut_ad(n_core_fields <= n_fields || table->n_dropped()); ut_ad(n_core_fields == n_fields || (type & ~(DICT_UNIQUE | DICT_CORRUPT)) == DICT_CLUSTERED); ut_ad(n_core_fields == n_fields || table->supports_instant()); ut_ad(n_core_fields == n_fields || !table->is_temporary()); - return(n_core_fields != n_fields); + ut_ad(!table->instant || !table->is_temporary()); + + return n_core_fields != n_fields + || (is_primary() && table->instant); } inline bool dict_index_t::is_corrupted() const @@ -2066,6 +2330,81 @@ inline bool dict_index_t::is_corrupted() const || (table && table->corrupted)); } +inline void dict_index_t::clear_instant_add() +{ + DBUG_ASSERT(is_primary()); + DBUG_ASSERT(is_instant()); + DBUG_ASSERT(!table->instant); + for (unsigned i = n_core_fields; i < n_fields; i++) { + fields[i].col->clear_instant(); + } + n_core_fields = n_fields; + n_core_null_bytes = UT_BITS_IN_BYTES(unsigned(n_nullable)); +} + +inline void dict_index_t::clear_instant_alter() +{ + DBUG_ASSERT(is_primary()); + DBUG_ASSERT(n_fields == n_def); + + if (!table->instant) { + if (is_instant()) { + clear_instant_add(); + } + return; + } + +#ifndef DBUG_OFF + for (unsigned i = first_user_field(); i--; ) { + DBUG_ASSERT(!fields[i].col->is_dropped()); + DBUG_ASSERT(!fields[i].col->is_nullable()); + } +#endif + const dict_col_t* ai_col = table->persistent_autoinc + ? fields[table->persistent_autoinc - 1].col + : NULL; + dict_field_t* const begin = &fields[first_user_field()]; + dict_field_t* end = &fields[n_fields]; + + for (dict_field_t* d = begin; d < end; ) { + /* Move fields for dropped columns to the end. */ + if (!d->col->is_dropped()) { + d++; + } else { + if (d->col->is_nullable()) { + n_nullable--; + } + + std::swap(*d, *--end); + } + } + + DBUG_ASSERT(&fields[n_fields - table->n_dropped()] == end); + n_core_fields = n_fields = n_def = end - fields; + n_core_null_bytes = UT_BITS_IN_BYTES(n_nullable); + std::sort(begin, end, [](const dict_field_t& a, const dict_field_t& b) + { return a.col->ind < b.col->ind; }); + table->instant = NULL; + if (ai_col) { + auto a = std::find_if(begin, end, + [ai_col](const dict_field_t& f) + { return f.col == ai_col; }); + table->persistent_autoinc = (a == end) ? 0 : 1 + (a - fields); + } +} + +/** @return whether the column was instantly dropped +@param[in] index the clustered index */ +inline bool dict_col_t::is_dropped(const dict_index_t& index) const +{ + DBUG_ASSERT(index.is_primary()); + DBUG_ASSERT(!is_dropped() == !index.table->instant); + DBUG_ASSERT(!is_dropped() || (this >= index.table->instant->dropped + && this < index.table->instant->dropped + + index.table->instant->n_dropped)); + return is_dropped(); +} + /*******************************************************************//** Initialise the table lock list. */ void @@ -2086,87 +2425,6 @@ struct dict_foreign_add_to_referenced_table { } }; -/** Destroy the autoinc latch of the given table. -This function is only called from either single threaded environment -or from a thread that has not shared the table object with other threads. -@param[in,out] table table whose stats latch to destroy */ -inline -void -dict_table_autoinc_destroy( - dict_table_t* table) -{ - if (table->autoinc_mutex_created == os_once::DONE - && table->autoinc_mutex != NULL) { - mutex_free(table->autoinc_mutex); - UT_DELETE(table->autoinc_mutex); - } -} - -/** Request for lazy creation of the autoinc latch of a given table. -This function is only called from either single threaded environment -or from a thread that has not shared the table object with other threads. -@param[in,out] table table whose autoinc latch is to be created. */ -inline -void -dict_table_autoinc_create_lazy( - dict_table_t* table) -{ - table->autoinc_mutex = NULL; - table->autoinc_mutex_created = os_once::NEVER_DONE; -} - -/** Request a lazy creation of dict_index_t::zip_pad::mutex. -This function is only called from either single threaded environment -or from a thread that has not shared the table object with other threads. -@param[in,out] index index whose zip_pad mutex is to be created */ -inline -void -dict_index_zip_pad_mutex_create_lazy( - dict_index_t* index) -{ - index->zip_pad.mutex = NULL; - index->zip_pad.mutex_created = os_once::NEVER_DONE; -} - -/** Destroy the zip_pad_mutex of the given index. -This function is only called from either single threaded environment -or from a thread that has not shared the table object with other threads. -@param[in,out] table table whose stats latch to destroy */ -inline -void -dict_index_zip_pad_mutex_destroy( - dict_index_t* index) -{ - if (index->zip_pad.mutex_created == os_once::DONE - && index->zip_pad.mutex != NULL) { - mutex_free(index->zip_pad.mutex); - UT_DELETE(index->zip_pad.mutex); - } -} - -/** Release the zip_pad_mutex of a given index. -@param[in,out] index index whose zip_pad_mutex is to be released */ -inline -void -dict_index_zip_pad_unlock( - dict_index_t* index) -{ - mutex_exit(index->zip_pad.mutex); -} - -#ifdef UNIV_DEBUG -/** Check if the current thread owns the autoinc_mutex of a given table. -@param[in] table the autoinc_mutex belongs to this table -@return true, if the current thread owns the autoinc_mutex, false otherwise.*/ -inline -bool -dict_table_autoinc_own( - const dict_table_t* table) -{ - return(mutex_own(table->autoinc_mutex)); -} -#endif /* UNIV_DEBUG */ - /** Check whether the col is used in spatial index or regular index. @param[in] col column to check @return spatial status */ diff --git a/storage/innobase/include/dict0priv.h b/storage/innobase/include/dict0priv.h index e56848d1954..3f2792054e0 100644 --- a/storage/innobase/include/dict0priv.h +++ b/storage/innobase/include/dict0priv.h @@ -45,18 +45,6 @@ dict_table_check_if_in_cache_low( /*=============================*/ const char* table_name); /*!< in: table name */ -/**********************************************************************//** -Returns a table object based on table id. -@return table, NULL if does not exist */ -UNIV_INLINE -dict_table_t* -dict_table_open_on_id_low( -/*=====================*/ - table_id_t table_id, /*!< in: table id */ - dict_err_ignore_t ignore_err, /*!< in: errors to ignore - when loading the table */ - ibool open_only_if_in_cache); - #include "dict0priv.ic" #endif /* dict0priv.h */ diff --git a/storage/innobase/include/dict0priv.ic b/storage/innobase/include/dict0priv.ic index 7b584c7e1cb..ff645378175 100644 --- a/storage/innobase/include/dict0priv.ic +++ b/storage/innobase/include/dict0priv.ic @@ -25,7 +25,6 @@ Created Wed 13 Oct 2010 16:10:14 EST Sunny Bains #include "dict0dict.h" #include "dict0load.h" -#include "dict0priv.h" /**********************************************************************//** Gets a table; loads it to the dictionary cache if necessary. A low-level @@ -40,7 +39,7 @@ dict_table_get_low( dict_table_t* table; ut_ad(table_name); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); table = dict_table_check_if_in_cache_low(table_name); @@ -64,40 +63,6 @@ dict_table_get_low( } /**********************************************************************//** -Returns a table object based on table id. -@return table, NULL if does not exist */ -UNIV_INLINE -dict_table_t* -dict_table_open_on_id_low( -/*======================*/ - table_id_t table_id, /*!< in: table id */ - dict_err_ignore_t ignore_err, /*!< in: errors to ignore - when loading the table */ - ibool open_only_if_in_cache) -{ - dict_table_t* table; - ulint fold; - - ut_ad(mutex_own(&dict_sys->mutex)); - - /* Look for the table name in the hash table */ - fold = ut_fold_ull(table_id); - - HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold, - dict_table_t*, table, ut_ad(table->cached), - table->id == table_id); - if (table == NULL && !open_only_if_in_cache) { - table = dict_load_table_on_id(table_id, ignore_err); - } - - ut_ad(!table || table->cached); - - /* TODO: should get the type information from MySQL */ - - return(table); -} - -/**********************************************************************//** Checks if a table is in the dictionary cache. @return table, NULL if not found */ UNIV_INLINE @@ -114,12 +79,12 @@ dict_table_check_if_in_cache_low( ("table: '%s'", table_name)); ut_ad(table_name); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); /* Look for the table name in the hash table */ table_fold = ut_fold_string(table_name); - HASH_SEARCH(name_hash, dict_sys->table_hash, table_fold, + HASH_SEARCH(name_hash, dict_sys.table_hash, table_fold, dict_table_t*, table, ut_ad(table->cached), !strcmp(table->name.m_name, table_name)); DBUG_RETURN(table); diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h index 00ac6eb4745..2e001cb56e9 100644 --- a/storage/innobase/include/dict0stats.h +++ b/storage/innobase/include/dict0stats.h @@ -187,6 +187,19 @@ dict_stats_rename_table( char* errstr, /*!< out: error string if != DB_SUCCESS is returned */ size_t errstr_sz); /*!< in: errstr size */ +/*********************************************************************//** +Renames an index in InnoDB persistent stats storage. +This function creates its own transaction and commits it. +@return DB_SUCCESS or error code. DB_STATS_DO_NOT_EXIST will be returned +if the persistent stats do not exist. */ +dberr_t +dict_stats_rename_index( +/*====================*/ + const dict_table_t* table, /*!< in: table whose index + is renamed */ + const char* old_index_name, /*!< in: old index name */ + const char* new_index_name) /*!< in: new index name */ + __attribute__((warn_unused_result)); /** Save an individual index's statistic into the persistent statistics storage. diff --git a/storage/innobase/include/dict0stats.ic b/storage/innobase/include/dict0stats.ic index 98024935e16..b30dede54f1 100644 --- a/storage/innobase/include/dict0stats.ic +++ b/storage/innobase/include/dict0stats.ic @@ -75,7 +75,7 @@ dict_stats_is_persistent_enabled(const dict_table_t* table) + dict_stats_update(DICT_STATS_RECALC_TRANSIENT) on a table that has just been PS-enabled. This is acceptable. Avoiding this would mean that we would have to - protect the ::stat_persistent with dict_table_stats_lock() like the + protect the ::stat_persistent with dict_table_t::stats_latch like the other ::stat_ members which would be too big performance penalty, especially when this function is called from dict_stats_update_if_needed(). */ @@ -148,7 +148,7 @@ dict_stats_init( /*============*/ dict_table_t* table) /*!< in/out: table */ { - ut_ad(!mutex_own(&dict_sys->mutex)); + ut_ad(!mutex_own(&dict_sys.mutex)); if (table->stat_initialized) { return; @@ -174,14 +174,14 @@ dict_stats_deinit( /*==============*/ dict_table_t* table) /*!< in/out: table */ { - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); ut_a(table->get_ref_count() == 0); - dict_table_stats_lock(table, RW_X_LATCH); + rw_lock_x_lock(&table->stats_latch); if (!table->stat_initialized) { - dict_table_stats_unlock(table, RW_X_LATCH); + rw_lock_x_unlock(&table->stats_latch); return; } @@ -222,5 +222,5 @@ dict_stats_deinit( } #endif /* HAVE_valgrind_or_MSAN */ - dict_table_stats_unlock(table, RW_X_LATCH); + rw_lock_x_unlock(&table->stats_latch); } diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h index 66b98629033..526139643d1 100644 --- a/storage/innobase/include/dict0stats_bg.h +++ b/storage/innobase/include/dict0stats_bg.h @@ -72,7 +72,7 @@ dict_stats_stop_bg( dict_table_t* table) /*!< in/out: table */ { ut_ad(!srv_read_only_mode); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); if (!(table->stats_bg_flag & BG_STAT_IN_PROGRESS)) { return(true); @@ -90,7 +90,7 @@ and restore the lock before it exits. The background stats thread is guaranteed not to start using the specified table after this function returns and before the caller unlocks the data dictionary because it sets the BG_STAT_IN_PROGRESS bit in table->stats_bg_flag -under dict_sys->mutex. */ +under dict_sys.mutex. */ void dict_stats_wait_bg_to_stop_using_table( /*===================================*/ diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h index 1e16e501a48..d0da45ab218 100644 --- a/storage/innobase/include/dict0types.h +++ b/storage/innobase/include/dict0types.h @@ -30,7 +30,6 @@ Created 1/8/1996 Heikki Tuuri #include <ut0mutex.h> #include <rem0types.h> -struct dict_sys_t; struct dict_col_t; struct dict_field_t; struct dict_index_t; diff --git a/storage/innobase/include/fil0crypt.h b/storage/innobase/include/fil0crypt.h index 870858b4ccd..fd0d3e12601 100644 --- a/storage/innobase/include/fil0crypt.h +++ b/storage/innobase/include/fil0crypt.h @@ -1,6 +1,6 @@ /***************************************************************************** Copyright (C) 2013, 2015, Google Inc. All Rights Reserved. -Copyright (c) 2015, 2018, MariaDB Corporation. +Copyright (c) 2015, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -26,11 +26,9 @@ Created 04/01/2015 Jan Lindström #ifndef fil0crypt_h #define fil0crypt_h -#ifndef UNIV_INNOCHECKSUM #include "os0event.h" #include "my_crypt.h" #include "fil0fil.h" -#endif /*! UNIV_INNOCHECKSUM */ /** * Magic pattern in start of crypt data on page 0 @@ -281,13 +279,11 @@ fil_space_merge_crypt_data( const fil_space_crypt_t* src); /** Initialize encryption parameters from a tablespace header page. -@param[in] page_size page size of the tablespace +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] page first page of the tablespace @return crypt data from page 0 @retval NULL if not present or not valid */ -UNIV_INTERN -fil_space_crypt_t* -fil_space_read_crypt_data(const page_size_t& page_size, const byte* page) +fil_space_crypt_t* fil_space_read_crypt_data(ulint zip_size, const byte* page) MY_ATTRIBUTE((nonnull, warn_unused_result)); /** @@ -313,14 +309,16 @@ fil_parse_write_crypt_data( MY_ATTRIBUTE((warn_unused_result)); /** Encrypt a buffer. -@param[in,out] crypt_data Crypt data -@param[in] space space_id -@param[in] offset Page offset -@param[in] lsn Log sequence number -@param[in] src_frame Page to encrypt -@param[in] page_size Page size -@param[in,out] dst_frame Output buffer +@param[in,out] crypt_data Crypt data +@param[in] space space_id +@param[in] offset Page offset +@param[in] lsn Log sequence number +@param[in] src_frame Page to encrypt +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] dst_frame Output buffer +@param[in] use_full_checksum full crc32 algo is used @return encrypted buffer or NULL */ +UNIV_INTERN byte* fil_encrypt_buf( fil_space_crypt_t* crypt_data, @@ -328,8 +326,9 @@ fil_encrypt_buf( ulint offset, lsn_t lsn, const byte* src_frame, - const page_size_t& page_size, - byte* dst_frame) + ulint zip_size, + byte* dst_frame, + bool use_full_checksum) MY_ATTRIBUTE((warn_unused_result)); /** @@ -351,20 +350,24 @@ fil_space_encrypt( byte* dst_frame) MY_ATTRIBUTE((warn_unused_result)); -/** -Decrypt a page. -@param[in,out] crypt_data crypt_data + +/** Decrypt a page. +@param]in] space_id space id +@param[in] crypt_data crypt_data @param[in] tmp_frame Temporary buffer -@param[in] page_size Page size +@param[in] physical_size page size +@param[in] fsp_flags Tablespace flags @param[in,out] src_frame Page to decrypt -@param[out] err DB_SUCCESS or error +@param[out] err DB_SUCCESS or DB_DECRYPTION_FAILED @return true if page decrypted, false if not.*/ UNIV_INTERN bool fil_space_decrypt( + ulint space_id, fil_space_crypt_t* crypt_data, byte* tmp_frame, - const page_size_t& page_size, + ulint physical_size, + ulint fsp_flags, byte* src_frame, dberr_t* err); @@ -383,17 +386,14 @@ fil_space_decrypt( byte* src_frame) MY_ATTRIBUTE((warn_unused_result)); -/****************************************************************** +/** Calculate post encryption checksum -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] dst_frame Block where checksum is calculated -@return page checksum or BUF_NO_CHECKSUM_MAGIC +@return page checksum not needed. */ -UNIV_INTERN uint32_t -fil_crypt_calculate_checksum( - const page_size_t& page_size, - const byte* dst_frame) +fil_crypt_calculate_checksum(ulint zip_size, const byte* dst_frame) MY_ATTRIBUTE((warn_unused_result)); /********************************************************************* @@ -491,10 +491,9 @@ calculated checksum as if it does page could be valid unencrypted, encrypted, or corrupted. @param[in,out] page page frame (checksum is temporarily modified) -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @return true if page is encrypted AND OK, false otherwise */ -bool -fil_space_verify_crypt_checksum(const byte* page, const page_size_t& page_size) +bool fil_space_verify_crypt_checksum(const byte* page, ulint zip_size) MY_ATTRIBUTE((warn_unused_result)); #endif /* fil0crypt_h */ diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 8682474824f..e78c9587325 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -27,12 +27,16 @@ Created 10/25/1995 Heikki Tuuri #ifndef fil0fil_h #define fil0fil_h +#include "fsp0types.h" + #ifndef UNIV_INNOCHECKSUM #include "log0recv.h" #include "dict0types.h" -#include "page0size.h" #include "ilist.h" +#ifdef UNIV_LINUX +# include <set> +#endif struct unflushed_spaces_tag_t; struct rotation_list_tag_t; @@ -41,8 +45,6 @@ struct rotation_list_tag_t; extern my_bool srv_use_doublewrite_buf; extern struct buf_dblwr_t* buf_dblwr; class page_id_t; -struct trx_t; -class truncate_t; /** Structure containing encryption specification */ struct fil_space_crypt_t; @@ -76,10 +78,17 @@ fil_type_is_data( struct fil_node_t; +#endif + /** Tablespace or log data space */ +#ifndef UNIV_INNOCHECKSUM struct fil_space_t : ilist_node<unflushed_spaces_tag_t>, ilist_node<rotation_list_tag_t> +#else +struct fil_space_t +#endif { +#ifndef UNIV_INNOCHECKSUM ulint id; /*!< space id */ hash_node_t hash; /*!< hash chain node */ char* name; /*!< Tablespace name */ @@ -93,26 +102,21 @@ struct fil_space_t : ilist_node<unflushed_spaces_tag_t>, /** Log sequence number of the latest MLOG_INDEX_LOAD record that was found while parsing the redo log */ lsn_t enable_lsn; + /** set when an .ibd file is about to be deleted, + or an undo tablespace is about to be truncated. + When this is set following new ops are not allowed: + * read IO request + * ibuf merge + * file flush + Note that we can still possibly have new write operations + because we don't check this flag when doing flush batches. */ bool stop_new_ops; - /*!< we set this true when we start - deleting a single-table tablespace. - When this is set following new ops - are not allowed: - * read IO request - * ibuf merge - * file flush - Note that we can still possibly have - new write operations because we don't - check this flag when doing flush - batches. */ /** whether undo tablespace truncation is in progress */ bool is_being_truncated; #ifdef UNIV_DEBUG - ulint redo_skipped_count; - /*!< reference count for operations who want - to skip redo log in the file space in order - to make modify_check() pass. - Uses my_atomic_loadlint() and friends. */ + /** reference count for operations who want to skip redo log in the + file space in order to make modify_check() pass. */ + Atomic_counter<ulint> redo_skipped_count; #endif fil_type_t purpose;/*!< purpose */ UT_LIST_BASE_NODE_T(fil_node_t) chain; @@ -130,10 +134,6 @@ struct fil_space_t : ilist_node<unflushed_spaces_tag_t>, /*!< recovered tablespace size in pages; 0 if no size change was read from the redo log, or if the size change was implemented */ - ulint flags; /*!< FSP_SPACE_FLAGS and FSP_FLAGS_MEM_ flags; - see fsp0types.h, - fsp_flags_is_valid(), - page_size_t(ulint) (constructor) */ ulint n_reserved_extents; /*!< number of reserved free extents for ongoing operations like B-tree page split */ @@ -141,20 +141,20 @@ struct fil_space_t : ilist_node<unflushed_spaces_tag_t>, the tablespace to disk; dropping of the tablespace is forbidden if this is positive */ /** Number of pending buffer pool operations accessing the tablespace - without holding a table lock or dict_operation_lock S-latch + without holding a table lock or dict_sys.latch S-latch that would prevent the table (and tablespace) from being dropped. An example is change buffer merge. The tablespace cannot be dropped while this is nonzero, or while fil_node_t::n_pending is nonzero. - Protected by fil_system.mutex and my_atomic_loadlint() and friends. */ - ulint n_pending_ops; + Protected by fil_system.mutex and std::atomic. */ + std::atomic<ulint> n_pending_ops; /** Number of pending block read or write operations (when a write is imminent or a read has recently completed). The tablespace object cannot be freed while this is nonzero, but it can be detached from fil_system. Note that fil_node_t::n_pending tracks actual pending I/O requests. - Protected by fil_system.mutex and my_atomic_loadlint() and friends. */ - ulint n_pending_ios; + Protected by fil_system.mutex and std::atomic. */ + std::atomic<ulint> n_pending_ios; rw_lock_t latch; /*!< latch protecting the file space storage allocation */ UT_LIST_NODE_T(fil_space_t) named_spaces; @@ -248,7 +248,10 @@ struct fil_space_t : ilist_node<unflushed_spaces_tag_t>, /** Note that the tablespace has been imported. Initially, purpose=FIL_TYPE_IMPORT so that no redo log is written while the space ID is being updated in each page. */ - void set_imported(); + inline void set_imported(); + + /** @return whether the storage device is rotational (HDD, not SSD) */ + inline bool is_rotational() const; /** Open each file. Only invoked on fil_system.temp_space. @return whether all files were opened */ @@ -257,38 +260,290 @@ struct fil_space_t : ilist_node<unflushed_spaces_tag_t>, void close(); /** Acquire a tablespace reference. */ - void acquire() { my_atomic_addlint(&n_pending_ops, 1); } + void acquire() { n_pending_ops++; } /** Release a tablespace reference. */ - void release() + void release() { ut_ad(referenced()); n_pending_ops--; } + /** @return whether references are being held */ + bool referenced() const { return n_pending_ops; } + + /** Acquire a tablespace reference for I/O. */ + void acquire_for_io() { n_pending_ios++; } + /** Release a tablespace reference for I/O. */ + void release_for_io() { ut_ad(pending_io()); n_pending_ios--; } + /** @return whether I/O is pending */ + bool pending_io() const { return n_pending_ios; } +#endif /* !UNIV_INNOCHECKSUM */ + /** FSP_SPACE_FLAGS and FSP_FLAGS_MEM_ flags; + check fsp0types.h to more info about flags. */ + ulint flags; + + /** Determine if full_crc32 is used for a data file + @param[in] flags tablespace flags (FSP_FLAGS) + @return whether the full_crc32 algorithm is active */ + static bool full_crc32(ulint flags) { + return flags & FSP_FLAGS_FCRC32_MASK_MARKER; + } + /** @return whether innodb_checksum_algorithm=full_crc32 is active */ + bool full_crc32() const { return full_crc32(flags); } + /** Determine the logical page size. + @param flags tablespace flags (FSP_FLAGS) + @return the logical page size + @retval 0 if the flags are invalid */ + static unsigned logical_size(ulint flags) { + + ulint page_ssize = 0; + + if (full_crc32(flags)) { + page_ssize = FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags); + } else { + page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); + } + + switch (page_ssize) { + case 3: return 4096; + case 4: return 8192; + case 5: + { ut_ad(full_crc32(flags)); return 16384; } + case 0: + { ut_ad(!full_crc32(flags)); return 16384; } + case 6: return 32768; + case 7: return 65536; + default: return 0; + } + } + /** Determine the ROW_FORMAT=COMPRESSED page size. + @param flags tablespace flags (FSP_FLAGS) + @return the ROW_FORMAT=COMPRESSED page size + @retval 0 if ROW_FORMAT=COMPRESSED is not used */ + static unsigned zip_size(ulint flags) { + + if (full_crc32(flags)) { + return 0; + } + + ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(flags); + return zip_ssize + ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize : 0; + } + /** Determine the physical page size. + @param flags tablespace flags (FSP_FLAGS) + @return the physical page size */ + static unsigned physical_size(ulint flags) { + + if (full_crc32(flags)) { + return logical_size(flags); + } + + ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(flags); + return zip_ssize + ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize + : unsigned(srv_page_size); + } + /** @return the ROW_FORMAT=COMPRESSED page size + @retval 0 if ROW_FORMAT=COMPRESSED is not used */ + unsigned zip_size() const { return zip_size(flags); } + /** @return the physical page size */ + unsigned physical_size() const { return physical_size(flags); } + /** Check whether the compression enabled in tablespace. + @param[in] flags tablespace flags */ + static bool is_compressed(ulint flags) { + + if (full_crc32(flags)) { + ulint algo = FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO( + flags); + DBUG_ASSERT(algo <= PAGE_ALGORITHM_LAST); + return algo > 0; + } + + return FSP_FLAGS_HAS_PAGE_COMPRESSION(flags); + } + /** @return whether the compression enabled for the tablespace. */ + bool is_compressed() const { return is_compressed(flags); } + + /** Get the compression algorithm for full crc32 format. + @param[in] flags tablespace flags + @return algorithm type of tablespace */ + static ulint get_compression_algo(ulint flags) { - ut_ad(referenced()); - my_atomic_addlint(&n_pending_ops, ulint(-1)); + return full_crc32(flags) + ? FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags) + : 0; } - /** @return whether references are being held */ - bool referenced() { return my_atomic_loadlint(&n_pending_ops); } - /** @return whether references are being held */ - bool referenced() const + /** @return the page_compressed algorithm + @retval 0 if not page_compressed */ + ulint get_compression_algo() const { + return fil_space_t::get_compression_algo(flags); + } + /** Determine if the page_compressed page contains an extra byte + for exact compressed stream length + @param[in] flags tablespace flags + @return whether the extra byte is needed */ + static bool full_crc32_page_compressed_len(ulint flags) { - return const_cast<fil_space_t*>(this)->referenced(); + DBUG_ASSERT(full_crc32(flags)); + switch (get_compression_algo(flags)) { + case PAGE_LZ4_ALGORITHM: + case PAGE_LZO_ALGORITHM: + case PAGE_SNAPPY_ALGORITHM: + return true; + } + return false; } - /** Acquire a tablespace reference for I/O. */ - void acquire_for_io() { my_atomic_addlint(&n_pending_ios, 1); } - /** Release a tablespace reference for I/O. */ - void release_for_io() + /** Whether the full checksum matches with non full checksum flags. + @param[in] flags flags present + @param[in] expected expected flags + @return true if it is equivalent */ + static bool is_flags_full_crc32_equal(ulint flags, ulint expected) { - ut_ad(pending_io()); - my_atomic_addlint(&n_pending_ios, ulint(-1)); + ut_ad(full_crc32(flags)); + ulint page_ssize = FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags); + + if (full_crc32(expected)) { + /* The data file may have been created with a + different innodb_compression_algorithm. But + we only support one innodb_page_size for all files. */ + return page_ssize + == FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(expected); + } + + ulint space_page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(expected); + + if (page_ssize == 5) { + if (space_page_ssize) { + return false; + } + } else if (space_page_ssize != page_ssize) { + return false; + } + + return true; } - /** @return whether I/O is pending */ - bool pending_io() { return my_atomic_loadlint(&n_pending_ios); } - /** @return whether I/O is pending */ - bool pending_io() const + /** Whether old tablespace flags match full_crc32 flags. + @param[in] flags flags present + @param[in] expected expected flags + @return true if it is equivalent */ + static bool is_flags_non_full_crc32_equal(ulint flags, ulint expected) + { + ut_ad(!full_crc32(flags)); + + if (!full_crc32(expected)) { + return false; + } + + ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); + ulint space_page_ssize = FSP_FLAGS_FCRC32_GET_PAGE_SSIZE( + expected); + + if (page_ssize) { + if (space_page_ssize != 5) { + return false; + } + } else if (space_page_ssize != page_ssize) { + return false; + } + + return true; + } + /** Whether both fsp flags are equivalent */ + static bool is_flags_equal(ulint flags, ulint expected) + { + if (!((flags ^ expected) & ~(1U << FSP_FLAGS_POS_RESERVED))) { + return true; + } + + return full_crc32(flags) + ? is_flags_full_crc32_equal(flags, expected) + : is_flags_non_full_crc32_equal(flags, expected); + } + /** Validate the tablespace flags for full crc32 format. + @param[in] flags the content of FSP_SPACE_FLAGS + @return whether the flags are correct in full crc32 format */ + static bool is_fcrc32_valid_flags(ulint flags) + { + ut_ad(flags & FSP_FLAGS_FCRC32_MASK_MARKER); + const ulint page_ssize = physical_size(flags); + if (page_ssize < 3 || page_ssize & 8) { + return false; + } + + flags >>= FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO; + + return flags <= PAGE_ALGORITHM_LAST; + } + /** Validate the tablespace flags. + @param[in] flags content of FSP_SPACE_FLAGS + @param[in] is_ibd whether this is an .ibd file + (not system tablespace) + @return whether the flags are correct. */ + static bool is_valid_flags(ulint flags, bool is_ibd) { - return const_cast<fil_space_t*>(this)->pending_io(); + DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", + return false;); + + if (full_crc32(flags)) { + return is_fcrc32_valid_flags(flags); + } + + if (flags == 0) { + return true; + } + + if (flags & ~FSP_FLAGS_MASK) { + return false; + } + + if ((flags & (FSP_FLAGS_MASK_POST_ANTELOPE + | FSP_FLAGS_MASK_ATOMIC_BLOBS)) + == FSP_FLAGS_MASK_ATOMIC_BLOBS) { + /* If the "atomic blobs" flag (indicating + ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED) flag + is set, then the "post Antelope" + (ROW_FORMAT!=REDUNDANT) flag must also be set. */ + return false; + } + + /* Bits 10..14 should be 0b0000d where d is the DATA_DIR flag + of MySQL 5.6 and MariaDB 10.0, which we ignore. + In the buggy FSP_SPACE_FLAGS written by MariaDB 10.1.0 to 10.1.20, + bits 10..14 would be nonzero 0bsssaa where sss is + nonzero PAGE_SSIZE (3, 4, 6, or 7) + and aa is ATOMIC_WRITES (not 0b11). */ + if (FSP_FLAGS_GET_RESERVED(flags) & ~1U) { + return false; + } + + const ulint ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); + if (ssize == 1 || ssize == 2 || ssize == 5 || ssize & 8) { + /* the page_size is not between 4k and 64k; + 16k should be encoded as 0, not 5 */ + return false; + } + + const ulint zssize = FSP_FLAGS_GET_ZIP_SSIZE(flags); + if (zssize == 0) { + /* not ROW_FORMAT=COMPRESSED */ + } else if (zssize > (ssize ? ssize : 5)) { + /* Invalid KEY_BLOCK_SIZE */ + return false; + } else if (~flags & (FSP_FLAGS_MASK_POST_ANTELOPE + | FSP_FLAGS_MASK_ATOMIC_BLOBS)) { + /* both these flags should be set for + ROW_FORMAT=COMPRESSED */ + return false; + } + + /* The flags do look valid. But, avoid misinterpreting + buggy MariaDB 10.1 format flags for + PAGE_COMPRESSED=1 PAGE_COMPRESSION_LEVEL={0,2,3} + as valid-looking PAGE_SSIZE if this is known to be + an .ibd file and we are using the default innodb_page_size=16k. */ + return(ssize == 0 || !is_ibd + || srv_page_size != UNIV_PAGE_SIZE_ORIG); } }; +#ifndef UNIV_INNOCHECKSUM /** Value of fil_space_t::magic_n */ #define FIL_SPACE_MAGIC_N 89472 @@ -302,6 +557,8 @@ struct fil_node_t { pfs_os_file_t handle; /** whether the file actually is a raw device or disk partition */ bool is_raw_disk; + /** whether the file is on non-rotational media (SSD) */ + bool on_ssd; /** size of the file in database pages (0 if not known yet); the possible last incomplete megabyte may be ignored if space->id == 0 */ @@ -344,6 +601,14 @@ struct fil_node_t { @return whether the page was found valid */ bool read_page0(bool first); + /** Determine some file metadata when creating or reading the file. + @param file the file that is being created, or OS_FILE_CLOSED */ + void find_metadata(os_file_t file = OS_FILE_CLOSED +#ifdef UNIV_LINUX + , struct stat* statbuf = NULL +#endif + ); + /** Close the file handle. */ void close(); }; @@ -351,6 +616,24 @@ struct fil_node_t { /** Value of fil_node_t::magic_n */ #define FIL_NODE_MAGIC_N 89389 +inline void fil_space_t::set_imported() +{ + ut_ad(purpose == FIL_TYPE_IMPORT); + purpose = FIL_TYPE_TABLESPACE; + UT_LIST_GET_FIRST(chain)->find_metadata(); +} + +inline bool fil_space_t::is_rotational() const +{ + for (const fil_node_t* node = UT_LIST_GET_FIRST(chain); + node != NULL; node = UT_LIST_GET_NEXT(chain, node)) { + if (!node->on_ssd) { + return true; + } + } + return false; +} + /** Common InnoDB file extensions */ enum ib_extention { NO_EXT = 0, @@ -389,19 +672,12 @@ typedef byte fil_faddr_t; /*!< 'type' definition in C: an address #define FIL_ADDR_BYTE 4U /* then comes 2-byte byte offset within page*/ #define FIL_ADDR_SIZE 6U /* address size is 6 bytes */ -#ifndef UNIV_INNOCHECKSUM - /** File space address */ struct fil_addr_t { ulint page; /*!< page number within a space */ ulint boffset; /*!< byte offset within the page */ }; -/** The null file address */ -extern const fil_addr_t fil_addr_null; - -#endif /* !UNIV_INNOCHECKSUM */ - /** The byte offsets on a file page for various variables @{ */ #define FIL_PAGE_SPACE_OR_CHKSUM 0 /*!< in < MySQL-4.0.14 space id the page belongs to (== 0) but in later @@ -442,19 +718,19 @@ extern const fil_addr_t fil_addr_null; MySQL/InnoDB 5.1.7 or later, the contents of this field is valid for all uncompressed pages. */ -#define FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION 26U /*!< for the first page - in a system tablespace data file - (ibdata*, not *.ibd): the file has - been flushed to disk at least up - to this lsn - for other pages: a 32-bit key version - used to encrypt the page + 32-bit checksum - or 64 bits of zero if no encryption - */ + +/** For the first page in a system tablespace data file(ibdata*, not *.ibd): +the file has been flushed to disk at least up to this lsn +For other pages: 32-bit key version used to encrypt the page + 32-bit checksum +or 64 bites of zero if no encryption */ +#define FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION 26U /** This overloads FIL_PAGE_FILE_FLUSH_LSN for RTREE Split Sequence Number */ #define FIL_RTREE_SPLIT_SEQ_NUM FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION +/** Start of the page_compressed content */ +#define FIL_PAGE_COMP_ALGO FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + /** starting from 4.1.x this contains the space id of the page */ #define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34U @@ -462,25 +738,45 @@ extern const fil_addr_t fil_addr_null; #define FIL_PAGE_DATA 38U /*!< start of the data on the page */ -/* Following are used when page compression is used */ -#define FIL_PAGE_COMPRESSED_SIZE 2 /*!< Number of bytes used to store - actual payload data size on - compressed pages. */ -#define FIL_PAGE_COMPRESSION_METHOD_SIZE 2 - /*!< Number of bytes used to store - actual compression method. */ +/** 32-bit key version used to encrypt the page in full_crc32 format. +For non-encrypted page, it contains 0. */ +#define FIL_PAGE_FCRC32_KEY_VERSION 0 + +/** page_compressed without innodb_checksum_algorithm=full_crc32 @{ */ +/** Number of bytes used to store actual payload data size on +page_compressed pages when not using full_crc32. */ +#define FIL_PAGE_COMP_SIZE 0 + +/** Number of bytes for FIL_PAGE_COMP_SIZE */ +#define FIL_PAGE_COMP_METADATA_LEN 2 + +/** Number of bytes used to store actual compression method +for encrypted tables when not using full_crc32. */ +#define FIL_PAGE_ENCRYPT_COMP_ALGO 2 + +/** Extra header size for encrypted page_compressed pages when +not using full_crc32 */ +#define FIL_PAGE_ENCRYPT_COMP_METADATA_LEN 4 /* @} */ + /** File page trailer @{ */ #define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used to store the page checksum, the last 4 bytes should be identical to the last 4 bytes of FIL_PAGE_LSN */ #define FIL_PAGE_DATA_END 8 /*!< size of the page trailer */ + +/** Store the last 4 bytes of FIL_PAGE_LSN */ +#define FIL_PAGE_FCRC32_END_LSN 8 + +/** Store crc32 checksum at the end of the page */ +#define FIL_PAGE_FCRC32_CHECKSUM 4 /* @} */ /** File page types (values of FIL_PAGE_TYPE) @{ */ -#define FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED 37401 /*!< Page is compressed and - then encrypted */ +/** page_compressed, encrypted=YES (not used for full_crc32) */ +#define FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED 37401 +/** page_compressed (not used for full_crc32) */ #define FIL_PAGE_PAGE_COMPRESSED 34354 /*!< page compressed page */ #define FIL_PAGE_INDEX 17855 /*!< B-tree node */ #define FIL_PAGE_RTREE 17854 /*!< R-tree node (SPATIAL INDEX) */ @@ -513,6 +809,12 @@ extern const fil_addr_t fil_addr_null; Note: FIL_PAGE_TYPE_INSTANT maps to the same as FIL_PAGE_INDEX. */ #define FIL_PAGE_TYPE_LAST FIL_PAGE_TYPE_UNKNOWN /*!< Last page type */ +/** Set in FIL_PAGE_TYPE if for full_crc32 pages in page_compressed format. +If the flag is set, then the following holds for the remaining bits +of FIL_PAGE_TYPE: +Bits 0..7 will contain the compressed page size in bytes. +Bits 8..14 are reserved and must be 0. */ +#define FIL_PAGE_COMPRESS_FCRC32_MARKER 15 /* @} */ /** @return whether the page type is B-tree or R-tree index */ @@ -597,6 +899,22 @@ struct fil_system_t { private: bool m_initialised; +#ifdef UNIV_LINUX + /** available block devices that reside on non-rotational storage */ + std::vector<dev_t> ssd; +public: + /** @return whether a file system device is on non-rotational storage */ + bool is_ssd(dev_t dev) const + { + /* Linux seems to allow up to 15 partitions per block device. + If the detected ssd carries "partition number 0" (it is the whole device), + compare the candidate file system number without the partition number. */ + for (const auto s : ssd) + if (dev == s || (dev & ~15U) == s) + return true; + return false; + } +#endif public: ib_mutex_t mutex; /*!< The mutex protecting the cache */ fil_space_t* sys_space; /*!< The innodb_system tablespace */ @@ -749,16 +1067,6 @@ fil_space_get_flags( /*================*/ ulint id); /*!< in: space id */ -/** Returns the page size of the space and whether it is compressed or not. -The tablespace must be cached in the memory cache. -@param[in] id space id -@param[out] found true if tablespace was found -@return page size */ -const page_size_t -fil_space_get_page_size( - ulint id, - bool* found); - /*******************************************************************//** Opens all log files and system tablespace data files. They stay open until the database server shutdown. This should be called at a server startup after the @@ -804,10 +1112,8 @@ for concurrency control. @param[in] id tablespace ID @param[in] silent whether to silently ignore missing tablespaces @return the tablespace -@retval NULL if missing or being deleted or truncated */ -UNIV_INTERN -fil_space_t* -fil_space_acquire_low(ulint id, bool silent) +@retval NULL if missing or being deleted */ +fil_space_t* fil_space_acquire_low(ulint id, bool silent) MY_ATTRIBUTE((warn_unused_result)); /** Acquire a tablespace when it could be dropped concurrently. @@ -1085,7 +1391,7 @@ fil_space_extend( @param[in] type IO context @param[in] sync true if synchronous aio is desired @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] byte_offset remainder of offset in bytes; in aio this must be divisible by the OS block size @param[in] len how many bytes to read or write; this must @@ -1097,14 +1403,14 @@ fil_space_extend( @param[in] message message for aio handler if non-sync aio used, else ignored @param[in] ignore_missing_space true=ignore missing space during read -@return DB_SUCCESS, DB_TABLESPACE_DELETED or DB_TABLESPACE_TRUNCATED +@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do i/o on a tablespace which does not exist */ dberr_t fil_io( const IORequest& type, bool sync, const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint byte_offset, ulint len, void* buf, diff --git a/storage/innobase/include/fil0fil.ic b/storage/innobase/include/fil0fil.ic index 31466f38546..24e4157d1f3 100644 --- a/storage/innobase/include/fil0fil.ic +++ b/storage/innobase/include/fil0fil.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2015, 2018, MariaDB Corporation. +Copyright (c) 2015, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -75,17 +75,25 @@ fil_get_page_type_name( } } -/****************************************************************//** -Validate page type. +#ifdef UNIV_DEBUG +/** Validate page type. +@param[in] space Tablespace object +@param[in] page page to validate @return true if valid, false if not */ UNIV_INLINE bool fil_page_type_validate( - const byte* page) /*!< in: page */ + fil_space_t* space, + const byte* page) { -#ifdef UNIV_DEBUG ulint page_type = mach_read_from_2(page + FIL_PAGE_TYPE); + if ((page_type & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER) + && space->full_crc32() + && space->is_compressed()) { + return true; + } + /* Validate page type */ if (!((page_type == FIL_PAGE_PAGE_COMPRESSED || page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED || @@ -106,25 +114,31 @@ fil_page_type_validate( page_type == FIL_PAGE_TYPE_ZBLOB2 || page_type == FIL_PAGE_TYPE_UNKNOWN))) { - ulint space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + ulint space_id = mach_read_from_4( + page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + ulint offset = mach_read_from_4(page + FIL_PAGE_OFFSET); - fil_system_enter(); - fil_space_t* rspace = fil_space_get_by_id(space); - fil_system_exit(); + + ulint key_version = mach_read_from_4( + page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + + if (space && space->full_crc32()) { + key_version = mach_read_from_4( + page + FIL_PAGE_FCRC32_KEY_VERSION); + } /* Dump out the page info */ - ib::fatal() << "Page " << space << ":" << offset - << " name " << (rspace ? rspace->name : "???") + ib::fatal() << "Page " << space_id << ":" << offset + << " name " << (space ? space->name : "???") << " page_type " << page_type - << " key_version " - << mach_read_from_4(page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) + << " key_version " << key_version << " lsn " << mach_read_from_8(page + FIL_PAGE_LSN) << " compressed_len " << mach_read_from_2(page + FIL_PAGE_DATA); return false; } -#endif /* UNIV_DEBUG */ return true; } +#endif /* UNIV_DEBUG */ #endif /* fil0fil_ic */ diff --git a/storage/innobase/include/fil0pagecompress.h b/storage/innobase/include/fil0pagecompress.h index 545e05da769..9baf3289380 100644 --- a/storage/innobase/include/fil0pagecompress.h +++ b/storage/innobase/include/fil0pagecompress.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (C) 2013, 2018 MariaDB Corporation. +Copyright (C) 2013, 2019 MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -33,21 +33,29 @@ Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com /** Compress a page_compressed page before writing to a data file. @param[in] buf page to be compressed @param[out] out_buf compressed page -@param[in] level compression level +@param[in] flags tablespace flags @param[in] block_size file system block size @param[in] encrypted whether the page will be subsequently encrypted @return actual length of compressed page @retval 0 if the page was not compressed */ -ulint fil_page_compress(const byte* buf, byte* out_buf, ulint level, - ulint block_size, bool encrypted) +ulint fil_page_compress( + const byte* buf, + byte* out_buf, + ulint flags, + ulint block_size, + bool encrypted) MY_ATTRIBUTE((nonnull, warn_unused_result)); /** Decompress a page that may be subject to page_compressed compression. @param[in,out] tmp_buf temporary buffer (of innodb_page_size) @param[in,out] buf compressed page buffer +@param[in] flags talespace flags @return size of the compressed data @retval 0 if decompression failed @retval srv_page_size if the page was not compressed */ -ulint fil_page_decompress(byte* tmp_buf, byte* buf) +ulint fil_page_decompress( + byte* tmp_buf, + byte* buf, + ulint flags) MY_ATTRIBUTE((nonnull, warn_unused_result)); #endif diff --git a/storage/innobase/include/fsp0file.h b/storage/innobase/include/fsp0file.h index 8c5b24fbadb..15485769429 100644 --- a/storage/innobase/include/fsp0file.h +++ b/storage/innobase/include/fsp0file.h @@ -504,13 +504,13 @@ public: /* No op - base constructor is called. */ } - ~RemoteDatafile() + ~RemoteDatafile() override { shutdown(); } /** Release the resources. */ - void shutdown(); + void shutdown() override; /** Get the link filepath. @return m_link_filepath */ @@ -532,7 +532,7 @@ public: in read-only mode so that it can be validated. @param[in] strict whether to issue error messages @return DB_SUCCESS or error code */ - dberr_t open_read_only(bool strict); + dberr_t open_read_only(bool strict) override; /** Opens a handle to the file linked to in an InnoDB Symbolic Link file in read-write mode so that it can be restored from doublewrite @@ -540,7 +540,7 @@ public: @param[in] read_only_mode If true, then readonly mode checks are enforced. @return DB_SUCCESS or error code */ - dberr_t open_read_write(bool read_only_mode) + dberr_t open_read_write(bool read_only_mode) override MY_ATTRIBUTE((warn_unused_result)); /****************************************************************** diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h index 2ba85803eb5..8e1acfe1805 100644 --- a/storage/innobase/include/fsp0fsp.h +++ b/storage/innobase/include/fsp0fsp.h @@ -28,15 +28,15 @@ Created 12/18/1995 Heikki Tuuri #define fsp0fsp_h #include "fsp0types.h" +#include "fut0lst.h" +#include "ut0byte.h" #ifndef UNIV_INNOCHECKSUM - -#include "fut0lst.h" #include "mtr0mtr.h" #include "page0types.h" #include "rem0types.h" -#include "ut0byte.h" - +#else +# include "mach0data.h" #endif /* !UNIV_INNOCHECKSUM */ /** @return the PAGE_SSIZE flags for the current innodb_page_size */ @@ -45,6 +45,12 @@ Created 12/18/1995 Heikki Tuuri 0U : (srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1) \ << FSP_FLAGS_POS_PAGE_SSIZE) +/** @return the PAGE_SSIZE flags for the current innodb_page_size in +full checksum format */ +#define FSP_FLAGS_FCRC32_PAGE_SSIZE() \ + ((srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1) \ + << FSP_FLAGS_FCRC32_POS_PAGE_SSIZE) + /* @defgroup Compatibility macros for MariaDB 10.1.0 through 10.1.20; see the table in fsp0types.h @{ */ /** Zero relative shift position of the PAGE_COMPRESSION field */ @@ -201,11 +207,6 @@ typedef byte fseg_inode_t; (16 + 3 * FLST_BASE_NODE_SIZE \ + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE) -#define FSP_SEG_INODES_PER_PAGE(page_size) \ - ((page_size.physical() - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE) - /* Number of segment inodes which fit on a - single page */ - #define FSEG_MAGIC_N_VALUE 97937874 #define FSEG_FILLFACTOR 8 /* If this value is x, then if @@ -290,33 +291,6 @@ the extent are free and which contain old tuple version to clean. */ #ifndef UNIV_INNOCHECKSUM /* @} */ -/** Calculate the number of pages to extend a datafile. -We extend single-table tablespaces first one extent at a time, -but 4 at a time for bigger tablespaces. It is not enough to extend always -by one extent, because we need to add at least one extent to FSP_FREE. -A single extent descriptor page will track many extents. And the extent -that uses its extent descriptor page is put onto the FSP_FREE_FRAG list. -Extents that do not use their extent descriptor page are added to FSP_FREE. -The physical page size is used to determine how many extents are tracked -on one extent descriptor page. See xdes_calc_descriptor_page(). -@param[in] page_size page_size of the datafile -@param[in] size current number of pages in the datafile -@return number of pages to extend the file. */ -ulint -fsp_get_pages_to_extend_ibd( - const page_size_t& page_size, - ulint size); - -/** Calculate the number of physical pages in an extent for this file. -@param[in] page_size page_size of the datafile -@return number of pages in an extent for this file. */ -UNIV_INLINE -ulint -fsp_get_extent_size_in_pages(const page_size_t& page_size) -{ - return (FSP_EXTENT_SIZE << srv_page_size_shift) / page_size.physical(); -} - /**********************************************************************//** Reads the space id from the first page of a tablespace. @return space id, ULINT UNDEFINED if error */ @@ -347,13 +321,15 @@ fsp_header_get_flags(const page_t* page) } /** Get the byte offset of encryption information in page 0. -@param[in] ps page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @return byte offset relative to FSP_HEADER_OFFSET */ inline MY_ATTRIBUTE((pure, warn_unused_result)) -ulint -fsp_header_get_encryption_offset(const page_size_t& ps) +ulint fsp_header_get_encryption_offset(ulint zip_size) { - return XDES_ARR_OFFSET + XDES_SIZE * ps.physical() / FSP_EXTENT_SIZE; + return zip_size + ? XDES_ARR_OFFSET + XDES_SIZE * zip_size / FSP_EXTENT_SIZE + : XDES_ARR_OFFSET + (XDES_SIZE << srv_page_size_shift) + / FSP_EXTENT_SIZE; } /** Check the encryption key from the first page of a tablespace. @@ -512,12 +488,14 @@ fsp_reserve_free_extents( @param[in,out] seg_header file segment header @param[in,out] space tablespace @param[in] offset page number +@param[in] log whether to write MLOG_INIT_FREE_PAGE record @param[in,out] mtr mini-transaction */ void fseg_free_page( fseg_header_t* seg_header, fil_space_t* space, ulint offset, + bool log, mtr_t* mtr); /** Determine whether a page is free. @param[in,out] space tablespace @@ -591,13 +569,12 @@ fil_block_check_type( /** Checks if a page address is an extent descriptor page address. @param[in] page_id page id -@param[in] page_size page size -@return TRUE if a descriptor page */ -UNIV_INLINE -ibool -fsp_descr_page( - const page_id_t page_id, - const page_size_t& page_size); +@param[in] physical_size page size +@return whether a descriptor page */ +inline bool fsp_descr_page(const page_id_t page_id, ulint physical_size) +{ + return (page_id.page_no() & (physical_size - 1)) == FSP_XDES_OFFSET; +} /** Initialize a file page whose prior contents should be ignored. @param[in,out] block buffer pool block */ @@ -644,7 +621,7 @@ fsp_flags_convert_from_101(ulint flags) { DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(ULINT_UNDEFINED);); - if (flags == 0) { + if (flags == 0 || fil_space_t::full_crc32(flags)) { return(flags); } @@ -739,7 +716,7 @@ fsp_flags_convert_from_101(ulint flags) flags = ((flags & 0x3f) | ssize << FSP_FLAGS_POS_PAGE_SSIZE | FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags) << FSP_FLAGS_POS_PAGE_COMPRESSION); - ut_ad(fsp_flags_is_valid(flags, false)); + ut_ad(fil_space_t::is_valid_flags(flags, false)); return(flags); } @@ -753,7 +730,7 @@ bool fsp_flags_match(ulint expected, ulint actual) { expected &= ~FSP_FLAGS_MEM_MASK; - ut_ad(fsp_flags_is_valid(expected, false)); + ut_ad(fil_space_t::is_valid_flags(expected, false)); if (actual == expected) { return(true); @@ -763,16 +740,6 @@ fsp_flags_match(ulint expected, ulint actual) return(actual == expected); } -/** Calculates the descriptor index within a descriptor page. -@param[in] page_size page size -@param[in] offset page offset -@return descriptor index */ -UNIV_INLINE -ulint -xdes_calc_descriptor_index( - const page_size_t& page_size, - ulint offset); - /**********************************************************************//** Gets a descriptor bit of a page. @return TRUE if free */ @@ -785,15 +752,42 @@ xdes_get_bit( ulint offset);/*!< in: page offset within extent: 0 ... FSP_EXTENT_SIZE - 1 */ -/** Calculates the page where the descriptor of a page resides. -@param[in] page_size page size +/** Determine the descriptor index within a descriptor page. +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] offset page offset +@return descriptor index */ +inline ulint xdes_calc_descriptor_index(ulint zip_size, ulint offset) +{ + return ut_2pow_remainder<ulint>(offset, + zip_size ? zip_size : srv_page_size) + / FSP_EXTENT_SIZE; +} + +/** Determine the descriptor page number for a page. +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] offset page offset @return descriptor page offset */ -UNIV_INLINE -ulint -xdes_calc_descriptor_page( - const page_size_t& page_size, - ulint offset); +inline ulint xdes_calc_descriptor_page(ulint zip_size, ulint offset) +{ + compile_time_assert(UNIV_PAGE_SIZE_MAX > XDES_ARR_OFFSET + + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX) + * XDES_SIZE_MAX); + compile_time_assert(UNIV_PAGE_SIZE_MIN > XDES_ARR_OFFSET + + (UNIV_PAGE_SIZE_MIN / FSP_EXTENT_SIZE_MIN) + * XDES_SIZE_MIN); + + ut_ad(srv_page_size > XDES_ARR_OFFSET + + (srv_page_size / FSP_EXTENT_SIZE) + * XDES_SIZE); + ut_ad(UNIV_ZIP_SIZE_MIN > XDES_ARR_OFFSET + + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE) + * XDES_SIZE); + ut_ad(!zip_size + || zip_size > XDES_ARR_OFFSET + + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE); + return ut_2pow_round<ulint>(offset, + zip_size ? zip_size : srv_page_size); +} #endif /* UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/fsp0fsp.ic b/storage/innobase/include/fsp0fsp.ic index 9f28aacaff5..31b9d8c5dbe 100644 --- a/storage/innobase/include/fsp0fsp.ic +++ b/storage/innobase/include/fsp0fsp.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, 2017, MariaDB Corporation. +Copyright (c) 2013, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -24,37 +24,6 @@ File space management Created 12/18/1995 Heikki Tuuri *******************************************************/ -#ifndef UNIV_INNOCHECKSUM - -/** Checks if a page address is an extent descriptor page address. -@param[in] page_id page id -@param[in] page_size page size -@return TRUE if a descriptor page */ -UNIV_INLINE -ibool -fsp_descr_page( - const page_id_t page_id, - const page_size_t& page_size) -{ - return((page_id.page_no() & (page_size.physical() - 1)) - == FSP_XDES_OFFSET); -} - -/** Calculates the descriptor index within a descriptor page. -@param[in] page_size page size -@param[in] offset page offset -@return descriptor index */ -UNIV_INLINE -ulint -xdes_calc_descriptor_index( - const page_size_t& page_size, - ulint offset) -{ - return(ut_2pow_remainder(offset, page_size.physical()) - / FSP_EXTENT_SIZE); -} -#endif /*!UNIV_INNOCHECKSUM */ - /**********************************************************************//** Gets a descriptor bit of a page. @return TRUE if free */ @@ -75,44 +44,5 @@ xdes_get_bit( ulint bit_index = index % 8; ulint byte_index = index / 8; - return(ut_bit_get_nth( - mach_read_ulint(descr + XDES_BITMAP + byte_index, - MLOG_1BYTE), - bit_index)); -} - -#ifndef UNIV_INNOCHECKSUM -/** Calculates the page where the descriptor of a page resides. -@param[in] page_size page size -@param[in] offset page offset -@return descriptor page offset */ -UNIV_INLINE -ulint -xdes_calc_descriptor_page( - const page_size_t& page_size, - ulint offset) -{ - compile_time_assert(UNIV_PAGE_SIZE_MAX > XDES_ARR_OFFSET - + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX) - * XDES_SIZE_MAX); - compile_time_assert(UNIV_PAGE_SIZE_MIN > XDES_ARR_OFFSET - + (UNIV_PAGE_SIZE_MIN / FSP_EXTENT_SIZE_MIN) - * XDES_SIZE_MIN); - - ut_ad(srv_page_size > XDES_ARR_OFFSET - + (srv_page_size / FSP_EXTENT_SIZE) - * XDES_SIZE); - ut_ad(UNIV_ZIP_SIZE_MIN > XDES_ARR_OFFSET - + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE) - * XDES_SIZE); - -#ifdef UNIV_DEBUG - if (page_size.is_compressed()) { - ut_a(page_size.physical() > XDES_ARR_OFFSET - + (page_size.physical() / FSP_EXTENT_SIZE) * XDES_SIZE); - } -#endif /* UNIV_DEBUG */ - - return(ut_2pow_round(offset, page_size.physical())); + return ut_bit_get_nth(descr[XDES_BITMAP + byte_index], bit_index); } -#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/fsp0pagecompress.h b/storage/innobase/include/fsp0pagecompress.h index fc0b907dfa7..27423858435 100644 --- a/storage/innobase/include/fsp0pagecompress.h +++ b/storage/innobase/include/fsp0pagecompress.h @@ -27,17 +27,6 @@ Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com #ifndef fsp0pagecompress_h #define fsp0pagecompress_h -/* Supported page compression methods */ - -#define PAGE_UNCOMPRESSED 0 -#define PAGE_ZLIB_ALGORITHM 1 -#define PAGE_LZ4_ALGORITHM 2 -#define PAGE_LZO_ALGORITHM 3 -#define PAGE_LZMA_ALGORITHM 4 -#define PAGE_BZIP2_ALGORITHM 5 -#define PAGE_SNAPPY_ALGORITHM 6 -#define PAGE_ALGORITHM_LAST PAGE_SNAPPY_ALGORITHM - /**********************************************************************//** Reads the page compression level from the first page of a tablespace. @return page compression level, or 0 if uncompressed */ diff --git a/storage/innobase/include/fsp0space.h b/storage/innobase/include/fsp0space.h index 5bd70e4f80d..632c65e14cc 100644 --- a/storage/innobase/include/fsp0space.h +++ b/storage/innobase/include/fsp0space.h @@ -127,7 +127,7 @@ public: @param[in] fsp_flags tablespace flags */ void set_flags(ulint fsp_flags) { - ut_ad(fsp_flags_is_valid(fsp_flags, false)); + ut_ad(fil_space_t::is_valid_flags(fsp_flags, false)); m_flags = fsp_flags; } diff --git a/storage/innobase/include/fsp0sysspace.h b/storage/innobase/include/fsp0sysspace.h index d3a79ec23a9..bcb8dd5e5e9 100644 --- a/storage/innobase/include/fsp0sysspace.h +++ b/storage/innobase/include/fsp0sysspace.h @@ -49,7 +49,7 @@ public: /* No op */ } - ~SysTablespace() + ~SysTablespace() override { shutdown(); } diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h index 5c77b62723a..69c5346a4f9 100644 --- a/storage/innobase/include/fsp0types.h +++ b/storage/innobase/include/fsp0types.h @@ -27,10 +27,6 @@ Created May 26, 2009 Vasil Dimov #ifndef fsp0types_h #define fsp0types_h -#include "univ.i" - -#ifndef UNIV_INNOCHECKSUM - /** The fil_space_t::id of the redo log. All persistent tablespaces have a smaller fil_space_t::id. */ #define SRV_LOG_SPACE_FIRST_ID 0xFFFFFFF0U @@ -39,6 +35,16 @@ have a smaller fil_space_t::id. */ #include "ut0byte.h" +/* Possible values of innodb_compression_algorithm */ +#define PAGE_UNCOMPRESSED 0 +#define PAGE_ZLIB_ALGORITHM 1 +#define PAGE_LZ4_ALGORITHM 2 +#define PAGE_LZO_ALGORITHM 3 +#define PAGE_LZMA_ALGORITHM 4 +#define PAGE_BZIP2_ALGORITHM 5 +#define PAGE_SNAPPY_ALGORITHM 6 +#define PAGE_ALGORITHM_LAST PAGE_SNAPPY_ALGORITHM + /** @name Flags for inserting records in order If records are inserted in order, there are the following flags to tell this (their type is made byte for the compiler @@ -50,7 +56,6 @@ fseg_alloc_free_page) */ #define FSP_NO_DIR ((byte)113) /*!< no order */ /* @} */ -#endif /* !UNIV_INNOCHECKSUM */ /** File space extent size in pages page size | file space extent size ----------+----------------------- @@ -73,7 +78,6 @@ page size | file space extent size offset */ #define FSEG_PAGE_DATA FIL_PAGE_DATA -#ifndef UNIV_INNOCHECKSUM /** @name File segment header The file segment header points to the inode describing the file segment. */ /* @{ */ @@ -88,6 +92,7 @@ typedef byte fseg_header_t; header, in bytes */ /* @} */ +#ifndef UNIV_INNOCHECKSUM #ifdef UNIV_DEBUG struct mtr_t; @@ -224,6 +229,15 @@ to ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. */ /** A mask of all the known/used bits in FSP_SPACE_FLAGS */ #define FSP_FLAGS_MASK (~(~0U << FSP_FLAGS_WIDTH)) +/** Number of flag bits used to indicate the tablespace page size */ +#define FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE 4 + +/** Marker to indicate whether tablespace is in full checksum format. */ +#define FSP_FLAGS_FCRC32_WIDTH_MARKER 1 + +/** Stores the compressed algo for full checksum format. */ +#define FSP_FLAGS_FCRC32_WIDTH_COMPRESSED_ALGO 3 + /* FSP_SPACE_FLAGS position and name in MySQL 5.6/MariaDB 10.0 or older and MariaDB 10.1.20 or older MariaDB 10.1 and in MariaDB 10.1.21 or newer. @@ -286,6 +300,19 @@ these are only used in MySQL 5.7 and used for compatibility. */ #define FSP_FLAGS_POS_PAGE_COMPRESSION (FSP_FLAGS_POS_RESERVED \ + FSP_FLAGS_WIDTH_RESERVED) +/** Zero relative shift position of the PAGE_SIZE field +in full crc32 format */ +#define FSP_FLAGS_FCRC32_POS_PAGE_SSIZE 0 + +/** Zero relative shift position of the MARKER field in full crc32 format. */ +#define FSP_FLAGS_FCRC32_POS_MARKER (FSP_FLAGS_FCRC32_POS_PAGE_SSIZE \ + + FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE) + +/** Zero relative shift position of the compressed algorithm stored +in full crc32 format. */ +#define FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO (FSP_FLAGS_FCRC32_POS_MARKER \ + + FSP_FLAGS_FCRC32_WIDTH_MARKER) + /** Bit mask of the POST_ANTELOPE field */ #define FSP_FLAGS_MASK_POST_ANTELOPE \ ((~(~0U << FSP_FLAGS_WIDTH_POST_ANTELOPE)) \ @@ -315,6 +342,21 @@ these are only used in MySQL 5.7 and used for compatibility. */ #define FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL \ (15U << FSP_FLAGS_MEM_COMPRESSION_LEVEL) +/** Bit mask of the PAGE_SIZE field in full crc32 format */ +#define FSP_FLAGS_FCRC32_MASK_PAGE_SSIZE \ + ((~(~0U << FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE)) \ + << FSP_FLAGS_FCRC32_POS_PAGE_SSIZE) + +/** Bit mask of the MARKER field in full crc32 format */ +#define FSP_FLAGS_FCRC32_MASK_MARKER \ + ((~(~0U << FSP_FLAGS_FCRC32_WIDTH_MARKER)) \ + << FSP_FLAGS_FCRC32_POS_MARKER) + +/** Bit mask of the COMPRESSED ALGO field in full crc32 format */ +#define FSP_FLAGS_FCRC32_MASK_COMPRESSED_ALGO \ + ((~(~0U << FSP_FLAGS_FCRC32_WIDTH_COMPRESSED_ALGO)) \ + << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO) + /** Return the value of the POST_ANTELOPE field */ #define FSP_FLAGS_GET_POST_ANTELOPE(flags) \ ((flags & FSP_FLAGS_MASK_POST_ANTELOPE) \ @@ -339,10 +381,14 @@ these are only used in MySQL 5.7 and used for compatibility. */ #define FSP_FLAGS_HAS_PAGE_COMPRESSION(flags) \ ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION) \ >> FSP_FLAGS_POS_PAGE_COMPRESSION) - -/** Return the contents of the UNUSED bits */ -#define FSP_FLAGS_GET_UNUSED(flags) \ - (flags >> FSP_FLAGS_POS_UNUSED) +/** @return the PAGE_SSIZE flags in full crc32 format */ +#define FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags) \ + ((flags & FSP_FLAGS_FCRC32_MASK_PAGE_SSIZE) \ + >> FSP_FLAGS_FCRC32_POS_PAGE_SSIZE) +/** @return the COMPRESSED_ALGO flags in full crc32 format */ +#define FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags) \ + ((flags & FSP_FLAGS_FCRC32_MASK_COMPRESSED_ALGO) \ + >> FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO) /** @return the value of the DATA_DIR field */ #define FSP_FLAGS_HAS_DATA_DIR(flags) \ @@ -354,67 +400,4 @@ these are only used in MySQL 5.7 and used for compatibility. */ /* @} */ -/** Validate the tablespace flags, which are stored in the -tablespace header at offset FSP_SPACE_FLAGS. -@param[in] flags the contents of FSP_SPACE_FLAGS -@param[in] is_ibd whether this is an .ibd file (not system tablespace) -@return whether the flags are correct (not in the buggy 10.1) format */ -MY_ATTRIBUTE((warn_unused_result, const)) -UNIV_INLINE -bool -fsp_flags_is_valid(ulint flags, bool is_ibd) -{ - DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", - return(false);); - if (flags == 0) { - return(true); - } - if (flags & ~FSP_FLAGS_MASK) { - return(false); - } - if ((flags & (FSP_FLAGS_MASK_POST_ANTELOPE | FSP_FLAGS_MASK_ATOMIC_BLOBS)) - == FSP_FLAGS_MASK_ATOMIC_BLOBS) { - /* If the "atomic blobs" flag (indicating - ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED) flag - is set, then the "post Antelope" (ROW_FORMAT!=REDUNDANT) flag - must also be set. */ - return(false); - } - /* Bits 10..14 should be 0b0000d where d is the DATA_DIR flag - of MySQL 5.6 and MariaDB 10.0, which we ignore. - In the buggy FSP_SPACE_FLAGS written by MariaDB 10.1.0 to 10.1.20, - bits 10..14 would be nonzero 0bsssaa where sss is - nonzero PAGE_SSIZE (3, 4, 6, or 7) - and aa is ATOMIC_WRITES (not 0b11). */ - if (FSP_FLAGS_GET_RESERVED(flags) & ~1U) { - return(false); - } - - const ulint ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); - if (ssize == 1 || ssize == 2 || ssize == 5 || ssize & 8) { - /* the page_size is not between 4k and 64k; - 16k should be encoded as 0, not 5 */ - return(false); - } - const ulint zssize = FSP_FLAGS_GET_ZIP_SSIZE(flags); - if (zssize == 0) { - /* not ROW_FORMAT=COMPRESSED */ - } else if (zssize > (ssize ? ssize : 5)) { - /* invalid KEY_BLOCK_SIZE */ - return(false); - } else if (~flags & (FSP_FLAGS_MASK_POST_ANTELOPE - | FSP_FLAGS_MASK_ATOMIC_BLOBS)) { - /* both these flags should be set for - ROW_FORMAT=COMPRESSED */ - return(false); - } - - /* The flags do look valid. But, avoid misinterpreting - buggy MariaDB 10.1 format flags for - PAGE_COMPRESSED=1 PAGE_COMPRESSION_LEVEL={0,2,3} - as valid-looking PAGE_SSIZE if this is known to be - an .ibd file and we are using the default innodb_page_size=16k. */ - return(ssize == 0 || !is_ibd || srv_page_size != UNIV_PAGE_SIZE_ORIG); -} - #endif /* fsp0types_h */ diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h index 354c36aba50..950d978e073 100644 --- a/storage/innobase/include/fts0fts.h +++ b/storage/innobase/include/fts0fts.h @@ -323,7 +323,7 @@ public: /** Whether the ADDED table record sync-ed after crash recovery; protected by bg_threads_mutex */ unsigned added_synced:1; - /** Whether the table holds dict_sys->mutex; + /** Whether the table holds dict_sys.mutex; protected by bg_threads_mutex */ unsigned dict_locked:1; @@ -384,9 +384,9 @@ extern bool fts_need_sync; #define fts_que_graph_free(graph) \ do { \ - mutex_enter(&dict_sys->mutex); \ + mutex_enter(&dict_sys.mutex); \ que_graph_free(graph); \ - mutex_exit(&dict_sys->mutex); \ + mutex_exit(&dict_sys.mutex); \ } while (0) /******************************************************************//** @@ -584,17 +584,15 @@ fts_get_doc_id_from_row( want to extract.*/ /** Extract the doc id from the record that belongs to index. -@param[in] table table -@param[in] rec record contains FTS_DOC_ID +@param[in] rec record containing FTS_DOC_ID @param[in] index index of rec -@param[in] heap heap memory +@param[in] offsets rec_get_offsets(rec,index) @return doc id that was extracted from rec */ doc_id_t fts_get_doc_id_from_rec( - dict_table_t* table, - const rec_t* rec, - const dict_index_t* index, - mem_heap_t* heap); + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets); /** Add new fts doc id to the update vector. @param[in] table the table that contains the FTS index. @@ -751,7 +749,7 @@ FTS auxiliary INDEX table and clear the cache at the end. dberr_t fts_sync_table(dict_table_t* table, bool wait = true); /****************************************************************//** -Free the query graph but check whether dict_sys->mutex is already +Free the query graph but check whether dict_sys.mutex is already held */ void fts_que_graph_free_check_lock( diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h index 3c6bc0e14d7..e0b0d27bebf 100644 --- a/storage/innobase/include/fts0priv.h +++ b/storage/innobase/include/fts0priv.h @@ -135,7 +135,7 @@ fts_eval_sql( /** Construct the name of an internal FTS table for the given table. @param[in] fts_table metadata on fulltext-indexed table @param[out] table_name a name up to MAX_FULL_NAME_LEN -@param[in] dict_locked whether dict_sys->mutex is being held */ +@param[in] dict_locked whether dict_sys.mutex is being held */ void fts_get_table_name(const fts_table_t* fts_table, char* table_name, bool dict_locked = false) MY_ATTRIBUTE((nonnull)); @@ -490,7 +490,7 @@ fts_get_table_id( MY_ATTRIBUTE((nonnull, warn_unused_result)); /** Construct the name of an internal FTS table for the given table. @param[in] fts_table metadata on fulltext-indexed table -@param[in] dict_locked whether dict_sys->mutex is being held +@param[in] dict_locked whether dict_sys.mutex is being held @return the prefix, must be freed with ut_free() */ char* fts_get_table_name_prefix(const fts_table_t* fts_table) MY_ATTRIBUTE((nonnull, malloc, warn_unused_result)); diff --git a/storage/innobase/include/fut0fut.h b/storage/innobase/include/fut0fut.h index 3c3f118bd68..a52fc256efa 100644 --- a/storage/innobase/include/fut0fut.h +++ b/storage/innobase/include/fut0fut.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -27,12 +28,11 @@ Created 12/13/1995 Heikki Tuuri #ifndef fut0fut_h #define fut0fut_h -#include "fil0fil.h" #include "mtr0mtr.h" /** Gets a pointer to a file address and latches the page. @param[in] space space id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] addr file address @param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_SX_LATCH @param[out] ptr_block file page @@ -43,13 +43,32 @@ UNIV_INLINE byte* fut_get_ptr( ulint space, - const page_size_t& page_size, + ulint zip_size, fil_addr_t addr, rw_lock_type_t rw_latch, mtr_t* mtr, buf_block_t** ptr_block = NULL) - MY_ATTRIBUTE((warn_unused_result)); +{ + buf_block_t* block; + byte* ptr = NULL; -#include "fut0fut.ic" + ut_ad(addr.boffset < srv_page_size); + ut_ad((rw_latch == RW_S_LATCH) + || (rw_latch == RW_X_LATCH) + || (rw_latch == RW_SX_LATCH)); + + block = buf_page_get(page_id_t(space, addr.page), zip_size, + rw_latch, mtr); + + ptr = buf_block_get_frame(block) + addr.boffset; + + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + if (ptr_block != NULL) { + *ptr_block = block; + } + + return(ptr); +} #endif /* fut0fut_h */ diff --git a/storage/innobase/include/fut0fut.ic b/storage/innobase/include/fut0fut.ic deleted file mode 100644 index b5c1e15e059..00000000000 --- a/storage/innobase/include/fut0fut.ic +++ /dev/null @@ -1,68 +0,0 @@ -/***************************************************************************** - -Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/******************************************************************//** -@file include/fut0fut.ic -File-based utilities - -Created 12/13/1995 Heikki Tuuri -***********************************************************************/ - -#include "sync0rw.h" -#include "buf0buf.h" - -/** Gets a pointer to a file address and latches the page. -@param[in] space space id -@param[in] page_size page size -@param[in] addr file address -@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_SX_LATCH -@param[in,out] mtr mini-transaction -@param[out] ptr_block file page -@return pointer to a byte in (*ptr_block)->frame; the *ptr_block is -bufferfixed and latched */ -UNIV_INLINE -byte* -fut_get_ptr( - ulint space, - const page_size_t& page_size, - fil_addr_t addr, - rw_lock_type_t rw_latch, - mtr_t* mtr, - buf_block_t** ptr_block) -{ - buf_block_t* block; - byte* ptr = NULL; - - ut_ad(addr.boffset < srv_page_size); - ut_ad((rw_latch == RW_S_LATCH) - || (rw_latch == RW_X_LATCH) - || (rw_latch == RW_SX_LATCH)); - - block = buf_page_get(page_id_t(space, addr.page), page_size, - rw_latch, mtr); - - ptr = buf_block_get_frame(block) + addr.boffset; - - buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); - - if (ptr_block != NULL) { - *ptr_block = block; - } - - return(ptr); -} diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h index 187b673d2fd..9fa928eda23 100644 --- a/storage/innobase/include/fut0lst.h +++ b/storage/innobase/include/fut0lst.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -26,11 +27,11 @@ Created 11/28/1995 Heikki Tuuri #ifndef fut0lst_h #define fut0lst_h -#ifndef UNIV_INNOCHECKSUM - -#include "fil0fil.h" -#include "mtr0mtr.h" - +#ifdef UNIV_INNOCHECKSUM +# include "fil0fil.h" +#else +#include "fut0fut.h" +#include "mtr0log.h" /* The C 'types' of base node and list node: these should be used to write self-documenting code. Of course, the sizeof macro cannot be @@ -39,14 +40,59 @@ applied to these types! */ typedef byte flst_base_node_t; typedef byte flst_node_t; -/* The physical size of a list base node in bytes */ -#define FLST_BASE_NODE_SIZE (4 + 2 * FIL_ADDR_SIZE) #endif /* !UNIV_INNOCHECKSUM */ +/* The physical size of a list base node in bytes */ +#define FLST_BASE_NODE_SIZE (4 + 2 * FIL_ADDR_SIZE) /* The physical size of a list node in bytes */ #define FLST_NODE_SIZE (2 * FIL_ADDR_SIZE) #ifndef UNIV_INNOCHECKSUM +/* We define the field offsets of a node for the list */ +#define FLST_PREV 0 /* 6-byte address of the previous list element; + the page part of address is FIL_NULL, if no + previous element */ +#define FLST_NEXT FIL_ADDR_SIZE /* 6-byte address of the next + list element; the page part of address + is FIL_NULL, if no next element */ + +/* We define the field offsets of a base node for the list */ +#define FLST_LEN 0 /* 32-bit list length field */ +#define FLST_FIRST 4 /* 6-byte address of the first element + of the list; undefined if empty list */ +#define FLST_LAST (4 + FIL_ADDR_SIZE) /* 6-byte address of the + last element of the list; undefined + if empty list */ + +/** Initialize a zero-initialized list base node. +@param[in,out] block file page +@param[in] ofs byte offset of the list base node +@param[in,out] mtr mini-transaction */ +inline void flst_init(buf_block_t* block, uint16_t ofs, mtr_t* mtr) +{ + ut_ad(0 == mach_read_from_2(FLST_LEN + ofs + block->frame)); + ut_ad(0 == mach_read_from_2(FLST_FIRST + FIL_ADDR_BYTE + ofs + + block->frame)); + ut_ad(0 == mach_read_from_2(FLST_LAST + FIL_ADDR_BYTE + ofs + + block->frame)); + compile_time_assert(FIL_NULL == 0xffU * 0x1010101U); + mlog_memset(block, FLST_FIRST + FIL_ADDR_PAGE + ofs, 4, 0xff, mtr); + mlog_memset(block, FLST_LAST + FIL_ADDR_PAGE + ofs, 4, 0xff, mtr); +} + +/** Write a null file address. +@param[in,out] faddr file address to be zeroed otu +@param[in,out] mtr mini-transaction */ +inline void flst_zero_addr(fil_faddr_t* faddr, mtr_t* mtr) +{ + if (mach_read_from_4(faddr + FIL_ADDR_PAGE) != FIL_NULL) { + mlog_memset(faddr + FIL_ADDR_PAGE, 4, 0xff, mtr); + } + if (mach_read_from_2(faddr + FIL_ADDR_BYTE)) { + mlog_write_ulint(faddr + FIL_ADDR_BYTE, 0, MLOG_2BYTES, mtr); + } +} + /********************************************************************//** Initializes a list base node. */ UNIV_INLINE @@ -83,7 +129,7 @@ flst_remove( @param[in] base base node @return length */ UNIV_INLINE -ulint +uint32_t flst_get_len( const flst_base_node_t* base); /********************************************************************//** diff --git a/storage/innobase/include/fut0lst.ic b/storage/innobase/include/fut0lst.ic index 00bb3fe8e9c..ec4181b2c93 100644 --- a/storage/innobase/include/fut0lst.ic +++ b/storage/innobase/include/fut0lst.ic @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -23,26 +24,8 @@ File-based list utilities Created 11/28/1995 Heikki Tuuri ***********************************************************************/ -#include "fut0fut.h" -#include "mtr0log.h" #include "buf0buf.h" -/* We define the field offsets of a node for the list */ -#define FLST_PREV 0 /* 6-byte address of the previous list element; - the page part of address is FIL_NULL, if no - previous element */ -#define FLST_NEXT FIL_ADDR_SIZE /* 6-byte address of the next - list element; the page part of address - is FIL_NULL, if no next element */ - -/* We define the field offsets of a base node for the list */ -#define FLST_LEN 0 /* 32-bit list length field */ -#define FLST_FIRST 4 /* 6-byte address of the first element - of the list; undefined if empty list */ -#define FLST_LAST (4 + FIL_ADDR_SIZE) /* 6-byte address of the - last element of the list; undefined - if empty list */ - /********************************************************************//** Writes a file address. */ UNIV_INLINE @@ -79,9 +62,8 @@ flst_read_addr( ut_ad(faddr && mtr); - addr.page = mtr_read_ulint(faddr + FIL_ADDR_PAGE, MLOG_4BYTES, mtr); - addr.boffset = mtr_read_ulint(faddr + FIL_ADDR_BYTE, MLOG_2BYTES, - mtr); + addr.page = mach_read_from_4(faddr + FIL_ADDR_PAGE); + addr.boffset = mach_read_from_2(faddr + FIL_ADDR_BYTE); ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA); ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA); return(addr); @@ -100,16 +82,18 @@ flst_init( MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)); - mlog_write_ulint(base + FLST_LEN, 0, MLOG_4BYTES, mtr); - flst_write_addr(base + FLST_FIRST, fil_addr_null, mtr); - flst_write_addr(base + FLST_LAST, fil_addr_null, mtr); + if (mach_read_from_4(base + FLST_LEN)) { + mlog_write_ulint(base + FLST_LEN, 0, MLOG_4BYTES, mtr); + } + flst_zero_addr(base + FLST_FIRST, mtr); + flst_zero_addr(base + FLST_LAST, mtr); } /** Get the length of a list. @param[in] base base node @return length */ UNIV_INLINE -ulint +uint32_t flst_get_len( const flst_base_node_t* base) { diff --git a/storage/innobase/include/gis0rtree.h b/storage/innobase/include/gis0rtree.h index e189b6a7f28..01fcc2943d2 100644 --- a/storage/innobase/include/gis0rtree.h +++ b/storage/innobase/include/gis0rtree.h @@ -191,23 +191,8 @@ rtr_non_leaf_insert_stack_push( double mbr_inc); /*!< in: MBR needs to be enlarged */ -/*****************************************************************//** -Allocates a new Split Sequence Number. -@return new SSN id */ -UNIV_INLINE -node_seq_t -rtr_get_new_ssn_id( -/*===============*/ - dict_index_t* index); /*!< in: the index struct */ - -/*****************************************************************//** -Get the current Split Sequence Number. -@return current SSN id */ -UNIV_INLINE -node_seq_t -rtr_get_current_ssn_id( -/*===================*/ - dict_index_t* index); /*!< in/out: the index struct */ +#define rtr_get_new_ssn_id(index) (index)->assign_ssn() +#define rtr_get_current_ssn_id(index) (index)->ssn() /********************************************************************//** Create a RTree search info structure */ diff --git a/storage/innobase/include/gis0rtree.ic b/storage/innobase/include/gis0rtree.ic index a22164931b2..2076b24b9b1 100644 --- a/storage/innobase/include/gis0rtree.ic +++ b/storage/innobase/include/gis0rtree.ic @@ -123,31 +123,6 @@ rtr_non_leaf_stack_push( #endif /* RTR_SEARCH_DIAGNOSTIC */ } -/*****************************************************************//** -Allocates a new Split Sequence Number. -@return new SSN id */ -UNIV_INLINE -node_seq_t -rtr_get_new_ssn_id( -/*===============*/ - dict_index_t* index) /*!< in/out: the index struct */ -{ - node_seq_t ssn= my_atomic_add32_explicit(&index->rtr_ssn, 1, - MY_MEMORY_ORDER_RELAXED); - return ssn + 1; -} -/*****************************************************************//** -Get the current Split Sequence Number. -@return current SSN id */ -UNIV_INLINE -node_seq_t -rtr_get_current_ssn_id( -/*===================*/ - dict_index_t* index) /*!< in: index struct */ -{ - return my_atomic_load32_explicit(&index->rtr_ssn, MY_MEMORY_ORDER_RELAXED); -} - /*********************************************************************//** Sets pointer to the data and length in a field. */ UNIV_INLINE diff --git a/storage/innobase/include/gis0type.h b/storage/innobase/include/gis0type.h index c5ea817c6bf..a1e0a878cb2 100644 --- a/storage/innobase/include/gis0type.h +++ b/storage/innobase/include/gis0type.h @@ -35,7 +35,7 @@ Created 2013/03/27 Jimmy Yang #include "gis0geo.h" #include <vector> -#include <list> +#include <forward_list> /* Node Sequence Number. Only updated when page splits */ typedef ib_uint32_t node_seq_t; @@ -133,15 +133,14 @@ typedef struct rtr_info{ /*!< current search mode */ } rtr_info_t; -typedef std::list<rtr_info_t*, ut_allocator<rtr_info_t*> > rtr_info_active; - -/* Tracking structure for all onoging search for an index */ -typedef struct rtr_info_track { - rtr_info_active* rtr_active; /*!< Active search info */ - ib_mutex_t rtr_active_mutex; +/* Tracking structure for all ongoing search for an index */ +struct rtr_info_track_t { + /** Active search info */ + std::forward_list<rtr_info_t*, ut_allocator<rtr_info_t*> > rtr_active; + ib_mutex_t rtr_active_mutex; /*!< mutex to protect rtr_active */ -} rtr_info_track_t; +}; /* This is to record the record movement between pages. Used for corresponding lock movement */ diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h index 51a34b91418..28e5d1d4f56 100644 --- a/storage/innobase/include/ha_prototypes.h +++ b/storage/innobase/include/ha_prototypes.h @@ -232,10 +232,11 @@ innobase_casedn_str( #ifdef WITH_WSREP UNIV_INTERN int -wsrep_innobase_kill_one_trx(void * const thd_ptr, - const trx_t * const bf_trx, - trx_t *victim_trx, - ibool signal); +wsrep_innobase_kill_one_trx( + THD* bf_thd, + trx_t *victim_trx, + bool signal); + ulint wsrep_innobase_mysql_sort(int mysql_type, uint charset_number, unsigned char* str, unsigned int str_length, unsigned int buf_length); @@ -512,18 +513,6 @@ normalize_table_name_c_low( const char* name, /*!< in: table name string */ ibool set_lower_case); /*!< in: TRUE if we want to set name to lower case */ -/*************************************************************//** -InnoDB index push-down condition check defined in ha_innodb.cc -@return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */ - -#include <my_compare.h> - -ICP_RESULT -innobase_index_cond( -/*================*/ - void* file) /*!< in/out: pointer to ha_innobase */ - MY_ATTRIBUTE((warn_unused_result)); - /** Update the system variable with the given value of the InnoDB buffer pool size. @param[in] buf_pool_size given value of buffer pool size.*/ diff --git a/storage/innobase/include/ib0mutex.h b/storage/innobase/include/ib0mutex.h index e496c65e46a..ce0e911dbb4 100644 --- a/storage/innobase/include/ib0mutex.h +++ b/storage/innobase/include/ib0mutex.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2019, MariaDB Corporation. +Copyright (c) 2017, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,13 +29,12 @@ Created 2013-03-26 Sunny Bains. #ifndef ib0mutex_h #define ib0mutex_h -#include "my_atomic.h" #include "my_cpu.h" #include "os0event.h" #include "sync0arr.h" /** OS mutex for tracking lock/unlock for debugging */ -template <template <typename> class Policy = NoPolicy> +template <template <typename> class Policy> struct OSTrackMutex { typedef Policy<OSTrackMutex> MutexPolicy; @@ -152,7 +151,7 @@ private: #include <sys/syscall.h> /** Mutex implementation that used the Linux futex. */ -template <template <typename> class Policy = NoPolicy> +template <template <typename> class Policy> struct TTASFutexMutex { typedef Policy<TTASFutexMutex> MutexPolicy; @@ -167,21 +166,24 @@ struct TTASFutexMutex { ~TTASFutexMutex() { - ut_a(m_lock_word == MUTEX_STATE_UNLOCKED); + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_UNLOCKED); } /** Called when the mutex is "created". Note: Not from the constructor but when the mutex is initialised. */ void init(latch_id_t, const char*, uint32_t) UNIV_NOTHROW { - ut_a(m_lock_word == MUTEX_STATE_UNLOCKED); + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_UNLOCKED); } /** Destroy the mutex. */ void destroy() UNIV_NOTHROW { /* The destructor can be called at shutdown. */ - ut_a(m_lock_word == MUTEX_STATE_UNLOCKED); + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_UNLOCKED); } /** Acquire the mutex. @@ -202,9 +204,8 @@ struct TTASFutexMutex { } for (n_waits= 0;; n_waits++) { - if (my_atomic_fas32_explicit(&m_lock_word, - MUTEX_STATE_WAITERS, - MY_MEMORY_ORDER_ACQUIRE) + if (m_lock_word.exchange(MUTEX_STATE_WAITERS, + std::memory_order_acquire) == MUTEX_STATE_UNLOCKED) { break; } @@ -220,9 +221,8 @@ struct TTASFutexMutex { /** Release the mutex. */ void exit() UNIV_NOTHROW { - if (my_atomic_fas32_explicit(&m_lock_word, - MUTEX_STATE_UNLOCKED, - MY_MEMORY_ORDER_RELEASE) + if (m_lock_word.exchange(MUTEX_STATE_UNLOCKED, + std::memory_order_release) == MUTEX_STATE_WAITERS) { syscall(SYS_futex, &m_lock_word, FUTEX_WAKE_PRIVATE, 1, 0, 0, 0); @@ -234,10 +234,11 @@ struct TTASFutexMutex { bool try_lock() UNIV_NOTHROW { int32 oldval = MUTEX_STATE_UNLOCKED; - return(my_atomic_cas32_strong_explicit(&m_lock_word, &oldval, - MUTEX_STATE_LOCKED, - MY_MEMORY_ORDER_ACQUIRE, - MY_MEMORY_ORDER_RELAXED)); + return m_lock_word.compare_exchange_strong( + oldval, + MUTEX_STATE_LOCKED, + std::memory_order_acquire, + std::memory_order_relaxed); } /** @return non-const version of the policy */ @@ -257,12 +258,12 @@ private: /** lock_word is the target of the atomic test-and-set instruction when atomic operations are enabled. */ - int32 m_lock_word; + std::atomic<int32> m_lock_word; }; #endif /* HAVE_IB_LINUX_FUTEX */ -template <template <typename> class Policy = NoPolicy> +template <template <typename> class Policy> struct TTASMutex { typedef Policy<TTASMutex> MutexPolicy; @@ -277,40 +278,45 @@ struct TTASMutex { ~TTASMutex() { - ut_ad(m_lock_word == MUTEX_STATE_UNLOCKED); + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_UNLOCKED); } /** Called when the mutex is "created". Note: Not from the constructor but when the mutex is initialised. */ void init(latch_id_t) UNIV_NOTHROW { - ut_ad(m_lock_word == MUTEX_STATE_UNLOCKED); + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_UNLOCKED); } /** Destroy the mutex. */ void destroy() UNIV_NOTHROW { /* The destructor can be called at shutdown. */ - ut_ad(m_lock_word == MUTEX_STATE_UNLOCKED); + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_UNLOCKED); } /** Try and lock the mutex. @return true on success */ bool try_lock() UNIV_NOTHROW { - int32 oldval = MUTEX_STATE_UNLOCKED; - return(my_atomic_cas32_strong_explicit(&m_lock_word, &oldval, - MUTEX_STATE_LOCKED, - MY_MEMORY_ORDER_ACQUIRE, - MY_MEMORY_ORDER_RELAXED)); + uint32_t oldval = MUTEX_STATE_UNLOCKED; + return m_lock_word.compare_exchange_strong( + oldval, + MUTEX_STATE_LOCKED, + std::memory_order_acquire, + std::memory_order_relaxed); } /** Release the mutex. */ void exit() UNIV_NOTHROW { - ut_ad(m_lock_word == MUTEX_STATE_LOCKED); - my_atomic_store32_explicit(&m_lock_word, MUTEX_STATE_UNLOCKED, - MY_MEMORY_ORDER_RELEASE); + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_LOCKED); + m_lock_word.store(MUTEX_STATE_UNLOCKED, + std::memory_order_release); } /** Acquire the mutex. @@ -353,12 +359,11 @@ private: /** Policy data */ MutexPolicy m_policy; - /** lock_word is the target of the atomic test-and-set instruction - when atomic operations are enabled. */ - int32 m_lock_word; + /** mutex state */ + std::atomic<uint32_t> m_lock_word; }; -template <template <typename> class Policy = NoPolicy> +template <template <typename> class Policy> struct TTASEventMutex { typedef Policy<TTASEventMutex> MutexPolicy; @@ -376,7 +381,7 @@ struct TTASEventMutex { ~TTASEventMutex() UNIV_NOTHROW { - ut_ad(m_lock_word == MUTEX_STATE_UNLOCKED); + ut_ad(state() == MUTEX_STATE_UNLOCKED); } /** Called when the mutex is "created". Note: Not from the constructor @@ -385,7 +390,7 @@ struct TTASEventMutex { void init(latch_id_t id, const char*, uint32_t) UNIV_NOTHROW { ut_a(m_event == 0); - ut_a(m_lock_word == MUTEX_STATE_UNLOCKED); + ut_ad(state() == MUTEX_STATE_UNLOCKED); m_event = os_event_create(sync_latch_get_name(id)); } @@ -396,7 +401,7 @@ struct TTASEventMutex { void destroy() UNIV_NOTHROW { - ut_ad(m_lock_word == MUTEX_STATE_UNLOCKED); + ut_ad(state() == MUTEX_STATE_UNLOCKED); /* We have to free the event before InnoDB shuts down. */ os_event_destroy(m_event); @@ -408,20 +413,20 @@ struct TTASEventMutex { bool try_lock() UNIV_NOTHROW { - int32 oldval = MUTEX_STATE_UNLOCKED; - return(my_atomic_cas32_strong_explicit(&m_lock_word, &oldval, - MUTEX_STATE_LOCKED, - MY_MEMORY_ORDER_ACQUIRE, - MY_MEMORY_ORDER_RELAXED)); + uint32_t oldval = MUTEX_STATE_UNLOCKED; + return m_lock_word.compare_exchange_strong( + oldval, + MUTEX_STATE_LOCKED, + std::memory_order_acquire, + std::memory_order_relaxed); } /** Release the mutex. */ void exit() UNIV_NOTHROW { - if (my_atomic_fas32_explicit(&m_lock_word, - MUTEX_STATE_UNLOCKED, - MY_MEMORY_ORDER_RELEASE) + if (m_lock_word.exchange(MUTEX_STATE_UNLOCKED, + std::memory_order_release) == MUTEX_STATE_WAITERS) { os_event_set(m_event); sync_array_object_signalled(); @@ -459,11 +464,12 @@ struct TTASEventMutex { : SYNC_MUTEX, filename, line, &cell); - int32 oldval = MUTEX_STATE_LOCKED; - my_atomic_cas32_strong_explicit(&m_lock_word, &oldval, - MUTEX_STATE_WAITERS, - MY_MEMORY_ORDER_RELAXED, - MY_MEMORY_ORDER_RELAXED); + uint32_t oldval = MUTEX_STATE_LOCKED; + m_lock_word.compare_exchange_strong( + oldval, + MUTEX_STATE_WAITERS, + std::memory_order_relaxed, + std::memory_order_relaxed); if (oldval == MUTEX_STATE_UNLOCKED) { sync_array_free_cell(sync_arr, cell); @@ -482,9 +488,7 @@ struct TTASEventMutex { int32 state() const UNIV_NOTHROW { - return(my_atomic_load32_explicit(const_cast<int32*> - (&m_lock_word), - MY_MEMORY_ORDER_RELAXED)); + return m_lock_word.load(std::memory_order_relaxed); } /** The event that the mutex will wait in sync0arr.cc @@ -514,9 +518,8 @@ private: TTASEventMutex(const TTASEventMutex&); TTASEventMutex& operator=(const TTASEventMutex&); - /** lock_word is the target of the atomic test-and-set instruction - when atomic operations are enabled. */ - int32 m_lock_word; + /** mutex state */ + std::atomic<uint32_t> m_lock_word; /** Used by sync0arr.cc for the wait queue */ os_event_t m_event; @@ -530,7 +533,6 @@ with the Performance Schema instrumentation. */ template <typename MutexImpl> struct PolicyMutex { - typedef MutexImpl MutexType; typedef typename MutexImpl::MutexPolicy Policy; PolicyMutex() UNIV_NOTHROW : m_impl() @@ -561,7 +563,7 @@ struct PolicyMutex pfs_exit(); #endif /* UNIV_PFS_MUTEX */ - policy().release(m_impl); + ut_d(policy().context.release(m_impl)); m_impl.exit(); } @@ -587,11 +589,11 @@ struct PolicyMutex locker = pfs_begin_lock(&state, name, line); #endif /* UNIV_PFS_MUTEX */ - policy().enter(m_impl, name, line); + ut_d(policy().context.enter(m_impl, name, line)); m_impl.enter(n_spins, n_delay, name, line); - policy().locked(m_impl, name, line); + ut_d(policy().context.locked(m_impl, name, line)); #ifdef UNIV_PFS_MUTEX pfs_end(locker, 0); #endif /* UNIV_PFS_MUTEX */ @@ -620,9 +622,9 @@ struct PolicyMutex if (ret == 0) { - policy().enter(m_impl, name, line); + ut_d(policy().context.enter(m_impl, name, line)); - policy().locked(m_impl, name, line); + ut_d(policy().context.locked(m_impl, name, line)); } #ifdef UNIV_PFS_MUTEX @@ -636,7 +638,7 @@ struct PolicyMutex /** @return true if the thread owns the mutex. */ bool is_owned() const UNIV_NOTHROW { - return(policy().is_owned()); + return(policy().context.is_owned()); } #endif /* UNIV_DEBUG */ @@ -658,6 +660,7 @@ struct PolicyMutex m_impl.init(id, filename, line); policy().init(m_impl, id, filename, line); + ut_d(policy().context.init(id)); } /** Free resources (if any) */ @@ -668,6 +671,7 @@ struct PolicyMutex #endif /* UNIV_PFS_MUTEX */ m_impl.destroy(); policy().destroy(); + ut_d(policy().context.destroy()); } /** Required for os_event_t */ diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h index 00fe1d3b02a..02d38069d94 100644 --- a/storage/innobase/include/ibuf0ibuf.h +++ b/storage/innobase/include/ibuf0ibuf.h @@ -119,13 +119,6 @@ ibuf_mtr_commit( /*============*/ mtr_t* mtr) /*!< in/out: mini-transaction */ MY_ATTRIBUTE((nonnull)); -/*********************************************************************//** -Initializes an ibuf bitmap page. */ -void -ibuf_bitmap_page_init( -/*==================*/ - buf_block_t* block, /*!< in: bitmap page */ - mtr_t* mtr); /*!< in: mtr */ /************************************************************************//** Resets the free bits of the page in the ibuf bitmap. This is done in a separate mini-transaction, hence this operation does not restrict @@ -241,18 +234,19 @@ ibuf_inside( /** Checks if a page address is an ibuf bitmap page (level 3 page) address. @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @return TRUE if a bitmap page */ -UNIV_INLINE -ibool -ibuf_bitmap_page( - const page_id_t page_id, - const page_size_t& page_size); +inline bool ibuf_bitmap_page(const page_id_t page_id, ulint zip_size) +{ + ut_ad(ut_is_2pow(zip_size)); + ulint size = zip_size ? zip_size : srv_page_size; + return (page_id.page_no() & (size - 1)) == FSP_IBUF_BITMAP_OFFSET; +} /** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. Must not be called when recv_no_ibuf_operations==true. @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] x_latch FALSE if relaxed check (avoid latching the bitmap page) @param[in] file file name @@ -260,13 +254,13 @@ bitmap page) @param[in,out] mtr mtr which will contain an x-latch to the bitmap page if the page is not one of the fixed address ibuf pages, or NULL, in which case a new transaction is created. -@return TRUE if level 2 or level 3 page */ -ibool +@return true if level 2 or level 3 page */ +bool ibuf_page_low( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, #ifdef UNIV_DEBUG - ibool x_latch, + bool x_latch, #endif /* UNIV_DEBUG */ const char* file, unsigned line, @@ -278,22 +272,22 @@ ibuf_page_low( /** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. Must not be called when recv_no_ibuf_operations==true. @param[in] page_id tablespace/page identifier -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in,out] mtr mini-transaction or NULL @return TRUE if level 2 or level 3 page */ -# define ibuf_page(page_id, page_size, mtr) \ - ibuf_page_low(page_id, page_size, TRUE, __FILE__, __LINE__, mtr) +# define ibuf_page(page_id, zip_size, mtr) \ + ibuf_page_low(page_id, zip_size, true, __FILE__, __LINE__, mtr) #else /* UVIV_DEBUG */ /** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. Must not be called when recv_no_ibuf_operations==true. @param[in] page_id tablespace/page identifier -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in,out] mtr mini-transaction or NULL @return TRUE if level 2 or level 3 page */ -# define ibuf_page(page_id, page_size, mtr) \ - ibuf_page_low(page_id, page_size, __FILE__, __LINE__, mtr) +# define ibuf_page(page_id, zip_size, mtr) \ + ibuf_page_low(page_id, zip_size, __FILE__, __LINE__, mtr) #endif /* UVIV_DEBUG */ /***********************************************************************//** @@ -304,23 +298,23 @@ void ibuf_free_excess_pages(void); /*========================*/ -/** Buffer an operation in the insert/delete buffer, instead of doing it -directly to the disk page, if this is possible. Does not do it if the index +/** Buffer an operation in the change buffer, instead of applying it +directly to the file page, if this is possible. Does not do it if the index is clustered or unique. @param[in] op operation type @param[in] entry index entry to insert @param[in,out] index index where to insert @param[in] page_id page id where to insert -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in,out] thr query thread -@return TRUE if success */ -ibool +@return true if success */ +bool ibuf_insert( ibuf_op_t op, const dtuple_t* entry, dict_index_t* index, const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, que_thr_t* thr); /** @@ -340,25 +334,22 @@ subsequently was dropped. @param[in,out] block if page has been read from disk, pointer to the page x-latched, else NULL @param[in] page_id page id of the index page -@param[in] update_ibuf_bitmap normally this is set to TRUE, but +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] update_ibuf_bitmap normally this is set, but if we have deleted or are deleting the tablespace, then we naturally do not want to update a non-existent bitmap page */ void ibuf_merge_or_delete_for_page( buf_block_t* block, const page_id_t page_id, - const page_size_t* page_size, - ibool update_ibuf_bitmap); + ulint zip_size, + bool update_ibuf_bitmap); + +/** Delete all change buffer entries for a tablespace, +in DISCARD TABLESPACE, IMPORT TABLESPACE, or crash recovery. +@param[in] space missing or to-be-discarded tablespace */ +void ibuf_delete_for_discarded_space(ulint space); -/*********************************************************************//** -Deletes all entries in the insert buffer for a given space id. This is used -in DISCARD TABLESPACE and IMPORT TABLESPACE. -NOTE: this does not update the page free bitmaps in the space. The space will -become CORRUPT when you call this function! */ -void -ibuf_delete_for_discarded_space( -/*============================*/ - ulint space); /*!< in: space id */ /** Contract the change buffer by reading pages to the buffer pool. @param[in] full If true, do a full contraction based on PCT_IO(100). If false, the size of contract batch is determined @@ -378,16 +369,8 @@ ibuf_merge_space( /*=============*/ ulint space); /*!< in: space id */ -/*********************************************************************//** -Parses a redo log record of an ibuf bitmap page init. -@return end of log record or NULL */ -byte* -ibuf_parse_bitmap_init( -/*===================*/ - byte* ptr, /*!< in: buffer */ - byte* end_ptr,/*!< in: buffer end */ - buf_block_t* block, /*!< in: block or NULL */ - mtr_t* mtr); /*!< in: mtr or NULL */ +/** Apply MLOG_IBUF_BITMAP_INIT when crash-upgrading */ +ATTRIBUTE_COLD void ibuf_bitmap_init_apply(buf_block_t* block); /******************************************************************//** Looks if the insert buffer is empty. diff --git a/storage/innobase/include/ibuf0ibuf.ic b/storage/innobase/include/ibuf0ibuf.ic index b3e04ee1661..db8c122c0f7 100644 --- a/storage/innobase/include/ibuf0ibuf.ic +++ b/storage/innobase/include/ibuf0ibuf.ic @@ -78,11 +78,12 @@ struct ibuf_t{ ulint height; /*!< tree height */ dict_index_t* index; /*!< insert buffer index */ - ulint n_merges; /*!< number of pages merged */ - ulint n_merged_ops[IBUF_OP_COUNT]; + /** number of pages merged */ + Atomic_counter<ulint> n_merges; + Atomic_counter<ulint> n_merged_ops[IBUF_OP_COUNT]; /*!< number of operations of each type merged to index pages */ - ulint n_discarded_ops[IBUF_OP_COUNT]; + Atomic_counter<ulint> n_discarded_ops[IBUF_OP_COUNT]; /*!< number of operations of each type discarded without merging due to the tablespace being deleted or the @@ -149,20 +150,6 @@ ibuf_inside( return(mtr->is_inside_ibuf()); } -/** Checks if a page address is an ibuf bitmap page (level 3 page) address. -@param[in] page_id page id -@param[in] page_size page size -@return TRUE if a bitmap page */ -UNIV_INLINE -ibool -ibuf_bitmap_page( - const page_id_t page_id, - const page_size_t& page_size) -{ - return((page_id.page_no() & (page_size.physical() - 1)) - == FSP_IBUF_BITMAP_OFFSET); -} - /** Translates the free space on a page to a value in the ibuf bitmap. @param[in] page_size page size in bytes @param[in] max_ins_size maximum insert size after reorganize for @@ -191,29 +178,6 @@ ibuf_index_page_calc_free_bits( return(n); } -/** Translates the ibuf free bits to the free space on a page in bytes. -@param[in] page_size page_size -@param[in] bits value for ibuf bitmap bits -@return maximum insert size after reorganize for the page */ -UNIV_INLINE -ulint -ibuf_index_page_calc_free_from_bits( - const page_size_t& page_size, - ulint bits) -{ - ut_ad(bits < 4); - ut_ad(!page_size.is_compressed() - || page_size.physical() > IBUF_PAGE_SIZE_PER_FREE_SPACE); - - if (bits == 3) { - return(4 * page_size.physical() - / IBUF_PAGE_SIZE_PER_FREE_SPACE); - } - - return(bits * (page_size.physical() - / IBUF_PAGE_SIZE_PER_FREE_SPACE)); -} - /*********************************************************************//** Translates the free space on a compressed page to a value in the ibuf bitmap. @return value for ibuf bitmap bits */ @@ -227,7 +191,7 @@ ibuf_index_page_calc_free_zip( const page_zip_des_t* page_zip; lint zip_max_ins; - ut_ad(block->page.size.is_compressed()); + ut_ad(block->page.zip.data); /* Consider the maximum insert size on the uncompressed page without reorganizing the page. We must not assume anything @@ -250,7 +214,7 @@ ibuf_index_page_calc_free_zip( max_ins_size = (ulint) zip_max_ins; } - return(ibuf_index_page_calc_free_bits(block->page.size.physical(), + return(ibuf_index_page_calc_free_bits(block->physical_size(), max_ins_size)); } @@ -263,14 +227,14 @@ ibuf_index_page_calc_free( /*======================*/ const buf_block_t* block) /*!< in: buffer block */ { - if (!block->page.size.is_compressed()) { + if (!block->page.zip.data) { ulint max_ins_size; max_ins_size = page_get_max_insert_size_after_reorganize( buf_block_get_frame(block), 1); return(ibuf_index_page_calc_free_bits( - block->page.size.physical(), max_ins_size)); + block->physical_size(), max_ins_size)); } else { return(ibuf_index_page_calc_free_zip(block)); } @@ -311,12 +275,12 @@ ibuf_update_free_bits_if_full( ut_ad(buf_block_get_page_zip(block) == NULL); before = ibuf_index_page_calc_free_bits( - block->page.size.physical(), max_ins_size); + srv_page_size, max_ins_size); if (max_ins_size >= increase) { compile_time_assert(ULINT32_UNDEFINED > UNIV_PAGE_SIZE_MAX); after = ibuf_index_page_calc_free_bits( - block->page.size.physical(), max_ins_size - increase); + srv_page_size, max_ins_size - increase); #ifdef UNIV_IBUF_DEBUG ut_a(after <= ibuf_index_page_calc_free(block)); #endif diff --git a/storage/innobase/include/log0crypt.h b/storage/innobase/include/log0crypt.h index 7a14b022e66..8d26ccb2ba3 100644 --- a/storage/innobase/include/log0crypt.h +++ b/storage/innobase/include/log0crypt.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (C) 2013, 2015, Google Inc. All Rights Reserved. -Copyright (C) 2014, 2017, MariaDB Corporation. All Rights Reserved. +Copyright (C) 2014, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -73,14 +73,23 @@ UNIV_INTERN bool log_crypt_read_checkpoint_buf(const byte* buf); +/** log_crypt() operation code */ +enum log_crypt_t { + /** encrypt a log block without rotating key */ + LOG_ENCRYPT, + /** decrypt a log block */ + LOG_DECRYPT, + /** attempt to rotate the key, and encrypt a log block */ + LOG_ENCRYPT_ROTATE_KEY +}; + /** Encrypt or decrypt log blocks. @param[in,out] buf log blocks to encrypt or decrypt @param[in] lsn log sequence number of the start of the buffer @param[in] size size of the buffer, in bytes -@param[in] decrypt whether to decrypt instead of encrypting */ -UNIV_INTERN -void -log_crypt(byte* buf, lsn_t lsn, ulint size, bool decrypt = false); +@param[in] op whether to decrypt, encrypt, or rotate key and encrypt +@return whether the operation succeeded (encrypt always does) */ +bool log_crypt(byte* buf, lsn_t lsn, ulint size, log_crypt_t op = LOG_ENCRYPT); /** Encrypt or decrypt a temporary file block. @param[in] src block to encrypt or decrypt diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index 133b1692d31..399319537c8 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -164,19 +164,16 @@ bool log_set_capacity(ulonglong file_size) MY_ATTRIBUTE((warn_unused_result)); -/******************************************************//** -This function is called, e.g., when a transaction wants to commit. It checks -that the log has been written to the log file up to the last log entry written -by the transaction. If there is a flush running, it waits and checks if the -flush flushed enough. If not, starts a new flush. */ -void -log_write_up_to( -/*============*/ - lsn_t lsn, /*!< in: log sequence number up to which - the log should be written, LSN_MAX if not specified */ - bool flush_to_disk); - /*!< in: true if we want the written log - also to be flushed to disk */ +/** Ensure that the log has been written to the log file up to a given +log entry (such as that of a transaction commit). Start a new write, or +wait and check if an already running write is covering the request. +@param[in] lsn log sequence number that should be +included in the redo log file write +@param[in] flush_to_disk whether the written log should also +be flushed to the file system +@param[in] rotate_key whether to rotate the encryption key */ +void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key = false); + /** write to the log file up to the last log entry. @param[in] sync whether we want the written log also to be flushed to disk. */ @@ -406,13 +403,14 @@ extern my_bool innodb_log_checksums; #define LOG_BLOCK_HDR_SIZE 12 /* size of the log block header in bytes */ -/* Offsets of a log block trailer from the end of the block */ +#define LOG_BLOCK_KEY 4 /* encryption key version + before LOG_BLOCK_CHECKSUM; + in log_t::FORMAT_ENC_10_4 only */ #define LOG_BLOCK_CHECKSUM 4 /* 4 byte checksum of the log block contents; in InnoDB versions < 3.23.52 this did not contain the checksum but the same value as - .._HDR_NO */ -#define LOG_BLOCK_TRL_SIZE 4 /* trailer size in bytes */ + LOG_BLOCK_HDR_NO */ /** Offsets inside the checkpoint pages (redo log format version 1) @{ */ /** Checkpoint number */ @@ -463,25 +461,6 @@ or the MySQL version that created the redo log file. */ IB_TO_STR(MYSQL_VERSION_MINOR) "." \ IB_TO_STR(MYSQL_VERSION_PATCH) -/** The original (not version-tagged) InnoDB redo log format */ -#define LOG_HEADER_FORMAT_3_23 0 -/** The MySQL 5.7.9/MariaDB 10.2.2 log format */ -#define LOG_HEADER_FORMAT_10_2 1 -/** The MariaDB 10.3.2 log format. -To prevent crash-downgrade to earlier 10.2 due to the inability to -roll back a retroactively introduced TRX_UNDO_RENAME_TABLE undo log record, -MariaDB 10.2.18 and later will use the 10.3 format, but LOG_HEADER_SUBFORMAT -1 instead of 0. MariaDB 10.3 will use subformat 0 (5.7-style TRUNCATE) or 2 -(MDEV-13564 backup-friendly TRUNCATE). */ -#define LOG_HEADER_FORMAT_10_3 103 -/** The redo log format identifier corresponding to the current format version. -Stored in LOG_HEADER_FORMAT. */ -#define LOG_HEADER_FORMAT_CURRENT LOG_HEADER_FORMAT_10_3 -/** Future MariaDB 10.4 log format */ -#define LOG_HEADER_FORMAT_10_4 104 -/** Encrypted MariaDB redo log */ -#define LOG_HEADER_FORMAT_ENCRYPTED (1U<<31) - /* @} */ #define LOG_CHECKPOINT_1 OS_FILE_LOG_BLOCK_SIZE @@ -506,6 +485,24 @@ typedef ib_mutex_t FlushOrderMutex; /** Redo log buffer */ struct log_t{ + /** The original (not version-tagged) InnoDB redo log format */ + static constexpr uint32_t FORMAT_3_23 = 0; + /** The MySQL 5.7.9/MariaDB 10.2.2 log format */ + static constexpr uint32_t FORMAT_10_2 = 1; + /** The MariaDB 10.3.2 log format. + To prevent crash-downgrade to earlier 10.2 due to the inability to + roll back a retroactively introduced TRX_UNDO_RENAME_TABLE undo log record, + MariaDB 10.2.18 and later will use the 10.3 format, but LOG_HEADER_SUBFORMAT + 1 instead of 0. MariaDB 10.3 will use subformat 0 (5.7-style TRUNCATE) or 2 + (MDEV-13564 backup-friendly TRUNCATE). */ + static constexpr uint32_t FORMAT_10_3 = 103; + /** The MariaDB 10.4.0 log format. */ + static constexpr uint32_t FORMAT_10_4 = 104; + /** Encrypted MariaDB redo log */ + static constexpr uint32_t FORMAT_ENCRYPTED = 1U << 31; + /** The MariaDB 10.4.0 log format (only with innodb_encrypt_log=ON) */ + static constexpr uint32_t FORMAT_ENC_10_4 = FORMAT_10_4 | FORMAT_ENCRYPTED; + MY_ALIGNED(CACHE_LINE_SIZE) lsn_t lsn; /*!< log sequence number */ ulong buf_free; /*!< first free offset within the log @@ -546,7 +543,7 @@ struct log_t{ struct files { /** number of files */ ulint n_files; - /** format of the redo log: e.g., LOG_HEADER_FORMAT_CURRENT */ + /** format of the redo log: e.g., FORMAT_10_4 */ uint32_t format; /** redo log subformat: 0 with separately logged TRUNCATE, 2 with fully redo-logged TRUNCATE (1 in MariaDB 10.2) */ @@ -564,7 +561,7 @@ struct log_t{ lsn_t scanned_lsn; /** @return whether the redo log is encrypted */ - bool is_encrypted() const { return format & LOG_HEADER_FORMAT_ENCRYPTED; } + bool is_encrypted() const { return format & FORMAT_ENCRYPTED; } /** @return capacity in bytes */ lsn_t capacity() const{ return (file_size - LOG_FILE_HDR_SIZE) * n_files; } /** Calculate the offset of a log sequence number. @@ -699,11 +696,34 @@ public: /** @return whether the redo log is encrypted */ bool is_encrypted() const { return(log.is_encrypted()); } - bool is_initialised() { return m_initialised; } + bool is_initialised() const { return m_initialised; } /** Complete an asynchronous checkpoint write. */ void complete_checkpoint(); + /** @return the log block header + trailer size */ + unsigned framing_size() const + { + return log.format == FORMAT_ENC_10_4 + ? LOG_BLOCK_HDR_SIZE + LOG_BLOCK_KEY + LOG_BLOCK_CHECKSUM + : LOG_BLOCK_HDR_SIZE + LOG_BLOCK_CHECKSUM; + } + /** @return the log block payload size */ + unsigned payload_size() const + { + return log.format == FORMAT_ENC_10_4 + ? OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE - LOG_BLOCK_CHECKSUM - + LOG_BLOCK_KEY + : OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE - LOG_BLOCK_CHECKSUM; + } + /** @return the log block trailer offset */ + unsigned trailer_offset() const + { + return log.format == FORMAT_ENC_10_4 + ? OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM - LOG_BLOCK_KEY + : OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM; + } + /** Initialise the redo log subsystem. */ void create(); diff --git a/storage/innobase/include/log0log.ic b/storage/innobase/include/log0log.ic index 722e658a24b..7dfa7c0db68 100644 --- a/storage/innobase/include/log0log.ic +++ b/storage/innobase/include/log0log.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2018, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -217,7 +217,7 @@ log_block_calc_checksum_format_0( sum = 1; sh = 0; - for (i = 0; i < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; i++) { + for (i = 0; i < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM; i++) { ulint b = (ulint) block[i]; sum &= 0x7FFFFFFFUL; sum += b; @@ -239,7 +239,7 @@ ulint log_block_calc_checksum_crc32( const byte* block) { - return(ut_crc32(block, OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE)); + return ut_crc32(block, OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM); } /** Calculates the checksum for a log block using the "no-op" algorithm. @@ -340,7 +340,7 @@ log_reserve_and_write_fast( #endif /* UNIV_LOG_LSN_DEBUG */ + log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE; - if (data_len >= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) { + if (data_len >= log_sys.trailer_offset()) { /* The string does not fit within the current log block or the log block would become full */ @@ -485,9 +485,9 @@ log_free_check(void) #ifdef UNIV_DEBUG static const latch_level_t latches[] = { - SYNC_DICT, /* dict_sys->mutex during + SYNC_DICT, /* dict_sys.mutex during commit_try_rebuild() */ - SYNC_DICT_OPERATION, /* dict_operation_lock X-latch during + SYNC_DICT_OPERATION, /* dict_sys.latch X-latch during commit_try_rebuild() */ SYNC_FTS_CACHE, /* fts_cache_t::lock */ SYNC_INDEX_TREE /* index->lock */ diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h index 866102e6f3d..21ddd2b0388 100644 --- a/storage/innobase/include/log0recv.h +++ b/storage/innobase/include/log0recv.h @@ -33,8 +33,7 @@ Created 9/20/1997 Heikki Tuuri #include "log0log.h" #include "mtr0types.h" -#include <list> -#include <vector> +#include <deque> /** Is recv_writer_thread active? */ extern bool recv_writer_thread_active; @@ -49,7 +48,7 @@ dberr_t recv_find_max_checkpoint(ulint* max_field) MY_ATTRIBUTE((nonnull, warn_unused_result)); -/** Reduces recv_sys->n_addrs for the corrupted page. +/** Reduces recv_sys.n_addrs for the corrupted page. This function should called when srv_force_recovery > 0. @param[in] page_id page id of the corrupted page */ void recv_recover_corrupt_page(page_id_t page_id); @@ -74,17 +73,6 @@ Initiates the rollback of active transactions. */ void recv_recovery_rollback_active(void); /*===============================*/ -/** Clean up after recv_sys_init() */ -void -recv_sys_close(); -/** Initialize the redo log recovery subsystem. */ -void -recv_sys_init(); -/********************************************************//** -Frees the recovery system. */ -void -recv_sys_debug_free(void); -/*=====================*/ /********************************************************//** Reset the state of the recovery system variables. */ @@ -110,7 +98,7 @@ enum store_t { /** Adds data from a new log block to the parsing buffer of recv_sys if -recv_sys->parse_start_lsn is non-zero. +recv_sys.parse_start_lsn is non-zero. @param[in] log_block log block to add @param[in] scanned_lsn lsn of how far we were able to find data in this log block @@ -140,10 +128,6 @@ corresponding to MLOG_INDEX_LOAD. */ extern void (*log_optimized_ddl_op)(ulint space_id); -/** Report backup-unfriendly TRUNCATE operation (with separate log file), -corresponding to MLOG_TRUNCATE. */ -extern void (*log_truncate)(); - /** Report an operation to create, delete, or rename a file during backup. @param[in] space_id tablespace identifier @param[in] flags tablespace flags (NULL if not create) @@ -184,7 +168,7 @@ struct recv_t{ struct recv_dblwr_t { /** Add a page frame to the doublewrite recovery buffer. */ void add(byte* page) { - pages.push_back(page); + pages.push_front(page); } /** Find a doublewrite copy of a page. @@ -194,7 +178,7 @@ struct recv_dblwr_t { @retval NULL if no page was found */ const byte* find_page(ulint space_id, ulint page_no); - typedef std::list<byte*, ut_allocator<byte*> > list; + typedef std::deque<byte*, ut_allocator<byte*> > list; /** Recovered doublewrite buffer page frames */ list pages; @@ -215,14 +199,11 @@ struct recv_sys_t{ buf_flush_t flush_type;/*!< type of the flush request. BUF_FLUSH_LRU: flush end of LRU, keeping free blocks. BUF_FLUSH_LIST: flush all of blocks. */ - ibool apply_log_recs; - /*!< this is TRUE when log rec application to - pages is allowed; this flag tells the - i/o-handler if it should do log record - application */ - ibool apply_batch_on; - /*!< this is TRUE when a log rec application - batch is running */ + /** whether recv_recover_page(), invoked from buf_page_io_complete(), + should apply log records*/ + bool apply_log_recs; + /** whether recv_apply_hashed_log_recs() is running */ + bool apply_batch_on; byte* buf; /*!< buffer for parsing log records */ size_t buf_size; /*!< size of buf */ ulint len; /*!< amount of data in buf */ @@ -276,6 +257,32 @@ struct recv_sys_t{ /** Lastly added LSN to the hash table of log records. */ lsn_t last_stored_lsn; + /** Initialize the redo log recovery subsystem. */ + void create(); + + /** Free most recovery data structures. */ + void debug_free(); + + /** Clean up after create() */ + void close(); + + bool is_initialised() const { return buf_size != 0; } + + /** Store a redo log record for applying. + @param type record type + @param space tablespace identifier + @param page_no page number + @param body record body + @param rec_end end of record + @param lsn start LSN of the mini-transaction + @param end_lsn end LSN of the mini-transaction */ + inline void add(mlog_id_t type, ulint space, ulint page_no, + byte* body, byte* rec_end, lsn_t lsn, + lsn_t end_lsn); + + /** Empty a fully processed set of stored redo log records. */ + inline void empty(); + /** Determine whether redo log recovery progress should be reported. @param[in] time the current time @return whether progress should be reported @@ -292,7 +299,7 @@ struct recv_sys_t{ }; /** The recovery system */ -extern recv_sys_t* recv_sys; +extern recv_sys_t recv_sys; /** TRUE when applying redo log records during crash recovery; FALSE otherwise. Note that this is FALSE while a background thread is diff --git a/storage/innobase/include/mach0data.h b/storage/innobase/include/mach0data.h index 8141c8a91e0..3d0e48253eb 100644 --- a/storage/innobase/include/mach0data.h +++ b/storage/innobase/include/mach0data.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,11 +29,10 @@ Created 11/28/1995 Heikki Tuuri #define mach0data_h #include "univ.i" +#include "mtr0types.h" #ifndef UNIV_INNOCHECKSUM -#include "mtr0types.h" - /* The data and all fields are always stored in a database file in the same format: ascii, big-endian, ... . All data in the files MUST be accessed using the functions in this @@ -368,17 +367,6 @@ mach_write_ulonglong( #endif /* !UNIV_INNOCHECKSUM */ -/** Read 1 to 4 bytes from a file page buffered in the buffer pool. -@param[in] ptr pointer where to read -@param[in] type MLOG_1BYTE, MLOG_2BYTES, or MLOG_4BYTES -@return value read */ -UNIV_INLINE -ulint -mach_read_ulint( - const byte* ptr, - mlog_id_t type) - MY_ATTRIBUTE((warn_unused_result)); - #include "mach0data.ic" #endif diff --git a/storage/innobase/include/mach0data.ic b/storage/innobase/include/mach0data.ic index 408044292a5..80bd925d70b 100644 --- a/storage/innobase/include/mach0data.ic +++ b/storage/innobase/include/mach0data.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -866,28 +866,3 @@ mach_write_ulonglong( } #endif /* !UNIV_INNOCHECKSUM */ - -/** Read 1 to 4 bytes from a file page buffered in the buffer pool. -@param[in] ptr pointer where to read -@param[in] type MLOG_1BYTE, MLOG_2BYTES, or MLOG_4BYTES -@return value read */ -UNIV_INLINE -ulint -mach_read_ulint( - const byte* ptr, - mlog_id_t type) -{ - switch (type) { - case MLOG_1BYTE: - return(mach_read_from_1(ptr)); - case MLOG_2BYTES: - return(mach_read_from_2(ptr)); - case MLOG_4BYTES: - return(mach_read_from_4(ptr)); - default: - break; - } - - ut_error; - return(0); -} diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h index dc76b40a3db..0c58f524015 100644 --- a/storage/innobase/include/mtr0log.h +++ b/storage/innobase/include/mtr0log.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -71,6 +72,23 @@ mlog_log_string( byte* ptr, /*!< in: pointer written to */ ulint len, /*!< in: string length */ mtr_t* mtr); /*!< in: mini-transaction handle */ + +/** Initialize a string of bytes. +@param[in,out] b buffer page +@param[in] ofs byte offset from block->frame +@param[in] len length of the data to write +@param[in] val the data byte to write +@param[in,out] mtr mini-transaction */ +void +mlog_memset(buf_block_t* b, ulint ofs, ulint len, byte val, mtr_t* mtr); + +/** Initialize a string of bytes. +@param[in,out] byte byte address +@param[in] len length of the data to write +@param[in] val the data byte to write +@param[in,out] mtr mini-transaction */ +void mlog_memset(byte* b, ulint len, byte val, mtr_t* mtr); + /********************************************************//** Writes initial part of a log record consisting of one-byte item type and four-byte space and page numbers. */ @@ -180,7 +198,7 @@ mlog_parse_initial_log_record( ulint* space, /*!< out: space id */ ulint* page_no);/*!< out: page number */ /********************************************************//** -Parses a log record written by mlog_write_ulint or mlog_write_ull. +Parses a log record written by mlog_write_ulint, mlog_write_ull, mlog_memset. @return parsed record end, NULL if not a complete record */ byte* mlog_parse_nbytes( diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index 074f55971b3..f364730b21f 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -54,10 +54,6 @@ savepoint. */ @return old mode */ #define mtr_set_log_mode(m, d) (m)->set_log_mode((d)) -/** Read 1 - 4 bytes from a file page buffered in the buffer pool. -@return value read */ -#define mtr_read_ulint(p, t, m) (m)->read_ulint((p), (t)) - /** Release an object in the memo stack. @return true if released */ #define mtr_memo_release(m, o, t) \ @@ -239,13 +235,6 @@ struct mtr_t { bool is_named_space(const fil_space_t* space) const; #endif /* UNIV_DEBUG */ - /** Read 1 - 4 bytes from a file page buffered in the buffer pool. - @param ptr pointer from where to read - @param type) MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES - @return value read */ - inline ulint read_ulint(const byte* ptr, mlog_id_t type) const - MY_ATTRIBUTE((warn_unused_result)); - /** Acquire a tablespace X-latch. @param[in] space_id tablespace ID @param[in] file file name from where called diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic index 4cc55ed13ec..0fe56f960b7 100644 --- a/storage/innobase/include/mtr0mtr.ic +++ b/storage/innobase/include/mtr0mtr.ic @@ -170,7 +170,7 @@ mtr_t::release_block_at_savepoint( ut_a(slot->object == block); - buf_block_unfix(reinterpret_cast<buf_block_t*>(block)); + reinterpret_cast<buf_block_t*>(block)->unfix(); buf_page_release_latch(block, slot->type); @@ -227,21 +227,3 @@ mtr_t::set_log_mode(mtr_log_t mode) ut_ad(0); return(old_mode); } - -/** -Reads 1 - 4 bytes from a file page buffered in the buffer pool. -@return value read */ - -ulint -mtr_t::read_ulint(const byte* ptr, mlog_id_t type) const -{ - ut_ad(is_active()); - - ut_ad(memo_contains_page_flagged( - ptr, - MTR_MEMO_PAGE_S_FIX - | MTR_MEMO_PAGE_X_FIX - | MTR_MEMO_PAGE_SX_FIX)); - - return(mach_read_ulint(ptr, type)); -} diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h index da6686d77c8..bf7484b2337 100644 --- a/storage/innobase/include/mtr0types.h +++ b/storage/innobase/include/mtr0types.h @@ -120,7 +120,7 @@ enum mlog_id_t { /** mark an index record as the predefined minimum record */ MLOG_REC_MIN_MARK = 26, - /** initialize an ibuf bitmap page */ + /** initialize an ibuf bitmap page (used in MariaDB 10.2 and 10.3) */ MLOG_IBUF_BITMAP_INIT = 27, #ifdef UNIV_LOG_LSN_DEBUG @@ -218,7 +218,8 @@ enum mlog_id_t { /** initialize a file page */ MLOG_INIT_FILE_PAGE2 = 59, - /** Table is being truncated. (Marked only for file-per-table) */ + /** Table is being truncated. (Was used in 10.2 and 10.3; + not supported for crash-upgrade to 10.4 or later.) */ MLOG_TRUNCATE = 60, /** notify that an index tree is being loaded without writing @@ -229,8 +230,14 @@ enum mlog_id_t { of a ROW_FORMAT=COMPRESSED table */ MLOG_ZIP_WRITE_TRX_ID = 62, + /** initialize a page with a string of identical bytes */ + MLOG_MEMSET = 63, + + /** Zero-fill a page that is not allocated. */ + MLOG_INIT_FREE_PAGE = 64, + /** biggest value (used in assertions) */ - MLOG_BIGGEST_TYPE = MLOG_ZIP_WRITE_TRX_ID, + MLOG_BIGGEST_TYPE = MLOG_INIT_FREE_PAGE, /** log record for writing/updating crypt data of a tablespace */ diff --git a/storage/innobase/include/os0api.h b/storage/innobase/include/os0api.h index 6f42d968c8e..3be7c0afaa4 100644 --- a/storage/innobase/include/os0api.h +++ b/storage/innobase/include/os0api.h @@ -1,6 +1,6 @@ /*********************************************************************** -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the @@ -54,22 +54,4 @@ buf_page_get_trim_length( ulint write_length) MY_ATTRIBUTE((warn_unused_result)); -/** -Get should we punch hole to tablespace. -@param[in] space Tablespace -@return true, if punch hole should be tried, false if not. */ -bool -fil_node_should_punch_hole( - const fil_node_t* node) - MY_ATTRIBUTE((warn_unused_result)); - -/** -Set punch hole to tablespace to given value. -@param[in] space Tablespace -@param[in] val value to be set. */ -void -fil_space_set_punch_hole( - fil_node_t* node, - bool val); - #endif /* OS_API_H */ diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index c896d9da6a2..a87ce5ec07b 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -36,7 +36,7 @@ Created 10/21/1995 Heikki Tuuri #ifndef os0file_h #define os0file_h -#include "page0size.h" +#include "fsp0types.h" #include "os0api.h" #ifndef _WIN32 @@ -360,17 +360,8 @@ public: /** Set the pointer to file node for IO @param[in] node File node */ - void set_fil_node(fil_node_t* node) - { - if (node && !fil_node_should_punch_hole(node)) { - clear_punch_hole(); - } - - m_fil_node = node; - } + inline void set_fil_node(fil_node_t* node); - /** Compare two requests - @reutrn true if the are equal */ bool operator==(const IORequest& rhs) const { return(m_type == rhs.m_type); @@ -414,17 +405,7 @@ public: : 0); } - bool should_punch_hole() const { - return (m_fil_node ? - fil_node_should_punch_hole(m_fil_node) - : false); - } - - void space_no_punch_hole() const { - if (m_fil_node) { - fil_space_set_punch_hole(m_fil_node, false); - } - } + inline bool should_punch_hole() const; /** Free storage space associated with a section of the file. @param[in] fh Open file handle @@ -1585,19 +1566,6 @@ os_file_change_size_win32( #endif /*_WIN32 */ -/** Check if the file system supports sparse files. - -Warning: On POSIX systems we try and punch a hole from offset 0 to -the system configured page size. This should only be called on an empty -file. - -@param[in] fh File handle for the file - if opened -@return true if the file system supports sparse files */ -bool -os_is_sparse_file_supported( - os_file_t fh) - MY_ATTRIBUTE((warn_unused_result)); - /** Free storage space associated with a section of the file. @param[in] fh Open file handle @param[in] off Starting offset (SEEK_SET) @@ -1637,16 +1605,6 @@ is_absolute_path( return(false); } -/***********************************************************************//** -Try to get number of bytes per sector from file system. -@return file block size */ -UNIV_INTERN -ulint -os_file_get_block_size( -/*===================*/ - os_file_t file, /*!< in: handle to a file */ - const char* name); /*!< in: file name */ - #include "os0file.ic" #endif /* os0file_h */ diff --git a/storage/innobase/include/os0once.h b/storage/innobase/include/os0once.h deleted file mode 100644 index a818b451830..00000000000 --- a/storage/innobase/include/os0once.h +++ /dev/null @@ -1,120 +0,0 @@ -/***************************************************************************** - -Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/os0once.h -A class that aids executing a given function exactly once in a multi-threaded -environment. - -Created Feb 20, 2014 Vasil Dimov -*******************************************************/ - -#ifndef os0once_h -#define os0once_h - -#include "univ.i" - -#include "ut0ut.h" -#include "my_cpu.h" - -/** Execute a given function exactly once in a multi-threaded environment -or wait for the function to be executed by another thread. - -Example usage: -First the user must create a control variable of type os_once::state_t and -assign it os_once::NEVER_DONE. -Then the user must pass this variable, together with a function to be -executed to os_once::do_or_wait_for_done(). - -Multiple threads can call os_once::do_or_wait_for_done() simultaneously with -the same (os_once::state_t) control variable. The provided function will be -called exactly once and when os_once::do_or_wait_for_done() returns then this -function has completed execution, by this or another thread. In other words -os_once::do_or_wait_for_done() will either execute the provided function or -will wait for its execution to complete if it is already called by another -thread or will do nothing if the function has already completed its execution -earlier. - -This mimics pthread_once(3), but unfortunatelly pthread_once(3) does not -support passing arguments to the init_routine() function. We should use -std::call_once() when we start compiling with C++11 enabled. */ -class os_once { -public: - /** Control variables' state type */ - typedef ib_uint32_t state_t; - - /** Not yet executed. */ - static const state_t NEVER_DONE = 0; - - /** Currently being executed by this or another thread. */ - static const state_t IN_PROGRESS = 1; - - /** Finished execution. */ - static const state_t DONE = 2; - - /** Call a given function or wait its execution to complete if it is - already called by another thread. - @param[in,out] state control variable - @param[in] do_func function to call - @param[in,out] do_func_arg an argument to pass to do_func(). */ - static - void - do_or_wait_for_done( - volatile state_t* state, - void (*do_func)(void*), - void* do_func_arg) - { - int32 oldval = NEVER_DONE; - - /* Avoid calling my_atomic_cas32() in the most common case. */ - if (*state == DONE) { - return; - } - - if (my_atomic_cas32((int32*) state, &oldval, IN_PROGRESS)) { - /* We are the first. Call the function. */ - - do_func(do_func_arg); - - my_atomic_store32((int32*) state, DONE); - } else { - /* The state is not NEVER_DONE, so either it is - IN_PROGRESS (somebody is calling the function right - now or DONE (it has already been called and completed). - Wait for it to become DONE. */ - for (;;) { - const state_t s = *state; - - switch (s) { - case DONE: - return; - case IN_PROGRESS: - break; - case NEVER_DONE: - /* fall through */ - default: - ut_error; - } - - MY_RELAX_CPU(); - } - } - } -}; - -#endif /* os0once_h */ diff --git a/storage/innobase/include/os0proc.h b/storage/innobase/include/os0proc.h index 9b0b3cbf628..d8952a56cc9 100644 --- a/storage/innobase/include/os0proc.h +++ b/storage/innobase/include/os0proc.h @@ -40,7 +40,7 @@ typedef unsigned long int os_process_id_t; /** The total amount of memory currently allocated from the operating system with os_mem_alloc_large(). */ -extern ulint os_total_large_mem_allocated; +extern Atomic_counter<ulint> os_total_large_mem_allocated; /** Converts the current process id to a number. @return process id as a number */ diff --git a/storage/innobase/include/os0thread.h b/storage/innobase/include/os0thread.h index d99bc841de9..67ee3097274 100644 --- a/storage/innobase/include/os0thread.h +++ b/storage/innobase/include/os0thread.h @@ -73,7 +73,7 @@ typedef unsigned int mysql_pfs_key_t; #endif /* HAVE_PSI_INTERFACE */ /** Number of threads active. */ -extern ulint os_thread_count; +extern Atomic_counter<ulint> os_thread_count; /***************************************************************//** Compares two thread ids for equality. diff --git a/storage/innobase/include/page0cur.ic b/storage/innobase/include/page0cur.ic index 4d7b5c3a42f..f0844ee1f73 100644 --- a/storage/innobase/include/page0cur.ic +++ b/storage/innobase/include/page0cur.ic @@ -24,12 +24,7 @@ The page cursor Created 10/4/1994 Heikki Tuuri *************************************************************************/ -#include "page0page.h" -#include "buf0types.h" - #ifdef UNIV_DEBUG -# include "rem0cmp.h" - /*********************************************************//** Gets pointer to the page frame where the cursor is positioned. @return page */ @@ -280,6 +275,7 @@ page_cur_tuple_insert( *offsets = rec_get_offsets(rec, index, *offsets, page_is_leaf(cursor->block->frame), ULINT_UNDEFINED, heap); + ut_ad(size == rec_offs_size(*offsets)); if (buf_block_get_page_zip(cursor->block)) { rec = page_cur_insert_rec_zip( diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h index 22f4bd5d8c4..0de7f50f8c2 100644 --- a/storage/innobase/include/page0page.h +++ b/storage/innobase/include/page0page.h @@ -27,29 +27,23 @@ Created 2/2/1994 Heikki Tuuri #define page0page_h #include "page0types.h" -#ifndef UNIV_INNOCHECKSUM +#include "fsp0fsp.h" #include "fil0fil.h" #include "buf0buf.h" -#include "data0data.h" -#include "dict0dict.h" -#include "rem0types.h" #include "rem0rec.h" -#endif /* !UNIV_INNOCHECKSUM*/ -#include "fsp0fsp.h" #ifndef UNIV_INNOCHECKSUM +#include "dict0dict.h" +#include "data0data.h" #include "mtr0mtr.h" -#ifdef UNIV_MATERIALIZE -#undef UNIV_INLINE -#define UNIV_INLINE -#endif - /* PAGE HEADER =========== Index page header starts at the first offset left free by the FIL-module */ typedef byte page_header_t; +#else +# include "mach0data.h" #endif /* !UNIV_INNOCHECKSUM */ #define PAGE_HEADER FSEG_PAGE_DATA /* index page header starts at this @@ -164,12 +158,12 @@ Otherwise written as 0. @see PAGE_ROOT_AUTO_INC */ not necessarily collation order; this record may have been deleted */ -/* Directions of cursor movement */ -#define PAGE_LEFT 1 -#define PAGE_RIGHT 2 -#define PAGE_SAME_REC 3 -#define PAGE_SAME_PAGE 4 -#define PAGE_NO_DIRECTION 5 +/* Directions of cursor movement (stored in PAGE_DIRECTION field) */ +constexpr uint16_t PAGE_LEFT= 1; +constexpr uint16_t PAGE_RIGHT= 2; +constexpr uint16_t PAGE_SAME_REC= 3; +constexpr uint16_t PAGE_SAME_PAGE= 4; +constexpr uint16_t PAGE_NO_DIRECTION= 5; #ifndef UNIV_INNOCHECKSUM @@ -1013,13 +1007,6 @@ page_get_direction(const page_t* page) inline uint16_t page_get_instant(const page_t* page); -/** Assign the PAGE_INSTANT field. -@param[in,out] page clustered index root page -@param[in] n original number of clustered index fields -@param[in,out] mtr mini-transaction */ -inline -void -page_set_instant(page_t* page, unsigned n, mtr_t* mtr); /**********************************************************//** Create an uncompressed B-tree index page. @@ -1041,16 +1028,10 @@ page_create_zip( buf_block_t* block, /*!< in/out: a buffer frame where the page is created */ dict_index_t* index, /*!< in: the index of the - page, or NULL when applying - TRUNCATE log - record during recovery */ + page */ ulint level, /*!< in: the B-tree level of the page */ trx_id_t max_trx_id, /*!< in: PAGE_MAX_TRX_ID */ - const redo_page_compress_t* page_comp_info, - /*!< in: used for applying - TRUNCATE log - record during recovery */ mtr_t* mtr); /*!< in/out: mini-transaction handle */ /**********************************************************//** @@ -1338,11 +1319,6 @@ const rec_t* page_find_rec_max_not_deleted( const page_t* page); -#ifdef UNIV_MATERIALIZE -#undef UNIV_INLINE -#define UNIV_INLINE UNIV_INLINE_ORIGINAL -#endif - #endif /* !UNIV_INNOCHECKSUM */ #include "page0page.ic" diff --git a/storage/innobase/include/page0page.ic b/storage/innobase/include/page0page.ic index d1bf382c1d5..c0a3c86c737 100644 --- a/storage/innobase/include/page0page.ic +++ b/storage/innobase/include/page0page.ic @@ -29,18 +29,10 @@ Created 2/2/1994 Heikki Tuuri #ifndef UNIV_INNOCHECKSUM #include "mach0data.h" -#ifdef UNIV_DEBUG -# include "log0recv.h" -#endif /* !UNIV_DEBUG */ #include "rem0cmp.h" #include "mtr0log.h" #include "page0zip.h" -#ifdef UNIV_MATERIALIZE -#undef UNIV_INLINE -#define UNIV_INLINE -#endif - /*************************************************************//** Returns the max trx id field value. */ UNIV_INLINE @@ -1103,29 +1095,6 @@ page_get_instant(const page_t* page) #endif /* UNIV_DEBUG */ return(i >> 3); } - -/** Assign the PAGE_INSTANT field. -@param[in,out] page clustered index root page -@param[in] n original number of clustered index fields -@param[in,out] mtr mini-transaction */ -inline -void -page_set_instant(page_t* page, unsigned n, mtr_t* mtr) -{ - ut_ad(fil_page_get_type(page) == FIL_PAGE_TYPE_INSTANT); - ut_ad(n > 0); - ut_ad(n < REC_MAX_N_FIELDS); - uint16_t i = page_header_get_field(page, PAGE_INSTANT); - ut_ad(i <= PAGE_NO_DIRECTION); - i |= n << 3; - mlog_write_ulint(PAGE_HEADER + PAGE_INSTANT + page, i, - MLOG_2BYTES, mtr); -} #endif /* !UNIV_INNOCHECKSUM */ -#ifdef UNIV_MATERIALIZE -#undef UNIV_INLINE -#define UNIV_INLINE UNIV_INLINE_ORIGINAL -#endif - #endif diff --git a/storage/innobase/include/page0size.h b/storage/innobase/include/page0size.h deleted file mode 100644 index ca1e704eda1..00000000000 --- a/storage/innobase/include/page0size.h +++ /dev/null @@ -1,197 +0,0 @@ -/***************************************************************************** - -Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/page0size.h -A class describing a page size. - -Created Nov 14, 2013 Vasil Dimov -*******************************************************/ - -#ifndef page0size_t -#define page0size_t - -#include "fsp0types.h" - -#define FIELD_REF_SIZE 20U - -/** A BLOB field reference full of zero, for use in assertions and -tests.Initially, BLOB field references are set to zero, in -dtuple_convert_big_rec(). */ -extern const byte field_ref_zero[UNIV_PAGE_SIZE_MAX]; - -#define PAGE_SIZE_T_SIZE_BITS 17 - -/** Page size descriptor. Contains the physical and logical page size, as well -as whether the page is compressed or not. */ -class page_size_t { -public: - /** Constructor from (physical, logical, is_compressed). - @param[in] physical physical (on-disk/zipped) page size - @param[in] logical logical (in-memory/unzipped) page size - @param[in] is_compressed whether the page is compressed */ - page_size_t(ulint physical, ulint logical, bool is_compressed) - { - if (physical == 0) { - physical = UNIV_PAGE_SIZE_ORIG; - } - if (logical == 0) { - logical = UNIV_PAGE_SIZE_ORIG; - } - - m_physical = static_cast<unsigned>(physical); - m_logical = static_cast<unsigned>(logical); - m_is_compressed = static_cast<unsigned>(is_compressed); - - ut_ad(physical <= (1 << PAGE_SIZE_T_SIZE_BITS)); - ut_ad(logical <= (1 << PAGE_SIZE_T_SIZE_BITS)); - - ut_ad(ut_is_2pow(physical)); - ut_ad(ut_is_2pow(logical)); - - ut_ad(logical <= UNIV_PAGE_SIZE_MAX); - ut_ad(logical >= physical); - ut_ad(!is_compressed || physical <= UNIV_ZIP_SIZE_MAX); - } - - /** Constructor from (fsp_flags). - @param[in] fsp_flags filespace flags */ - explicit page_size_t(ulint fsp_flags) - { - ulint ssize = FSP_FLAGS_GET_PAGE_SSIZE(fsp_flags); - - /* If the logical page size is zero in fsp_flags, then use the - legacy 16k page size. */ - ssize = (0 == ssize) ? UNIV_PAGE_SSIZE_ORIG : ssize; - - /* Convert from a 'log2 minus 9' to a page size in bytes. */ - const unsigned size = ((UNIV_ZIP_SIZE_MIN >> 1) << ssize); - - ut_ad(size <= UNIV_PAGE_SIZE_MAX); - ut_ad(size <= (1 << PAGE_SIZE_T_SIZE_BITS)); - - m_logical = size; - - ssize = FSP_FLAGS_GET_ZIP_SSIZE(fsp_flags); - - /* If the fsp_flags have zero in the zip_ssize field, then it means - that the tablespace does not have compressed pages and the physical - page size is the same as the logical page size. */ - if (ssize == 0) { - m_is_compressed = false; - m_physical = m_logical; - } else { - m_is_compressed = true; - - /* Convert from a 'log2 minus 9' to a page size - in bytes. */ - const unsigned phy - = ((UNIV_ZIP_SIZE_MIN >> 1) << ssize); - - ut_ad(phy <= UNIV_ZIP_SIZE_MAX); - ut_ad(phy <= (1 << PAGE_SIZE_T_SIZE_BITS)); - - m_physical = phy; - } - } - - /** Retrieve the physical page size (on-disk). - @return physical page size in bytes */ - inline ulint physical() const - { - ut_ad(m_physical > 0); - - return(m_physical); - } - - /** Retrieve the logical page size (in-memory). - @return logical page size in bytes */ - inline ulint logical() const - { - ut_ad(m_logical > 0); - return(m_logical); - } - - /** Check whether the page is compressed on disk. - @return true if compressed */ - inline bool is_compressed() const - { - return(m_is_compressed); - } - - /** Copy the values from a given page_size_t object. - @param[in] src page size object whose values to fetch */ - inline void copy_from(const page_size_t& src) - { - *this = src; - } - - /** Check if a given page_size_t object is equal to the current one. - @param[in] a page_size_t object to compare - @return true if equal */ - inline bool equals_to(const page_size_t& a) const - { - return(a.physical() == m_physical - && a.logical() == m_logical - && a.is_compressed() == m_is_compressed); - } - -private: - - /* For non compressed tablespaces, physical page size is equal to - the logical page size and the data is stored in buf_page_t::frame - (and is also always equal to univ_page_size (--innodb-page-size=)). - - For compressed tablespaces, physical page size is the compressed - page size as stored on disk and in buf_page_t::zip::data. The logical - page size is the uncompressed page size in memory - the size of - buf_page_t::frame (currently also always equal to univ_page_size - (--innodb-page-size=)). */ - - /** Physical page size. */ - unsigned m_physical:PAGE_SIZE_T_SIZE_BITS; - - /** Logical page size. */ - unsigned m_logical:PAGE_SIZE_T_SIZE_BITS; - - /** Flag designating whether the physical page is compressed, which is - true IFF the whole tablespace where the page belongs is compressed. */ - unsigned m_is_compressed:1; -}; - -/* Overloading the global output operator to conveniently print an object -of type the page_size_t. -@param[in,out] out the output stream -@param[in] obj an object of type page_size_t to be printed -@retval the output stream */ -inline -std::ostream& -operator<<( - std::ostream& out, - const page_size_t& obj) -{ - out << "[page size: physical=" << obj.physical() - << ", logical=" << obj.logical() - << ", compressed=" << obj.is_compressed() << "]"; - return(out); -} - -extern page_size_t univ_page_size; - -#endif /* page0size_t */ diff --git a/storage/innobase/include/page0types.h b/storage/innobase/include/page0types.h index 0fcaebd0e43..14ccc2eae36 100644 --- a/storage/innobase/include/page0types.h +++ b/storage/innobase/include/page0types.h @@ -84,18 +84,6 @@ enum page_cur_mode_t { PAGE_CUR_RTREE_GET_FATHER = 14 }; - -/** The information used for compressing a page when applying -TRUNCATE log record during recovery */ -struct redo_page_compress_t { - ulint type; /*!< index type */ - index_id_t index_id; /*!< index id */ - ulint n_fields; /*!< number of index fields */ - ulint field_len; /*!< the length of index field */ - const byte* fields; /*!< index field information */ - ulint trx_id_pos; /*!< position of trx-id column. */ -}; - /** Compressed page descriptor */ struct page_zip_des_t { diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h index bf6ad5c860f..ec205fd79bf 100644 --- a/storage/innobase/include/page0zip.h +++ b/storage/innobase/include/page0zip.h @@ -28,28 +28,11 @@ Created June 2005 by Marko Makela #ifndef page0zip_h #define page0zip_h -#ifdef UNIV_MATERIALIZE -# undef UNIV_INLINE -# define UNIV_INLINE -#endif - -#ifdef UNIV_INNOCHECKSUM -#include "buf0buf.h" -#include "ut0crc32.h" -#include "buf0checksum.h" -#include "mach0data.h" -#include "zlib.h" -#endif /* UNIV_INNOCHECKSUM */ +#include "buf0types.h" #ifndef UNIV_INNOCHECKSUM #include "mtr0types.h" #include "page0types.h" -#endif /* !UNIV_INNOCHECKSUM */ - -#include "buf0types.h" -#include "rem0types.h" - -#ifndef UNIV_INNOCHECKSUM #include "dict0types.h" #include "srv0srv.h" #include "trx0types.h" @@ -103,15 +86,10 @@ page_zip_set_size( @param[in] comp nonzero=compact format @param[in] n_fields number of fields in the record; ignored if tablespace is not compressed -@param[in] page_size page size -@return FALSE if the entire record can be stored locally on the page */ -UNIV_INLINE -ibool -page_zip_rec_needs_ext( - ulint rec_size, - ulint comp, - ulint n_fields, - const page_size_t& page_size) +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return false if the entire record can be stored locally on the page */ +inline bool page_zip_rec_needs_ext(ulint rec_size, ulint comp, ulint n_fields, + ulint zip_size) MY_ATTRIBUTE((warn_unused_result)); /**********************************************************************//** @@ -164,10 +142,6 @@ page_zip_compress( dict_index_t* index, /*!< in: index of the B-tree node */ ulint level, /*!< in: commpression level */ - const redo_page_compress_t* page_comp_info, - /*!< in: used for applying - TRUNCATE log - record during recovery */ mtr_t* mtr); /*!< in/out: mini-transaction, or NULL */ @@ -516,12 +490,7 @@ uint32_t page_zip_calc_checksum( const void* data, ulint size, - srv_checksum_algorithm_t algo -#ifdef INNODB_BUG_ENDIAN_CRC32 - /** for crc32, use the big-endian bug-compatible crc32 variant */ - , bool use_legacy_big_endian = false -#endif -); + srv_checksum_algorithm_t algo); /** Validate the checksum on a ROW_FORMAT=COMPRESSED page. @param data ROW_FORMAT=COMPRESSED page @@ -562,11 +531,6 @@ void page_zip_reset_stat_per_index(); /*===========================*/ -#ifdef UNIV_MATERIALIZE -# undef UNIV_INLINE -# define UNIV_INLINE UNIV_INLINE_ORIGINAL -#endif - #include "page0zip.ic" #endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/page0zip.ic b/storage/innobase/include/page0zip.ic index 4e4ccdb492f..337debd30e9 100644 --- a/storage/innobase/include/page0zip.ic +++ b/storage/innobase/include/page0zip.ic @@ -2,7 +2,7 @@ Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -25,11 +25,6 @@ Compressed page interface Created June 2005 by Marko Makela *******************************************************/ -#ifdef UNIV_MATERIALIZE -# undef UNIV_INLINE -# define UNIV_INLINE -#endif - #include "page0zip.h" #include "mtr0log.h" #include "page0page.h" @@ -154,22 +149,17 @@ page_zip_set_size( @param[in] comp nonzero=compact format @param[in] n_fields number of fields in the record; ignored if tablespace is not compressed -@param[in] page_size page size -@return FALSE if the entire record can be stored locally on the page */ -UNIV_INLINE -ibool -page_zip_rec_needs_ext( - ulint rec_size, - ulint comp, - ulint n_fields, - const page_size_t& page_size) +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return false if the entire record can be stored locally on the page */ +inline bool page_zip_rec_needs_ext(ulint rec_size, ulint comp, ulint n_fields, + ulint zip_size) { /* FIXME: row size check is this function seems to be the most correct. Put it in a separate function and use in more places of InnoDB */ ut_ad(rec_size > ulint(comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES)); - ut_ad(comp || !page_size.is_compressed()); + ut_ad(comp || !zip_size); #if UNIV_PAGE_SIZE_MAX > COMPRESSED_REC_MAX_DATA_SIZE if (comp ? rec_size >= COMPRESSED_REC_MAX_DATA_SIZE : @@ -178,7 +168,7 @@ page_zip_rec_needs_ext( } #endif - if (page_size.is_compressed()) { + if (zip_size) { ut_ad(comp); /* On a compressed page, there is a two-byte entry in the dense page directory for every record. But there @@ -187,7 +177,7 @@ page_zip_rec_needs_ext( the encoded heap number. Check also the available space on the uncompressed page. */ return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2 - 1) - >= page_zip_empty_size(n_fields, page_size.physical()) + >= page_zip_empty_size(n_fields, zip_size) || rec_size >= page_get_free_space_of_empty(TRUE) / 2); } @@ -417,7 +407,7 @@ page_zip_parse_compress_no_data( was successful. Crash in this case. */ if (page - && !page_zip_compress(page_zip, page, index, level, NULL, NULL)) { + && !page_zip_compress(page_zip, page, index, level, NULL)) { ut_error; } @@ -440,8 +430,3 @@ page_zip_reset_stat_per_index() mutex_exit(&page_zip_stat_per_index_mutex); } - -#ifdef UNIV_MATERIALIZE -# undef UNIV_INLINE -# define UNIV_INLINE UNIV_INLINE_ORIGINAL -#endif diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h index b46393e37d2..2df2b33f5c8 100644 --- a/storage/innobase/include/que0que.h +++ b/storage/innobase/include/que0que.h @@ -287,9 +287,9 @@ que_eval_sql( /*=========*/ pars_info_t* info, /*!< in: info struct, or NULL */ const char* sql, /*!< in: SQL string */ - ibool reserve_dict_mutex, - /*!< in: if TRUE, acquire/release - dict_sys->mutex around call to pars_sql. */ + bool reserve_dict_mutex, + /*!< in: whether to acquire/release + dict_sys.mutex around call to pars_sql. */ trx_t* trx); /*!< in: trx */ /**********************************************************************//** diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h index c0faf84cfbe..48575feda10 100644 --- a/storage/innobase/include/read0types.h +++ b/storage/innobase/include/read0types.h @@ -66,7 +66,14 @@ class ReadView Close view: READ_VIEW_STATE_OPEN -> READ_VIEW_STATE_CLOSED */ - int32_t m_state; + std::atomic<uint32_t> m_state; + + + /** m_state getter for ReadView owner thread */ + uint32_t state() const + { + return m_state.load(std::memory_order_relaxed); + } public: @@ -134,35 +141,36 @@ loop: Closes the view. View becomes not visible to purge thread. + + This method is intended to be called by ReadView owner thread, thus + m_state cannot change. */ void close() { - ut_ad(m_state == READ_VIEW_STATE_CLOSED || - m_state == READ_VIEW_STATE_OPEN); - if (m_state == READ_VIEW_STATE_OPEN) - my_atomic_store32_explicit(&m_state, READ_VIEW_STATE_CLOSED, - MY_MEMORY_ORDER_RELAXED); + ut_ad(state() == READ_VIEW_STATE_CLOSED || + state() == READ_VIEW_STATE_OPEN); + m_state.store(READ_VIEW_STATE_CLOSED, std::memory_order_relaxed); } /** m_state getter for trx_sys::clone_oldest_view() trx_sys::size(). */ - int32_t get_state() const + uint32_t get_state() const { - return my_atomic_load32_explicit(const_cast<int32*>(&m_state), - MY_MEMORY_ORDER_ACQUIRE); + return m_state.load(std::memory_order_acquire); } /** Returns true if view is open. - Only used by view owner thread, thus we can omit atomic operations. + This method is intended to be called by ReadView owner thread, thus + m_state cannot change. */ bool is_open() const { - ut_ad(m_state == READ_VIEW_STATE_OPEN || - m_state == READ_VIEW_STATE_CLOSED); - return m_state == READ_VIEW_STATE_OPEN; + ut_ad(state() == READ_VIEW_STATE_OPEN || + state() == READ_VIEW_STATE_CLOSED); + return state() == READ_VIEW_STATE_OPEN; } diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h index 2a522ae4837..23c25f76362 100644 --- a/storage/innobase/include/rem0rec.h +++ b/storage/innobase/include/rem0rec.h @@ -38,15 +38,6 @@ Created 5/30/1994 Heikki Tuuri #include <ostream> #include <sstream> -/* Info bit denoting the predefined minimum record: this bit is set -if and only if the record is the first user record on a non-leaf -B-tree page that is the leftmost page on its level -(PAGE_LEVEL is nonzero and FIL_PAGE_PREV is FIL_NULL). */ -#define REC_INFO_MIN_REC_FLAG 0x10UL -/* The deleted flag in info bits */ -#define REC_INFO_DELETED_FLAG 0x20UL /* when bit is set to 1, it means the - record has been delete marked */ - /* Number of extra bytes in an old-style record, in addition to the data and the offsets */ #define REC_N_OLD_EXTRA_BYTES 6 @@ -54,26 +45,6 @@ in addition to the data and the offsets */ in addition to the data and the offsets */ #define REC_N_NEW_EXTRA_BYTES 5 -/** Record status values for ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED */ -enum rec_comp_status_t { - /** User record (PAGE_LEVEL=0, heap>=PAGE_HEAP_NO_USER_LOW) */ - REC_STATUS_ORDINARY = 0, - /** Node pointer record (PAGE_LEVEL>=0, heap>=PAGE_HEAP_NO_USER_LOW) */ - REC_STATUS_NODE_PTR = 1, - /** The page infimum pseudo-record (heap=PAGE_HEAP_NO_INFIMUM) */ - REC_STATUS_INFIMUM = 2, - /** The page supremum pseudo-record (heap=PAGE_HEAP_NO_SUPREMUM) */ - REC_STATUS_SUPREMUM = 3, - /** Clustered index record that has been inserted or updated - after instant ADD COLUMN (more than dict_index_t::n_core_fields) */ - REC_STATUS_COLUMNS_ADDED = 4 -}; - -/** The dtuple_t::info_bits of the metadata pseudo-record. -@see rec_is_metadata() */ -static const byte REC_INFO_METADATA - = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED; - #define REC_NEW_STATUS 3 /* This is single byte bit-field */ #define REC_NEW_STATUS_MASK 0x7UL #define REC_NEW_STATUS_SHIFT 0 @@ -336,7 +307,7 @@ rec_comp_status_t rec_get_status(const rec_t* rec) { byte bits = rec[-REC_NEW_STATUS] & REC_NEW_STATUS_MASK; - ut_ad(bits <= REC_STATUS_COLUMNS_ADDED); + ut_ad(bits <= REC_STATUS_INSTANT); return static_cast<rec_comp_status_t>(bits); } @@ -347,12 +318,12 @@ inline void rec_set_status(rec_t* rec, byte bits) { - ut_ad(bits <= REC_STATUS_COLUMNS_ADDED); + ut_ad(bits <= REC_STATUS_INSTANT); rec[-REC_NEW_STATUS] = (rec[-REC_NEW_STATUS] & ~REC_NEW_STATUS_MASK) | bits; } -/** Get the length of added field count in a REC_STATUS_COLUMNS_ADDED record. +/** Get the length of added field count in a REC_STATUS_INSTANT record. @param[in] n_add_field number of added fields, minus one @return storage size of the field count, in bytes */ inline unsigned rec_get_n_add_field_len(ulint n_add_field) @@ -361,8 +332,26 @@ inline unsigned rec_get_n_add_field_len(ulint n_add_field) return n_add_field < 0x80 ? 1 : 2; } -/** Set the added field count in a REC_STATUS_COLUMNS_ADDED record. -@param[in,out] header variable header of a REC_STATUS_COLUMNS_ADDED record +/** Get the added field count in a REC_STATUS_INSTANT record. +@param[in,out] header variable header of a REC_STATUS_INSTANT record +@return number of added fields */ +inline unsigned rec_get_n_add_field(const byte*& header) +{ + unsigned n_fields_add = *--header; + if (n_fields_add < 0x80) { + ut_ad(rec_get_n_add_field_len(n_fields_add) == 1); + return n_fields_add; + } + + n_fields_add &= 0x7f; + n_fields_add |= unsigned(*--header) << 7; + ut_ad(n_fields_add < REC_MAX_N_FIELDS); + ut_ad(rec_get_n_add_field_len(n_fields_add) == 2); + return n_fields_add; +} + +/** Set the added field count in a REC_STATUS_INSTANT record. +@param[in,out] header variable header of a REC_STATUS_INSTANT record @param[in] n_add number of added fields, minus 1 @return record header before the number of added fields */ inline void rec_set_n_add_field(byte*& header, ulint n_add) @@ -799,20 +788,89 @@ inline ulint rec_offs_comp(const rec_offs *offsets) } /** Determine if the record is the metadata pseudo-record -in the clustered index. +in the clustered index for instant ADD COLUMN or ALTER TABLE. +@param[in] rec leaf page record +@param[in] comp 0 if ROW_FORMAT=REDUNDANT, else nonzero +@return whether the record is the metadata pseudo-record */ +inline bool rec_is_metadata(const rec_t* rec, ulint comp) +{ + bool is = !!(rec_get_info_bits(rec, comp) & REC_INFO_MIN_REC_FLAG); + ut_ad(!is || !comp || rec_get_status(rec) == REC_STATUS_INSTANT); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ADD COLUMN or ALTER TABLE. +@param[in] rec leaf page record +@param[in] index index of the record +@return whether the record is the metadata pseudo-record */ +inline bool rec_is_metadata(const rec_t* rec, const dict_index_t& index) +{ + bool is = rec_is_metadata(rec, dict_table_is_comp(index.table)); + ut_ad(!is || index.is_instant()); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ADD COLUMN (not other ALTER TABLE). +@param[in] rec leaf page record +@param[in] comp 0 if ROW_FORMAT=REDUNDANT, else nonzero +@return whether the record is the metadata pseudo-record */ +inline bool rec_is_add_metadata(const rec_t* rec, ulint comp) +{ + bool is = rec_get_info_bits(rec, comp) == REC_INFO_MIN_REC_FLAG; + ut_ad(!is || !comp || rec_get_status(rec) == REC_STATUS_INSTANT); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ADD COLUMN (not other ALTER TABLE). @param[in] rec leaf page record @param[in] index index of the record @return whether the record is the metadata pseudo-record */ -inline bool rec_is_metadata(const rec_t* rec, const dict_index_t* index) +inline bool rec_is_add_metadata(const rec_t* rec, const dict_index_t& index) +{ + bool is = rec_is_add_metadata(rec, dict_table_is_comp(index.table)); + ut_ad(!is || index.is_instant()); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ALTER TABLE (not plain ADD COLUMN). +@param[in] rec leaf page record +@param[in] comp 0 if ROW_FORMAT=REDUNDANT, else nonzero +@return whether the record is the ALTER TABLE metadata pseudo-record */ +inline bool rec_is_alter_metadata(const rec_t* rec, ulint comp) +{ + bool is = !(~rec_get_info_bits(rec, comp) + & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)); + ut_ad(!is || rec_is_metadata(rec, comp)); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ALTER TABLE (not plain ADD COLUMN). +@param[in] rec leaf page record +@param[in] index index of the record +@return whether the record is the ALTER TABLE metadata pseudo-record */ +inline bool rec_is_alter_metadata(const rec_t* rec, const dict_index_t& index) { - bool is = rec_get_info_bits(rec, dict_table_is_comp(index->table)) - & REC_INFO_MIN_REC_FLAG; - ut_ad(!is || index->is_instant()); - ut_ad(!is || !dict_table_is_comp(index->table) - || rec_get_status(rec) == REC_STATUS_COLUMNS_ADDED); + bool is = rec_is_alter_metadata(rec, dict_table_is_comp(index.table)); + ut_ad(!is || index.is_dummy || index.is_instant()); return is; } +/** Determine if a record is delete-marked (not a metadata pseudo-record). +@param[in] rec record +@param[in] comp nonzero if ROW_FORMAT!=REDUNDANT +@return whether the record is a delete-marked user record */ +inline bool rec_is_delete_marked(const rec_t* rec, ulint comp) +{ + return (rec_get_info_bits(rec, comp) + & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)) + == REC_INFO_DELETED_FLAG; +} + /** Get the nth field from an index. @param[in] rec index record @param[in] index index @@ -830,6 +888,7 @@ rec_get_nth_cfield( ulint* len) { ut_ad(rec_offs_validate(rec, index, offsets)); + if (!rec_offs_nth_default(offsets, n)) { return rec_get_nth_field(rec, offsets, n, len); } @@ -976,7 +1035,7 @@ rec_copy( @param[in] fields data fields @param[in] n_fields number of data fields @param[out] extra record header size -@param[in] status REC_STATUS_ORDINARY or REC_STATUS_COLUMNS_ADDED +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT @return total size, in bytes */ ulint rec_get_converted_size_temp( @@ -993,7 +1052,7 @@ rec_get_converted_size_temp( @param[in,out] offsets offsets to the fields; in: rec_offs_n_fields(offsets) @param[in] n_core number of core fields (index->n_core_fields) @param[in] def_val default values for non-core fields -@param[in] status REC_STATUS_ORDINARY or REC_STATUS_COLUMNS_ADDED */ +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT */ void rec_init_offsets_temp( const rec_t* rec, @@ -1020,8 +1079,7 @@ rec_init_offsets_temp( @param[in] index clustered or secondary index @param[in] fields data fields @param[in] n_fields number of data fields -@param[in] status REC_STATUS_ORDINARY or REC_STATUS_COLUMNS_ADDED -*/ +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT */ void rec_convert_dtuple_to_temp( rec_t* rec, @@ -1084,21 +1142,20 @@ rec_get_converted_size_comp_prefix( ulint n_fields,/*!< in: number of data fields */ ulint* extra) /*!< out: extra size */ MY_ATTRIBUTE((warn_unused_result, nonnull(1,2))); -/**********************************************************//** -Determines the size of a data tuple in ROW_FORMAT=COMPACT. + +/** Determine the size of a record in ROW_FORMAT=COMPACT. +@param[in] index record descriptor. dict_table_is_comp() + is assumed to hold, even if it doesn't +@param[in] tuple logical record +@param[out] extra extra size @return total size */ ulint rec_get_converted_size_comp( -/*========================*/ - const dict_index_t* index, /*!< in: record descriptor; - dict_table_is_comp() is - assumed to hold, even if - it does not */ - rec_comp_status_t status, /*!< in: status bits of the record */ - const dfield_t* fields, /*!< in: array of data fields */ - ulint n_fields,/*!< in: number of data fields */ - ulint* extra) /*!< out: extra size */ - MY_ATTRIBUTE((nonnull(1,3))); + const dict_index_t* index, + const dtuple_t* tuple, + ulint* extra) + MY_ATTRIBUTE((nonnull(1,2))); + /**********************************************************//** The following function returns the size of a data tuple when converted to a physical record. @@ -1273,7 +1330,7 @@ public: } /** Destructor */ - virtual ~rec_printer() {} + ~rec_printer() override {} private: /** Copy constructor */ diff --git a/storage/innobase/include/rem0rec.ic b/storage/innobase/include/rem0rec.ic index 48898b1f916..6cecd9f1f08 100644 --- a/storage/innobase/include/rem0rec.ic +++ b/storage/innobase/include/rem0rec.ic @@ -67,7 +67,7 @@ most significant bytes and bits are written below less significant. 001=REC_STATUS_NODE_PTR 010=REC_STATUS_INFIMUM 011=REC_STATUS_SUPREMUM - 100=REC_STATUS_COLUMNS_ADDED + 100=REC_STATUS_INSTANT 1xx=reserved 5 bits heap number 4 8 bits heap number @@ -451,7 +451,7 @@ rec_get_n_fields( } switch (rec_get_status(rec)) { - case REC_STATUS_COLUMNS_ADDED: + case REC_STATUS_INSTANT: case REC_STATUS_ORDINARY: return(dict_index_get_n_fields(index)); case REC_STATUS_NODE_PTR: @@ -547,19 +547,6 @@ rec_set_n_owned_new( } } -#ifdef UNIV_DEBUG -/** Check if the info bits are valid. -@param[in] bits info bits to check -@return true if valid */ -inline -bool -rec_info_bits_valid( - ulint bits) -{ - return(0 == (bits & ~(REC_INFO_DELETED_FLAG | REC_INFO_MIN_REC_FLAG))); -} -#endif /* UNIV_DEBUG */ - /******************************************************//** The following function is used to retrieve the info bits of a record. @return info bits */ @@ -573,7 +560,6 @@ rec_get_info_bits( const ulint val = rec_get_bit_field_1( rec, comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS, REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); - ut_ad(rec_info_bits_valid(val)); return(val); } @@ -586,7 +572,6 @@ rec_set_info_bits_old( rec_t* rec, /*!< in: old-style physical record */ ulint bits) /*!< in: info bits */ { - ut_ad(rec_info_bits_valid(bits)); rec_set_bit_field_1(rec, bits, REC_OLD_INFO_BITS, REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); } @@ -599,7 +584,6 @@ rec_set_info_bits_new( rec_t* rec, /*!< in/out: new-style physical record */ ulint bits) /*!< in: info bits */ { - ut_ad(rec_info_bits_valid(bits)); rec_set_bit_field_1(rec, bits, REC_NEW_INFO_BITS, REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); } @@ -1388,24 +1372,20 @@ rec_get_converted_size( } else if (index->table->id == DICT_INDEXES_ID) { /* The column SYS_INDEXES.MERGE_THRESHOLD was instantly added in MariaDB 10.2.2 (MySQL 5.7). */ + ut_ad(!index->table->is_temporary()); ut_ad(index->n_fields == DICT_NUM_FIELDS__SYS_INDEXES); ut_ad(dtuple->n_fields == DICT_NUM_FIELDS__SYS_INDEXES || dtuple->n_fields == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD); } else { ut_ad(dtuple->n_fields >= index->n_core_fields); - ut_ad(dtuple->n_fields <= index->n_fields); + ut_ad(dtuple->n_fields <= index->n_fields + || dtuple->is_alter_metadata()); } #endif if (dict_table_is_comp(index->table)) { - return(rec_get_converted_size_comp( - index, - static_cast<rec_comp_status_t>( - dtuple->info_bits - & REC_NEW_STATUS_MASK), - dtuple->fields, - dtuple->n_fields, NULL)); + return rec_get_converted_size_comp(index, dtuple, NULL); } data_size = dtuple_get_data_size(dtuple, 0); diff --git a/storage/innobase/include/row0ext.h b/storage/innobase/include/row0ext.h index 11a6bfa4667..251f3125667 100644 --- a/storage/innobase/include/row0ext.h +++ b/storage/innobase/include/row0ext.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,7 +30,7 @@ Created September 2006 Marko Makela #include "data0types.h" #include "mem0mem.h" #include "dict0types.h" -#include "page0size.h" +#include "fsp0types.h" #include "row0types.h" /********************************************************************//** @@ -43,7 +44,7 @@ row_ext_create( in the InnoDB table object, as reported by dict_col_get_no(); NOT relative to the records in the clustered index */ - ulint flags, /*!< in: table->flags */ + const dict_table_t& table, /*!< in: table */ const dtuple_t* tuple, /*!< in: data tuple containing the field references of the externally stored columns; must be indexed by col_no; @@ -91,9 +92,7 @@ struct row_ext_t{ REC_ANTELOPE_MAX_INDEX_COL_LEN or REC_VERSION_56_MAX_INDEX_COL_LEN depending on row format */ - page_size_t page_size; - /*!< page size of the externally stored - columns */ + ulint zip_size;/*!< ROW_FORMAT=COMPRESSED page size, or 0 */ ulint len[1]; /*!< prefix lengths; 0 if not cached */ }; diff --git a/storage/innobase/include/row0ftsort.h b/storage/innobase/include/row0ftsort.h index 3a65e1c58da..0189bb7a4ff 100644 --- a/storage/innobase/include/row0ftsort.h +++ b/storage/innobase/include/row0ftsort.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2015, 2018, MariaDB Corporation. +Copyright (c) 2015, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -60,8 +60,8 @@ struct fts_psort_t; struct fts_psort_common_t { row_merge_dup_t* dup; /*!< descriptor of FTS index */ dict_table_t* new_table; /*!< source table */ - /* Old table page size */ - page_size_t old_page_size; + /** Old table page size */ + ulint old_zip_size; trx_t* trx; /*!< transaction */ fts_psort_t* all_info; /*!< all parallel sort info */ os_event_t sort_event; /*!< sort event */ @@ -199,19 +199,19 @@ row_merge_create_fts_sort_index( @param[in] new_table table where indexes are created @param[in] opt_doc_id_size whether to use 4 bytes instead of 8 bytes integer to store Doc ID during sort -@param[in] old_page_size page size of the old table during alter +@param[in] old_zip_size page size of the old table during alter @param[out] psort parallel sort info to be instantiated @param[out] merge parallel merge info to be instantiated -@return TRUE if all successful */ -ibool +@return true if all successful */ +bool row_fts_psort_info_init( - trx_t* trx, - row_merge_dup_t* dup, - const dict_table_t* new_table, - ibool opt_doc_id_size, - const page_size_t old_page_size, - fts_psort_t** psort, - fts_psort_t** merge) + trx_t* trx, + row_merge_dup_t*dup, + dict_table_t* new_table, + bool opt_doc_id_size, + ulint old_zip_size, + fts_psort_t** psort, + fts_psort_t** merge) MY_ATTRIBUTE((nonnull)); /********************************************************************//** diff --git a/storage/innobase/include/row0log.h b/storage/innobase/include/row0log.h index fac1b950e2e..63fd877691c 100644 --- a/storage/innobase/include/row0log.h +++ b/storage/innobase/include/row0log.h @@ -36,7 +36,7 @@ Created 2011-05-26 Marko Makela class ut_stage_alter_t; -extern ulint onlineddl_rowlog_rows; +extern Atomic_counter<ulint> onlineddl_rowlog_rows; extern ulint onlineddl_rowlog_pct_used; extern ulint onlineddl_pct_progress; diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h index 8c9b5325c5f..e5798f1f673 100644 --- a/storage/innobase/include/row0mysql.h +++ b/storage/innobase/include/row0mysql.h @@ -43,6 +43,7 @@ Created 9/17/2000 Heikki Tuuri extern ibool row_rollback_on_timeout; struct row_prebuilt_t; +class ha_innobase; /*******************************************************************//** Frees the blob heap in prebuilt when no longer needed. */ @@ -417,7 +418,7 @@ will remain locked. @param[in] create_failed true=create table failed because e.g. foreign key column @param[in] nonatomic Whether it is permitted to release - and reacquire dict_operation_lock + and reacquire dict_sys.latch @return error code */ dberr_t row_drop_table_for_mysql( @@ -777,10 +778,14 @@ struct row_prebuilt_t { store it here so that we can return it to MySQL */ /*----------------------*/ - void* idx_cond; /*!< In ICP, pointer to a ha_innobase, - passed to innobase_index_cond(). - NULL if index condition pushdown is - not used. */ + + /** Argument of handler_rowid_filter_check(), + or NULL if no PRIMARY KEY filter is pushed */ + ha_innobase* pk_filter; + + /** Argument to handler_index_cond_check(), + or NULL if no index condition pushdown (ICP) is used. */ + ha_innobase* idx_cond; ulint idx_cond_n_cols;/*!< Number of fields in idx_cond_cols. 0 if and only if idx_cond == NULL. */ /*----------------------*/ diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h index 5268d684529..b4dab3c2f1b 100644 --- a/storage/innobase/include/row0row.h +++ b/storage/innobase/include/row0row.h @@ -74,6 +74,7 @@ row_get_rec_roll_ptr( #define ROW_BUILD_FOR_PURGE 1 /*!< build row for purge. */ #define ROW_BUILD_FOR_UNDO 2 /*!< build row for undo. */ #define ROW_BUILD_FOR_INSERT 3 /*!< build row for insert. */ + /*****************************************************************//** When an insert or purge to a table is performed, this function builds the entry to be inserted into or purged from an index on the table. @@ -223,6 +224,24 @@ row_rec_to_index_entry( mem_heap_t* heap) /*!< in: memory heap from which the memory needed is allocated */ MY_ATTRIBUTE((warn_unused_result)); + +/** Convert a metadata record to a data tuple. +@param[in] rec metadata record +@param[in] index clustered index after instant ALTER TABLE +@param[in] offsets rec_get_offsets(rec) +@param[in,out] heap memory heap for allocations +@param[in] info_bits the info_bits after an update +@param[in] pad whether to pad to index->n_fields */ +dtuple_t* +row_metadata_to_tuple( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets, + mem_heap_t* heap, + ulint info_bits, + bool pad) + MY_ATTRIBUTE((nonnull,warn_unused_result)); + /*******************************************************************//** Builds from a secondary index record a row reference with which we can search the clustered index record. diff --git a/storage/innobase/include/row0row.ic b/storage/innobase/include/row0row.ic index 18e6959e6f3..e89adb581f4 100644 --- a/storage/innobase/include/row0row.ic +++ b/storage/innobase/include/row0row.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -39,16 +39,12 @@ row_get_trx_id_offset( const dict_index_t* index, /*!< in: clustered index */ const rec_offs* offsets)/*!< in: record offsets */ { - ulint pos; ulint offset; ulint len; - ut_ad(dict_index_is_clust(index)); ut_ad(rec_offs_validate(NULL, index, offsets)); - pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); - - offset = rec_get_nth_field_offs(offsets, pos, &len); + offset = rec_get_nth_field_offs(offsets, index->db_trx_id(), &len); ut_ad(len == DATA_TRX_ID_LEN); diff --git a/storage/innobase/include/row0trunc.h b/storage/innobase/include/row0trunc.h deleted file mode 100644 index c5f89f7cfdb..00000000000 --- a/storage/innobase/include/row0trunc.h +++ /dev/null @@ -1,416 +0,0 @@ -/***************************************************************************** - -Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2018, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/row0trunc.h -TRUNCATE implementation - -Created 2013-04-25 Krunal Bauskar -*******************************************************/ - -#ifndef row0trunc_h -#define row0trunc_h - -#include "row0mysql.h" -#include "dict0boot.h" -#include "fil0fil.h" -#include "srv0start.h" - -#include <vector> - -/** The information of TRUNCATE log record. -This class handles the recovery stage of TRUNCATE table. */ -class truncate_t { - -public: - /** - Constructor - - @param old_table_id old table id assigned to table before truncate - @param new_table_id new table id that will be assigned to table - after truncate - @param dir_path directory path */ - truncate_t( - table_id_t old_table_id, - table_id_t new_table_id, - const char* dir_path); - - /** - Constructor - - @param log_file_name parse the log file during recovery to populate - information related to table to truncate */ - truncate_t(const char* log_file_name); - - /** - Consturctor - - @param space_id space in which table reisde - @param name table name - @param tablespace_flags tablespace flags use for recreating tablespace - @param log_flags page format flag - @param recv_lsn lsn of redo log record. */ - truncate_t( - ulint space_id, - const char* name, - ulint tablespace_flags, - ulint log_flags, - lsn_t recv_lsn); - - /** Destructor */ - ~truncate_t(); - - /** The index information of MLOG_FILE_TRUNCATE redo record */ - struct index_t { - - /* Default copy constructor and destructor should be OK. */ - - index_t(); - - /** - Set the truncate log values for a compressed table. - @return DB_CORRUPTION or error code */ - dberr_t set(const dict_index_t* index); - - typedef std::vector<byte, ut_allocator<byte> > fields_t; - - /** Index id */ - index_id_t m_id; - - /** Index type */ - ulint m_type; - - /** Root Page Number */ - ulint m_root_page_no; - - /** New Root Page Number. - Note: This field is not persisted to TRUNCATE log but used - during truncate table fix-up for updating SYS_XXXX tables. */ - ulint m_new_root_page_no; - - /** Number of index fields */ - ulint m_n_fields; - - /** DATA_TRX_ID column position. */ - ulint m_trx_id_pos; - - /** Compressed table field meta data, encode by - page_zip_fields_encode. Empty for non-compressed tables. - Should be NUL terminated. */ - fields_t m_fields; - }; - - /** - @return the directory path, can be NULL */ - const char* get_dir_path() const - { - return(m_dir_path); - } - - /** - Register index information - - @param index index information logged as part of truncate log. */ - void add(index_t& index) - { - m_indexes.push_back(index); - } - - /** - Add table to truncate post recovery. - - @param ptr table information need to complete truncate of table. */ - static void add(truncate_t* ptr) - { - s_tables.push_back(ptr); - } - - /** - Clear registered index vector */ - void clear() - { - m_indexes.clear(); - } - - /** - @return old table id of the table to truncate */ - table_id_t old_table_id() const - { - return(m_old_table_id); - } - - /** - @return new table id of the table to truncate */ - table_id_t new_table_id() const - { - return(m_new_table_id); - } - - /** - Update root page number in SYS_XXXX tables. - - @param trx transaction object - @param table_id table id for which information needs to - be updated. - @param reserve_dict_mutex if TRUE, acquire/release - dict_sys->mutex around call to pars_sql. - @param mark_index_corrupted if true, then mark index corrupted - @return DB_SUCCESS or error code */ - dberr_t update_root_page_no( - trx_t* trx, - table_id_t table_id, - ibool reserve_dict_mutex, - bool mark_index_corrupted) const; - - /** Create an index for a table. - @param[in] table_name table name, for which to create - the index - @param[in,out] space tablespace - @param[in] index_type type of index to truncate - @param[in] index_id id of index to truncate - @param[in] btr_redo_create_info control info for ::btr_create() - @param[in,out] mtr mini-transaction covering the - create index - @return root page no or FIL_NULL on failure */ - inline ulint create_index( - const char* table_name, - fil_space_t* space, - ulint index_type, - index_id_t index_id, - const btr_create_t& btr_redo_create_info, - mtr_t* mtr) const; - - /** Create the indexes for a table - @param[in] table_name table name, for which to create the - indexes - @param[in,out] space tablespace - @param[in] format_flags page format flags - @return DB_SUCCESS or error code. */ - inline dberr_t create_indexes( - const char* table_name, - fil_space_t* space, - ulint format_flags); - - /** Check if index has been modified since TRUNCATE log snapshot - was recorded. - @param[in] space tablespace - @param[in] root_page_no index root page number - @return true if modified else false */ - inline bool is_index_modified_since_logged( - const fil_space_t* space, - ulint root_page_no) const; - - /** Drop indexes for a table. - @param[in,out] space tablespace - @return DB_SUCCESS or error code. */ - void drop_indexes(fil_space_t* space) const; - - /** - Parses log record during recovery - @param start_ptr buffer containing log body to parse - @param end_ptr buffer end - - @return DB_SUCCESS or error code */ - dberr_t parse( - byte* start_ptr, - const byte* end_ptr); - - /** Parse MLOG_TRUNCATE log record from REDO log file during recovery. - @param[in,out] start_ptr buffer containing log body to parse - @param[in] end_ptr buffer end - @param[in] space_id tablespace identifier - @return parsed upto or NULL. */ - static byte* parse_redo_entry( - byte* start_ptr, - const byte* end_ptr, - ulint space_id); - - /** - Write a log record for truncating a single-table tablespace. - - @param start_ptr buffer to write log record - @param end_ptr buffer end - @param space_id space id - @param tablename the table name in the usual - databasename/tablename format of InnoDB - @param flags tablespace flags - @param format_flags page format - @param lsn lsn while logging */ - dberr_t write( - byte* start_ptr, - byte* end_ptr, - ulint space_id, - const char* tablename, - ulint flags, - ulint format_flags, - lsn_t lsn) const; - - /** - @return number of indexes parsed from the truncate log record */ - size_t indexes() const; - - /** - Truncate a single-table tablespace. The tablespace must be cached - in the memory cache. - - Note: This is defined in fil0fil.cc because it needs to access some - types that are local to that file. - - @param space_id space id - @param dir_path directory path - @param tablename the table name in the usual - databasename/tablename format of InnoDB - @param flags tablespace flags - @param default_size if true, truncate to default size if tablespace - is being newly re-initialized. - @return DB_SUCCESS or error */ - static dberr_t truncate( - ulint space_id, - const char* dir_path, - const char* tablename, - ulint flags, - bool default_size); - - /** - Fix the table truncate by applying information parsed from TRUNCATE log. - Fix-up includes re-creating table (drop and re-create indexes) - @return error code or DB_SUCCESS */ - static dberr_t fixup_tables_in_system_tablespace(); - - /** - Fix the table truncate by applying information parsed from TRUNCATE log. - Fix-up includes re-creating tablespace. - @return error code or DB_SUCCESS */ - static dberr_t fixup_tables_in_non_system_tablespace(); - - /** - Check whether a tablespace was truncated during recovery - @param space_id tablespace id to check - @return true if the tablespace was truncated */ - static bool is_tablespace_truncated(ulint space_id); - - /** Was tablespace truncated (on crash before checkpoint). - If the MLOG_TRUNCATE redo-record is still available then tablespace - was truncated and checkpoint is yet to happen. - @param[in] space_id tablespace id to check. - @return true if tablespace was truncated. */ - static bool was_tablespace_truncated(ulint space_id); - - /** Get the lsn associated with space. - @param[in] space_id tablespace id to check. - @return associated lsn. */ - static lsn_t get_truncated_tablespace_init_lsn(ulint space_id); - -private: - typedef std::vector<index_t, ut_allocator<index_t> > indexes_t; - - /** Space ID of tablespace */ - ulint m_space_id; - - /** ID of table that is being truncated. */ - table_id_t m_old_table_id; - - /** New ID that will be assigned to table on truncation. */ - table_id_t m_new_table_id; - - /** Data dir path of tablespace */ - char* m_dir_path; - - /** Table name */ - char* m_tablename; - - /** Tablespace Flags */ - ulint m_tablespace_flags; - - /** Format flags (log flags; stored in page-no field of header) */ - ulint m_format_flags; - - /** Index meta-data */ - indexes_t m_indexes; - - /** LSN of TRUNCATE log record. */ - lsn_t m_log_lsn; - - /** Log file name. */ - char* m_log_file_name; - - /** Encryption information of the table */ - fil_encryption_t m_encryption; - uint32_t m_key_id; - - /** Vector of tables to truncate. */ - typedef std::vector<truncate_t*, ut_allocator<truncate_t*> > - tables_t; - - /** Information about tables to truncate post recovery */ - static tables_t s_tables; - - /** Information about truncated table - This is case when truncate is complete but checkpoint hasn't. */ - typedef std::map<ulint, lsn_t> truncated_tables_t; - static truncated_tables_t s_truncated_tables; - -public: - /** If true then fix-up of table is active and so while creating - index instead of grabbing information from dict_index_t, grab it - from parsed truncate log record. */ - static bool s_fix_up_active; -}; - -/** -Parse truncate log file. */ -class TruncateLogParser { - -public: - - /** - Scan and Parse truncate log files. - - @param dir_path look for log directory in following path - @return DB_SUCCESS or error code. */ - static dberr_t scan_and_parse( - const char* dir_path); - -private: - typedef std::vector<char*, ut_allocator<char*> > - trunc_log_files_t; - -private: - /** - Scan to find out truncate log file from the given directory path. - - @param dir_path look for log directory in following path. - @param log_files cache to hold truncate log file name found. - @return DB_SUCCESS or error code. */ - static dberr_t scan( - const char* dir_path, - trunc_log_files_t& log_files); - - /** - Parse the log file and populate table to truncate information. - (Add this table to truncate information to central vector that is then - used by truncate fix-up routine to fix-up truncate action of the table.) - - @param log_file_name log file to parse - @return DB_SUCCESS or error code. */ - static dberr_t parse( - const char* log_file_name); -}; - -#endif /* row0trunc_h */ diff --git a/storage/innobase/include/row0types.h b/storage/innobase/include/row0types.h index 5f1e46c6a4d..048b161b884 100644 --- a/storage/innobase/include/row0types.h +++ b/storage/innobase/include/row0types.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2018, MariaDB Corporation. +Copyright (c) 2018, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -24,8 +24,8 @@ Row operation global types Created 12/27/1996 Heikki Tuuri *******************************************************/ -#ifndef row0types_h -#define row0types_h +#pragma once +#include "buf0types.h" struct plan_t; @@ -146,5 +146,3 @@ public: return first_use; } }; - -#endif diff --git a/storage/innobase/include/row0undo.h b/storage/innobase/include/row0undo.h index a18d154c132..4357a908ca3 100644 --- a/storage/innobase/include/row0undo.h +++ b/storage/innobase/include/row0undo.h @@ -82,17 +82,20 @@ that index record. */ enum undo_exec { UNDO_NODE_FETCH_NEXT = 1, /*!< we should fetch the next undo log record */ - UNDO_NODE_INSERT, /*!< undo a fresh insert of a - row to a table */ - UNDO_NODE_MODIFY /*!< undo a modify operation - (DELETE or UPDATE) on a row - of a table */ + /** rollback an insert into persistent table */ + UNDO_INSERT_PERSISTENT, + /** rollback an update (or delete) in a persistent table */ + UNDO_UPDATE_PERSISTENT, + /** rollback an insert into temporary table */ + UNDO_INSERT_TEMPORARY, + /** rollback an update (or delete) in a temporary table */ + UNDO_UPDATE_TEMPORARY, }; /** Undo node structure */ struct undo_node_t{ que_common_t common; /*!< node type: QUE_NODE_UNDO */ - enum undo_exec state; /*!< node execution state */ + undo_exec state; /*!< rollback execution state */ trx_t* trx; /*!< trx for which undo is done */ roll_ptr_t roll_ptr;/*!< roll pointer to undo log record */ trx_undo_rec_t* undo_rec;/*!< undo log record */ diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h index b34acfd8dc1..677af76c561 100644 --- a/storage/innobase/include/row0upd.h +++ b/storage/innobase/include/row0upd.h @@ -101,19 +101,6 @@ upd_get_field_by_field_no( bool is_virtual) /*!< in: if it is a virtual column */ MY_ATTRIBUTE((warn_unused_result)); /*********************************************************************//** -Writes into the redo log the values of trx id and roll ptr and enough info -to determine their positions within a clustered index record. -@return new pointer to mlog */ -byte* -row_upd_write_sys_vals_to_log( -/*==========================*/ - dict_index_t* index, /*!< in: clustered index */ - trx_id_t trx_id, /*!< in: transaction id */ - roll_ptr_t roll_ptr,/*!< in: roll ptr of the undo log record */ - byte* log_ptr,/*!< pointer to a buffer of size > 20 opened - in mlog */ - mtr_t* mtr); /*!< in: mtr */ -/*********************************************************************//** Updates the trx id and roll ptr field in a clustered index record when a row is updated or marked deleted. */ UNIV_INLINE @@ -128,18 +115,6 @@ row_upd_rec_sys_fields( const trx_t* trx, /*!< in: transaction */ roll_ptr_t roll_ptr);/*!< in: DB_ROLL_PTR to the undo log */ /*********************************************************************//** -Sets the trx id or roll ptr field of a clustered index entry. */ -void -row_upd_index_entry_sys_field( -/*==========================*/ - dtuple_t* entry, /*!< in/out: index entry, where the memory - buffers for sys fields are already allocated: - the function just copies the new values to - them */ - dict_index_t* index, /*!< in: clustered index */ - ulint type, /*!< in: DATA_TRX_ID or DATA_ROLL_PTR */ - ib_uint64_t val); /*!< in: value to write */ -/*********************************************************************//** Creates an update node for a query graph. @return own: update node */ upd_node_t* @@ -482,6 +457,14 @@ struct upd_t{ return false; } + /** @return whether this is for a hidden metadata record + for instant ALTER TABLE */ + bool is_metadata() const { return dtuple_t::is_metadata(info_bits); } + /** @return whether this is for a hidden metadata record + for instant ALTER TABLE (not only ADD COLUMN) */ + bool is_alter_metadata() const + { return dtuple_t::is_alter_metadata(info_bits); } + #ifdef UNIV_DEBUG bool validate() const { @@ -495,7 +478,6 @@ struct upd_t{ return(true); } #endif // UNIV_DEBUG - }; /** Kinds of update operation */ diff --git a/storage/innobase/include/row0upd.ic b/storage/innobase/include/row0upd.ic index e1368a14e63..fffb7650da3 100644 --- a/storage/innobase/include/row0upd.ic +++ b/storage/innobase/include/row0upd.ic @@ -167,13 +167,13 @@ row_upd_rec_sys_fields( const trx_t* trx, /*!< in: transaction */ roll_ptr_t roll_ptr)/*!< in: DB_ROLL_PTR to the undo log */ { - ut_ad(dict_index_is_clust(index)); + ut_ad(index->is_primary()); ut_ad(rec_offs_validate(rec, index, offsets)); - if (page_zip) { - ulint pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + if (UNIV_LIKELY_NULL(page_zip)) { page_zip_write_trx_id_and_roll_ptr(page_zip, rec, offsets, - pos, trx->id, roll_ptr); + index->db_trx_id(), + trx->id, roll_ptr); } else { ulint offset = index->trx_id_offset; diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h index 84e8ece2d77..10730366401 100644 --- a/storage/innobase/include/srv0mon.h +++ b/storage/innobase/include/srv0mon.h @@ -37,6 +37,8 @@ Created 12/15/2009 Jimmy Yang #endif /* __STDC_LIMIT_MACROS */ #include <stdint.h> +#include "my_atomic.h" +#include "my_atomic_wrapper.h" /** Possible status values for "mon_status" in "struct monitor_value" */ enum monitor_running_status { @@ -177,7 +179,6 @@ enum monitor_id_t { MONITOR_OVLD_INDEX_PAGES_WRITTEN, MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN, MONITOR_OVLD_PAGES_READ, - MONITOR_OVLD_PAGES0_READ, MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS, MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS_AVOIDED, MONITOR_OVLD_BYTE_READ, @@ -300,7 +301,6 @@ enum monitor_id_t { MONITOR_TRX_COMMIT_UNDO, MONITOR_TRX_ROLLBACK, MONITOR_TRX_ROLLBACK_SAVEPOINT, - MONITOR_TRX_ROLLBACK_ACTIVE, MONITOR_TRX_ACTIVE, MONITOR_RSEG_HISTORY_LEN, MONITOR_NUM_UNDO_SLOT_USED, diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 6c575733710..e1d37613dc9 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -47,7 +47,6 @@ Created 10/10/1995 Heikki Tuuri #include "que0types.h" #include "trx0types.h" #include "srv0conc.h" -#include "buf0checksum.h" #include "fil0fil.h" #include "mysql/psi/mysql_stage.h" @@ -144,7 +143,8 @@ struct srv_stats_t ulint_ctr_1_t n_lock_wait_count; /** Number of threads currently waiting on database locks */ - simple_atomic_counter<> n_lock_wait_current_count; + MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<ulint> + n_lock_wait_current_count; /** Number of rows read. */ ulint_ctr_64_t n_rows_read; @@ -176,9 +176,6 @@ struct srv_stats_t /** Number of times prefix optimization avoided triggering cluster lookup */ ulint_ctr_64_t n_sec_rec_cluster_reads_avoided; - /** Number of times page 0 is read from tablespace */ - ulint_ctr_64_t page0_read; - /** Number of encryption_get_latest_key_version calls */ ulint_ctr_64_t n_key_requests; @@ -457,7 +454,7 @@ extern uint srv_fast_shutdown; /*!< If this is 1, do not do a /** Signal to shut down InnoDB (NULL if shutdown was signaled, or if running in innodb_read_only mode, srv_read_only_mode) */ -extern st_my_thread_var *srv_running; +extern std::atomic<st_my_thread_var *> srv_running; extern ibool srv_innodb_status; @@ -565,7 +562,6 @@ extern uint srv_sys_space_size_debug; extern bool srv_log_files_created; #endif /* UNIV_DEBUG */ -#define SRV_SEMAPHORE_WAIT_EXTENSION 7200 extern ulint srv_dml_needed_delay; #define SRV_MAX_N_IO_THREADS 130 @@ -929,23 +925,6 @@ srv_purge_wakeup(); /** Shut down the purge threads. */ void srv_purge_shutdown(); -/** Check if tablespace is being truncated. -(Ignore system-tablespace as we don't re-create the tablespace -and so some of the action that are suppressed by this function -for independent tablespace are not applicable to system-tablespace). -@param space_id space_id to check for truncate action -@return true if being truncated, false if not being - truncated or tablespace is system-tablespace. */ -bool -srv_is_tablespace_truncated(ulint space_id); - -/** Check if tablespace was truncated. -@param[in] space space object to check for truncate action -@return true if tablespace was truncated and we still have an active -MLOG_TRUNCATE REDO log record. */ -bool -srv_was_tablespace_truncated(const fil_space_t* space); - #ifdef UNIV_DEBUG /** Disables master thread. It's used by: SET GLOBAL innodb_master_thread_disabled_debug = 1 (0). @@ -1000,7 +979,6 @@ struct export_var_t{ ulint innodb_page_size; /*!< srv_page_size */ ulint innodb_pages_created; /*!< buf_pool->stat.n_pages_created */ ulint innodb_pages_read; /*!< buf_pool->stat.n_pages_read*/ - ulint innodb_page0_read; /*!< srv_stats.page0_read */ ulint innodb_pages_written; /*!< buf_pool->stat.n_pages_written */ ulint innodb_row_lock_waits; /*!< srv_n_lock_wait_count */ ulint innodb_row_lock_current_waits; /*!< srv_n_lock_wait_current_count */ diff --git a/storage/innobase/include/sync0arr.ic b/storage/innobase/include/sync0arr.ic index 9163d5b6614..962226b4934 100644 --- a/storage/innobase/include/sync0arr.ic +++ b/storage/innobase/include/sync0arr.ic @@ -44,8 +44,7 @@ sync_array_get() return(sync_wait_array[0]); } - return(sync_wait_array[default_indexer_t<>::get_rnd_index() - % sync_array_size]); + return(sync_wait_array[get_rnd_value() % sync_array_size]); } /******************************************************************//** diff --git a/storage/innobase/include/sync0policy.h b/storage/innobase/include/sync0policy.h index 4e48f1e2720..94f49ff628c 100644 --- a/storage/innobase/include/sync0policy.h +++ b/storage/innobase/include/sync0policy.h @@ -30,247 +30,176 @@ Created 2012-08-21 Sunny Bains. #include "ut0rnd.h" #include "os0thread.h" #include "srv0mon.h" +#include "sync0debug.h" #ifdef UNIV_DEBUG -# define MUTEX_MAGIC_N 979585UL - -template <typename Mutex> -class MutexDebug { -public: - - /** For passing context to SyncDebug */ - struct Context : public latch_t { - - /** Constructor */ - Context() - : - m_mutex(), - m_filename(), - m_line(), - m_thread_id(ULINT_UNDEFINED) - { - /* No op */ - } - - /** Create the context for SyncDebug - @param[in] id ID of the latch to track */ - Context(latch_id_t id) - : - latch_t(id) - { - ut_ad(id != LATCH_ID_NONE); - } - - /** Set to locked state - @param[in] mutex The mutex to acquire - @param[in] filename File name from where to acquire - @param[in] line Line number in filename */ - void locked( - const Mutex* mutex, - const char* filename, - unsigned line) - UNIV_NOTHROW - { - m_mutex = mutex; - - my_atomic_storelint(&m_thread_id, - ulint(os_thread_get_curr_id())); - - m_filename = filename; - - m_line = line; - } - - /** Reset to unlock state */ - void release() - UNIV_NOTHROW - { - m_mutex = NULL; - - my_atomic_storelint(&m_thread_id, ULINT_UNDEFINED); - - m_filename = NULL; - - m_line = 0; - } - - /** Print information about the latch - @return the string representation */ - virtual std::string to_string() const - UNIV_NOTHROW - { - std::ostringstream msg; - - msg << m_mutex->policy().to_string(); - - if (m_thread_id != ULINT_UNDEFINED) { - - msg << " addr: " << m_mutex - << " acquired: " << locked_from().c_str(); - - } else { - msg << "Not locked"; - } - - return(msg.str()); - } - - /** @return the name of the file and line number in the file - from where the mutex was acquired "filename:line" */ - virtual std::string locked_from() const - { - std::ostringstream msg; - - msg << sync_basename(m_filename) << ":" << m_line; - - return(std::string(msg.str())); - } - - /** Mutex to check for lock order violation */ - const Mutex* m_mutex; - - /** Filename from where enter was called */ - const char* m_filename; - - /** Line mumber in filename */ - unsigned m_line; - - /** Thread ID of the thread that own(ed) the mutex */ - ulint m_thread_id; - }; - - /** Constructor. */ - MutexDebug() - : - m_magic_n(), - m_context() - UNIV_NOTHROW - { - /* No op */ - } - - /* Destructor */ - virtual ~MutexDebug() { } - - /** Mutex is being destroyed. */ - void destroy() UNIV_NOTHROW - { - ut_ad((ulint)my_atomic_loadlint(&m_context.m_thread_id) == ULINT_UNDEFINED); - - m_magic_n = 0; - - m_context.m_thread_id = 0; - } - - /** Called when the mutex is "created". Note: Not from the constructor - but when the mutex is initialised. - @param[in] id Mutex ID */ - void init(latch_id_t id) UNIV_NOTHROW; - - /** Called when an attempt is made to lock the mutex - @param[in] mutex Mutex instance to be locked - @param[in] filename Filename from where it was called - @param[in] line Line number from where it was called */ - void enter( - const Mutex* mutex, - const char* filename, - unsigned line) - UNIV_NOTHROW; - - /** Called when the mutex is locked - @param[in] mutex Mutex instance that was locked - @param[in] filename Filename from where it was called - @param[in] line Line number from where it was called */ - void locked( - const Mutex* mutex, - const char* filename, - unsigned line) - UNIV_NOTHROW; - - /** Called when the mutex is released - @param[in] mutx Mutex that was released */ - void release(const Mutex* mutex) - UNIV_NOTHROW; - - /** @return true if thread owns the mutex */ - bool is_owned() const UNIV_NOTHROW - { - return(os_thread_eq( - (os_thread_id_t)my_atomic_loadlint(&m_context.m_thread_id), - os_thread_get_curr_id())); - } - - /** @return the name of the file from the mutex was acquired */ - const char* get_enter_filename() const - UNIV_NOTHROW - { - return(m_context.m_filename); - } - - /** @return the name of the file from the mutex was acquired */ - unsigned get_enter_line() const - UNIV_NOTHROW - { - return(m_context.m_line); - } - - /** @return id of the thread that was trying to acquire the mutex */ - os_thread_id_t get_thread_id() const - UNIV_NOTHROW - { - return((os_thread_id_t)my_atomic_loadlint(&m_context.m_thread_id)); - } +template <typename Mutex> class MutexDebug: public latch_t +{ + /** Mutex to check for lock order violation */ + const Mutex *m_mutex; + /** Filename from where enter was called */ + const char *m_filename; + /** Line mumber in filename */ + unsigned m_line; + /** Thread ID of the thread that owns the mutex */ + os_thread_id_t m_thread_id; + /** Mutex protecting the above members */ + mutable OSMutex m_debug_mutex; + + + void set(const Mutex *mutex, const char *filename, unsigned line, + os_thread_id_t thread_id) + { + m_debug_mutex.enter(); + m_mutex= mutex; + m_filename= filename; + m_line= line; + m_thread_id= thread_id; + m_debug_mutex.exit(); + } + + + const MutexDebug get() const + { + MutexDebug ret; + m_debug_mutex.enter(); + ret.m_mutex= m_mutex; + ret.m_filename= m_filename; + ret.m_line= m_line; + ret.m_thread_id= m_thread_id; + m_debug_mutex.exit(); + return ret; + } + + + /** + Called either when mutex is locked or destroyed. Thus members are protected + from concurrent modification. + */ + void assert_clean_context() + { + ut_ad(!m_mutex); + ut_ad(!m_filename); + ut_ad(!m_line); + ut_ad(m_thread_id == os_thread_id_t(ULINT_UNDEFINED)); + } - /** Magic number to check for memory corruption. */ - ulint m_magic_n; - /** Latch state of the mutex owner */ - Context m_context; +public: + /** + Called when the mutex is "created". Note: Not from the constructor + but when the mutex is initialised. + @param[in] id Mutex ID + */ + void init(latch_id_t id) + { + ut_ad(id != LATCH_ID_NONE); + m_id= id; + m_debug_mutex.init(); + set(0, 0, 0, os_thread_id_t(ULINT_UNDEFINED)); + } + + + /** Mutex is being destroyed. */ + void destroy() + { + assert_clean_context(); + m_debug_mutex.destroy(); + } + + + /** + Called when an attempt is made to lock the mutex + @param[in] mutex Mutex instance to be locked + @param[in] filename Filename from where it was called + @param[in] line Line number from where it was called + */ + void enter(const Mutex &mutex, const char *filename, unsigned line) + { + MutexDebug context; + ut_ad(!is_owned()); + context.init(m_id); + context.set(&mutex, filename, line, os_thread_get_curr_id()); + /* Check for latch order violation. */ + sync_check_lock_validate(&context); + context.set(0, 0, 0, os_thread_id_t(ULINT_UNDEFINED)); + context.destroy(); + } + + + /** + Called when the mutex is locked + @param[in] mutex Mutex instance that was locked + @param[in] filename Filename from where it was called + @param[in] line Line number from where it was called + */ + void locked(const Mutex &mutex, const char *filename, unsigned line) + { + assert_clean_context(); + set(&mutex, filename, line, os_thread_get_curr_id()); + sync_check_lock_granted(this); + } + + + /** + Called when the mutex is released + @param[in] mutex Mutex that was released + */ + void release(const Mutex &mutex) + { + ut_ad(is_owned()); + set(0, 0, 0, os_thread_id_t(ULINT_UNDEFINED)); + sync_check_unlock(this); + } + + + /** @return true if thread owns the mutex */ + bool is_owned() const + { + return os_thread_eq(get_thread_id(), os_thread_get_curr_id()); + } + + + /** @return the name of the file from the mutex was acquired */ + const char* get_enter_filename() const { return get().m_filename; } + + + /** @return the name of the file from the mutex was acquired */ + unsigned get_enter_line() const { return get().m_line; } + + + /** @return id of the thread that was trying to acquire the mutex */ + os_thread_id_t get_thread_id() const { return get().m_thread_id; } + + + /** + Print information about the latch + @return the string representation + */ + virtual std::string to_string() const + { + std::ostringstream msg; + const MutexDebug ctx= get(); + + msg << m_mutex->policy().to_string(); + if (ctx.m_mutex) + msg << " addr: " << ctx.m_mutex << " acquired: " + << sync_basename(ctx.get_enter_filename()) << ":" + << ctx.get_enter_line(); + else + msg << "Not locked"; + + return(msg.str()); + } }; #endif /* UNIV_DEBUG */ -/* Do nothing */ -template <typename Mutex> -struct NoPolicy { - /** Default constructor. */ - NoPolicy() { } - - void init(const Mutex&, latch_id_t, const char*, uint32_t) - UNIV_NOTHROW { } - void destroy() UNIV_NOTHROW { } - void enter(const Mutex&, const char*, unsigned) UNIV_NOTHROW { } - void add(uint32_t, uint32_t) UNIV_NOTHROW { } - void locked(const Mutex&, const char*, ulint) UNIV_NOTHROW { } - void release(const Mutex&) UNIV_NOTHROW { } - std::string to_string() const { return(""); }; - latch_id_t get_id() const; -}; - /** Collect the metrics per mutex instance, no aggregation. */ template <typename Mutex> struct GenericPolicy -#ifdef UNIV_DEBUG -: public MutexDebug<Mutex> -#endif /* UNIV_DEBUG */ { public: - typedef Mutex MutexType; - - /** Constructor. */ - GenericPolicy() - UNIV_NOTHROW - : -#ifdef UNIV_DEBUG - MutexDebug<MutexType>(), -#endif /* UNIV_DEBUG */ - m_count(), - m_id() - { } - - /** Destructor */ - ~GenericPolicy() { } - /** Called when the mutex is "created". Note: Not from the constructor but when the mutex is initialised. @param[in] id Mutex ID @@ -292,8 +221,6 @@ public: meta.get_counter()->single_register(&m_count); sync_file_created_register(this, filename, uint16_t(line)); - - ut_d(MutexDebug<MutexType>::init(m_id)); } /** Called when the mutex is destroyed. */ @@ -305,8 +232,6 @@ public: meta.get_counter()->single_deregister(&m_count); sync_file_created_deregister(this); - - ut_d(MutexDebug<MutexType>::destroy()); } /** Called after a successful mutex acquire. @@ -332,40 +257,6 @@ public: ++m_count.m_calls; } - /** Called when an attempt is made to lock the mutex - @param[in] mutex Mutex instance to be locked - @param[in] filename Filename from where it was called - @param[in] line Line number from where it was called */ - void enter( - const MutexType& mutex, - const char* filename, - unsigned line) - UNIV_NOTHROW - { - ut_d(MutexDebug<MutexType>::enter(&mutex, filename, line)); - } - - /** Called when the mutex is locked - @param[in] mutex Mutex instance that is locked - @param[in] filename Filename from where it was called - @param[in] line Line number from where it was called */ - void locked( - const MutexType& mutex, - const char* filename, - unsigned line) - UNIV_NOTHROW - { - ut_d(MutexDebug<MutexType>::locked(&mutex, filename, line)); - } - - /** Called when the mutex is released - @param[in] mutex Mutex instance that is released */ - void release(const MutexType& mutex) - UNIV_NOTHROW - { - ut_d(MutexDebug<MutexType>::release(&mutex)); - } - /** Print the information about the latch @return the string representation */ std::string print() const @@ -378,14 +269,18 @@ public: return(m_id); } - /** @return the string representation */ - std::string to_string() const; -private: - typedef latch_meta_t::CounterType Counter; + /** @return the string representation */ + std::string to_string() const + { return sync_mutex_to_string(get_id(), sync_file_created_get(this)); } - /** The user visible counters, registered with the meta-data. */ - Counter::Count m_count; +#ifdef UNIV_DEBUG + MutexDebug<Mutex> context; +#endif + +private: + /** The user visible counters, registered with the meta-data. */ + latch_meta_t::CounterType::Count m_count; /** Latch meta data ID */ latch_id_t m_id; @@ -395,29 +290,8 @@ private: too many of them to count individually. */ template <typename Mutex> class BlockMutexPolicy -#ifdef UNIV_DEBUG -: public MutexDebug<Mutex> -#endif /* UNIV_DEBUG */ { public: - typedef Mutex MutexType; - typedef typename latch_meta_t::CounterType::Count Count; - - /** Default constructor. */ - BlockMutexPolicy() - : -#ifdef UNIV_DEBUG - MutexDebug<MutexType>(), -#endif /* UNIV_DEBUG */ - m_count(), - m_id() - { - /* Do nothing */ - } - - /** Destructor */ - ~BlockMutexPolicy() { } - /** Called when the mutex is "created". Note: Not from the constructor but when the mutex is initialised. @param[in] id Mutex ID */ @@ -436,8 +310,6 @@ public: ut_ad(meta.get_id() == id); m_count = meta.get_counter()->sum_register(); - - ut_d(MutexDebug<MutexType>::init(m_id)); } /** Called when the mutex is destroyed. */ @@ -445,7 +317,6 @@ public: UNIV_NOTHROW { m_count = NULL; - ut_d(MutexDebug<MutexType>::destroy()); } /** Called after a successful mutex acquire. @@ -469,40 +340,6 @@ public: ++m_count->m_calls; } - /** Called when the mutex is locked - @param[in] mutex Mutex instance that is locked - @param[in] filename Filename from where it was called - @param[in] line Line number from where it was called */ - void locked( - const MutexType& mutex, - const char* filename, - unsigned line) - UNIV_NOTHROW - { - ut_d(MutexDebug<MutexType>::locked(&mutex, filename, line)); - } - - /** Called when the mutex is released - @param[in] mutex Mutex instance that is released */ - void release(const MutexType& mutex) - UNIV_NOTHROW - { - ut_d(MutexDebug<MutexType>::release(&mutex)); - } - - /** Called when an attempt is made to lock the mutex - @param[in] mutex Mutex instance to be locked - @param[in] filename Filename from where it was called - @param[in] line Line number from where it was called */ - void enter( - const MutexType& mutex, - const char* filename, - unsigned line) - UNIV_NOTHROW - { - ut_d(MutexDebug<MutexType>::enter(&mutex, filename, line)); - } - /** Print the information about the latch @return the string representation */ std::string print() const @@ -514,19 +351,26 @@ public: return(m_id); } - /** @return the string representation */ - std::string to_string() const; -private: - typedef latch_meta_t::CounterType Counter; + /** + I don't think it makes sense to keep track of the file name + and line number for each block mutex. Too much of overhead. Use the + latch id to figure out the location from the source. + + @return the string representation + */ + std::string to_string() const + { return(sync_mutex_to_string(get_id(), "buf0buf.cc:0")); } + +#ifdef UNIV_DEBUG + MutexDebug<Mutex> context; +#endif - /** The user visible counters, registered with the meta-data. */ - Counter::Count* m_count; +private: + /** The user visible counters, registered with the meta-data. */ + latch_meta_t::CounterType::Count *m_count; /** Latch meta data ID */ latch_id_t m_id; }; - -#include "sync0policy.ic" - #endif /* sync0policy_h */ diff --git a/storage/innobase/include/sync0policy.ic b/storage/innobase/include/sync0policy.ic deleted file mode 100644 index e7aeb2e16bb..00000000000 --- a/storage/innobase/include/sync0policy.ic +++ /dev/null @@ -1,101 +0,0 @@ -/***************************************************************************** - -Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/******************************************************************//** -@file include/sync0policy.ic -Policy for mutexes. - -Created 2012-08-21 Sunny Bains. -***********************************************************************/ - -#include "sync0debug.h" - -template <typename Mutex> -std::string GenericPolicy<Mutex>::to_string() const -{ - return(sync_mutex_to_string(get_id(), sync_file_created_get(this))); -} - -template <typename Mutex> -std::string BlockMutexPolicy<Mutex>::to_string() const -{ - /* I don't think it makes sense to keep track of the file name - and line number for each block mutex. Too much of overhead. Use the - latch id to figure out the location from the source. */ - return(sync_mutex_to_string(get_id(), "buf0buf.cc:0")); -} - -#ifdef UNIV_DEBUG - -template <typename Mutex> -void MutexDebug<Mutex>::init(latch_id_t id) - UNIV_NOTHROW -{ - m_context.m_id = id; - - m_context.release(); - - m_magic_n = MUTEX_MAGIC_N; -} - -template <typename Mutex> -void MutexDebug<Mutex>::enter( - const Mutex* mutex, - const char* name, - unsigned line) - UNIV_NOTHROW -{ - ut_ad(!is_owned()); - - Context context(m_context.get_id()); - - context.locked(mutex, name, line); - - /* Check for latch order violation. */ - - sync_check_lock_validate(&context); -} - -template <typename Mutex> -void MutexDebug<Mutex>::locked( - const Mutex* mutex, - const char* name, - unsigned line) - UNIV_NOTHROW -{ - ut_ad(!is_owned()); - ut_ad(m_context.m_thread_id == ULINT_UNDEFINED); - - m_context.locked(mutex, name, line); - - sync_check_lock_granted(&m_context); -} - -template <typename Mutex> -void MutexDebug<Mutex>::release(const Mutex*) - UNIV_NOTHROW -{ - ut_ad(is_owned()); - - m_context.release(); - - sync_check_unlock(&m_context); -} - -#endif /* UNIV_DEBUG */ diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h index 5de22c74fa1..48528eb4d30 100644 --- a/storage/innobase/include/sync0rw.h +++ b/storage/innobase/include/sync0rw.h @@ -2,7 +2,7 @@ Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. -Copyright (c) 2017, 2019, MariaDB Corporation. +Copyright (c) 2017, 2020, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -568,11 +568,11 @@ struct rw_lock_t : public latch_t #endif /* UNIV_DEBUG */ { - /** Holds the state of the lock. */ - int32_t lock_word; + /** Holds the state of the lock. */ + Atomic_relaxed<int32_t> lock_word; - /** 1: there are waiters */ - int32_t waiters; + /** 0=no waiters, 1=waiters for X or SX lock exist */ + Atomic_relaxed<uint32_t> waiters; /** number of granted SX locks. */ volatile ulint sx_recursive; @@ -625,8 +625,7 @@ struct rw_lock_t #endif /* UNIV_PFS_RWLOCK */ #ifdef UNIV_DEBUG - virtual std::string to_string() const; - virtual std::string locked_from() const; + std::string to_string() const override; /** In the debug version: pointer to the debug info list of the lock */ UT_LIST_BASE_NODE_T(rw_lock_debug_t) debug_list; @@ -634,7 +633,6 @@ struct rw_lock_t /** Level in the global latching order. */ latch_level_t level; #endif /* UNIV_DEBUG */ - }; #ifdef UNIV_DEBUG /** The structure for storing debug info of an rw-lock. All access to this diff --git a/storage/innobase/include/sync0rw.ic b/storage/innobase/include/sync0rw.ic index 15f8ff3fe62..603e902d01c 100644 --- a/storage/innobase/include/sync0rw.ic +++ b/storage/innobase/include/sync0rw.ic @@ -2,7 +2,7 @@ Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. -Copyright (c) 2017, 2018, MariaDB Corporation. +Copyright (c) 2017, 2020, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -77,8 +77,7 @@ rw_lock_get_writer( /*===============*/ const rw_lock_t* lock) /*!< in: rw-lock */ { - int32_t lock_word = my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), - MY_MEMORY_ORDER_RELAXED); + int32_t lock_word = lock->lock_word; ut_ad(lock_word <= X_LOCK_DECR); if (lock_word > X_LOCK_HALF_DECR) { @@ -110,8 +109,7 @@ rw_lock_get_reader_count( /*=====================*/ const rw_lock_t* lock) /*!< in: rw-lock */ { - int32_t lock_word = my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), - MY_MEMORY_ORDER_RELAXED); + int32_t lock_word = lock->lock_word; ut_ad(lock_word <= X_LOCK_DECR); if (lock_word > X_LOCK_HALF_DECR) { @@ -147,8 +145,7 @@ rw_lock_get_x_lock_count( /*=====================*/ const rw_lock_t* lock) /*!< in: rw-lock */ { - int32_t lock_copy = my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), - MY_MEMORY_ORDER_RELAXED); + int32_t lock_copy = lock->lock_word; ut_ad(lock_copy <= X_LOCK_DECR); if (lock_copy == 0 || lock_copy == -X_LOCK_HALF_DECR) { @@ -181,8 +178,7 @@ rw_lock_get_sx_lock_count( const rw_lock_t* lock) /*!< in: rw-lock */ { #ifdef UNIV_DEBUG - int32_t lock_copy = my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), - MY_MEMORY_ORDER_RELAXED); + int32_t lock_copy = lock->lock_word; ut_ad(lock_copy <= X_LOCK_DECR); @@ -213,14 +209,15 @@ rw_lock_lock_word_decr( int32_t amount, /*!< in: amount to decrement */ int32_t threshold) /*!< in: threshold of judgement */ { - int32_t lock_copy = my_atomic_load32_explicit(&lock->lock_word, - MY_MEMORY_ORDER_RELAXED); + int32_t lock_copy = lock->lock_word; + while (lock_copy > threshold) { - if (my_atomic_cas32_strong_explicit(&lock->lock_word, - &lock_copy, - lock_copy - amount, - MY_MEMORY_ORDER_ACQUIRE, - MY_MEMORY_ORDER_RELAXED)) { + if (lock->lock_word.compare_exchange_strong( + lock_copy, + lock_copy - amount, + std::memory_order_acquire, + std::memory_order_relaxed)) { + return(true); } } @@ -304,9 +301,9 @@ rw_lock_x_lock_func_nowait( { int32_t oldval = X_LOCK_DECR; - if (my_atomic_cas32_strong_explicit(&lock->lock_word, &oldval, 0, - MY_MEMORY_ORDER_ACQUIRE, - MY_MEMORY_ORDER_RELAXED)) { + if (lock->lock_word.compare_exchange_strong(oldval, 0, + std::memory_order_acquire, + std::memory_order_relaxed)) { lock->writer_thread = os_thread_get_curr_id(); } else if (os_thread_eq(lock->writer_thread, os_thread_get_curr_id())) { @@ -316,12 +313,12 @@ rw_lock_x_lock_func_nowait( observe consistent values. */ if (oldval == 0 || oldval == -X_LOCK_HALF_DECR) { /* There are 1 x-locks */ - my_atomic_add32_explicit(&lock->lock_word, -X_LOCK_DECR, - MY_MEMORY_ORDER_RELAXED); + lock->lock_word.fetch_sub(X_LOCK_DECR, + std::memory_order_relaxed); } else if (oldval <= -X_LOCK_DECR) { /* There are 2 or more x-locks */ - my_atomic_add32_explicit(&lock->lock_word, -1, - MY_MEMORY_ORDER_RELAXED); + lock->lock_word.fetch_sub(1, + std::memory_order_relaxed); /* Watch for too many recursive locks */ ut_ad(oldval < 1); } else { @@ -355,27 +352,21 @@ rw_lock_s_unlock_func( #endif /* UNIV_DEBUG */ rw_lock_t* lock) /*!< in/out: rw-lock */ { -#ifdef UNIV_DEBUG - int32_t dbg_lock_word = my_atomic_load32_explicit(&lock->lock_word, - MY_MEMORY_ORDER_RELAXED); - ut_ad(dbg_lock_word > -X_LOCK_DECR); - ut_ad(dbg_lock_word != 0); - ut_ad(dbg_lock_word < X_LOCK_DECR); -#endif - ut_d(rw_lock_remove_debug_info(lock, pass, RW_LOCK_S)); /* Increment lock_word to indicate 1 less reader */ - int32_t lock_word = my_atomic_add32_explicit(&lock->lock_word, 1, - MY_MEMORY_ORDER_RELEASE) + 1; - if (lock_word == 0 || lock_word == -X_LOCK_HALF_DECR) { + int32_t lock_word = lock->lock_word.fetch_add( + 1, std::memory_order_release); + if (lock_word == -1 || lock_word == -X_LOCK_HALF_DECR - 1) { /* wait_ex waiter exists. It may not be asleep, but we signal anyway. We do not wake other waiters, because they can't exist without wait_ex waiter and wait_ex waiter goes first.*/ os_event_set(lock->wait_ex_event); sync_array_object_signalled(); - + } else { + ut_ad(lock_word > -X_LOCK_DECR); + ut_ad(lock_word < X_LOCK_DECR); } ut_ad(rw_lock_validate(lock)); @@ -393,11 +384,7 @@ rw_lock_x_unlock_func( #endif /* UNIV_DEBUG */ rw_lock_t* lock) /*!< in/out: rw-lock */ { - int32_t lock_word = my_atomic_load32_explicit(&lock->lock_word, - MY_MEMORY_ORDER_RELAXED); - - ut_ad(lock_word == 0 || lock_word == -X_LOCK_HALF_DECR - || lock_word <= -X_LOCK_DECR); + int32_t lock_word = lock->lock_word; if (lock_word == 0) { /* Last caller in a possible recursive chain. */ @@ -411,31 +398,27 @@ rw_lock_x_unlock_func( ACQ_REL due to... RELEASE: we release rw-lock ACQUIRE: we want waiters to be loaded after lock_word is stored */ - my_atomic_add32_explicit(&lock->lock_word, X_LOCK_DECR, - MY_MEMORY_ORDER_ACQ_REL); + lock->lock_word.fetch_add(X_LOCK_DECR, + std::memory_order_acq_rel); /* This no longer has an X-lock but it may still have an SX-lock. So it is now free for S-locks by other threads. We need to signal read/write waiters. We do not need to signal wait_ex waiters, since they cannot exist when there is a writer. */ - if (my_atomic_load32_explicit(&lock->waiters, - MY_MEMORY_ORDER_RELAXED)) { - my_atomic_store32_explicit(&lock->waiters, 0, - MY_MEMORY_ORDER_RELAXED); + if (lock->waiters) { + lock->waiters = 0; os_event_set(lock->event); sync_array_object_signalled(); } } else if (lock_word == -X_LOCK_DECR || lock_word == -(X_LOCK_DECR + X_LOCK_HALF_DECR)) { /* There are 2 x-locks */ - my_atomic_add32_explicit(&lock->lock_word, X_LOCK_DECR, - MY_MEMORY_ORDER_RELAXED); + lock->lock_word.fetch_add(X_LOCK_DECR); } else { /* There are more than 2 x-locks. */ ut_ad(lock_word < -X_LOCK_DECR); - my_atomic_add32_explicit(&lock->lock_word, 1, - MY_MEMORY_ORDER_RELAXED); + lock->lock_word.fetch_add(1); } ut_ad(rw_lock_validate(lock)); @@ -461,8 +444,7 @@ rw_lock_sx_unlock_func( ut_d(rw_lock_remove_debug_info(lock, pass, RW_LOCK_SX)); if (lock->sx_recursive == 0) { - int32_t lock_word = my_atomic_load32_explicit(&lock->lock_word, - MY_MEMORY_ORDER_RELAXED); + int32_t lock_word = lock->lock_word; /* Last caller in a possible recursive chain. */ if (lock_word > 0) { lock->writer_thread = 0; @@ -472,17 +454,15 @@ rw_lock_sx_unlock_func( ACQ_REL due to... RELEASE: we release rw-lock ACQUIRE: we want waiters to be loaded after lock_word is stored */ - my_atomic_add32_explicit(&lock->lock_word, X_LOCK_HALF_DECR, - MY_MEMORY_ORDER_ACQ_REL); + lock->lock_word.fetch_add(X_LOCK_HALF_DECR, + std::memory_order_acq_rel); /* Lock is now free. May have to signal read/write waiters. We do not need to signal wait_ex waiters, since they cannot exist when there is an sx-lock holder. */ - if (my_atomic_load32_explicit(&lock->waiters, - MY_MEMORY_ORDER_RELAXED)) { - my_atomic_store32_explicit(&lock->waiters, 0, - MY_MEMORY_ORDER_RELAXED); + if (lock->waiters) { + lock->waiters = 0; os_event_set(lock->event); sync_array_object_signalled(); } @@ -490,8 +470,7 @@ rw_lock_sx_unlock_func( /* still has x-lock */ ut_ad(lock_word == -X_LOCK_HALF_DECR || lock_word <= -(X_LOCK_DECR + X_LOCK_HALF_DECR)); - my_atomic_add32_explicit(&lock->lock_word, X_LOCK_HALF_DECR, - MY_MEMORY_ORDER_RELAXED); + lock->lock_word.fetch_add(X_LOCK_HALF_DECR); } } diff --git a/storage/innobase/include/sync0types.h b/storage/innobase/include/sync0types.h index 8fcb1abb0e2..4d2a7c8ff28 100644 --- a/storage/innobase/include/sync0types.h +++ b/storage/innobase/include/sync0types.h @@ -28,7 +28,6 @@ Created 9/5/1995 Heikki Tuuri #define sync0types_h #include <vector> -#include <my_atomic.h> #include "ut0new.h" @@ -998,9 +997,6 @@ struct latch_t { @return the string representation */ virtual std::string to_string() const = 0; - /** @return "filename:line" from where the latch was last locked */ - virtual std::string locked_from() const = 0; - /** @return the latch level */ latch_level_t get_level() const UNIV_NOTHROW @@ -1041,7 +1037,7 @@ struct sync_checker : public sync_check_functor_t /** Check the latching constraints @param[in] level The level held by the thread @return whether a latch violation was detected */ - bool operator()(const latch_level_t level) const + bool operator()(const latch_level_t level) const override { if (some_allowed) { switch (level) { @@ -1085,7 +1081,7 @@ struct sync_allowed_latches : public sync_check_functor_t { @param[in] latch The latch level to check @return true if there is a latch violation */ - bool operator()(const latch_level_t level) const + bool operator()(const latch_level_t level) const override { return(std::find(begin, end, level) == end); } @@ -1116,51 +1112,6 @@ enum rw_lock_flag_t { #endif /* UNIV_INNOCHECKSUM */ -static inline ulint my_atomic_addlint(ulint *A, ulint B) -{ -#ifdef _WIN64 - return ulint(my_atomic_add64((volatile int64*)A, B)); -#else - return ulint(my_atomic_addlong(A, B)); -#endif -} - -static inline ulint my_atomic_loadlint(const ulint *A) -{ -#ifdef _WIN64 - return ulint(my_atomic_load64((volatile int64*)A)); -#else - return ulint(my_atomic_loadlong(A)); -#endif -} - -static inline lint my_atomic_addlint(volatile lint *A, lint B) -{ -#ifdef _WIN64 - return my_atomic_add64((volatile int64*)A, B); -#else - return my_atomic_addlong(A, B); -#endif -} - -static inline lint my_atomic_loadlint(const lint *A) -{ -#ifdef _WIN64 - return lint(my_atomic_load64((volatile int64*)A)); -#else - return my_atomic_loadlong(A); -#endif -} - -static inline void my_atomic_storelint(ulint *A, ulint B) -{ -#ifdef _WIN64 - my_atomic_store64((volatile int64*)A, B); -#else - my_atomic_storelong(A, B); -#endif -} - /** Simple non-atomic counter aligned to CACHE_LINE_SIZE @tparam Type the integer type of the counter */ template <typename Type> @@ -1183,28 +1134,4 @@ private: /** The counter */ Type m_counter; }; - -/** Simple atomic counter aligned to CACHE_LINE_SIZE -@tparam Type lint or ulint */ -template <typename Type = ulint> -struct MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) simple_atomic_counter -{ - /** Increment the counter */ - Type inc() { return add(1); } - /** Decrement the counter */ - Type dec() { return add(Type(~0)); } - - /** Add to the counter - @param[in] i amount to be added - @return the value of the counter before adding */ - Type add(Type i) { return my_atomic_addlint(&m_counter, i); } - - /** @return the value of the counter (non-atomic access)! */ - operator Type() const { return m_counter; } - -private: - /** The counter */ - Type m_counter; -}; - #endif /* sync0types_h */ diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h index 4bc5aded341..7c3343ce7d2 100644 --- a/storage/innobase/include/trx0purge.h +++ b/storage/innobase/include/trx0purge.h @@ -140,202 +140,6 @@ private: TrxUndoRsegs::const_iterator m_iter; }; -/* Namespace to hold all the related functions and variables need for truncate -of undo tablespace. */ -namespace undo { - - typedef std::vector<ulint> undo_spaces_t; - typedef std::vector<trx_rseg_t*> rseg_for_trunc_t; - - /** Mark completion of undo truncate action by writing magic number to - the log file and then removing it from the disk. - If we are going to remove it from disk then why write magic number ? - This is to safeguard from unlink (file-system) anomalies that will keep - the link to the file even after unlink action is successfull and - ref-count = 0. - @param[in] space_id id of the undo tablespace to truncate.*/ - void done(ulint space_id); - - /** Check if TRUNCATE_DDL_LOG file exist. - @param[in] space_id id of the undo tablespace. - @return true if exist else false. */ - bool is_log_present(ulint space_id); - - /** Track UNDO tablespace mark for truncate. */ - class Truncate { - public: - void create() - { - m_undo_for_trunc = ULINT_UNDEFINED; - m_scan_start = 1; - m_purge_rseg_truncate_frequency = - ulint(srv_purge_rseg_truncate_frequency); - } - - /** Clear the cached rollback segment. Normally done - when purge is about to shutdown. */ - void clear() - { - reset(); - rseg_for_trunc_t temp; - m_rseg_for_trunc.swap(temp); - } - - /** Is tablespace selected for truncate. - @return true if undo tablespace is marked for truncate */ - bool is_marked() const - { - return(!(m_undo_for_trunc == ULINT_UNDEFINED)); - } - - /** Mark the tablespace for truncate. - @param[in] undo_id tablespace for truncate. */ - void mark(ulint undo_id) - { - m_undo_for_trunc = undo_id; - - m_scan_start = (undo_id + 1) - % (srv_undo_tablespaces_active + 1); - if (m_scan_start == 0) { - /* Note: UNDO tablespace ids starts from 1. */ - m_scan_start = 1; - } - - /* We found an UNDO-tablespace to truncate so set the - local purge rseg truncate frequency to 1. This will help - accelerate the purge action and in turn truncate. */ - m_purge_rseg_truncate_frequency = 1; - } - - /** Get the tablespace marked for truncate. - @return tablespace id marked for truncate. */ - ulint get_marked_space_id() const - { - return(m_undo_for_trunc); - } - - /** Add rseg to truncate vector. - @param[in,out] rseg rseg for truncate */ - void add_rseg_to_trunc(trx_rseg_t* rseg) - { - m_rseg_for_trunc.push_back(rseg); - } - - /** Get number of rsegs registered for truncate. - @return return number of rseg that belongs to tablespace mark - for truncate. */ - ulint rsegs_size() const - { - return(m_rseg_for_trunc.size()); - } - - /** Get ith registered rseg. - @param[in] id index of rseg to get. - @return reference to registered rseg. */ - trx_rseg_t* get_ith_rseg(ulint id) - { - ut_ad(id < m_rseg_for_trunc.size()); - return(m_rseg_for_trunc.at(id)); - } - - /** Reset for next rseg truncate. */ - void reset() - { - m_undo_for_trunc = ULINT_UNDEFINED; - m_rseg_for_trunc.clear(); - - /* Sync with global value as we are done with - truncate now. */ - m_purge_rseg_truncate_frequency = static_cast<ulint>( - srv_purge_rseg_truncate_frequency); - } - - /** Get the tablespace id to start scanning from. - @return id of UNDO tablespace to start scanning from. */ - ulint get_scan_start() const - { - return(m_scan_start); - } - - /** Check if the tablespace needs fix-up (based on presence of - DDL truncate log) - @param space_id space id of the undo tablespace to check - @return true if fix up is needed else false */ - bool needs_fix_up(ulint space_id) const - { - return(is_log_present(space_id)); - } - - /** Add undo tablespace to truncate vector. - @param[in] space_id space id of tablespace to - truncate */ - static void add_space_to_trunc_list(ulint space_id) - { - s_spaces_to_truncate.push_back(space_id); - } - - /** Clear the truncate vector. */ - static void clear_trunc_list() - { - s_spaces_to_truncate.clear(); - } - - /** Is tablespace marked for truncate. - @param[in] space_id space id to check - @return true if marked for truncate, else false. */ - static bool is_tablespace_truncated(ulint space_id) - { - return(std::find(s_spaces_to_truncate.begin(), - s_spaces_to_truncate.end(), space_id) - != s_spaces_to_truncate.end()); - } - - /** Was a tablespace truncated at startup - @param[in] space_id space id to check - @return whether space_id was truncated at startup */ - static bool was_tablespace_truncated(ulint space_id) - { - return(std::find(s_fix_up_spaces.begin(), - s_fix_up_spaces.end(), - space_id) - != s_fix_up_spaces.end()); - } - - /** Get local rseg purge truncate frequency - @return rseg purge truncate frequency. */ - ulint get_rseg_truncate_frequency() const - { - return(m_purge_rseg_truncate_frequency); - } - - private: - /** UNDO tablespace is mark for truncate. */ - ulint m_undo_for_trunc; - - /** rseg that resides in UNDO tablespace is marked for - truncate. */ - rseg_for_trunc_t m_rseg_for_trunc; - - /** Start scanning for UNDO tablespace from this space_id. - This is to avoid bias selection of one tablespace always. */ - ulint m_scan_start; - - /** Rollback segment(s) purge frequency. This is local - value maintained along with global value. It is set to global - value on start but when tablespace is marked for truncate it - is updated to 1 and then minimum value among 2 is used by - purge action. */ - ulint m_purge_rseg_truncate_frequency; - - /** List of UNDO tablespace(s) to truncate. */ - static undo_spaces_t s_spaces_to_truncate; - public: - /** Undo tablespaces that were truncated at startup */ - static undo_spaces_t s_fix_up_spaces; - }; /* class Truncate */ - -}; /* namespace undo */ - /** The control structure used in the purge operation */ class purge_sys_t { @@ -348,22 +152,19 @@ public: MY_ALIGNED(CACHE_LINE_SIZE) rw_lock_t latch; private: - /** whether purge is enabled; protected by latch and my_atomic */ - int32_t m_enabled; + /** whether purge is enabled; protected by latch and std::atomic */ + std::atomic<bool> m_enabled; /** number of pending stop() calls without resume() */ - int32_t m_paused; + Atomic_counter<int32_t> m_paused; public: que_t* query; /*!< The query graph which will do the parallelized purge operation */ MY_ALIGNED(CACHE_LINE_SIZE) ReadView view; /*!< The purge will not remove undo logs which are >= this view (purge view) */ - /** Total number of tasks submitted by srv_purge_coordinator_thread. - Not accessed by other threads. */ - ulint n_submitted; - /** Number of completed tasks. Accessed by srv_purge_coordinator - and srv_worker_thread by my_atomic. */ - ulint n_completed; + /** Number of not completed tasks. Accessed by srv_purge_coordinator + and srv_worker_thread by std::atomic. */ + std::atomic<ulint> n_tasks; /** Iterator to the undo log records of committed transactions */ struct iterator @@ -417,9 +218,14 @@ public: by the pq_mutex */ PQMutex pq_mutex; /*!< Mutex protecting purge_queue */ - undo::Truncate undo_trunc; /*!< Track UNDO tablespace marked - for truncate. */ - + /** Undo tablespace file truncation (only accessed by the + srv_purge_coordinator_thread) */ + struct { + /** The undo tablespace that is currently being truncated */ + fil_space_t* current; + /** The undo tablespace that was last truncated */ + fil_space_t* last; + } truncate; /** Constructor. @@ -428,7 +234,7 @@ public: uninitialised. Real initialisation happens in create(). */ - purge_sys_t() : event(NULL), m_enabled(false) {} + purge_sys_t() : event(NULL), m_enabled(false), n_tasks(0) {} /** Create the instance */ @@ -438,39 +244,24 @@ public: void close(); /** @return whether purge is enabled */ - bool enabled() - { - return my_atomic_load32_explicit(&m_enabled, MY_MEMORY_ORDER_RELAXED); - } - /** @return whether purge is enabled */ - bool enabled_latched() - { - ut_ad(rw_lock_own_flagged(&latch, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); - return bool(m_enabled); - } + bool enabled() { return m_enabled.load(std::memory_order_relaxed); } /** @return whether the purge coordinator is paused */ bool paused() - { return my_atomic_load32_explicit(&m_paused, MY_MEMORY_ORDER_RELAXED); } - /** @return whether the purge coordinator is paused */ - bool paused_latched() - { - ut_ad(rw_lock_own_flagged(&latch, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); - return m_paused != 0; - } + { return m_paused != 0; } /** Enable purge at startup. Not protected by latch; the main thread will wait for purge_sys.enabled() in srv_start() */ void coordinator_startup() { ut_ad(!enabled()); - my_atomic_store32_explicit(&m_enabled, true, MY_MEMORY_ORDER_RELAXED); + m_enabled.store(true, std::memory_order_relaxed); } /** Disable purge at shutdown */ void coordinator_shutdown() { ut_ad(enabled()); - my_atomic_store32_explicit(&m_enabled, false, MY_MEMORY_ORDER_RELAXED); + m_enabled.store(false, std::memory_order_relaxed); } /** @return whether the purge coordinator thread is active */ diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h index a23b57ccc3e..d9ea6c19d11 100644 --- a/storage/innobase/include/trx0roll.h +++ b/storage/innobase/include/trx0roll.h @@ -42,16 +42,6 @@ trx_savept_take( /*============*/ trx_t* trx); /*!< in: transaction */ -/** Get the last undo log record of a transaction (for rollback). -@param[in,out] trx transaction -@param[out] roll_ptr DB_ROLL_PTR to the undo record -@param[in,out] heap memory heap for allocation -@return undo log record copied to heap -@retval NULL if none left or the roll_limit (savepoint) was reached */ -trx_undo_rec_t* -trx_roll_pop_top_rec_of_trx(trx_t* trx, roll_ptr_t* roll_ptr, mem_heap_t* heap) - MY_ATTRIBUTE((nonnull, warn_unused_result)); - /** Report progress when rolling back a row of a recovered transaction. */ void trx_roll_report_progress(); /*******************************************************************//** diff --git a/storage/innobase/include/trx0rseg.ic b/storage/innobase/include/trx0rseg.ic index 687a1d5b8d8..0cff8fa1f5c 100644 --- a/storage/innobase/include/trx0rseg.ic +++ b/storage/innobase/include/trx0rseg.ic @@ -41,7 +41,7 @@ trx_rsegf_get(fil_space_t* space, ulint page_no, mtr_t* mtr) || !srv_was_started); buf_block_t* block = buf_page_get(page_id_t(space->id, page_no), - univ_page_size, RW_X_LATCH, mtr); + 0, RW_X_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_RSEG_HEADER); @@ -67,8 +67,7 @@ trx_rsegf_get_new( || !srv_was_started); ut_ad(space <= TRX_SYS_MAX_UNDO_SPACES || space == SRV_TMP_SPACE_ID); - block = buf_page_get( - page_id_t(space, page_no), univ_page_size, RW_X_LATCH, mtr); + block = buf_page_get(page_id_t(space, page_no), 0, RW_X_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW); diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index 913e2d25172..73f05eb5d48 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -68,17 +68,12 @@ trx_sys_rseg_find_free(const buf_block_t* sys_header); @param[in] rw whether to lock the page for writing @return the TRX_SYS page @retval NULL if the page cannot be read */ -inline -buf_block_t* -trx_sysf_get(mtr_t* mtr, bool rw = true) +inline buf_block_t *trx_sysf_get(mtr_t* mtr, bool rw= true) { - buf_block_t* block = buf_page_get( - page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), - univ_page_size, rw ? RW_X_LATCH : RW_S_LATCH, mtr); - if (block) { - buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); - } - return block; + buf_block_t* block = buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), + 0, rw ? RW_X_LATCH : RW_S_LATCH, mtr); + ut_d(if (block) buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);) + return block; } #ifdef UNIV_DEBUG @@ -200,14 +195,13 @@ trx_sysf_rseg_get_space(const buf_block_t* sys_header, ulint rseg_id) @param[in] sys_header TRX_SYS page @param[in] rseg_id rollback segment identifier @return undo page number */ -inline -uint32_t -trx_sysf_rseg_get_page_no(const buf_block_t* sys_header, ulint rseg_id) +inline uint32_t +trx_sysf_rseg_get_page_no(const buf_block_t *sys_header, ulint rseg_id) { - ut_ad(rseg_id < TRX_SYS_N_RSEGS); - return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO - + rseg_id * TRX_SYS_RSEG_SLOT_SIZE - + sys_header->frame); + ut_ad(rseg_id < TRX_SYS_N_RSEGS); + return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO + + rseg_id * TRX_SYS_RSEG_SLOT_SIZE + + sys_header->frame); } /** Maximum length of MySQL binlog file name, in bytes. @@ -344,9 +338,9 @@ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */ /*-------------------------------------------------------------*/ /** Contents of TRX_SYS_DOUBLEWRITE_MAGIC */ -#define TRX_SYS_DOUBLEWRITE_MAGIC_N 536853855 +constexpr uint32_t TRX_SYS_DOUBLEWRITE_MAGIC_N= 536853855; /** Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED */ -#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N 1783657386 +constexpr uint32_t TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N= 1783657386; /** Size of the doublewrite block in pages */ #define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE @@ -369,7 +363,7 @@ struct rw_trx_hash_element_t trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */ - trx_id_t no; + Atomic_counter<trx_id_t> no; trx_t *trx; ib_mutex_t mutex; }; @@ -716,11 +710,7 @@ public: because it may change even before this method returns. */ - uint32_t size() - { - return uint32_t(my_atomic_load32_explicit(&hash.count, - MY_MEMORY_ORDER_RELAXED)); - } + uint32_t size() { return uint32_t(lf_hash_size(&hash)); } /** @@ -802,7 +792,7 @@ class trx_sys_t The smallest number not yet assigned as a transaction id or transaction number. Accessed and updated with atomic operations. */ - MY_ALIGNED(CACHE_LINE_SIZE) trx_id_t m_max_trx_id; + MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<trx_id_t> m_max_trx_id; /** @@ -813,17 +803,17 @@ class trx_sys_t @sa assign_new_trx_no() @sa snapshot_ids() */ - MY_ALIGNED(CACHE_LINE_SIZE) trx_id_t m_rw_trx_hash_version; + MY_ALIGNED(CACHE_LINE_SIZE) std::atomic<trx_id_t> m_rw_trx_hash_version; + bool m_initialised; + +public: /** TRX_RSEG_HISTORY list length (number of committed transactions to purge) */ - MY_ALIGNED(CACHE_LINE_SIZE) int32 rseg_history_len; - - bool m_initialised; + MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<uint32_t> rseg_history_len; -public: /** Mutex protecting trx_list. */ MY_ALIGNED(CACHE_LINE_SIZE) mutable TrxSysMutex mutex; @@ -899,9 +889,7 @@ public: trx_id_t get_max_trx_id() { - return static_cast<trx_id_t> - (my_atomic_load64_explicit(reinterpret_cast<int64*>(&m_max_trx_id), - MY_MEMORY_ORDER_RELAXED)); + return m_max_trx_id; } @@ -943,9 +931,7 @@ public: void assign_new_trx_no(trx_t *trx) { trx->no= get_new_trx_id_no_refresh(); - my_atomic_store64_explicit(reinterpret_cast<int64*> - (&trx->rw_trx_hash_element->no), - trx->no, MY_MEMORY_ORDER_RELAXED); + trx->rw_trx_hash_element->no= trx->no; refresh_rw_trx_hash_version(); } @@ -996,7 +982,8 @@ public: /** Initialiser for m_max_trx_id and m_rw_trx_hash_version. */ void init_max_trx_id(trx_id_t value) { - m_max_trx_id= m_rw_trx_hash_version= value; + m_max_trx_id= value; + m_rw_trx_hash_version.store(value, std::memory_order_relaxed); } @@ -1118,22 +1105,6 @@ public: return count; } - /** @return number of committed transactions waiting for purge */ - ulint history_size() const - { - return uint32(my_atomic_load32(&const_cast<trx_sys_t*>(this) - ->rseg_history_len)); - } - /** Add to the TRX_RSEG_HISTORY length (on database startup). */ - void history_add(int32 len) - { - my_atomic_add32(&rseg_history_len, len); - } - /** Register a committed transaction. */ - void history_insert() { history_add(1); } - /** Note that a committed transaction was purged. */ - void history_remove() { history_add(-1); } - private: static my_bool get_min_trx_id_callback(rw_trx_hash_element_t *element, trx_id_t *id) @@ -1164,8 +1135,7 @@ private: { if (element->id < arg->m_id) { - trx_id_t no= static_cast<trx_id_t>(my_atomic_load64_explicit( - reinterpret_cast<int64*>(&element->no), MY_MEMORY_ORDER_RELAXED)); + trx_id_t no= element->no; arg->m_ids->push_back(element->id); if (no < arg->m_no) arg->m_no= no; @@ -1177,18 +1147,14 @@ private: /** Getter for m_rw_trx_hash_version, must issue ACQUIRE memory barrier. */ trx_id_t get_rw_trx_hash_version() { - return static_cast<trx_id_t> - (my_atomic_load64_explicit(reinterpret_cast<int64*> - (&m_rw_trx_hash_version), - MY_MEMORY_ORDER_ACQUIRE)); + return m_rw_trx_hash_version.load(std::memory_order_acquire); } /** Increments m_rw_trx_hash_version, must issue RELEASE memory barrier. */ void refresh_rw_trx_hash_version() { - my_atomic_add64_explicit(reinterpret_cast<int64*>(&m_rw_trx_hash_version), - 1, MY_MEMORY_ORDER_RELEASE); + m_rw_trx_hash_version.fetch_add(1, std::memory_order_release); } @@ -1207,8 +1173,7 @@ private: trx_id_t get_new_trx_id_no_refresh() { - return static_cast<trx_id_t>(my_atomic_add64_explicit( - reinterpret_cast<int64*>(&m_max_trx_id), 1, MY_MEMORY_ORDER_RELAXED)); + return m_max_trx_id++; } }; diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index feb27e56115..70df62d0d03 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -181,17 +181,6 @@ trx_start_for_ddl_low( trx_start_for_ddl_low((t), (o)) #endif /* UNIV_DEBUG */ -/****************************************************************//** -Commits a transaction. */ -void -trx_commit( -/*=======*/ - trx_t* trx); /*!< in/out: transaction */ - -/** Commit a transaction and a mini-transaction. -@param[in,out] trx transaction -@param[in,out] mtr mini-transaction (NULL if no modifications) */ -void trx_commit_low(trx_t* trx, mtr_t* mtr); /**********************************************************************//** Does the transaction commit for MySQL. @return DB_SUCCESS or error number */ @@ -447,31 +436,6 @@ Check transaction state */ ut_error; \ } while (0) -/** Check if transaction is free so that it can be re-initialized. -@param t transaction handle */ -#define assert_trx_is_free(t) do { \ - ut_ad(trx_state_eq((t), TRX_STATE_NOT_STARTED)); \ - ut_ad(!(t)->id); \ - ut_ad(!(t)->has_logged()); \ - ut_ad(!(t)->is_referenced()); \ - ut_ad(!(t)->is_wsrep()); \ - ut_ad(!(t)->read_view.is_open()); \ - ut_ad((t)->lock.wait_thr == NULL); \ - ut_ad(UT_LIST_GET_LEN((t)->lock.trx_locks) == 0); \ - ut_ad((t)->lock.table_locks.empty()); \ - ut_ad(!(t)->autoinc_locks \ - || ib_vector_is_empty((t)->autoinc_locks)); \ - ut_ad((t)->dict_operation == TRX_DICT_OP_NONE); \ -} while(0) - -/** Check if transaction is in-active so that it can be freed and put back to -transaction pool. -@param t transaction handle */ -#define assert_trx_is_inactive(t) do { \ - assert_trx_is_free((t)); \ - ut_ad((t)->dict_operation_lock_mode == 0); \ -} while(0) - #ifdef UNIV_DEBUG /*******************************************************************//** Assert that an autocommit non-locking select cannot be in the @@ -559,6 +523,11 @@ struct trx_lock_t { lock_sys.mutex. Otherwise, this may only be modified by the thread that is serving the running transaction. */ +#ifdef WITH_WSREP + bool was_chosen_as_wsrep_victim; + /*!< high priority wsrep thread has + marked this trx to abort */ +#endif /* WITH_WSREP */ /** Pre-allocated record locks */ struct { @@ -585,6 +554,9 @@ struct trx_lock_t { lock_list table_locks; /*!< All table locks requested by this transaction, including AUTOINC locks */ + /** List of pending trx_t::evict_table() */ + UT_LIST_BASE_NODE_T(dict_table_t) evicted_tables; + bool cancel; /*!< true if the transaction is being rolled back either via deadlock detection or due to lock timeout. The @@ -675,7 +647,7 @@ with exactly one user transaction. There are some exceptions to this: * For DDL operations, a subtransaction is allocated that modifies the data dictionary tables. Lock waits and deadlocks are prevented by -acquiring the dict_operation_lock before starting the subtransaction +acquiring the dict_sys.latch before starting the subtransaction and releasing it after committing the subtransaction. * The purge system uses a special transaction that is not associated @@ -751,7 +723,7 @@ private: that it is no longer "active". */ - int32_t n_ref; + Atomic_counter<int32_t> n_ref; public: @@ -890,10 +862,10 @@ public: defer flush of the logs to disk until after we release the mutex. */ - bool must_flush_log_later;/*!< this flag is set to TRUE in - trx_commit() if flush_log_later was - TRUE, and there were modifications by - the transaction; in that case we must + bool must_flush_log_later;/*!< set in commit() + if flush_log_later was + set and redo log was written; + in that case we will flush the log in trx_commit_complete_for_mysql() */ ulint duplicates; /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */ @@ -913,8 +885,8 @@ public: ib_uint32_t dict_operation_lock_mode; /*!< 0, RW_S_LATCH, or RW_X_LATCH: the latch mode trx currently holds - on dict_operation_lock. Protected - by dict_operation_lock. */ + on dict_sys.latch. Protected + by dict_sys.latch. */ /** wall-clock time of the latest transition to TRX_STATE_ACTIVE; used for diagnostic purposes only */ @@ -1120,19 +1092,32 @@ public: /** Release any explicit locks of a committing transaction. */ inline void release_locks(); + /** Evict a table definition due to the rollback of ALTER TABLE. + @param[in] table_id table identifier */ + void evict_table(table_id_t table_id); + +private: + /** Mark a transaction committed in the main memory data structures. */ + inline void commit_in_memory(const mtr_t *mtr); +public: + /** Commit the transaction. */ + void commit(); + + /** Commit the transaction in a mini-transaction. + @param mtr mini-transaction (if there are any persistent modifications) */ + void commit_low(mtr_t *mtr= nullptr); - bool is_referenced() - { - return my_atomic_load32_explicit(&n_ref, MY_MEMORY_ORDER_RELAXED) > 0; - } + + + bool is_referenced() const { return n_ref > 0; } void reference() { #ifdef UNIV_DEBUG - int32_t old_n_ref= + auto old_n_ref= #endif - my_atomic_add32_explicit(&n_ref, 1, MY_MEMORY_ORDER_RELAXED); + n_ref++; ut_ad(old_n_ref >= 0); } @@ -1140,13 +1125,33 @@ public: void release_reference() { #ifdef UNIV_DEBUG - int32_t old_n_ref= + auto old_n_ref= #endif - my_atomic_add32_explicit(&n_ref, -1, MY_MEMORY_ORDER_RELAXED); + n_ref--; ut_ad(old_n_ref > 0); } + void assert_freed() const + { + ut_ad(state == TRX_STATE_NOT_STARTED); + ut_ad(!id); + ut_ad(!has_logged()); + ut_ad(!is_referenced()); + ut_ad(!is_wsrep()); +#ifdef WITH_WSREP + ut_ad(!lock.was_chosen_as_wsrep_victim); +#endif + ut_ad(!read_view.is_open()); + ut_ad(!lock.wait_thr); + ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0); + ut_ad(lock.table_locks.empty()); + ut_ad(!autoinc_locks || ib_vector_is_empty(autoinc_locks)); + ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0); + ut_ad(dict_operation == TRX_DICT_OP_NONE); + } + + private: /** Assign a rollback segment for modifying temporary tables. @return the assigned rollback segment */ diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h index 7be4314ecbc..ce92e5de5e1 100644 --- a/storage/innobase/include/trx0undo.h +++ b/storage/innobase/include/trx0undo.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2018, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -79,27 +79,22 @@ trx_undo_trx_id_is_insert( /*======================*/ const byte* trx_id) /*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */ MY_ATTRIBUTE((warn_unused_result)); -/*****************************************************************//** -Writes a roll ptr to an index page. In case that the size changes in -some future version, this function should be used instead of -mach_write_... */ -UNIV_INLINE -void -trx_write_roll_ptr( -/*===============*/ - byte* ptr, /*!< in: pointer to memory where - written */ - roll_ptr_t roll_ptr); /*!< in: roll ptr */ -/*****************************************************************//** -Reads a roll ptr from an index page. In case that the roll ptr size -changes in some future version, this function should be used instead of -mach_read_... +/** Write DB_ROLL_PTR. +@param[out] ptr buffer +@param[in] roll_ptr DB_ROLL_PTR value */ +inline void trx_write_roll_ptr(byte* ptr, roll_ptr_t roll_ptr) +{ + compile_time_assert(DATA_ROLL_PTR_LEN == 7); + mach_write_to_7(ptr, roll_ptr); +} +/** Read DB_ROLL_PTR. +@param[in] ptr buffer @return roll ptr */ -UNIV_INLINE -roll_ptr_t -trx_read_roll_ptr( -/*==============*/ - const byte* ptr); /*!< in: pointer to memory from where to read */ +inline roll_ptr_t trx_read_roll_ptr(const byte* ptr) +{ + compile_time_assert(DATA_ROLL_PTR_LEN == 7); + return mach_read_from_7(ptr); +} /** Gets an undo log page and x-latches it. @param[in] page_id page id @@ -185,9 +180,7 @@ trx_undo_free_last_page(trx_undo_t* undo, mtr_t* mtr) @param[in,out] undo undo log @param[in] limit all undo logs after this limit will be discarded @param[in] is_temp whether this is temporary undo log */ -void -trx_undo_truncate_end(trx_undo_t* undo, undo_no_t limit, bool is_temp) - MY_ATTRIBUTE((nonnull)); +void trx_undo_truncate_end(trx_undo_t& undo, undo_no_t limit, bool is_temp); /** Truncate the head of an undo log. NOTE that only whole pages are freed; the header page is not @@ -315,16 +308,17 @@ trx_undo_mem_create_at_db_start(trx_rseg_t* rseg, ulint id, ulint page_no, and delete markings: in short, modifys (the name 'UPDATE' is a historical relic) */ -/* States of an undo log segment */ -#define TRX_UNDO_ACTIVE 1 /* contains an undo log of an active - transaction */ -#define TRX_UNDO_CACHED 2 /* cached for quick reuse */ -#define TRX_UNDO_TO_FREE 3 /* insert undo segment can be freed */ -#define TRX_UNDO_TO_PURGE 4 /* update undo segment will not be - reused: it can be freed in purge when - all undo data in it is removed */ -#define TRX_UNDO_PREPARED 5 /* contains an undo log of an - prepared transaction */ +/* TRX_UNDO_STATE values of an undo log segment */ +/** contains an undo log of an active transaction */ +constexpr uint16_t TRX_UNDO_ACTIVE = 1; +/** cached for quick reuse */ +constexpr uint16_t TRX_UNDO_CACHED = 2; +/** old_insert undo segment that can be freed */ +constexpr uint16_t TRX_UNDO_TO_FREE = 3; +/** can be freed in purge when all undo data in it is removed */ +constexpr uint16_t TRX_UNDO_TO_PURGE = 4; +/** contains an undo log of a prepared transaction */ +constexpr uint16_t TRX_UNDO_PREPARED = 5; #ifndef UNIV_INNOCHECKSUM diff --git a/storage/innobase/include/trx0undo.ic b/storage/innobase/include/trx0undo.ic index 19697c6054c..6d1ec16869e 100644 --- a/storage/innobase/include/trx0undo.ic +++ b/storage/innobase/include/trx0undo.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2018, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -103,37 +103,6 @@ trx_undo_trx_id_is_insert( return bool(trx_id[DATA_TRX_ID_LEN] >> 7); } -/*****************************************************************//** -Writes a roll ptr to an index page. In case that the size changes in -some future version, this function should be used instead of -mach_write_... */ -UNIV_INLINE -void -trx_write_roll_ptr( -/*===============*/ - byte* ptr, /*!< in: pointer to memory where - written */ - roll_ptr_t roll_ptr) /*!< in: roll ptr */ -{ - compile_time_assert(DATA_ROLL_PTR_LEN == 7); - mach_write_to_7(ptr, roll_ptr); -} - -/*****************************************************************//** -Reads a roll ptr from an index page. In case that the roll ptr size -changes in some future version, this function should be used instead of -mach_read_... -@return roll ptr */ -UNIV_INLINE -roll_ptr_t -trx_read_roll_ptr( -/*==============*/ - const byte* ptr) /*!< in: pointer to memory from where to read */ -{ - compile_time_assert(DATA_ROLL_PTR_LEN == 7); - return(mach_read_from_7(ptr)); -} - /** Gets an undo log page and x-latches it. @param[in] page_id page id @param[in,out] mtr mini-transaction @@ -142,8 +111,7 @@ UNIV_INLINE page_t* trx_undo_page_get(const page_id_t page_id, mtr_t* mtr) { - buf_block_t* block = buf_page_get(page_id, univ_page_size, - RW_X_LATCH, mtr); + buf_block_t* block = buf_page_get(page_id, 0, RW_X_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); @@ -158,8 +126,7 @@ UNIV_INLINE page_t* trx_undo_page_get_s_latched(const page_id_t page_id, mtr_t* mtr) { - buf_block_t* block = buf_page_get(page_id, univ_page_size, - RW_S_LATCH, mtr); + buf_block_t* block = buf_page_get(page_id, 0, RW_S_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i index 001690a47a1..99e493acfb4 100644 --- a/storage/innobase/include/univ.i +++ b/storage/innobase/include/univ.i @@ -77,6 +77,7 @@ used throughout InnoDB but do not include too much themselves. They support cross-platform development and expose comonly used SQL names. */ #include <my_global.h> +#include "my_counter.h" /* JAN: TODO: missing 5.7 header */ #ifdef HAVE_MY_THREAD_H diff --git a/storage/innobase/include/ut0counter.h b/storage/innobase/include/ut0counter.h index a04a674751c..646a5f367c2 100644 --- a/storage/innobase/include/ut0counter.h +++ b/storage/innobase/include/ut0counter.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2012, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2018, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -30,7 +30,6 @@ Created 2012/04/12 by Sunny Bains #include "os0thread.h" #include "my_rdtsc.h" -#include "my_atomic.h" /** CPU cache line size */ #ifdef CPU_LEVEL1_DCACHE_LINESIZE @@ -42,120 +41,85 @@ Created 2012/04/12 by Sunny Bains /** Default number of slots to use in ib_counter_t */ #define IB_N_SLOTS 64 -/** Get the offset into the counter array. */ -template <typename Type, int N> -struct generic_indexer_t { - /** @return offset within m_counter */ - static size_t offset(size_t index) UNIV_NOTHROW - { - return(((index % N) + 1) * (CACHE_LINE_SIZE / sizeof(Type))); - } -}; +/** Use the result of my_timer_cycles(), which mainly uses RDTSC for cycles +as a random value. See the comments for my_timer_cycles() */ +/** @return result from RDTSC or similar functions. */ +static inline size_t +get_rnd_value() +{ + size_t c = static_cast<size_t>(my_timer_cycles()); + + if (c != 0) { + return c; + } -/** Use the result of my_timer_cycles(), which mainly uses RDTSC for cycles, -to index into the counter array. See the comments for my_timer_cycles() */ -template <typename Type=ulint, int N=1> -struct counter_indexer_t : public generic_indexer_t<Type, N> { - /** @return result from RDTSC or similar functions. */ - static size_t get_rnd_index() UNIV_NOTHROW - { - size_t c = static_cast<size_t>(my_timer_cycles()); - - if (c != 0) { - return(c); - } else { - /* We may go here if my_timer_cycles() returns 0, - so we have to have the plan B for the counter. */ + /* We may go here if my_timer_cycles() returns 0, + so we have to have the plan B for the counter. */ #if !defined(_WIN32) - return(size_t(os_thread_get_curr_id())); + return (size_t)os_thread_get_curr_id(); #else - LARGE_INTEGER cnt; - QueryPerformanceCounter(&cnt); + LARGE_INTEGER cnt; + QueryPerformanceCounter(&cnt); - return(static_cast<size_t>(cnt.QuadPart)); + return static_cast<size_t>(cnt.QuadPart); #endif /* !_WIN32 */ - } - } +} - /** @return a random offset to the array */ - static size_t get_rnd_offset() UNIV_NOTHROW - { - return(generic_indexer_t<Type, N>::offset(get_rnd_index())); - } -}; - -#define default_indexer_t counter_indexer_t - -/** Class for using fuzzy counters. The counter is relaxed atomic +/** Class for using fuzzy counters. The counter is multi-instance relaxed atomic so the results are not guaranteed to be 100% accurate but close enough. Creates an array of counters and separates each element by the CACHE_LINE_SIZE bytes */ -template < - typename Type, - int N = IB_N_SLOTS, - template<typename, int> class Indexer = default_indexer_t> -struct MY_ALIGNED(CACHE_LINE_SIZE) ib_counter_t -{ +template <typename Type, int N = IB_N_SLOTS> +struct ib_counter_t { /** Increment the counter by 1. */ - void inc() UNIV_NOTHROW { add(1); } + void inc() { add(1); } /** Increment the counter by 1. @param[in] index a reasonably thread-unique identifier */ - void inc(size_t index) UNIV_NOTHROW { add(index, 1); } + void inc(size_t index) { add(index, 1); } /** Add to the counter. @param[in] n amount to be added */ - void add(Type n) UNIV_NOTHROW { add(m_policy.get_rnd_offset(), n); } + void add(Type n) { add(get_rnd_value(), n); } /** Add to the counter. @param[in] index a reasonably thread-unique identifier @param[in] n amount to be added */ - void add(size_t index, Type n) UNIV_NOTHROW { - size_t i = m_policy.offset(index); - - ut_ad(i < UT_ARR_SIZE(m_counter)); - - if (sizeof(Type) == 8) { - my_atomic_add64_explicit( - reinterpret_cast<int64*>(&m_counter[i]), - static_cast<int64>(n), MY_MEMORY_ORDER_RELAXED); - } else if (sizeof(Type) == 4) { - my_atomic_add32_explicit( - reinterpret_cast<int32*>(&m_counter[i]), - static_cast<int32>(n), MY_MEMORY_ORDER_RELAXED); - } - compile_time_assert(sizeof(Type) == 8 || sizeof(Type) == 4); + void add(size_t index, Type n) { + index = index % N; + + ut_ad(index < UT_ARR_SIZE(m_counter)); + + m_counter[index].value.fetch_add(n, std::memory_order_relaxed); } - /* @return total value - not 100% accurate, since it is relaxed atomic. */ - operator Type() const UNIV_NOTHROW { + /* @return total value - not 100% accurate, since it is relaxed atomic*/ + operator Type() const { Type total = 0; - for (size_t i = 0; i < N; ++i) { - if (sizeof(Type) == 8) { - total += static_cast< - Type>(my_atomic_load64_explicit( - reinterpret_cast<int64*>(const_cast<Type*>( - &m_counter[m_policy.offset(i)])), - MY_MEMORY_ORDER_RELAXED)); - } else if (sizeof(Type) == 4) { - total += static_cast< - Type>(my_atomic_load32_explicit( - reinterpret_cast<int32*>(const_cast<Type*>( - &m_counter[m_policy.offset(i)])), - MY_MEMORY_ORDER_RELAXED)); - } + for (const auto &counter : m_counter) { + total += counter.value.load(std::memory_order_relaxed); } return(total); } private: - /** Indexer into the array */ - Indexer<Type, N>m_policy; - - /** Slot 0 is unused. */ - Type m_counter[(N + 1) * (CACHE_LINE_SIZE / sizeof(Type))]; + /** Atomic which occupies whole CPU cache line. + Note: We rely on the default constructor of std::atomic and + do not explicitly initialize the contents. This works for us, + because ib_counter_t is only intended for usage with global + memory that is allocated from the .bss and thus guaranteed to + be zero-initialized by the run-time environment. + @see srv_stats + @see rw_lock_stats */ + struct ib_counter_element_t { + MY_ALIGNED(CACHE_LINE_SIZE) std::atomic<Type> value; + }; + static_assert(sizeof(ib_counter_element_t) == CACHE_LINE_SIZE, ""); + + /** Array of counter elements */ + MY_ALIGNED(CACHE_LINE_SIZE) ib_counter_element_t m_counter[N]; }; #endif /* ut0counter_h */ diff --git a/storage/innobase/include/ut0crc32.h b/storage/innobase/include/ut0crc32.h index 68af6882155..f2c1b7e82b6 100644 --- a/storage/innobase/include/ut0crc32.h +++ b/storage/innobase/include/ut0crc32.h @@ -47,12 +47,6 @@ typedef uint32_t (*ut_crc32_func_t)(const byte* ptr, ulint len); /** Pointer to CRC32 calculation function. */ extern ut_crc32_func_t ut_crc32; -#ifdef INNODB_BUG_ENDIAN_CRC32 -/** Pointer to CRC32 calculation function, which uses big-endian byte order -when converting byte strings to integers internally. */ -extern uint32_t ut_crc32_legacy_big_endian(const byte* buf, ulint len); -#endif /* INNODB_BUG_ENDIAN_CRC32 */ - /** Text description of CRC32 implementation */ extern const char* ut_crc32_implementation; diff --git a/storage/innobase/include/ut0mutex.h b/storage/innobase/include/ut0mutex.h index 1f99ee17a24..d7d48cd1f28 100644 --- a/storage/innobase/include/ut0mutex.h +++ b/storage/innobase/include/ut0mutex.h @@ -38,8 +38,6 @@ Created 2012-03-24 Sunny Bains. @param[in] T The resulting typedef alias */ #define UT_MUTEX_TYPE(M, P, T) typedef PolicyMutex<M<P> > T; -typedef OSMutex EventMutex; - # ifdef HAVE_IB_LINUX_FUTEX UT_MUTEX_TYPE(TTASFutexMutex, GenericPolicy, FutexMutex); UT_MUTEX_TYPE(TTASFutexMutex, BlockMutexPolicy, BlockFutexMutex); diff --git a/storage/innobase/include/ut0rnd.h b/storage/innobase/include/ut0rnd.h index 9af8687bfd0..5b1ae5bc0da 100644 --- a/storage/innobase/include/ut0rnd.h +++ b/storage/innobase/include/ut0rnd.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2019, MariaDB Corporation. +Copyright (c) 2019, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -32,7 +32,7 @@ Created 1/20/1994 Heikki Tuuri #ifndef UNIV_INNOCHECKSUM /** Seed value of ut_rnd_gen() */ -extern int32 ut_rnd_current; +extern std::atomic<uint32_t> ut_rnd_current; /** @return a pseudo-random 32-bit number */ inline uint32_t ut_rnd_gen() @@ -45,8 +45,7 @@ inline uint32_t ut_rnd_gen() x^19+x^18+x^14+x^13+x^11+x^10+x^9+x^8+x^6+1 */ const uint32_t crc32c= 0x1edc6f41; - uint32_t rnd= my_atomic_load32_explicit(&ut_rnd_current, - MY_MEMORY_ORDER_RELAXED); + uint32_t rnd= ut_rnd_current.load(std::memory_order_relaxed); if (UNIV_UNLIKELY(rnd == 0)) { @@ -61,7 +60,7 @@ inline uint32_t ut_rnd_gen() rnd^= crc32c; } - my_atomic_store32_explicit(&ut_rnd_current, rnd, MY_MEMORY_ORDER_RELAXED); + ut_rnd_current.store(rnd, std::memory_order_relaxed); return rnd; } diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h index a6a70c99ecf..430b99d7667 100644 --- a/storage/innobase/include/ut0ut.h +++ b/storage/innobase/include/ut0ut.h @@ -46,7 +46,6 @@ Created 1/20/1994 Heikki Tuuri #include <stdarg.h> #include <string> -#include <my_atomic.h> /** Index name prefix in fast index creation, as a string constant */ #define TEMP_INDEX_PREFIX_STR "\377" @@ -146,12 +145,6 @@ ut_2_power_up( ulint n) /*!< in: number != 0 */ MY_ATTRIBUTE((const)); -/** Determine how many bytes (groups of 8 bits) are needed to -store the given number of bits. -@param b in: bits -@return number of bytes (octets) needed to represent b */ -#define UT_BITS_IN_BYTES(b) (((b) + 7) / 8) - /**********************************************************//** Returns the number of milliseconds since some epoch. The value may wrap around. It should only be used for heuristic @@ -162,6 +155,12 @@ ut_time_ms(void); /*============*/ #endif /* !UNIV_INNOCHECKSUM */ +/** Determine how many bytes (groups of 8 bits) are needed to +store the given number of bits. +@param b in: bits +@return number of bytes (octets) needed to represent b */ +#define UT_BITS_IN_BYTES(b) (((b) + 7) / 8) + /** Determines if a number is zero or a power of two. @param[in] n number @return nonzero if n is zero or a power of two; zero otherwise */ diff --git a/storage/innobase/innodb.cmake b/storage/innobase/innodb.cmake index 8bfca3a614b..07ed7ed67ab 100644 --- a/storage/innobase/innodb.cmake +++ b/storage/innobase/innodb.cmake @@ -33,7 +33,6 @@ MYSQL_CHECK_LZMA() MYSQL_CHECK_BZIP2() MYSQL_CHECK_SNAPPY() MYSQL_CHECK_NUMA() -TEST_BIG_ENDIAN(IS_BIG_ENDIAN) INCLUDE(${MYSQL_CMAKE_SCRIPT_DIR}/compile_flags.cmake) @@ -122,11 +121,6 @@ ELSEIF(WITH_INNODB_ROOT_GUESS) ADD_DEFINITIONS(-DBTR_CUR_ADAPT) ENDIF() -OPTION(WITH_INNODB_BUG_ENDIAN_CRC32 "Weaken innodb_checksum_algorithm=crc32 by supporting upgrade from big-endian systems running 5.6/10.0/10.1" ${IS_BIG_ENDIAN}) -IF(WITH_INNODB_BUG_ENDIAN_CRC32) - ADD_DEFINITIONS(-DINNODB_BUG_ENDIAN_CRC32) -ENDIF() - OPTION(WITH_INNODB_EXTRA_DEBUG "Enable extra InnoDB debug checks" OFF) IF(WITH_INNODB_EXTRA_DEBUG) ADD_DEFINITIONS(-DUNIV_ZIP_DEBUG) diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 1c7407a0c23..86708818b36 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -329,7 +329,7 @@ lock_report_trx_id_insanity( trx_id_t max_trx_id) /*!< in: trx_sys.get_max_trx_id() */ { ut_ad(rec_offs_validate(rec, index, offsets)); - ut_ad(!rec_is_metadata(rec, index)); + ut_ad(!rec_is_metadata(rec, *index)); ib::error() << "Transaction id " << ib::hex(trx_id) @@ -352,7 +352,7 @@ lock_check_trx_id_sanity( const rec_offs* offsets) /*!< in: rec_get_offsets(rec, index) */ { ut_ad(rec_offs_validate(rec, index, offsets)); - ut_ad(!rec_is_metadata(rec, index)); + ut_ad(!rec_is_metadata(rec, *index)); trx_id_t max_trx_id= trx_sys.get_max_trx_id(); ut_ad(max_trx_id || srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN); @@ -381,7 +381,7 @@ lock_clust_rec_cons_read_sees( ut_ad(dict_index_is_clust(index)); ut_ad(page_rec_is_user_rec(rec)); ut_ad(rec_offs_validate(rec, index, offsets)); - ut_ad(!rec_is_metadata(rec, index)); + ut_ad(!rec_is_metadata(rec, *index)); /* Temp-tables are not shared across connections and multiple transactions from different connections cannot simultaneously @@ -420,7 +420,7 @@ lock_sec_rec_cons_read_sees( { ut_ad(page_rec_is_user_rec(rec)); ut_ad(!index->is_primary()); - ut_ad(!rec_is_metadata(rec, index)); + ut_ad(!rec_is_metadata(rec, *index)); /* NOTE that we might call this function while holding the search system latch. */ @@ -759,9 +759,7 @@ lock_rec_has_to_wait( << wsrep_thd_query(lock2->trx->mysql_thd); } - if (wsrep_trx_order_before(trx->mysql_thd, - lock2->trx->mysql_thd) - && (type_mode & LOCK_MODE_MASK) == LOCK_X + if ((type_mode & LOCK_MODE_MASK) == LOCK_X && (lock2->type_mode & LOCK_MODE_MASK) == LOCK_X) { if (for_locking || UNIV_UNLIKELY(wsrep_debug)) { /* exclusive lock conflicts are not @@ -771,12 +769,11 @@ lock_rec_has_to_wait( << type_mode << " supremum: " << lock_is_on_supremum << "conflicts states: my " - << wsrep_thd_conflict_state( - trx->mysql_thd, FALSE) + << wsrep_thd_transaction_state_str( + trx->mysql_thd) << " locked " - << wsrep_thd_conflict_state( - lock2->trx->mysql_thd, - FALSE); + << wsrep_thd_transaction_state_str( + lock2->trx->mysql_thd); lock_rec_print(stderr, lock2, mtr); ib::info() << " SQL1: " << wsrep_thd_query(trx->mysql_thd) @@ -1093,12 +1090,15 @@ wsrep_kill_victim( /* quit for native mysql */ if (!trx->is_wsrep()) return; - my_bool bf_this = wsrep_thd_is_BF(trx->mysql_thd, FALSE); + if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE)) { + return; + } + my_bool bf_other = wsrep_thd_is_BF(lock->trx->mysql_thd, TRUE); mtr_t mtr; - if ((bf_this && !bf_other) || - (bf_this && bf_other && wsrep_trx_order_before( + if ((!bf_other) || + (wsrep_thd_order_before( trx->mysql_thd, lock->trx->mysql_thd))) { if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) { @@ -1109,11 +1109,7 @@ wsrep_kill_victim( is in the queue*/ } else if (lock->trx != trx) { if (wsrep_log_conflicts) { - if (bf_this) { - ib::info() << "*** Priority TRANSACTION:"; - } else { - ib::info() << "*** Victim TRANSACTION:"; - } + ib::info() << "*** Priority TRANSACTION:"; trx_print_latched(stderr, trx, 3000); @@ -1139,7 +1135,7 @@ wsrep_kill_victim( } wsrep_innobase_kill_one_trx(trx->mysql_thd, - trx, lock->trx, TRUE); + lock->trx, true); } } } @@ -1214,7 +1210,7 @@ lock_sec_rec_some_has_impl( ut_ad(!dict_index_is_clust(index)); ut_ad(page_rec_is_user_rec(rec)); ut_ad(rec_offs_validate(rec, index, offsets)); - ut_ad(!rec_is_metadata(rec, index)); + ut_ad(!rec_is_metadata(rec, *index)); max_trx_id = page_get_max_trx_id(page); @@ -1423,7 +1419,7 @@ lock_rec_create_low( lock_t *prev = NULL; while (hash && wsrep_thd_is_BF(hash->trx->mysql_thd, TRUE) - && wsrep_trx_order_before(hash->trx->mysql_thd, + && wsrep_thd_order_before(hash->trx->mysql_thd, trx->mysql_thd)) { prev = hash; hash = (lock_t *)hash->hash; @@ -1829,15 +1825,15 @@ lock_rec_add_to_queue( ib::info() << "WSREP BF lock conflict for my lock:\n BF:" << ((wsrep_thd_is_BF(trx->mysql_thd, FALSE)) ? "BF" : "normal") << " exec: " << - wsrep_thd_exec_mode(trx->mysql_thd) << " conflict: " << - wsrep_thd_conflict_state(trx->mysql_thd, false) << " seqno: " << + wsrep_thd_client_state_str(trx->mysql_thd) << " conflict: " << + wsrep_thd_transaction_state_str(trx->mysql_thd) << " seqno: " << wsrep_thd_trx_seqno(trx->mysql_thd) << " SQL: " << wsrep_thd_query(trx->mysql_thd); trx_t* otrx = other_lock->trx; ib::info() << "WSREP other lock:\n BF:" << ((wsrep_thd_is_BF(otrx->mysql_thd, FALSE)) ? "BF" : "normal") << " exec: " << - wsrep_thd_exec_mode(otrx->mysql_thd) << " conflict: " << - wsrep_thd_conflict_state(otrx->mysql_thd, false) << " seqno: " << + wsrep_thd_client_state_str(otrx->mysql_thd) << " conflict: " << + wsrep_thd_transaction_state_str(otrx->mysql_thd) << " seqno: " << wsrep_thd_trx_seqno(otrx->mysql_thd) << " SQL: " << wsrep_thd_query(otrx->mysql_thd); } @@ -4243,6 +4239,7 @@ lock_check_dict_lock( const lock_t* lock) /*!< in: lock to check */ { if (lock_get_type_low(lock) == LOCK_REC) { + ut_ad(!lock->index->table->is_temporary()); /* Check if the transcation locked a record in a system table in X mode. It should have set @@ -4256,9 +4253,8 @@ lock_check_dict_lock( } else { ut_ad(lock_get_type_low(lock) & LOCK_TABLE); - const dict_table_t* table; - - table = lock->un_member.tab_lock.table; + const dict_table_t* table = lock->un_member.tab_lock.table; + ut_ad(!table->is_temporary()); /* Check if the transcation locked a system table in IX mode. It should have set the dict_op code @@ -4590,14 +4586,14 @@ lock_print_info_summary( fprintf(file, "Purge done for trx's n:o < " TRX_ID_FMT " undo n:o < " TRX_ID_FMT " state: %s\n" - "History list length " ULINTPF "\n", + "History list length %u\n", purge_sys.tail.trx_no(), purge_sys.tail.undo_no, purge_sys.enabled() ? (purge_sys.running() ? "running" : purge_sys.paused() ? "stopped" : "running but idle") : "disabled", - trx_sys.history_size()); + uint32_t{trx_sys.rseg_history_len}); #ifdef PRINT_NUM_OF_LOCK_STRUCTS fprintf(file, @@ -4895,8 +4891,8 @@ func_exit: if (!lock_get_wait(other_lock) ) { ib::info() << "WSREP impl BF lock conflict for my impl lock:\n BF:" << ((wsrep_thd_is_BF(impl_trx->mysql_thd, FALSE)) ? "BF" : "normal") << " exec: " << - wsrep_thd_exec_mode(impl_trx->mysql_thd) << " conflict: " << - wsrep_thd_conflict_state(impl_trx->mysql_thd, false) << " seqno: " << + wsrep_thd_client_state_str(impl_trx->mysql_thd) << " conflict: " << + wsrep_thd_transaction_state_str(impl_trx->mysql_thd) << " seqno: " << wsrep_thd_trx_seqno(impl_trx->mysql_thd) << " SQL: " << wsrep_thd_query(impl_trx->mysql_thd); @@ -4904,8 +4900,8 @@ func_exit: ib::info() << "WSREP other lock:\n BF:" << ((wsrep_thd_is_BF(otrx->mysql_thd, FALSE)) ? "BF" : "normal") << " exec: " << - wsrep_thd_exec_mode(otrx->mysql_thd) << " conflict: " << - wsrep_thd_conflict_state(otrx->mysql_thd, false) << " seqno: " << + wsrep_thd_client_state_str(otrx->mysql_thd) << " conflict: " << + wsrep_thd_transaction_state_str(otrx->mysql_thd) << " seqno: " << wsrep_thd_trx_seqno(otrx->mysql_thd) << " SQL: " << wsrep_thd_query(otrx->mysql_thd); } @@ -5123,7 +5119,7 @@ lock_rec_block_validate( block = buf_page_get_gen( page_id_t(space_id, page_no), - page_size_t(space->flags), + space->zip_size(), RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED, __FILE__, __LINE__, &mtr, &err); @@ -5263,7 +5259,7 @@ lock_rec_insert_check_and_lock( trx_t* trx = thr_get_trx(thr); const rec_t* next_rec = page_rec_get_next_const(rec); ulint heap_no = page_rec_get_heap_no(next_rec); - ut_ad(!rec_is_metadata(next_rec, index)); + ut_ad(!rec_is_metadata(next_rec, *index)); lock_mutex_enter(); /* Because this code is invoked for a running transaction by @@ -5391,7 +5387,7 @@ lock_rec_convert_impl_to_expl_for_trx( { ut_ad(trx->is_referenced()); ut_ad(page_rec_is_leaf(rec)); - ut_ad(!rec_is_metadata(rec, index)); + ut_ad(!rec_is_metadata(rec, *index)); DEBUG_SYNC_C("before_lock_rec_convert_impl_to_expl_for_trx"); lock_mutex_enter(); @@ -5524,7 +5520,7 @@ lock_rec_convert_impl_to_expl( ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); ut_ad(page_rec_is_leaf(rec)); - ut_ad(!rec_is_metadata(rec, index)); + ut_ad(!rec_is_metadata(rec, *index)); if (dict_index_is_clust(index)) { trx_id_t trx_id; @@ -5601,7 +5597,7 @@ lock_clust_rec_modify_check_and_lock( return(DB_SUCCESS); } - ut_ad(!rec_is_metadata(rec, index)); + ut_ad(!rec_is_metadata(rec, *index)); ut_ad(!index->table->is_temporary()); heap_no = rec_offs_comp(offsets) @@ -5657,7 +5653,7 @@ lock_sec_rec_modify_check_and_lock( ut_ad(block->frame == page_align(rec)); ut_ad(mtr->is_named_space(index->table->space)); ut_ad(page_rec_is_leaf(rec)); - ut_ad(!rec_is_metadata(rec, index)); + ut_ad(!rec_is_metadata(rec, *index)); if (flags & BTR_NO_LOCKING_FLAG) { @@ -5751,7 +5747,7 @@ lock_sec_rec_read_check_and_lock( return(DB_SUCCESS); } - ut_ad(!rec_is_metadata(rec, index)); + ut_ad(!rec_is_metadata(rec, *index)); heap_no = page_rec_get_heap_no(rec); /* Some transaction may have an implicit x-lock on the record only @@ -5813,7 +5809,7 @@ lock_clust_rec_read_check_and_lock( || gap_mode == LOCK_REC_NOT_GAP); ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(page_rec_is_leaf(rec)); - ut_ad(!rec_is_metadata(rec, index)); + ut_ad(!rec_is_metadata(rec, *index)); if ((flags & BTR_NO_LOCKING_FLAG) || srv_read_only_mode @@ -6084,10 +6080,8 @@ lock_get_table_id( /*==============*/ const lock_t* lock) /*!< in: lock */ { - dict_table_t* table; - - table = lock_get_table(lock); - + dict_table_t* table = lock_get_table(lock); + ut_ad(!table->is_temporary()); return(table->id); } @@ -6263,6 +6257,12 @@ lock_trx_handle_wait( /*=================*/ trx_t* trx) /*!< in/out: trx lock state */ { +#ifdef WITH_WSREP + /* We already own mutexes */ + if (trx->lock.was_chosen_as_wsrep_victim) { + return lock_trx_handle_wait_low(trx); + } +#endif /* WITH_WSREP */ lock_mutex_enter(); trx_mutex_enter(trx); dberr_t err = lock_trx_handle_wait_low(trx); @@ -6866,6 +6866,11 @@ DeadlockChecker::trx_rollback() trx_t* trx = m_wait_lock->trx; print("*** WE ROLL BACK TRANSACTION (1)\n"); +#ifdef WITH_WSREP + if (trx->is_wsrep() && wsrep_thd_is_SR(trx->mysql_thd)) { + wsrep_handle_SR_rollback(m_start->mysql_thd, trx->mysql_thd); + } +#endif trx_mutex_enter(trx); @@ -6952,6 +6957,12 @@ DeadlockChecker::check_and_resolve(const lock_t* lock, trx_t* trx) if (victim_trx != NULL) { print("*** WE ROLL BACK TRANSACTION (2)\n"); +#ifdef WITH_WSREP + if (trx->is_wsrep() && wsrep_thd_is_SR(trx->mysql_thd)) { + wsrep_handle_SR_rollback(trx->mysql_thd, + victim_trx->mysql_thd); + } +#endif lock_deadlock_found = true; } diff --git a/storage/innobase/lock/lock0wait.cc b/storage/innobase/lock/lock0wait.cc index 90fe1ccd626..b1f9b3c7d9b 100644 --- a/storage/innobase/lock/lock0wait.cc +++ b/storage/innobase/lock/lock0wait.cc @@ -281,7 +281,7 @@ lock_wait_suspend_thread( if (thr->lock_state == QUE_THR_LOCK_ROW) { srv_stats.n_lock_wait_count.inc(); - srv_stats.n_lock_wait_current_count.inc(); + srv_stats.n_lock_wait_current_count++; start_time = my_interval_timer(); } @@ -384,7 +384,7 @@ lock_wait_suspend_thread( thd_storage_lock_wait(trx->mysql_thd, diff_time); } - srv_stats.n_lock_wait_current_count.dec(); + srv_stats.n_lock_wait_current_count--; DBUG_EXECUTE_IF("lock_instrument_slow_query_log", os_thread_sleep(1000);); diff --git a/storage/innobase/log/log0crypt.cc b/storage/innobase/log/log0crypt.cc index a2e4ac1dd40..ff055131a6f 100644 --- a/storage/innobase/log/log0crypt.cc +++ b/storage/innobase/log/log0crypt.cc @@ -39,10 +39,9 @@ my_bool srv_encrypt_log; /** Redo log encryption key ID */ #define LOG_DEFAULT_ENCRYPTION_KEY 1 -typedef union { - uint32_t words[MY_AES_BLOCK_SIZE / sizeof(uint32_t)]; +struct aes_block_t { byte bytes[MY_AES_BLOCK_SIZE]; -} aes_block_t; +}; struct crypt_info_t { ulint checkpoint_no; /*!< checkpoint no; 32 bits */ @@ -85,19 +84,63 @@ log_block_get_start_lsn( return start_lsn; } +/** Generate crypt key from crypt msg. +@param[in,out] info encryption key +@param[in] upgrade whether to use the key in MariaDB 10.1 format +@return whether the operation was successful */ +static bool init_crypt_key(crypt_info_t* info, bool upgrade = false) +{ + byte mysqld_key[MY_AES_MAX_KEY_LENGTH]; + uint keylen = sizeof mysqld_key; + + compile_time_assert(16 == sizeof info->crypt_key.bytes); + compile_time_assert(16 == MY_AES_BLOCK_SIZE); + + if (uint rc = encryption_key_get(LOG_DEFAULT_ENCRYPTION_KEY, + info->key_version, mysqld_key, + &keylen)) { + ib::error() + << "Obtaining redo log encryption key version " + << info->key_version << " failed (" << rc + << "). Maybe the key or the required encryption " + "key management plugin was not found."; + return false; + } + + if (upgrade) { + while (keylen < sizeof mysqld_key) { + mysqld_key[keylen++] = 0; + } + } + + uint dst_len; + int err= my_aes_crypt(MY_AES_ECB, + ENCRYPTION_FLAG_NOPAD | ENCRYPTION_FLAG_ENCRYPT, + info->crypt_msg.bytes, MY_AES_BLOCK_SIZE, + info->crypt_key.bytes, &dst_len, + mysqld_key, keylen, NULL, 0); + + if (err != MY_AES_OK || dst_len != MY_AES_BLOCK_SIZE) { + ib::error() << "Getting redo log crypto key failed: err = " + << err << ", len = " << dst_len; + return false; + } + + return true; +} + /** Encrypt or decrypt log blocks. @param[in,out] buf log blocks to encrypt or decrypt @param[in] lsn log sequence number of the start of the buffer @param[in] size size of the buffer, in bytes -@param[in] decrypt whether to decrypt instead of encrypting */ -UNIV_INTERN -void -log_crypt(byte* buf, lsn_t lsn, ulint size, bool decrypt) +@param[in] op whether to decrypt, encrypt, or rotate key and encrypt +@return whether the operation succeeded (encrypt always does) */ +bool log_crypt(byte* buf, lsn_t lsn, ulint size, log_crypt_t op) { ut_ad(size % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_ad(ulint(buf) % OS_FILE_LOG_BLOCK_SIZE == 0); ut_a(info.key_version); - uint dst_len; uint32_t aes_ctr_iv[MY_AES_BLOCK_SIZE / sizeof(uint32_t)]; compile_time_assert(sizeof(uint32_t) == 4); @@ -106,7 +149,8 @@ log_crypt(byte* buf, lsn_t lsn, ulint size, bool decrypt) for (const byte* const end = buf + size; buf != end; buf += OS_FILE_LOG_BLOCK_SIZE, lsn += OS_FILE_LOG_BLOCK_SIZE) { - uint32_t dst[(OS_FILE_LOG_BLOCK_SIZE - LOG_CRYPT_HDR_SIZE) + uint32_t dst[(OS_FILE_LOG_BLOCK_SIZE - LOG_CRYPT_HDR_SIZE + - LOG_BLOCK_CHECKSUM) / sizeof(uint32_t)]; /* The log block number is not encrypted. */ @@ -126,64 +170,61 @@ log_crypt(byte* buf, lsn_t lsn, ulint size, bool decrypt) ut_ad(log_block_get_start_lsn(lsn, log_block_get_hdr_no(buf)) == lsn); + byte* key_ver = &buf[OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_KEY + - LOG_BLOCK_CHECKSUM]; + const uint dst_size + = log_sys.log.format == log_t::FORMAT_ENC_10_4 + ? sizeof dst - LOG_BLOCK_KEY + : sizeof dst; + if (log_sys.log.format == log_t::FORMAT_ENC_10_4) { + const uint key_version = info.key_version; + switch (op) { + case LOG_ENCRYPT_ROTATE_KEY: + info.key_version + = encryption_key_get_latest_version( + LOG_DEFAULT_ENCRYPTION_KEY); + if (key_version != info.key_version + && !init_crypt_key(&info)) { + info.key_version = key_version; + } + /* fall through */ + case LOG_ENCRYPT: + mach_write_to_4(key_ver, info.key_version); + break; + case LOG_DECRYPT: + info.key_version = mach_read_from_4(key_ver); + if (key_version != info.key_version + && !init_crypt_key(&info)) { + return false; + } + } +#ifndef DBUG_OFF + if (key_version != info.key_version) { + DBUG_PRINT("ib_log", ("key_version: %x -> %x", + key_version, + info.key_version)); + } +#endif /* !DBUG_OFF */ + } + ut_ad(LOG_CRYPT_HDR_SIZE + dst_size + == log_sys.trailer_offset()); + + uint dst_len; int rc = encryption_crypt( - buf + LOG_CRYPT_HDR_SIZE, sizeof dst, + buf + LOG_CRYPT_HDR_SIZE, dst_size, reinterpret_cast<byte*>(dst), &dst_len, const_cast<byte*>(info.crypt_key.bytes), - sizeof info.crypt_key, + MY_AES_BLOCK_SIZE, reinterpret_cast<byte*>(aes_ctr_iv), sizeof aes_ctr_iv, - decrypt + op == LOG_DECRYPT ? ENCRYPTION_FLAG_DECRYPT | ENCRYPTION_FLAG_NOPAD : ENCRYPTION_FLAG_ENCRYPT | ENCRYPTION_FLAG_NOPAD, LOG_DEFAULT_ENCRYPTION_KEY, info.key_version); - ut_a(rc == MY_AES_OK); - ut_a(dst_len == sizeof dst); - memcpy(buf + LOG_CRYPT_HDR_SIZE, dst, sizeof dst); - } -} - -/** Generate crypt key from crypt msg. -@param[in,out] info encryption key -@param[in] upgrade whether to use the key in MariaDB 10.1 format -@return whether the operation was successful */ -static bool init_crypt_key(crypt_info_t* info, bool upgrade = false) -{ - byte mysqld_key[MY_AES_MAX_KEY_LENGTH]; - uint keylen = sizeof mysqld_key; - - compile_time_assert(16 == sizeof info->crypt_key); - - if (uint rc = encryption_key_get(LOG_DEFAULT_ENCRYPTION_KEY, - info->key_version, mysqld_key, - &keylen)) { - ib::error() - << "Obtaining redo log encryption key version " - << info->key_version << " failed (" << rc - << "). Maybe the key or the required encryption " - "key management plugin was not found."; - return false; - } - - if (upgrade) { - while (keylen < sizeof mysqld_key) { - mysqld_key[keylen++] = 0; - } - } - - uint dst_len; - int err= my_aes_crypt(MY_AES_ECB, - ENCRYPTION_FLAG_NOPAD | ENCRYPTION_FLAG_ENCRYPT, - info->crypt_msg.bytes, sizeof info->crypt_msg, - info->crypt_key.bytes, &dst_len, - mysqld_key, keylen, NULL, 0); - - if (err != MY_AES_OK || dst_len != MY_AES_BLOCK_SIZE) { - ib::error() << "Getting redo log crypto key failed: err = " - << err << ", len = " << dst_len; - return false; + ut_a(dst_len == dst_size); + memcpy(buf + LOG_CRYPT_HDR_SIZE, dst, dst_size); } return true; @@ -247,7 +288,7 @@ log_crypt_101_read_checkpoint(const byte* buf) infos_used++; info.checkpoint_no = checkpoint_no; info.key_version = mach_read_from_4(buf + 4); - memcpy(info.crypt_msg.bytes, buf + 8, sizeof info.crypt_msg); + memcpy(info.crypt_msg.bytes, buf + 8, MY_AES_BLOCK_SIZE); memcpy(info.crypt_nonce.bytes, buf + 24, sizeof info.crypt_nonce); @@ -331,13 +372,14 @@ void log_crypt_write_checkpoint_buf(byte* buf) { ut_ad(info.key_version); - compile_time_assert(16 == sizeof info.crypt_msg); + compile_time_assert(16 == sizeof info.crypt_msg.bytes); + compile_time_assert(16 == MY_AES_BLOCK_SIZE); compile_time_assert(LOG_CHECKPOINT_CRYPT_MESSAGE - LOG_CHECKPOINT_CRYPT_NONCE == sizeof info.crypt_nonce); memcpy(buf + LOG_CHECKPOINT_CRYPT_MESSAGE, info.crypt_msg.bytes, - sizeof info.crypt_msg); + MY_AES_BLOCK_SIZE); memcpy(buf + LOG_CHECKPOINT_CRYPT_NONCE, info.crypt_nonce.bytes, sizeof info.crypt_nonce); mach_write_to_4(buf + LOG_CHECKPOINT_CRYPT_KEY, info.key_version); @@ -356,13 +398,14 @@ log_crypt_read_checkpoint_buf(const byte* buf) #if MY_AES_BLOCK_SIZE != 16 # error "MY_AES_BLOCK_SIZE != 16; redo log checkpoint format affected" #endif - compile_time_assert(16 == sizeof info.crypt_msg); + compile_time_assert(16 == sizeof info.crypt_msg.bytes); + compile_time_assert(16 == MY_AES_BLOCK_SIZE); compile_time_assert(LOG_CHECKPOINT_CRYPT_MESSAGE - LOG_CHECKPOINT_CRYPT_NONCE == sizeof info.crypt_nonce); memcpy(info.crypt_msg.bytes, buf + LOG_CHECKPOINT_CRYPT_MESSAGE, - sizeof info.crypt_msg); + MY_AES_BLOCK_SIZE); memcpy(info.crypt_nonce.bytes, buf + LOG_CHECKPOINT_CRYPT_NONCE, sizeof info.crypt_nonce); @@ -392,8 +435,7 @@ log_tmp_block_encrypt( int rc = encryption_crypt( src, uint(size), dst, &dst_len, - const_cast<byte*>(info.crypt_key.bytes), - uint(sizeof info.crypt_key), + const_cast<byte*>(info.crypt_key.bytes), MY_AES_BLOCK_SIZE, reinterpret_cast<byte*>(iv), uint(sizeof iv), encrypt ? ENCRYPTION_FLAG_ENCRYPT|ENCRYPTION_FLAG_NOPAD diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index cf70a35f380..54ee05fba26 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -214,9 +214,9 @@ log_calculate_actual_len( { ut_ad(log_mutex_own()); + const ulint framing_size = log_sys.framing_size(); /* actual length stored per block */ - const ulint len_per_blk = OS_FILE_LOG_BLOCK_SIZE - - (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE); + const ulint len_per_blk = OS_FILE_LOG_BLOCK_SIZE - framing_size; /* actual data length in last block already written */ ulint extra_len = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE); @@ -225,8 +225,7 @@ log_calculate_actual_len( extra_len -= LOG_BLOCK_HDR_SIZE; /* total extra length for block header and trailer */ - extra_len = ((len + extra_len) / len_per_blk) - * (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE); + extra_len = ((len + extra_len) / len_per_blk) * framing_size; return(len + extra_len); } @@ -344,26 +343,24 @@ log_write_low( ulint str_len) /*!< in: string length */ { ulint len; - ulint data_len; - byte* log_block; ut_ad(log_mutex_own()); + const ulint trailer_offset = log_sys.trailer_offset(); part_loop: /* Calculate a part length */ - data_len = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len; + ulint data_len = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len; - if (data_len <= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) { + if (data_len <= trailer_offset) { /* The string fits within the current log block */ len = str_len; } else { - data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; + data_len = trailer_offset; - len = OS_FILE_LOG_BLOCK_SIZE - - (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) - - LOG_BLOCK_TRL_SIZE; + len = trailer_offset + - log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE; } memcpy(log_sys.buf + log_sys.buf_free, str, len); @@ -371,18 +368,18 @@ part_loop: str_len -= len; str = str + len; - log_block = static_cast<byte*>( + byte* log_block = static_cast<byte*>( ut_align_down(log_sys.buf + log_sys.buf_free, OS_FILE_LOG_BLOCK_SIZE)); log_block_set_data_len(log_block, data_len); - if (data_len == OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) { + if (data_len == trailer_offset) { /* This block became full */ log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE); log_block_set_checkpoint_no(log_block, log_sys.next_checkpoint_no); - len += LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE; + len += log_sys.framing_size(); log_sys.lsn += len; @@ -608,9 +605,7 @@ void log_t::files::create(ulint n_files) ut_ad(log_sys.is_initialised()); this->n_files= n_files; - format= srv_encrypt_log - ? LOG_HEADER_FORMAT_CURRENT | LOG_HEADER_FORMAT_ENCRYPTED - : LOG_HEADER_FORMAT_CURRENT; + format= srv_encrypt_log ? log_t::FORMAT_ENC_10_4 : log_t::FORMAT_10_4; subformat= 2; file_size= srv_log_file_size; lsn= LOG_START_LSN; @@ -632,8 +627,8 @@ log_file_header_flush( ut_ad(log_write_mutex_own()); ut_ad(!recv_no_log_write); ut_a(nth_file < log_sys.log.n_files); - ut_ad((log_sys.log.format & ~LOG_HEADER_FORMAT_ENCRYPTED) - == LOG_HEADER_FORMAT_CURRENT); + ut_ad(log_sys.log.format == log_t::FORMAT_10_4 + || log_sys.log.format == log_t::FORMAT_ENC_10_4); // man 2 open suggests this buffer to be aligned by 512 for O_DIRECT MY_ALIGNED(OS_FILE_LOG_BLOCK_SIZE) @@ -662,7 +657,7 @@ log_file_header_flush( fil_io(IORequestLogWrite, true, page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no), - univ_page_size, + 0, ulint(dest_offset & (srv_page_size - 1)), OS_FILE_LOG_BLOCK_SIZE, buf, NULL); @@ -782,7 +777,7 @@ loop: fil_io(IORequestLogWrite, true, page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no), - univ_page_size, + 0, ulint(next_offset & (srv_page_size - 1)), write_len, buf, NULL); srv_stats.os_log_pending_writes.dec(); @@ -858,11 +853,9 @@ wait and check if an already running write is covering the request. @param[in] lsn log sequence number that should be included in the redo log file write @param[in] flush_to_disk whether the written log should also -be flushed to the file system */ -void -log_write_up_to( - lsn_t lsn, - bool flush_to_disk) +be flushed to the file system +@param[in] rotate_key whether to rotate the encryption key */ +void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key) { #ifdef UNIV_DEBUG ulint loop_count = 0; @@ -871,6 +864,7 @@ log_write_up_to( lsn_t write_lsn; ut_ad(!srv_read_only_mode); + ut_ad(!rotate_key || flush_to_disk); if (recv_no_ibuf_operations) { /* Recovery is running and no operations on the log files are @@ -1015,7 +1009,8 @@ loop: if (log_sys.is_encrypted()) { log_crypt(write_buf + area_start, log_sys.write_lsn, - area_end - area_start); + area_end - area_start, + rotate_key ? LOG_ENCRYPT_ROTATE_KEY : LOG_ENCRYPT); } /* Do the write to the log files */ @@ -1270,7 +1265,7 @@ log_group_checkpoint(lsn_t end_lsn) fil_io(IORequestLogWrite, false, page_id_t(SRV_LOG_SPACE_FIRST_ID, 0), - univ_page_size, + 0, (log_sys.next_checkpoint_no & 1) ? LOG_CHECKPOINT_2 : LOG_CHECKPOINT_1, OS_FILE_LOG_BLOCK_SIZE, @@ -1290,7 +1285,7 @@ void log_header_read(ulint header) fil_io(IORequestLogRead, true, page_id_t(SRV_LOG_SPACE_FIRST_ID, header >> srv_page_size_shift), - univ_page_size, header & (srv_page_size - 1), + 0, header & (srv_page_size - 1), OS_FILE_LOG_BLOCK_SIZE, log_sys.checkpoint_buf, NULL); } @@ -1421,7 +1416,7 @@ bool log_checkpoint(bool sync) log_mutex_exit(); - log_write_up_to(flush_lsn, true); + log_write_up_to(flush_lsn, true, true); log_mutex_enter(); @@ -1607,11 +1602,11 @@ loop: } else { ut_ad(!srv_dict_stats_thread_active); } - if (recv_sys && recv_sys->flush_start) { + if (recv_sys.flush_start) { /* This is in case recv_writer_thread was never started, or buf_flush_page_cleaner_coordinator failed to notice its termination. */ - os_event_set(recv_sys->flush_start); + os_event_set(recv_sys.flush_start); } } #define COUNT_INTERVAL 600U @@ -1949,7 +1944,7 @@ void log_t::close() if (!srv_read_only_mode && srv_scrub_log) os_event_destroy(log_scrub_event); - recv_sys_close(); + recv_sys.close(); } /******************************************************//** @@ -1969,13 +1964,9 @@ log_pad_current_log_block(void) /* We retrieve lsn only because otherwise gcc crashed on HP-UX */ lsn = log_reserve_and_open(OS_FILE_LOG_BLOCK_SIZE); - pad_length = OS_FILE_LOG_BLOCK_SIZE - - (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) - - LOG_BLOCK_TRL_SIZE; - if (pad_length - == (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE - - LOG_BLOCK_TRL_SIZE)) { - + pad_length = log_sys.trailer_offset() + - log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE; + if (pad_length == log_sys.payload_size()) { pad_length = 0; } diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index 93adfdd7587..705e39c40dd 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -51,7 +51,6 @@ Created 9/20/1997 Heikki Tuuri #include "trx0undo.h" #include "trx0rec.h" #include "fil0fil.h" -#include "row0trunc.h" #include "buf0rea.h" #include "srv0srv.h" #include "srv0start.h" @@ -66,7 +65,7 @@ this must be less than srv_page_size as it is stored in the buffer pool */ #define RECV_READ_AHEAD_AREA 32 /** The recovery system */ -recv_sys_t* recv_sys; +recv_sys_t recv_sys; /** TRUE when applying redo log records during crash recovery; FALSE otherwise. Note that this is FALSE while a background thread is rolling back incomplete transactions. */ @@ -151,8 +150,9 @@ struct file_name_t { lsn_t enable_lsn; /** Constructor */ - file_name_t(std::string name_, bool deleted) : - name(name_), space(NULL), status(deleted ? DELETED: NORMAL), + file_name_t(std::string name_, bool deleted) + : name(std::move(name_)), space(NULL), + status(deleted ? DELETED: NORMAL), size(0), enable_lsn(0) {} /** Report a MLOG_INDEX_LOAD operation, meaning that @@ -210,10 +210,6 @@ corresponding to MLOG_INDEX_LOAD. */ void (*log_optimized_ddl_op)(ulint space_id); -/** Report backup-unfriendly TRUNCATE operation (with separate log file), -corresponding to MLOG_TRUNCATE. */ -void (*log_truncate)(); - /** Report an operation to create, delete, or rename a file during backup. @param[in] space_id tablespace identifier @param[in] flags tablespace flags (NULL if not create) @@ -249,7 +245,7 @@ private: ut_allocator<std::pair<const page_id_t, init> > > map; /** Map of page initialization operations. - FIXME: Merge this to recv_sys->addr_hash! */ + FIXME: Merge this to recv_sys.addr_hash! */ map inits; public: /** Record that a page will be initialized by the redo log. @@ -258,7 +254,7 @@ public: @param[in] lsn log sequence number */ void add(ulint space, ulint page_no, lsn_t lsn) { - ut_ad(mutex_own(&recv_sys->mutex)); + ut_ad(mutex_own(&recv_sys.mutex)); const init init = { lsn, false }; std::pair<map::iterator, bool> p = inits.insert( map::value_type(page_id_t(space, page_no), init)); @@ -273,20 +269,20 @@ public: @param[in] page_id page id @param[in,out] init initialize log or load log @return the latest page initialization; - not valid after releasing recv_sys->mutex. */ + not valid after releasing recv_sys.mutex. */ init& last(page_id_t page_id) { - ut_ad(mutex_own(&recv_sys->mutex)); + ut_ad(mutex_own(&recv_sys.mutex)); return inits.find(page_id)->second; } /** At the end of each recovery batch, reset the 'created' flags. */ void reset() { - ut_ad(mutex_own(&recv_sys->mutex)); + ut_ad(mutex_own(&recv_sys.mutex)); ut_ad(recv_no_ibuf_operations); - for (map::iterator i= inits.begin(); i != inits.end(); i++) { - i->second.created = false; + for (map::value_type& i : inits) { + i.second.created = false; } } @@ -304,26 +300,25 @@ public: @param[in,out] mtr dummy mini-transaction */ void ibuf_merge(mtr_t& mtr) { - ut_ad(mutex_own(&recv_sys->mutex)); + ut_ad(mutex_own(&recv_sys.mutex)); ut_ad(!recv_no_ibuf_operations); mtr.start(); - for (map::const_iterator i= inits.begin(); i != inits.end(); - i++) { - if (!i->second.created) { + for (const map::value_type& i : inits) { + if (!i.second.created) { continue; } if (buf_block_t* block = buf_page_get_low( - i->first, univ_page_size, RW_X_LATCH, NULL, + i.first, 0, RW_X_LATCH, NULL, BUF_GET_IF_IN_POOL, __FILE__, __LINE__, &mtr, NULL)) { - mutex_exit(&recv_sys->mutex); + mutex_exit(&recv_sys.mutex); ibuf_merge_or_delete_for_page( - block, i->first, - &block->page.size, true); + block, i.first, + block->zip_size(), true); mtr.commit(); mtr.start(); - mutex_enter(&recv_sys->mutex); + mutex_enter(&recv_sys.mutex); } } @@ -347,10 +342,10 @@ static void recv_addr_trim(ulint space_id, unsigned pages, lsn_t lsn) DBUG_LOG("ib_log", "discarding log beyond end of tablespace " << page_id_t(space_id, pages) << " before LSN " << lsn); - ut_ad(mutex_own(&recv_sys->mutex)); - for (ulint i = recv_sys->addr_hash->n_cells; i--; ) { + ut_ad(mutex_own(&recv_sys.mutex)); + for (ulint i = recv_sys.addr_hash->n_cells; i--; ) { hash_cell_t* const cell = hash_get_nth_cell( - recv_sys->addr_hash, i); + recv_sys.addr_hash, i); for (recv_addr_t* addr = static_cast<recv_addr_t*>(cell->node), *next; addr; addr = next) { @@ -458,7 +453,7 @@ fil_name_process( << " has been found in two places: '" << f.name << "' and '" << name << "'." " You must delete one of them."; - recv_sys->found_corrupt_fs = true; + recv_sys.found_corrupt_fs = true; } break; @@ -481,7 +476,7 @@ fil_name_process( forcing recovery. */ ib::info() - << "At LSN: " << recv_sys->recovered_lsn + << "At LSN: " << recv_sys.recovered_lsn << ": unable to open file " << name << " for tablespace " << space_id; } @@ -510,7 +505,7 @@ fil_name_process( " disk is broken, and you cannot" " remove the .ibd file, you can set" " --innodb_force_recovery."; - recv_sys->found_corrupt_fs = true; + recv_sys.found_corrupt_fs = true; break; } @@ -526,8 +521,7 @@ fil_name_process( /** Parse or process a MLOG_FILE_* record. @param[in] ptr redo log record @param[in] end end of the redo log buffer -@param[in] space_id the tablespace ID -@param[in] first_page_no first page number in the file +@param[in] page_id first page number in the file @param[in] type MLOG_FILE_NAME or MLOG_FILE_DELETE or MLOG_FILE_CREATE2 or MLOG_FILE_RENAME2 @param[in] apply whether to apply the record @@ -538,8 +532,7 @@ byte* fil_name_parse( byte* ptr, const byte* end, - ulint space_id, - ulint first_page_no, + const page_id_t page_id, mlog_id_t type, bool apply) { @@ -563,9 +556,9 @@ fil_name_parse( /* MLOG_FILE_* records should only be written for user-created tablespaces. The name must be long enough and end in .ibd. */ - bool corrupt = is_predefined_tablespace(space_id) + bool corrupt = is_predefined_tablespace(page_id.space()) || len < sizeof "/a.ibd\0" - || (!first_page_no != !memcmp(ptr + len - 5, DOT_IBD, 5)); + || (!page_id.page_no() != !memcmp(ptr + len - 5, DOT_IBD, 5)); if (!corrupt && !memchr(ptr, OS_PATH_SEPARATOR, len)) { if (byte* c = static_cast<byte*> @@ -591,37 +584,38 @@ fil_name_parse( case MLOG_FILE_NAME: if (UNIV_UNLIKELY(corrupt)) { ib::error() << "MLOG_FILE_NAME incorrect:" << ptr; - recv_sys->found_corrupt_log = true; + recv_sys.found_corrupt_log = true; break; } fil_name_process( - reinterpret_cast<char*>(ptr), len, space_id, false); + reinterpret_cast<char*>(ptr), len, page_id.space(), + false); break; case MLOG_FILE_DELETE: if (UNIV_UNLIKELY(corrupt)) { ib::error() << "MLOG_FILE_DELETE incorrect:" << ptr; - recv_sys->found_corrupt_log = true; + recv_sys.found_corrupt_log = true; break; } - fil_name_process( - reinterpret_cast<char*>(ptr), len, space_id, true); + fil_name_process(reinterpret_cast<char*>(ptr), len, + page_id.space(), true); /* fall through */ case MLOG_FILE_CREATE2: - if (first_page_no) { - ut_ad(first_page_no + if (page_id.page_no()) { + ut_ad(page_id.page_no() == SRV_UNDO_TABLESPACE_SIZE_IN_PAGES); - ut_a(srv_is_undo_tablespace(space_id)); + ut_a(srv_is_undo_tablespace(page_id.space())); compile_time_assert( - UT_ARR_SIZE(recv_sys->truncated_undo_spaces) + UT_ARR_SIZE(recv_sys.truncated_undo_spaces) == TRX_SYS_MAX_UNDO_SPACES); - recv_sys_t::trunc& t = recv_sys->truncated_undo_spaces[ - space_id - srv_undo_space_id_start]; - t.lsn = recv_sys->recovered_lsn; - t.pages = uint32_t(first_page_no); + recv_sys_t::trunc& t = recv_sys.truncated_undo_spaces[ + page_id.space() - srv_undo_space_id_start]; + t.lsn = recv_sys.recovered_lsn; + t.pages = uint32_t(page_id.page_no()); } else if (log_file_op) { - log_file_op(space_id, + log_file_op(page_id.space(), type == MLOG_FILE_CREATE2 ? ptr - 4 : NULL, ptr, len, NULL, 0); } @@ -629,7 +623,7 @@ fil_name_parse( case MLOG_FILE_RENAME2: if (UNIV_UNLIKELY(corrupt)) { ib::error() << "MLOG_FILE_RENAME2 incorrect:" << ptr; - recv_sys->found_corrupt_log = true; + recv_sys.found_corrupt_log = true; } /* The new name follows the old name. */ @@ -671,19 +665,19 @@ fil_name_parse( if (UNIV_UNLIKELY(corrupt)) { ib::error() << "MLOG_FILE_RENAME2 new_name incorrect:" << ptr << " new_name: " << new_name; - recv_sys->found_corrupt_log = true; + recv_sys.found_corrupt_log = true; break; } fil_name_process( reinterpret_cast<char*>(ptr), len, - space_id, false); + page_id.space(), false); fil_name_process( reinterpret_cast<char*>(new_name), new_len, - space_id, false); + page_id.space(), false); if (log_file_op) { - log_file_op(space_id, NULL, + log_file_op(page_id.space(), NULL, ptr, len, new_name, new_len); } @@ -691,50 +685,51 @@ fil_name_parse( break; } if (!fil_op_replay_rename( - space_id, first_page_no, + page_id.space(), page_id.page_no(), reinterpret_cast<const char*>(ptr), reinterpret_cast<const char*>(new_name))) { - recv_sys->found_corrupt_fs = true; + recv_sys.found_corrupt_fs = true; } } return(end_ptr); } -/** Clean up after recv_sys_init() */ -void -recv_sys_close() +/** Clean up after recv_sys_t::create() */ +void recv_sys_t::close() { - if (recv_sys != NULL) { - recv_sys->dblwr.pages.clear(); + ut_ad(this == &recv_sys); + ut_ad(!recv_writer_thread_active); - if (recv_sys->addr_hash != NULL) { - hash_table_free(recv_sys->addr_hash); - } + if (is_initialised()) { + dblwr.pages.clear(); - if (recv_sys->heap != NULL) { - mem_heap_free(recv_sys->heap); + if (addr_hash) { + hash_table_free(addr_hash); + addr_hash = NULL; } - if (recv_sys->flush_start != NULL) { - os_event_destroy(recv_sys->flush_start); + if (heap) { + mem_heap_free(heap); + heap = NULL; } - if (recv_sys->flush_end != NULL) { - os_event_destroy(recv_sys->flush_end); + if (flush_start) { + os_event_destroy(flush_start); } - if (recv_sys->buf != NULL) { - ut_free_dodump(recv_sys->buf, recv_sys->buf_size); + if (flush_end) { + os_event_destroy(flush_end); } - ut_ad(!recv_writer_thread_active); - mutex_free(&recv_sys->writer_mutex); - - mutex_free(&recv_sys->mutex); + if (buf) { + ut_free_dodump(buf, buf_size); + buf = NULL; + } - ut_free(recv_sys); - recv_sys = NULL; + buf_size = 0; + mutex_free(&writer_mutex); + mutex_free(&mutex); } recv_spaces.clear(); @@ -788,20 +783,20 @@ DECLARE_THREAD(recv_writer_thread)( int64_t sig_count = os_event_reset(buf_flush_event); os_event_wait_time_low(buf_flush_event, 100000, sig_count); - mutex_enter(&recv_sys->writer_mutex); + mutex_enter(&recv_sys.writer_mutex); if (!recv_recovery_is_on()) { - mutex_exit(&recv_sys->writer_mutex); + mutex_exit(&recv_sys.writer_mutex); break; } /* Flush pages from end of LRU if required */ - os_event_reset(recv_sys->flush_end); - recv_sys->flush_type = BUF_FLUSH_LRU; - os_event_set(recv_sys->flush_start); - os_event_wait(recv_sys->flush_end); + os_event_reset(recv_sys.flush_end); + recv_sys.flush_type = BUF_FLUSH_LRU; + os_event_set(recv_sys.flush_start); + os_event_wait(recv_sys.flush_end); - mutex_exit(&recv_sys->writer_mutex); + mutex_exit(&recv_sys.writer_mutex); } recv_writer_thread_active = false; @@ -816,75 +811,83 @@ DECLARE_THREAD(recv_writer_thread)( } /** Initialize the redo log recovery subsystem. */ -void -recv_sys_init() +void recv_sys_t::create() { - ut_ad(recv_sys == NULL); - - recv_sys = static_cast<recv_sys_t*>(ut_zalloc_nokey(sizeof(*recv_sys))); + ut_ad(this == &recv_sys); + ut_ad(!is_initialised()); + ut_ad(!flush_start); + ut_ad(!flush_end); + mutex_create(LATCH_ID_RECV_SYS, &mutex); + mutex_create(LATCH_ID_RECV_WRITER, &writer_mutex); - mutex_create(LATCH_ID_RECV_SYS, &recv_sys->mutex); - mutex_create(LATCH_ID_RECV_WRITER, &recv_sys->writer_mutex); - - recv_sys->heap = mem_heap_create_typed(256, MEM_HEAP_FOR_RECV_SYS); + heap = mem_heap_create_typed(256, MEM_HEAP_FOR_RECV_SYS); if (!srv_read_only_mode) { - recv_sys->flush_start = os_event_create(0); - recv_sys->flush_end = os_event_create(0); + flush_start = os_event_create(0); + flush_end = os_event_create(0); } - recv_sys->buf = static_cast<byte*>( - ut_malloc_dontdump(RECV_PARSING_BUF_SIZE)); - recv_sys->buf_size = RECV_PARSING_BUF_SIZE; - - recv_sys->addr_hash = hash_create(buf_pool_get_curr_size() / 512); - recv_sys->progress_time = time(NULL); + flush_type = BUF_FLUSH_LRU; + apply_log_recs = false; + apply_batch_on = false; + + buf = static_cast<byte*>(ut_malloc_dontdump(RECV_PARSING_BUF_SIZE)); + buf_size = RECV_PARSING_BUF_SIZE; + len = 0; + parse_start_lsn = 0; + scanned_lsn = 0; + scanned_checkpoint_no = 0; + recovered_offset = 0; + recovered_lsn = 0; + found_corrupt_log = false; + found_corrupt_fs = false; + mlog_checkpoint_lsn = 0; + + addr_hash = hash_create(buf_pool_get_curr_size() / 512); + n_addrs = 0; + progress_time = time(NULL); recv_max_page_lsn = 0; - /* Call the constructor for recv_sys_t::dblwr member */ - new (&recv_sys->dblwr) recv_dblwr_t(); + memset(truncated_undo_spaces, 0, sizeof truncated_undo_spaces); + last_stored_lsn = 0; } -/** Empty a fully processed hash table. */ -static -void -recv_sys_empty_hash() +/** Empty a fully processed set of stored redo log records. */ +inline void recv_sys_t::empty() { - ut_ad(mutex_own(&(recv_sys->mutex))); - ut_a(recv_sys->n_addrs == 0); + ut_ad(mutex_own(&mutex)); + ut_a(n_addrs == 0); - hash_table_free(recv_sys->addr_hash); - mem_heap_empty(recv_sys->heap); + hash_table_free(addr_hash); + mem_heap_empty(heap); - recv_sys->addr_hash = hash_create(buf_pool_get_curr_size() / 512); + addr_hash = hash_create(buf_pool_get_curr_size() / 512); } -/********************************************************//** -Frees the recovery system. */ -void -recv_sys_debug_free(void) -/*=====================*/ +/** Free most recovery data structures. */ +void recv_sys_t::debug_free() { - mutex_enter(&(recv_sys->mutex)); + ut_ad(this == &recv_sys); + ut_ad(is_initialised()); + mutex_enter(&mutex); - hash_table_free(recv_sys->addr_hash); - mem_heap_free(recv_sys->heap); - ut_free_dodump(recv_sys->buf, recv_sys->buf_size); + hash_table_free(addr_hash); + mem_heap_free(heap); + ut_free_dodump(buf, buf_size); - recv_sys->buf_size = 0; - recv_sys->buf = NULL; - recv_sys->heap = NULL; - recv_sys->addr_hash = NULL; + buf = NULL; + heap = NULL; + addr_hash = NULL; /* wake page cleaner up to progress */ if (!srv_read_only_mode) { ut_ad(!recv_recovery_is_on()); ut_ad(!recv_writer_thread_active); os_event_reset(buf_flush_event); - os_event_set(recv_sys->flush_start); + os_event_set(flush_start); } - mutex_exit(&(recv_sys->mutex)); + mutex_exit(&mutex); } /** Read a log segment to log_sys.buf. @@ -925,7 +928,7 @@ loop: fil_io(IORequestLogRead, true, page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no), - univ_page_size, + 0, ulint(source_offset & (srv_page_size - 1)), len, buf, NULL); @@ -967,22 +970,24 @@ fail: goto fail; } - if (is_encrypted()) { - log_crypt(buf, *start_lsn, - OS_FILE_LOG_BLOCK_SIZE, true); + if (is_encrypted() + && !log_crypt(buf, *start_lsn, + OS_FILE_LOG_BLOCK_SIZE, + LOG_DECRYPT)) { + goto fail; } } ulint dl = log_block_get_data_len(buf); if (dl < LOG_BLOCK_HDR_SIZE - || (dl > OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE - && dl != OS_FILE_LOG_BLOCK_SIZE)) { - recv_sys->found_corrupt_log = true; + || (dl != OS_FILE_LOG_BLOCK_SIZE + && dl > log_sys.trailer_offset())) { + recv_sys.found_corrupt_log = true; goto fail; } } - if (recv_sys->report(time(NULL))) { + if (recv_sys.report(time(NULL))) { ib::info() << "Read redo log up to LSN=" << *start_lsn; service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, "Read redo log up to LSN=" LSN_PF, @@ -1007,7 +1012,7 @@ static void recv_synchronize_groups() { - const lsn_t recovered_lsn = recv_sys->recovered_lsn; + const lsn_t recovered_lsn = recv_sys.recovered_lsn; /* Read the last recovered log block to the recovery system buffer: the block is always incomplete */ @@ -1139,7 +1144,7 @@ static dberr_t recv_log_format_0_recover(lsn_t lsn, bool crypt) fil_io(IORequestLogRead, true, page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no), - univ_page_size, + 0, ulint((source_offset & ~(OS_FILE_LOG_BLOCK_SIZE - 1)) & (srv_page_size - 1)), OS_FILE_LOG_BLOCK_SIZE, buf, NULL); @@ -1166,9 +1171,9 @@ static dberr_t recv_log_format_0_recover(lsn_t lsn, bool crypt) /* Mark the redo log for upgrading. */ srv_log_file_size = 0; - recv_sys->parse_start_lsn = recv_sys->recovered_lsn - = recv_sys->scanned_lsn - = recv_sys->mlog_checkpoint_lsn = lsn; + recv_sys.parse_start_lsn = recv_sys.recovered_lsn + = recv_sys.scanned_lsn + = recv_sys.mlog_checkpoint_lsn = lsn; log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn = log_sys.lsn = log_sys.write_lsn = log_sys.current_flush_lsn = log_sys.flushed_to_disk_lsn @@ -1177,52 +1182,6 @@ static dberr_t recv_log_format_0_recover(lsn_t lsn, bool crypt) return(DB_SUCCESS); } -/** Determine if a redo log from MariaDB 10.4 is clean. -@return error code -@retval DB_SUCCESS if the redo log is clean -@retval DB_CORRUPTION if the redo log is corrupted -@retval DB_ERROR if the redo log is not empty */ -static dberr_t recv_log_recover_10_4() -{ - ut_ad(!log_sys.is_encrypted()); - const lsn_t lsn = log_sys.log.get_lsn(); - const lsn_t source_offset = log_sys.log.calc_lsn_offset(lsn); - const ulint page_no - = (ulint) (source_offset / univ_page_size.physical()); - byte* buf = log_sys.buf; - - fil_io(IORequestLogRead, true, - page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no), - univ_page_size, - (ulint) ((source_offset & ~(OS_FILE_LOG_BLOCK_SIZE - 1)) - % univ_page_size.physical()), - OS_FILE_LOG_BLOCK_SIZE, buf, NULL); - - if (log_block_calc_checksum(buf) != log_block_get_checksum(buf)) { - return DB_CORRUPTION; - } - - /* On a clean shutdown, the redo log will be logically empty - after the checkpoint lsn. */ - - if (log_block_get_data_len(buf) - != (source_offset & (OS_FILE_LOG_BLOCK_SIZE - 1))) { - return DB_ERROR; - } - - /* Mark the redo log for downgrading. */ - srv_log_file_size = 0; - recv_sys->parse_start_lsn = recv_sys->recovered_lsn - = recv_sys->scanned_lsn - = recv_sys->mlog_checkpoint_lsn = lsn; - log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn - = log_sys.lsn = log_sys.write_lsn - = log_sys.current_flush_lsn = log_sys.flushed_to_disk_lsn - = lsn; - log_sys.next_checkpoint_no = 0; - return DB_SUCCESS; -} - /** Find the latest checkpoint in the log header. @param[out] max_field LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 @return error code or DB_SUCCESS */ @@ -1243,10 +1202,10 @@ recv_find_max_checkpoint(ulint* max_field) /* Check the header page checksum. There was no checksum in the first redo log format (version 0). */ log_sys.log.format = mach_read_from_4(buf + LOG_HEADER_FORMAT); - log_sys.log.subformat = log_sys.log.format != LOG_HEADER_FORMAT_3_23 + log_sys.log.subformat = log_sys.log.format != log_t::FORMAT_3_23 ? mach_read_from_4(buf + LOG_HEADER_SUBFORMAT) : 0; - if (log_sys.log.format != LOG_HEADER_FORMAT_3_23 + if (log_sys.log.format != log_t::FORMAT_3_23 && !recv_check_log_header_checksum(buf)) { ib::error() << "Invalid redo log header checksum."; return(DB_CORRUPTION); @@ -1259,15 +1218,14 @@ recv_find_max_checkpoint(ulint* max_field) creator[LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR] = 0; switch (log_sys.log.format) { - case LOG_HEADER_FORMAT_3_23: + case log_t::FORMAT_3_23: return(recv_find_max_checkpoint_0(max_field)); - case LOG_HEADER_FORMAT_10_2: - case LOG_HEADER_FORMAT_10_2 | LOG_HEADER_FORMAT_ENCRYPTED: - case LOG_HEADER_FORMAT_CURRENT: - case LOG_HEADER_FORMAT_CURRENT | LOG_HEADER_FORMAT_ENCRYPTED: - case LOG_HEADER_FORMAT_10_4: - /* We can only parse the unencrypted LOG_HEADER_FORMAT_10_4. - The encrypted format uses a larger redo log block trailer. */ + case log_t::FORMAT_10_2: + case log_t::FORMAT_10_2 | log_t::FORMAT_ENCRYPTED: + case log_t::FORMAT_10_3: + case log_t::FORMAT_10_3 | log_t::FORMAT_ENCRYPTED: + case log_t::FORMAT_10_4: + case log_t::FORMAT_10_4 | log_t::FORMAT_ENCRYPTED: break; default: ib::error() << "Unsupported redo log format." @@ -1332,19 +1290,7 @@ recv_find_max_checkpoint(ulint* max_field) return(DB_ERROR); } - if (log_sys.log.format == LOG_HEADER_FORMAT_10_4) { - dberr_t err = recv_log_recover_10_4(); - if (err != DB_SUCCESS) { - ib::error() - << "Downgrade after a crash is not supported." - " The redo log was created with " << creator - << (err == DB_ERROR - ? "." : ", and it appears corrupted."); - } - return err; - } - - return DB_SUCCESS; + return(DB_SUCCESS); } /** Try to parse a single log record body and also applies it if @@ -1352,8 +1298,7 @@ specified. @param[in] type redo log entry type @param[in] ptr redo log record body @param[in] end_ptr end of buffer -@param[in] space_id tablespace identifier -@param[in] page_no page number +@param[in] page_id page identifier @param[in] apply whether to apply the record @param[in,out] block buffer block, or NULL if a page log record should not be applied @@ -1367,14 +1312,13 @@ recv_parse_or_apply_log_rec_body( mlog_id_t type, byte* ptr, byte* end_ptr, - ulint space_id, - ulint page_no, + const page_id_t page_id, bool apply, buf_block_t* block, mtr_t* mtr) { ut_ad(!block == !mtr); - ut_ad(!apply || recv_sys->mlog_checkpoint_lsn != 0); + ut_ad(!apply || recv_sys.mlog_checkpoint_lsn); switch (type) { case MLOG_FILE_NAME: @@ -1384,22 +1328,17 @@ recv_parse_or_apply_log_rec_body( ut_ad(block == NULL); /* Collect the file names when parsing the log, before applying any log records. */ - return(fil_name_parse(ptr, end_ptr, space_id, page_no, type, - apply)); + return fil_name_parse(ptr, end_ptr, page_id, type, apply); case MLOG_INDEX_LOAD: if (end_ptr < ptr + 8) { return(NULL); } return(ptr + 8); case MLOG_TRUNCATE: - if (log_truncate) { - ut_ad(srv_operation != SRV_OPERATION_NORMAL); - log_truncate(); - recv_sys->found_corrupt_fs = true; - return NULL; - } - return(truncate_t::parse_redo_entry(ptr, end_ptr, space_id)); - + ib::error() << "Cannot crash-upgrade from " + "old-style TRUNCATE TABLE"; + recv_sys.found_corrupt_log = true; + return NULL; default: break; } @@ -1418,23 +1357,22 @@ recv_parse_or_apply_log_rec_body( page_zip = buf_block_get_page_zip(block); ut_d(page_type = fil_page_get_type(page)); } else if (apply - && !is_predefined_tablespace(space_id) - && recv_spaces.find(space_id) == recv_spaces.end()) { - if (recv_sys->recovered_lsn < recv_sys->mlog_checkpoint_lsn) { + && !is_predefined_tablespace(page_id.space()) + && recv_spaces.find(page_id.space()) == recv_spaces.end()) { + if (recv_sys.recovered_lsn < recv_sys.mlog_checkpoint_lsn) { /* We have not seen all records between the checkpoint and MLOG_CHECKPOINT. There should be a MLOG_FILE_DELETE for this tablespace later. */ recv_spaces.insert( - std::make_pair(space_id, + std::make_pair(page_id.space(), file_name_t("", false))); goto parse_log; } ib::error() << "Missing MLOG_FILE_NAME or MLOG_FILE_DELETE" - " for redo log record " << type << " (page " - << space_id << ":" << page_no << ") at " - << recv_sys->recovered_lsn << "."; - recv_sys->found_corrupt_log = true; + " for redo log record " << type << page_id << " at " + << recv_sys.recovered_lsn << "."; + recv_sys.found_corrupt_log = true; return(NULL); } else { parse_log: @@ -1453,6 +1391,7 @@ parse_log: break; #endif /* UNIV_LOG_LSN_DEBUG */ case MLOG_1BYTE: case MLOG_2BYTES: case MLOG_4BYTES: case MLOG_8BYTES: + case MLOG_MEMSET: #ifdef UNIV_DEBUG if (page && page_type == FIL_PAGE_TYPE_ALLOCATED && end_ptr >= ptr + 2) { @@ -1475,7 +1414,8 @@ parse_log: redo log been written with something older than InnoDB Plugin 1.0.4. */ ut_ad(offs == FIL_PAGE_TYPE - || srv_is_undo_tablespace(space_id) + || srv_is_undo_tablespace( + page_id.space()) || offs == IBUF_TREE_SEG_HEADER + IBUF_HEADER + FSEG_HDR_OFFSET || offs == PAGE_BTR_IBUF_FREE_LIST @@ -1501,7 +1441,8 @@ parse_log: ut_ad(0 /* fil_crypt_rotate_page() writes this */ || offs == FIL_PAGE_SPACE_ID - || srv_is_undo_tablespace(space_id) + || srv_is_undo_tablespace( + page_id.space()) || offs == IBUF_TREE_SEG_HEADER + IBUF_HEADER + FSEG_HDR_SPACE || offs == IBUF_TREE_SEG_HEADER @@ -1533,7 +1474,7 @@ parse_log: #endif /* UNIV_DEBUG */ ptr = mlog_parse_nbytes(type, ptr, end_ptr, page, page_zip); if (ptr != NULL && page != NULL - && page_no == 0 && type == MLOG_4BYTES) { + && page_id.page_no() == 0 && type == MLOG_4BYTES) { ulint offs = mach_read_from_2(old_ptr); switch (offs) { fil_space_t* space; @@ -1544,7 +1485,7 @@ parse_log: case FSP_HEADER_OFFSET + FSP_SIZE: case FSP_HEADER_OFFSET + FSP_FREE_LIMIT: case FSP_HEADER_OFFSET + FSP_FREE + FLST_LEN: - space = fil_space_get(space_id); + space = fil_space_get(page_id.space()); ut_a(space != NULL); val = mach_read_from_4(page + offs); @@ -1720,12 +1661,17 @@ parse_log: break; case MLOG_IBUF_BITMAP_INIT: /* Allow anything in page_type when creating a page. */ - ptr = ibuf_parse_bitmap_init(ptr, end_ptr, block, mtr); + if (block) ibuf_bitmap_init_apply(block); break; case MLOG_INIT_FILE_PAGE2: /* Allow anything in page_type when creating a page. */ if (block) fsp_apply_init_file_page(block); break; + case MLOG_INIT_FREE_PAGE: + /* The page can be zero-filled and its previous + contents can be ignored. We do not write or apply + this record yet. */ + break; case MLOG_WRITE_STRING: ptr = mlog_parse_string(ptr, end_ptr, page, page_zip); break; @@ -1769,7 +1715,7 @@ parse_log: ptr = const_cast<byte*>(fil_parse_write_crypt_data(ptr, end_ptr, &err)); if (err != DB_SUCCESS) { - recv_sys->found_corrupt_log = TRUE; + recv_sys.found_corrupt_log = TRUE; } break; default: @@ -1777,7 +1723,7 @@ parse_log: ib::error() << "Incorrect log record type " << ib::hex(unsigned(type)); - recv_sys->found_corrupt_log = true; + recv_sys.found_corrupt_log = true; } if (index) { @@ -1815,7 +1761,7 @@ recv_hash( ulint space, /*!< in: space */ ulint page_no)/*!< in: page number */ { - return(hash_calc_hash(recv_fold(space, page_no), recv_sys->addr_hash)); + return(hash_calc_hash(recv_fold(space, page_no), recv_sys.addr_hash)); } /*********************************************************************//** @@ -1828,12 +1774,12 @@ recv_get_fil_addr_struct( ulint space, /*!< in: space id */ ulint page_no)/*!< in: page number */ { - ut_ad(mutex_own(&recv_sys->mutex)); + ut_ad(mutex_own(&recv_sys.mutex)); recv_addr_t* recv_addr; for (recv_addr = static_cast<recv_addr_t*>( - HASH_GET_FIRST(recv_sys->addr_hash, + HASH_GET_FIRST(recv_sys.addr_hash, recv_hash(space, page_no))); recv_addr != 0; recv_addr = static_cast<recv_addr_t*>( @@ -1849,26 +1795,18 @@ recv_get_fil_addr_struct( return(NULL); } -/*******************************************************************//** -Adds a new log record to the hash table of log records. */ -static -void -recv_add_to_hash_table( -/*===================*/ - mlog_id_t type, /*!< in: log record type */ - ulint space, /*!< in: space id */ - ulint page_no, /*!< in: page number */ - byte* body, /*!< in: log record body */ - byte* rec_end, /*!< in: log record end */ - lsn_t start_lsn, /*!< in: start lsn of the mtr */ - lsn_t end_lsn) /*!< in: end lsn of the mtr */ +/** Store a redo log record for applying. +@param type record type +@param space tablespace identifier +@param page_no page number +@param body record body +@param rec_end end of record +@param lsn start LSN of the mini-transaction +@param end_lsn end LSN of the mini-transaction */ +inline void recv_sys_t::add(mlog_id_t type, ulint space, ulint page_no, + byte* body, byte* rec_end, lsn_t lsn, + lsn_t end_lsn) { - recv_t* recv; - ulint len; - recv_data_t* recv_data; - recv_data_t** prev_field; - recv_addr_t* recv_addr; - ut_ad(type != MLOG_FILE_DELETE); ut_ad(type != MLOG_FILE_CREATE2); ut_ad(type != MLOG_FILE_RENAME2); @@ -1878,21 +1816,18 @@ recv_add_to_hash_table( ut_ad(type != MLOG_INDEX_LOAD); ut_ad(type != MLOG_TRUNCATE); - len = ulint(rec_end - body); - - recv = static_cast<recv_t*>( - mem_heap_alloc(recv_sys->heap, sizeof(recv_t))); + recv_t* recv= static_cast<recv_t*>(mem_heap_alloc(heap, sizeof *recv)); recv->type = type; recv->len = ulint(rec_end - body); - recv->start_lsn = start_lsn; + recv->start_lsn = lsn; recv->end_lsn = end_lsn; - recv_addr = recv_get_fil_addr_struct(space, page_no); + recv_addr_t* recv_addr = recv_get_fil_addr_struct(space, page_no); if (recv_addr == NULL) { recv_addr = static_cast<recv_addr_t*>( - mem_heap_alloc(recv_sys->heap, sizeof(recv_addr_t))); + mem_heap_alloc(heap, sizeof(recv_addr_t))); recv_addr->space = space; recv_addr->page_no = page_no; @@ -1900,50 +1835,49 @@ recv_add_to_hash_table( UT_LIST_INIT(recv_addr->rec_list, &recv_t::rec_list); - HASH_INSERT(recv_addr_t, addr_hash, recv_sys->addr_hash, + HASH_INSERT(recv_addr_t, addr_hash, addr_hash, recv_fold(space, page_no), recv_addr); - recv_sys->n_addrs++; + n_addrs++; } switch (type) { case MLOG_INIT_FILE_PAGE2: case MLOG_ZIP_PAGE_COMPRESS: + case MLOG_INIT_FREE_PAGE: /* Ignore any earlier redo log records for this page. */ ut_ad(recv_addr->state == RECV_NOT_PROCESSED || recv_addr->state == RECV_WILL_NOT_READ); recv_addr->state = RECV_WILL_NOT_READ; - mlog_init.add(space, page_no, start_lsn); + mlog_init.add(space, page_no, lsn); default: break; } UT_LIST_ADD_LAST(recv_addr->rec_list, recv); - prev_field = &(recv->data); + recv_data_t** prev_field = &recv->data; /* Store the log record body in chunks of less than srv_page_size: - recv_sys->heap grows into the buffer pool, and bigger chunks could not + heap grows into the buffer pool, and bigger chunks could not be allocated */ while (rec_end > body) { + ulint rec_len = ulint(rec_end - body); - len = ulint(rec_end - body); - - if (len > RECV_DATA_BLOCK_SIZE) { - len = RECV_DATA_BLOCK_SIZE; + if (rec_len > RECV_DATA_BLOCK_SIZE) { + rec_len = RECV_DATA_BLOCK_SIZE; } - recv_data = static_cast<recv_data_t*>( - mem_heap_alloc(recv_sys->heap, - sizeof(recv_data_t) + len)); + recv_data_t* recv_data = static_cast<recv_data_t*>( + mem_heap_alloc(heap, sizeof(recv_data_t) + rec_len)); *prev_field = recv_data; - memcpy(recv_data + 1, body, len); + memcpy(recv_data + 1, body, rec_len); - prev_field = &(recv_data->next); + prev_field = &recv_data->next; - body += len; + body += rec_len; } *prev_field = NULL; @@ -1986,18 +1920,21 @@ lsn of a log record. @param[in,out] block buffer pool page @param[in,out] mtr mini-transaction @param[in,out] recv_addr recovery address -@param[in] init_lsn the initial LSN where to start recovery */ +@param[in,out] init page initialization operation, or NULL */ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, - recv_addr_t* recv_addr, lsn_t init_lsn = 0) + recv_addr_t* recv_addr, + mlog_init_t::init* init = NULL) { page_t* page; page_zip_des_t* page_zip; - ut_ad(mutex_own(&recv_sys->mutex)); - ut_ad(recv_sys->apply_log_recs); + ut_ad(mutex_own(&recv_sys.mutex)); + ut_ad(recv_sys.apply_log_recs); ut_ad(recv_needed_recovery); ut_ad(recv_addr->state != RECV_BEING_PROCESSED); ut_ad(recv_addr->state != RECV_PROCESSED); + ut_ad(!init || init->created); + ut_ad(!init || init->lsn); if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) { fprintf(stderr, "Applying log to page %u:%u\n", @@ -2007,7 +1944,7 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, DBUG_LOG("ib_log", "Applying log to page " << block->page.id); recv_addr->state = RECV_BEING_PROCESSED; - mutex_exit(&recv_sys->mutex); + mutex_exit(&recv_sys.mutex); page = block->frame; page_zip = buf_block_get_page_zip(block); @@ -2019,19 +1956,9 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, page_lsn = mach_read_from_8(page + FIL_PAGE_LSN); } + bool free_page = false; lsn_t start_lsn = 0, end_lsn = 0; - fil_space_t* space; - - if (srv_is_tablespace_truncated(recv_addr->space)) { - /* The table will be truncated after applying - normal redo log records. */ - goto skip_log; - } - - space = fil_space_acquire(recv_addr->space); - if (!space) { - goto skip_log; - } + const lsn_t init_lsn = init ? init->lsn : 0; for (recv_t* recv = UT_LIST_GET_FIRST(recv_addr->rec_list); recv; recv = UT_LIST_GET_NEXT(rec_list, recv)) { @@ -2051,18 +1978,11 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, << get_mlog_string(recv->type) << " LSN " << recv->start_lsn << " < " << init_lsn); - } else if (srv_was_tablespace_truncated(space) - && recv->start_lsn - < truncate_t::get_truncated_tablespace_init_lsn( - recv_addr->space)) { - /* If per-table tablespace was truncated and - there exist REDO records before truncate that - are to be applied as part of recovery - (checkpoint didn't happen since truncate was - done) skip such records using lsn check as - they may not stand valid post truncate. */ } else { - if (!start_lsn) { + if (recv->type == MLOG_INIT_FREE_PAGE) { + /* This does not really modify the page. */ + free_page = true; + } else if (!start_lsn) { start_lsn = recv->start_lsn; } @@ -2093,8 +2013,7 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, recv_parse_or_apply_log_rec_body( recv->type, buf, buf + recv->len, - block->page.id.space(), - block->page.id.page_no(), true, block, &mtr); + block->page.id, true, block, &mtr); end_lsn = recv->start_lsn + recv->len; mach_write_to_8(FIL_PAGE_LSN + page, end_lsn); @@ -2113,9 +2032,6 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, } } - space->release(); - -skip_log: #ifdef UNIV_ZIP_DEBUG ut_ad(!fil_page_index_page_check(page) || !page_zip @@ -2124,8 +2040,15 @@ skip_log: if (start_lsn) { log_flush_order_mutex_enter(); - buf_flush_recv_note_modification(block, start_lsn, end_lsn); + buf_flush_note_modification(block, start_lsn, end_lsn, NULL); log_flush_order_mutex_exit(); + } else if (free_page && init) { + /* There have been no operations than MLOG_INIT_FREE_PAGE. + Any buffered changes must not be merged. A subsequent + buf_page_create() from a user thread should discard + any buffered changes. */ + init->created = false; + ut_ad(!mtr.has_modifications()); } /* Make sure that committing mtr does not change the modification @@ -2136,7 +2059,7 @@ skip_log: time_t now = time(NULL); - mutex_enter(&recv_sys->mutex); + mutex_enter(&recv_sys.mutex); if (recv_max_page_lsn < page_lsn) { recv_max_page_lsn = page_lsn; @@ -2145,9 +2068,9 @@ skip_log: ut_ad(recv_addr->state == RECV_BEING_PROCESSED); recv_addr->state = RECV_PROCESSED; - ut_a(recv_sys->n_addrs > 0); - if (ulint n = --recv_sys->n_addrs) { - if (recv_sys->report(now)) { + ut_a(recv_sys.n_addrs > 0); + if (ulint n = --recv_sys.n_addrs) { + if (recv_sys.report(now)) { ib::info() << "To recover: " << n << " pages from log"; service_manager_extend_timeout( INNODB_EXTEND_TIMEOUT_INTERVAL, "To recover: " ULINTPF " pages from log", n); @@ -2155,15 +2078,15 @@ skip_log: } } -/** Reduces recv_sys->n_addrs for the corrupted page. +/** Reduces recv_sys.n_addrs for the corrupted page. This function should called when srv_force_recovery > 0. @param[in] page_id page id of the corrupted page */ void recv_recover_corrupt_page(page_id_t page_id) { ut_ad(srv_force_recovery); - mutex_enter(&recv_sys->mutex); + mutex_enter(&recv_sys.mutex); - if (!recv_sys->apply_log_recs) { + if (!recv_sys.apply_log_recs) { } else if (recv_addr_t* recv_addr = recv_get_fil_addr_struct( page_id.space(), page_id.page_no())) { switch (recv_addr->state) { @@ -2175,12 +2098,12 @@ void recv_recover_corrupt_page(page_id_t page_id) break; default: recv_addr->state = RECV_PROCESSED; - ut_ad(recv_sys->n_addrs); - recv_sys->n_addrs--; + ut_ad(recv_sys.n_addrs); + recv_sys.n_addrs--; } } - mutex_exit(&recv_sys->mutex); + mutex_exit(&recv_sys.mutex); } /** Apply any buffered redo log to a page that was just read from a data file. @@ -2205,8 +2128,8 @@ void recv_recover_page(buf_page_t* bpage) __FILE__, __LINE__, &mtr); ut_a(success); - mutex_enter(&recv_sys->mutex); - if (!recv_sys->apply_log_recs) { + mutex_enter(&recv_sys.mutex); + if (!recv_sys.apply_log_recs) { } else if (recv_addr_t* recv_addr = recv_get_fil_addr_struct( bpage->id.space(), bpage->id.page_no())) { switch (recv_addr->state) { @@ -2221,7 +2144,7 @@ void recv_recover_page(buf_page_t* bpage) mtr.commit(); func_exit: - mutex_exit(&recv_sys->mutex); + mutex_exit(&recv_sys.mutex); ut_ad(mtr.has_committed()); } @@ -2247,10 +2170,10 @@ static void recv_read_in_area(const page_id_t page_id) } } - mutex_exit(&recv_sys->mutex); + mutex_exit(&recv_sys.mutex); buf_read_recv_pages(FALSE, page_id.space(), page_nos, ulint(p - page_nos)); - mutex_enter(&recv_sys->mutex); + mutex_enter(&recv_sys.mutex); } /** This is another low level function for the recovery system @@ -2273,12 +2196,12 @@ static buf_block_t* recv_recovery_create_page_low(const page_id_t page_id, << " < " << i.lsn); recv_addr->state= RECV_PROCESSED; ignore: - ut_a(recv_sys->n_addrs); - recv_sys->n_addrs--; + ut_a(recv_sys.n_addrs); + recv_sys.n_addrs--; return NULL; } - fil_space_t *space= fil_space_acquire(recv_addr->space); + fil_space_t *space= fil_space_acquire_for_io(recv_addr->space); if (!space) { recv_addr->state= RECV_PROCESSED; @@ -2288,7 +2211,7 @@ ignore: if (space->enable_lsn) { init_fail: - space->release(); + space->release_for_io(); recv_addr->state= RECV_NOT_PROCESSED; return NULL; } @@ -2308,8 +2231,7 @@ init_fail: mtr.start(); mtr.set_log_mode(MTR_LOG_NONE); - buf_block_t *block= buf_page_create(page_id, page_size_t(space->flags), - &mtr); + buf_block_t *block= buf_page_create(page_id, space->zip_size(), &mtr); if (recv_addr->state == RECV_PROCESSED) /* The page happened to exist in the buffer pool, or it was just being read in. Before buf_page_get_with_no_latch() returned, @@ -2320,11 +2242,11 @@ init_fail: i.created= true; buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); mtr.x_latch_at_savepoint(0, block); - recv_recover_page(block, mtr, recv_addr, i.lsn); + recv_recover_page(block, mtr, recv_addr, &i); ut_ad(mtr.has_committed()); } - space->release(); + space->release_for_io(); return block; } @@ -2334,15 +2256,13 @@ to create a page which has buffered intialized redo log records. @return whether the page creation successfully */ buf_block_t* recv_recovery_create_page_low(const page_id_t page_id) { - buf_block_t* block= NULL; - mutex_enter(&recv_sys->mutex); + buf_block_t* block= nullptr; + mutex_enter(&recv_sys.mutex); recv_addr_t* recv_addr= recv_get_fil_addr_struct(page_id.space(), page_id.page_no()); if (recv_addr && recv_addr->state == RECV_WILL_NOT_READ) - { block= recv_recovery_create_page_low(page_id, recv_addr); - } - mutex_exit(&recv_sys->mutex); + mutex_exit(&recv_sys.mutex); return block; } @@ -2354,18 +2274,18 @@ void recv_apply_hashed_log_recs(bool last_batch) ut_ad(srv_operation == SRV_OPERATION_NORMAL || is_mariabackup_restore_or_export()); - mutex_enter(&recv_sys->mutex); + mutex_enter(&recv_sys.mutex); - while (recv_sys->apply_batch_on) { - bool abort = recv_sys->found_corrupt_log; - mutex_exit(&recv_sys->mutex); + while (recv_sys.apply_batch_on) { + bool abort = recv_sys.found_corrupt_log; + mutex_exit(&recv_sys.mutex); if (abort) { return; } os_thread_sleep(500000); - mutex_enter(&recv_sys->mutex); + mutex_enter(&recv_sys.mutex); } ut_ad(!last_batch == log_mutex_own()); @@ -2375,7 +2295,18 @@ void recv_apply_hashed_log_recs(bool last_batch) ut_d(recv_no_log_write = recv_no_ibuf_operations); - if (ulint n = recv_sys->n_addrs) { + if (ulint n = recv_sys.n_addrs) { + if (!log_sys.log.subformat && !srv_force_recovery + && srv_undo_tablespaces_open) { + ib::error() << "Recovery of separately logged" + " TRUNCATE operations is no longer supported." + " Set innodb_force_recovery=1" + " if no *trunc.log files exist"; + recv_sys.found_corrupt_log = true; + mutex_exit(&recv_sys.mutex); + return; + } + const char* msg = last_batch ? "Starting final batch to recover " : "Starting a batch to recover "; @@ -2383,11 +2314,11 @@ void recv_apply_hashed_log_recs(bool last_batch) sd_notifyf(0, "STATUS=%s" ULINTPF " pages from redo log", msg, n); } - recv_sys->apply_log_recs = TRUE; - recv_sys->apply_batch_on = TRUE; + recv_sys.apply_log_recs = true; + recv_sys.apply_batch_on = true; for (ulint id = srv_undo_tablespaces_open; id--; ) { - recv_sys_t::trunc& t = recv_sys->truncated_undo_spaces[id]; + recv_sys_t::trunc& t = recv_sys.truncated_undo_spaces[id]; if (t.lsn) { recv_addr_trim(id + srv_undo_space_id_start, t.pages, t.lsn); @@ -2396,16 +2327,16 @@ void recv_apply_hashed_log_recs(bool last_batch) mtr_t mtr; - for (ulint i = 0; i < hash_get_n_cells(recv_sys->addr_hash); i++) { + for (ulint i = 0; i < hash_get_n_cells(recv_sys.addr_hash); i++) { for (recv_addr_t* recv_addr = static_cast<recv_addr_t*>( - HASH_GET_FIRST(recv_sys->addr_hash, i)); + HASH_GET_FIRST(recv_sys.addr_hash, i)); recv_addr; recv_addr = static_cast<recv_addr_t*>( HASH_GET_NEXT(addr_hash, recv_addr))) { if (!UT_LIST_GET_LEN(recv_addr->rec_list)) { ignore: - ut_a(recv_sys->n_addrs); - recv_sys->n_addrs--; + ut_a(recv_sys.n_addrs); + recv_sys.n_addrs--; continue; } @@ -2421,13 +2352,6 @@ ignore: break; } - if (srv_is_tablespace_truncated(recv_addr->space)) { - /* Avoid applying REDO log for the tablespace - that is schedule for TRUNCATE. */ - recv_addr->state = RECV_DISCARDED; - goto ignore; - } - const page_id_t page_id(recv_addr->space, recv_addr->page_no); @@ -2436,8 +2360,7 @@ apply: mtr.start(); mtr.set_log_mode(MTR_LOG_NONE); if (buf_block_t* block = buf_page_get_low( - page_id, univ_page_size, - RW_X_LATCH, NULL, + page_id, 0, RW_X_LATCH, NULL, BUF_GET_IF_IN_POOL, __FILE__, __LINE__, &mtr, NULL)) { buf_block_dbg_add_level( @@ -2458,16 +2381,16 @@ apply: /* Wait until all the pages have been processed */ - while (recv_sys->n_addrs != 0) { - const bool abort = recv_sys->found_corrupt_log - || recv_sys->found_corrupt_fs; + while (recv_sys.n_addrs != 0) { + const bool abort = recv_sys.found_corrupt_log + || recv_sys.found_corrupt_fs; - if (recv_sys->found_corrupt_fs && !srv_force_recovery) { + if (recv_sys.found_corrupt_fs && !srv_force_recovery) { ib::info() << "Set innodb_force_recovery=1" " to ignore corrupted pages."; } - mutex_exit(&(recv_sys->mutex)); + mutex_exit(&(recv_sys.mutex)); if (abort) { return; @@ -2475,47 +2398,47 @@ apply: os_thread_sleep(500000); - mutex_enter(&(recv_sys->mutex)); + mutex_enter(&(recv_sys.mutex)); } if (!last_batch) { /* Flush all the file pages to disk and invalidate them in the buffer pool */ - mutex_exit(&(recv_sys->mutex)); + mutex_exit(&(recv_sys.mutex)); log_mutex_exit(); /* Stop the recv_writer thread from issuing any LRU flush batches. */ - mutex_enter(&recv_sys->writer_mutex); + mutex_enter(&recv_sys.writer_mutex); /* Wait for any currently run batch to end. */ buf_flush_wait_LRU_batch_end(); - os_event_reset(recv_sys->flush_end); - recv_sys->flush_type = BUF_FLUSH_LIST; - os_event_set(recv_sys->flush_start); - os_event_wait(recv_sys->flush_end); + os_event_reset(recv_sys.flush_end); + recv_sys.flush_type = BUF_FLUSH_LIST; + os_event_set(recv_sys.flush_start); + os_event_wait(recv_sys.flush_end); buf_pool_invalidate(); /* Allow batches from recv_writer thread. */ - mutex_exit(&recv_sys->writer_mutex); + mutex_exit(&recv_sys.writer_mutex); log_mutex_enter(); - mutex_enter(&(recv_sys->mutex)); + mutex_enter(&(recv_sys.mutex)); mlog_init.reset(); } else if (!recv_no_ibuf_operations) { /* We skipped this in buf_page_create(). */ mlog_init.ibuf_merge(mtr); } - recv_sys->apply_log_recs = FALSE; - recv_sys->apply_batch_on = FALSE; + recv_sys.apply_log_recs = false; + recv_sys.apply_batch_on = false; - recv_sys_empty_hash(); + recv_sys.empty(); - mutex_exit(&recv_sys->mutex); + mutex_exit(&recv_sys.mutex); } /** Tries to parse a single log record. @@ -2561,7 +2484,7 @@ recv_parse_log_rec( if (new_ptr != NULL) { const lsn_t lsn = static_cast<lsn_t>( *space) << 32 | *page_no; - ut_a(lsn == recv_sys->recovered_lsn); + ut_a(lsn == recv_sys.recovered_lsn); } *type = MLOG_LSN; @@ -2582,7 +2505,7 @@ recv_parse_log_rec( case MLOG_CHECKPOINT | MLOG_SINGLE_REC_FLAG: ib::error() << "Incorrect log record type " << ib::hex(unsigned(*ptr)); - recv_sys->found_corrupt_log = true; + recv_sys.found_corrupt_log = true; return(0); } @@ -2597,7 +2520,8 @@ recv_parse_log_rec( const byte* old_ptr = new_ptr; new_ptr = recv_parse_or_apply_log_rec_body( - *type, new_ptr, end_ptr, *space, *page_no, apply, NULL, NULL); + *type, new_ptr, end_ptr, page_id_t(*space, *page_no), apply, + NULL, NULL); if (UNIV_UNLIKELY(new_ptr == NULL)) { return(0); @@ -2612,7 +2536,7 @@ recv_parse_log_rec( recv_spaces_t::iterator it = recv_spaces.find(*space); - ut_ad(!recv_sys->mlog_checkpoint_lsn + ut_ad(!recv_sys.mlog_checkpoint_lsn || *space == TRX_SYS_SPACE || srv_is_undo_tablespace(*space) || it != recv_spaces.end()); @@ -2637,17 +2561,12 @@ recv_calc_lsn_on_data_add( ib_uint64_t len) /*!< in: this many bytes of data is added, log block headers not included */ { - ulint frag_len; - ib_uint64_t lsn_len; - - frag_len = (lsn % OS_FILE_LOG_BLOCK_SIZE) - LOG_BLOCK_HDR_SIZE; - ut_ad(frag_len < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE - - LOG_BLOCK_TRL_SIZE); - lsn_len = len; - lsn_len += (lsn_len + frag_len) - / (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE - - LOG_BLOCK_TRL_SIZE) - * (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE); + unsigned frag_len = (lsn % OS_FILE_LOG_BLOCK_SIZE) - LOG_BLOCK_HDR_SIZE; + unsigned payload_size = log_sys.payload_size(); + ut_ad(frag_len < payload_size); + lsn_t lsn_len = len; + lsn_len += (lsn_len + frag_len) / payload_size + * (OS_FILE_LOG_BLOCK_SIZE - payload_size); return(lsn + lsn_len); } @@ -2670,29 +2589,28 @@ recv_report_corrupt_log( ib::error() << "############### CORRUPT LOG RECORD FOUND ##################"; - const ulint ptr_offset = ulint(ptr - recv_sys->buf); + const ulint ptr_offset = ulint(ptr - recv_sys.buf); ib::info() << "Log record type " << type << ", page " << space << ":" << page_no << ". Log parsing proceeded successfully up to " - << recv_sys->recovered_lsn << ". Previous log record type " - << recv_previous_parsed_rec_type - << ", is multi " + << recv_sys.recovered_lsn << ". Previous log record type " + << recv_previous_parsed_rec_type << ", is multi " << recv_previous_parsed_rec_is_multi << " Recv offset " << ptr_offset << ", prev " << recv_previous_parsed_rec_offset; - ut_ad(ptr <= recv_sys->buf + recv_sys->len); + ut_ad(ptr <= recv_sys.buf + recv_sys.len); const ulint limit = 100; const ulint prev_offset = std::min(recv_previous_parsed_rec_offset, ptr_offset); const ulint before = std::min(prev_offset, limit); - const ulint after = std::min(recv_sys->len - ptr_offset, limit); + const ulint after = std::min(recv_sys.len - ptr_offset, limit); ib::info() << "Hex dump starting " << before << " bytes before and" " ending " << after << " bytes after the corrupted record:"; - const byte* start = recv_sys->buf + prev_offset - before; + const byte* start = recv_sys.buf + prev_offset - before; ut_print_buf(stderr, start, ulint(ptr - start) + after); putc('\n', stderr); @@ -2734,17 +2652,16 @@ of buffer pool. Store last_stored_lsn if it is not in last phase read redo logs. */ static bool recv_sys_heap_check(store_t* store, ulint available_mem) { - if (*store != STORE_NO - && mem_heap_get_size(recv_sys->heap) >= available_mem) + if (*store != STORE_NO && mem_heap_get_size(recv_sys.heap) >= available_mem) { if (*store == STORE_YES) - recv_sys->last_stored_lsn= recv_sys->recovered_lsn; + recv_sys.last_stored_lsn= recv_sys.recovered_lsn; *store= STORE_NO; DBUG_PRINT("ib_log",("Ran out of memory and last " "stored lsn " LSN_PF " last stored offset " - ULINTPF "\n",recv_sys->recovered_lsn, - recv_sys->recovered_offset)); + ULINTPF "\n", + recv_sys.recovered_lsn, recv_sys.recovered_offset)); return true; } @@ -2775,12 +2692,12 @@ bool recv_parse_log_recs(lsn_t checkpoint_lsn, store_t* store, const bool last_phase = (*store == STORE_IF_EXISTS); ut_ad(log_mutex_own()); - ut_ad(mutex_own(&recv_sys->mutex)); - ut_ad(recv_sys->parse_start_lsn != 0); + ut_ad(mutex_own(&recv_sys.mutex)); + ut_ad(recv_sys.parse_start_lsn != 0); loop: - ptr = recv_sys->buf + recv_sys->recovered_offset; + ptr = recv_sys.buf + recv_sys.recovered_offset; - end_ptr = recv_sys->buf + recv_sys->len; + end_ptr = recv_sys.buf + recv_sys.len; if (ptr == end_ptr) { @@ -2808,7 +2725,7 @@ loop: if (single_rec) { /* The mtr did not modify multiple pages */ - old_lsn = recv_sys->recovered_lsn; + old_lsn = recv_sys.recovered_lsn; /* Try to parse a log record, fetching its type, space id, page no, and a pointer to the body of the log record */ @@ -2816,12 +2733,12 @@ loop: len = recv_parse_log_rec(&type, ptr, end_ptr, &space, &page_no, apply, &body); - if (UNIV_UNLIKELY(recv_sys->found_corrupt_log)) { + if (UNIV_UNLIKELY(recv_sys.found_corrupt_log)) { recv_report_corrupt_log(ptr, type, space, page_no); return(true); } - if (UNIV_UNLIKELY(recv_sys->found_corrupt_fs)) { + if (UNIV_UNLIKELY(recv_sys.found_corrupt_fs)) { return(true); } @@ -2831,7 +2748,7 @@ loop: new_recovered_lsn = recv_calc_lsn_on_data_add(old_lsn, len); - if (new_recovered_lsn > recv_sys->scanned_lsn) { + if (new_recovered_lsn > recv_sys.scanned_lsn) { /* The log record filled a log block, and we require that also the next log block should have been scanned in */ @@ -2840,11 +2757,11 @@ loop: } recv_previous_parsed_rec_type = type; - recv_previous_parsed_rec_offset = recv_sys->recovered_offset; + recv_previous_parsed_rec_offset = recv_sys.recovered_offset; recv_previous_parsed_rec_is_multi = 0; - recv_sys->recovered_offset += len; - recv_sys->recovered_lsn = new_recovered_lsn; + recv_sys.recovered_offset += len; + recv_sys.recovered_lsn = new_recovered_lsn; switch (type) { lsn_t lsn; @@ -2860,9 +2777,9 @@ loop: "MLOG_CHECKPOINT(" LSN_PF ") %s at " LSN_PF "\n", lsn, lsn != checkpoint_lsn ? "ignored" - : recv_sys->mlog_checkpoint_lsn + : recv_sys.mlog_checkpoint_lsn ? "reread" : "read", - recv_sys->recovered_lsn); + recv_sys.recovered_lsn); } DBUG_PRINT("ib_log", @@ -2870,18 +2787,18 @@ loop: LSN_PF, lsn, lsn != checkpoint_lsn ? "ignored" - : recv_sys->mlog_checkpoint_lsn + : recv_sys.mlog_checkpoint_lsn ? "reread" : "read", - recv_sys->recovered_lsn)); + recv_sys.recovered_lsn)); if (lsn == checkpoint_lsn) { - if (recv_sys->mlog_checkpoint_lsn) { - ut_ad(recv_sys->mlog_checkpoint_lsn - <= recv_sys->recovered_lsn); + if (recv_sys.mlog_checkpoint_lsn) { + ut_ad(recv_sys.mlog_checkpoint_lsn + <= recv_sys.recovered_lsn); break; } - recv_sys->mlog_checkpoint_lsn - = recv_sys->recovered_lsn; + recv_sys.mlog_checkpoint_lsn + = recv_sys.recovered_lsn; return(true); } break; @@ -2903,10 +2820,10 @@ loop: } /* fall through */ case STORE_YES: - recv_add_to_hash_table( + recv_sys.add( type, space, page_no, body, ptr + len, old_lsn, - recv_sys->recovered_lsn); + recv_sys.recovered_lsn); } /* fall through */ case MLOG_INDEX_LOAD: @@ -2943,7 +2860,7 @@ loop: &type, ptr, end_ptr, &space, &page_no, false, &body); - if (UNIV_UNLIKELY(recv_sys->found_corrupt_log)) { + if (UNIV_UNLIKELY(recv_sys.found_corrupt_log)) { corrupted_log: recv_report_corrupt_log( ptr, type, space, page_no); @@ -2953,11 +2870,11 @@ corrupted_log: if (ptr == end_ptr) { } else if (type == MLOG_CHECKPOINT || (*ptr & MLOG_SINGLE_REC_FLAG)) { - recv_sys->found_corrupt_log = true; + recv_sys.found_corrupt_log = true; goto corrupted_log; } - if (recv_sys->found_corrupt_fs) { + if (recv_sys.found_corrupt_fs) { return(true); } @@ -2967,7 +2884,7 @@ corrupted_log: recv_previous_parsed_rec_type = type; recv_previous_parsed_rec_offset - = recv_sys->recovered_offset + total_len; + = recv_sys.recovered_offset + total_len; recv_previous_parsed_rec_is_multi = 1; /* MLOG_FILE_NAME redo log records doesn't make changes @@ -2980,10 +2897,10 @@ corrupted_log: if (only_mlog_file) { new_recovered_lsn = recv_calc_lsn_on_data_add( - recv_sys->recovered_lsn, len); + recv_sys.recovered_lsn, len); mlog_rec_len += len; - recv_sys->recovered_offset += len; - recv_sys->recovered_lsn = new_recovered_lsn; + recv_sys.recovered_offset += len; + recv_sys.recovered_lsn = new_recovered_lsn; } total_len += len; @@ -2997,7 +2914,7 @@ corrupted_log: ": multi-log end" " total_len " ULINTPF " n=" ULINTPF, - recv_sys->recovered_lsn, + recv_sys.recovered_lsn, total_len, n_recs)); total_len -= mlog_rec_len; break; @@ -3007,14 +2924,14 @@ corrupted_log: ("scan " LSN_PF ": multi-log rec %s" " len " ULINTPF " page " ULINTPF ":" ULINTPF, - recv_sys->recovered_lsn, + recv_sys.recovered_lsn, get_mlog_string(type), len, space, page_no)); } new_recovered_lsn = recv_calc_lsn_on_data_add( - recv_sys->recovered_lsn, total_len); + recv_sys.recovered_lsn, total_len); - if (new_recovered_lsn > recv_sys->scanned_lsn) { + if (new_recovered_lsn > recv_sys.scanned_lsn) { /* The log record filled a log block, and we require that also the next log block should have been scanned in */ @@ -3024,10 +2941,10 @@ corrupted_log: /* Add all the records to the hash table */ - ptr = recv_sys->buf + recv_sys->recovered_offset; + ptr = recv_sys.buf + recv_sys.recovered_offset; for (;;) { - old_lsn = recv_sys->recovered_lsn; + old_lsn = recv_sys.recovered_lsn; /* This will apply MLOG_FILE_ records. We had to skip them in the first scan, because we did not know if the mini-transaction was @@ -3036,21 +2953,21 @@ corrupted_log: &type, ptr, end_ptr, &space, &page_no, apply, &body); - if (UNIV_UNLIKELY(recv_sys->found_corrupt_log) + if (UNIV_UNLIKELY(recv_sys.found_corrupt_log) && !recv_report_corrupt_log( ptr, type, space, page_no)) { return(true); } - if (UNIV_UNLIKELY(recv_sys->found_corrupt_fs)) { + if (UNIV_UNLIKELY(recv_sys.found_corrupt_fs)) { return(true); } ut_a(len != 0); ut_a(!(*ptr & MLOG_SINGLE_REC_FLAG)); - recv_sys->recovered_offset += len; - recv_sys->recovered_lsn + recv_sys.recovered_offset += len; + recv_sys.recovered_lsn = recv_calc_lsn_on_data_add(old_lsn, len); switch (type) { @@ -3087,7 +3004,7 @@ corrupted_log: } /* fall through */ case STORE_YES: - recv_add_to_hash_table( + recv_sys.add( type, space, page_no, body, ptr + len, old_lsn, @@ -3103,7 +3020,7 @@ corrupted_log: } /** Adds data from a new log block to the parsing buffer of recv_sys if -recv_sys->parse_start_lsn is non-zero. +recv_sys.parse_start_lsn is non-zero. @param[in] log_block log block to add @param[in] scanned_lsn lsn of how far we were able to find data in this log block @@ -3115,9 +3032,9 @@ bool recv_sys_add_to_parsing_buf(const byte* log_block, lsn_t scanned_lsn) ulint start_offset; ulint end_offset; - ut_ad(scanned_lsn >= recv_sys->scanned_lsn); + ut_ad(scanned_lsn >= recv_sys.scanned_lsn); - if (!recv_sys->parse_start_lsn) { + if (!recv_sys.parse_start_lsn) { /* Cannot start parsing yet because no start point for it found */ return(false); @@ -3125,18 +3042,18 @@ bool recv_sys_add_to_parsing_buf(const byte* log_block, lsn_t scanned_lsn) data_len = log_block_get_data_len(log_block); - if (recv_sys->parse_start_lsn >= scanned_lsn) { + if (recv_sys.parse_start_lsn >= scanned_lsn) { return(false); - } else if (recv_sys->scanned_lsn >= scanned_lsn) { + } else if (recv_sys.scanned_lsn >= scanned_lsn) { return(false); - } else if (recv_sys->parse_start_lsn > recv_sys->scanned_lsn) { - more_len = (ulint) (scanned_lsn - recv_sys->parse_start_lsn); + } else if (recv_sys.parse_start_lsn > recv_sys.scanned_lsn) { + more_len = (ulint) (scanned_lsn - recv_sys.parse_start_lsn); } else { - more_len = (ulint) (scanned_lsn - recv_sys->scanned_lsn); + more_len = (ulint) (scanned_lsn - recv_sys.scanned_lsn); } if (more_len == 0) { @@ -3151,21 +3068,17 @@ bool recv_sys_add_to_parsing_buf(const byte* log_block, lsn_t scanned_lsn) start_offset = LOG_BLOCK_HDR_SIZE; } - end_offset = data_len; - - if (end_offset > OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) { - end_offset = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; - } + end_offset = std::min<ulint>(data_len, log_sys.trailer_offset()); ut_ad(start_offset <= end_offset); if (start_offset < end_offset) { - ut_memcpy(recv_sys->buf + recv_sys->len, + ut_memcpy(recv_sys.buf + recv_sys.len, log_block + start_offset, end_offset - start_offset); - recv_sys->len += end_offset - start_offset; + recv_sys.len += end_offset - start_offset; - ut_a(recv_sys->len <= RECV_PARSING_BUF_SIZE); + ut_a(recv_sys.len <= RECV_PARSING_BUF_SIZE); } return(true); @@ -3174,13 +3087,12 @@ bool recv_sys_add_to_parsing_buf(const byte* log_block, lsn_t scanned_lsn) /** Moves the parsing buffer data left to the buffer start. */ void recv_sys_justify_left_parsing_buf() { - memmove(recv_sys->buf, - recv_sys->buf + recv_sys->recovered_offset, - recv_sys->len - recv_sys->recovered_offset); + memmove(recv_sys.buf, recv_sys.buf + recv_sys.recovered_offset, + recv_sys.len - recv_sys.recovered_offset); - recv_sys->len -= recv_sys->recovered_offset; + recv_sys.len -= recv_sys.recovered_offset; - recv_sys->recovered_offset = 0; + recv_sys.recovered_offset = 0; } /** Scan redo log from a buffer and stores new log data to the parsing buffer. @@ -3215,7 +3127,7 @@ static bool recv_scan_log_recs( bool finished = false; ulint data_len; bool more_data = false; - bool apply = recv_sys->mlog_checkpoint_lsn != 0; + bool apply = recv_sys.mlog_checkpoint_lsn != 0; ulint recv_parsing_buf_size = RECV_PARSING_BUF_SIZE; const bool last_phase = (*store_to_hash == STORE_IF_EXISTS); ut_ad(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0); @@ -3242,10 +3154,10 @@ static bool recv_scan_log_recs( data_len = log_block_get_data_len(log_block); - if (scanned_lsn + data_len > recv_sys->scanned_lsn + if (scanned_lsn + data_len > recv_sys.scanned_lsn && log_block_get_checkpoint_no(log_block) - < recv_sys->scanned_checkpoint_no - && (recv_sys->scanned_checkpoint_no + < recv_sys.scanned_checkpoint_no + && (recv_sys.scanned_checkpoint_no - log_block_get_checkpoint_no(log_block) > 0x80000000UL)) { @@ -3255,16 +3167,16 @@ static bool recv_scan_log_recs( break; } - if (!recv_sys->parse_start_lsn + if (!recv_sys.parse_start_lsn && (log_block_get_first_rec_group(log_block) > 0)) { /* We found a point from which to start the parsing of log records */ - recv_sys->parse_start_lsn = scanned_lsn + recv_sys.parse_start_lsn = scanned_lsn + log_block_get_first_rec_group(log_block); - recv_sys->scanned_lsn = recv_sys->parse_start_lsn; - recv_sys->recovered_lsn = recv_sys->parse_start_lsn; + recv_sys.scanned_lsn = recv_sys.parse_start_lsn; + recv_sys.recovered_lsn = recv_sys.parse_start_lsn; } scanned_lsn += data_len; @@ -3275,17 +3187,17 @@ static bool recv_scan_log_recs( && checkpoint_lsn == mach_read_from_8(LOG_BLOCK_HDR_SIZE + 1 + log_block)) { /* The redo log is logically empty. */ - ut_ad(recv_sys->mlog_checkpoint_lsn == 0 - || recv_sys->mlog_checkpoint_lsn + ut_ad(recv_sys.mlog_checkpoint_lsn == 0 + || recv_sys.mlog_checkpoint_lsn == checkpoint_lsn); - recv_sys->mlog_checkpoint_lsn = checkpoint_lsn; + recv_sys.mlog_checkpoint_lsn = checkpoint_lsn; DBUG_PRINT("ib_log", ("found empty log; LSN=" LSN_PF, scanned_lsn)); finished = true; break; } - if (scanned_lsn > recv_sys->scanned_lsn) { + if (scanned_lsn > recv_sys.scanned_lsn) { ut_ad(!srv_log_files_created); if (!recv_needed_recovery) { recv_needed_recovery = true; @@ -3298,7 +3210,7 @@ static bool recv_scan_log_recs( ib::info() << "Starting crash recovery from" " checkpoint LSN=" - << recv_sys->scanned_lsn; + << recv_sys.scanned_lsn; } /* We were able to find more log data: add it to the @@ -3311,12 +3223,12 @@ static bool recv_scan_log_recs( = (70 * 1024); ); - if (recv_sys->len + 4 * OS_FILE_LOG_BLOCK_SIZE + if (recv_sys.len + 4 * OS_FILE_LOG_BLOCK_SIZE >= recv_parsing_buf_size) { ib::error() << "Log parsing buffer overflow." " Recovery may have failed!"; - recv_sys->found_corrupt_log = true; + recv_sys.found_corrupt_log = true; if (!srv_force_recovery) { ib::error() @@ -3324,20 +3236,20 @@ static bool recv_scan_log_recs( " to ignore this error."; return(true); } - } else if (!recv_sys->found_corrupt_log) { + } else if (!recv_sys.found_corrupt_log) { more_data = recv_sys_add_to_parsing_buf( log_block, scanned_lsn); } - recv_sys->scanned_lsn = scanned_lsn; - recv_sys->scanned_checkpoint_no + recv_sys.scanned_lsn = scanned_lsn; + recv_sys.scanned_checkpoint_no = log_block_get_checkpoint_no(log_block); } /* During last phase of scanning, there can be redo logs - left in recv_sys->buf to parse & store it in recv_sys->heap */ + left in recv_sys.buf to parse & store it in recv_sys.heap */ if (last_phase - && recv_sys->recovered_lsn < recv_sys->scanned_lsn) { + && recv_sys.recovered_lsn < recv_sys.scanned_lsn) { more_data = true; } @@ -3352,38 +3264,38 @@ static bool recv_scan_log_recs( *group_scanned_lsn = scanned_lsn; - mutex_enter(&recv_sys->mutex); + mutex_enter(&recv_sys.mutex); - if (more_data && !recv_sys->found_corrupt_log) { + if (more_data && !recv_sys.found_corrupt_log) { /* Try to parse more log records */ if (recv_parse_log_recs(checkpoint_lsn, store_to_hash, available_mem, apply)) { - ut_ad(recv_sys->found_corrupt_log - || recv_sys->found_corrupt_fs - || recv_sys->mlog_checkpoint_lsn - == recv_sys->recovered_lsn); + ut_ad(recv_sys.found_corrupt_log + || recv_sys.found_corrupt_fs + || recv_sys.mlog_checkpoint_lsn + == recv_sys.recovered_lsn); finished = true; goto func_exit; } recv_sys_heap_check(store_to_hash, available_mem); - if (recv_sys->recovered_offset > recv_parsing_buf_size / 4) { + if (recv_sys.recovered_offset > recv_parsing_buf_size / 4) { /* Move parsing buffer data to the buffer start */ recv_sys_justify_left_parsing_buf(); } /* Need to re-parse the redo log which're stored - in recv_sys->buf */ + in recv_sys.buf */ if (last_phase && *store_to_hash == STORE_NO) { finished = false; } } func_exit: - mutex_exit(&recv_sys->mutex); + mutex_exit(&recv_sys.mutex); return(finished); } @@ -3403,28 +3315,28 @@ recv_group_scan_log_recs( bool last_phase) { DBUG_ENTER("recv_group_scan_log_recs"); - DBUG_ASSERT(!last_phase || recv_sys->mlog_checkpoint_lsn > 0); + DBUG_ASSERT(!last_phase || recv_sys.mlog_checkpoint_lsn > 0); - mutex_enter(&recv_sys->mutex); - recv_sys->len = 0; - recv_sys->recovered_offset = 0; - recv_sys->n_addrs = 0; - recv_sys_empty_hash(); + mutex_enter(&recv_sys.mutex); + recv_sys.len = 0; + recv_sys.recovered_offset = 0; + recv_sys.n_addrs = 0; + recv_sys.empty(); srv_start_lsn = *contiguous_lsn; - recv_sys->parse_start_lsn = *contiguous_lsn; - recv_sys->scanned_lsn = *contiguous_lsn; - recv_sys->recovered_lsn = *contiguous_lsn; - recv_sys->scanned_checkpoint_no = 0; + recv_sys.parse_start_lsn = *contiguous_lsn; + recv_sys.scanned_lsn = *contiguous_lsn; + recv_sys.recovered_lsn = *contiguous_lsn; + recv_sys.scanned_checkpoint_no = 0; recv_previous_parsed_rec_type = MLOG_SINGLE_REC_FLAG; recv_previous_parsed_rec_offset = 0; recv_previous_parsed_rec_is_multi = 0; ut_ad(recv_max_page_lsn == 0); ut_ad(last_phase || !recv_writer_thread_active); - mutex_exit(&recv_sys->mutex); + mutex_exit(&recv_sys.mutex); lsn_t start_lsn; lsn_t end_lsn; - store_t store_to_hash = recv_sys->mlog_checkpoint_lsn == 0 + store_t store_to_hash = recv_sys.mlog_checkpoint_lsn == 0 ? STORE_NO : (last_phase ? STORE_IF_EXISTS : STORE_YES); ulint available_mem = (buf_pool_get_n_pages() * 2 / 3) << srv_page_size_shift; @@ -3441,7 +3353,7 @@ recv_group_scan_log_recs( finished the redo log scan. */ recv_apply_hashed_log_recs(false); /* Rescan the redo logs from last stored lsn */ - end_lsn = recv_sys->recovered_lsn; + end_lsn = recv_sys.recovered_lsn; } start_lsn = ut_uint64_align_down(end_lsn, @@ -3455,7 +3367,7 @@ recv_group_scan_log_recs( start_lsn, end_lsn, contiguous_lsn, &log_sys.log.scanned_lsn)); - if (recv_sys->found_corrupt_log || recv_sys->found_corrupt_fs) { + if (recv_sys.found_corrupt_log || recv_sys.found_corrupt_fs) { DBUG_RETURN(false); } @@ -3475,10 +3387,13 @@ dberr_t recv_init_missing_space(dberr_t err, const recv_spaces_t::const_iterator& i) { if (is_mariabackup_restore_or_export()) { - ib::warn() << "Tablespace " << i->first << " was not" - " found at " << i->second.name << " when" - " restoring a (partial?) backup. All redo log" - " for this file will be ignored!"; + if (i->second.name.find(TEMP_TABLE_PATH_PREFIX) + != std::string::npos) { + ib::warn() << "Tablespace " << i->first << " was not" + " found at " << i->second.name << " when" + " restoring a (partial?) backup. All redo log" + " for this file will be ignored!"; + } return(err); } @@ -3514,9 +3429,9 @@ recv_validate_tablespace(bool rescan, bool& missing_tablespace) { dberr_t err = DB_SUCCESS; - for (ulint h = 0; h < hash_get_n_cells(recv_sys->addr_hash); h++) { + for (ulint h = 0; h < hash_get_n_cells(recv_sys.addr_hash); h++) { for (recv_addr_t* recv_addr = static_cast<recv_addr_t*>( - HASH_GET_FIRST(recv_sys->addr_hash, h)); + HASH_GET_FIRST(recv_sys.addr_hash, h)); recv_addr != 0; recv_addr = static_cast<recv_addr_t*>( HASH_GET_NEXT(addr_hash, recv_addr))) { @@ -3549,21 +3464,19 @@ recv_validate_tablespace(bool rescan, bool& missing_tablespace) return(err); } - /* When rescan is not needed then recv_sys->addr_hash will have - all space id belongs to redo log. If rescan is needed and - innodb_force_recovery > 0 then InnoDB can ignore missing tablespace. */ - for (recv_spaces_t::iterator i = recv_spaces.begin(); - i != recv_spaces.end(); i++) { - - if (UNIV_LIKELY(i->second.status != file_name_t::MISSING)) { + /* When rescan is not needed, recv_sys.addr_hash will contain the + entire redo log. If rescan is needed or innodb_force_recovery + is set, we can ignore missing tablespaces. */ + for (const recv_spaces_t::value_type& rs : recv_spaces) { + if (UNIV_LIKELY(rs.second.status != file_name_t::MISSING)) { continue; } missing_tablespace = true; if (srv_force_recovery > 0) { - ib::warn() << "Tablespace " << i->first - <<" was not found at " << i->second.name + ib::warn() << "Tablespace " << rs.first + <<" was not found at " << rs.second.name <<", and innodb_force_recovery was set." <<" All redo log for this tablespace" <<" will be ignored!"; @@ -3571,9 +3484,9 @@ recv_validate_tablespace(bool rescan, bool& missing_tablespace) } if (!rescan) { - ib::info() << "Tablespace " << i->first + ib::info() << "Tablespace " << rs.first << " was not found at '" - << i->second.name << "', but there" + << rs.second.name << "', but there" <<" were no modifications either."; } } @@ -3598,33 +3511,34 @@ recv_init_crash_recovery_spaces(bool rescan, bool& missing_tablespace) ut_ad(!srv_read_only_mode); ut_ad(recv_needed_recovery); - for (recv_spaces_t::iterator i = recv_spaces.begin(); - i != recv_spaces.end(); i++) { - ut_ad(!is_predefined_tablespace(i->first)); - ut_ad(i->second.status != file_name_t::DELETED || !i->second.space); + for (recv_spaces_t::value_type& rs : recv_spaces) { + ut_ad(!is_predefined_tablespace(rs.first)); + ut_ad(rs.second.status != file_name_t::DELETED + || !rs.second.space); - if (i->second.status == file_name_t::DELETED) { + if (rs.second.status == file_name_t::DELETED) { /* The tablespace was deleted, so we can ignore any redo log for it. */ flag_deleted = true; - } else if (i->second.space != NULL) { + } else if (rs.second.space != NULL) { /* The tablespace was found, and there are some redo log records for it. */ - fil_names_dirty(i->second.space); - i->second.space->enable_lsn = i->second.enable_lsn; - } else if (i->second.name == "") { + fil_names_dirty(rs.second.space); + rs.second.space->enable_lsn = rs.second.enable_lsn; + } else if (rs.second.name == "") { ib::error() << "Missing MLOG_FILE_NAME" " or MLOG_FILE_DELETE" " before MLOG_CHECKPOINT for tablespace " - << i->first; - recv_sys->found_corrupt_log = true; + << rs.first; + recv_sys.found_corrupt_log = true; return(DB_CORRUPTION); } else { - i->second.status = file_name_t::MISSING; + rs.second.status = file_name_t::MISSING; flag_deleted = true; } - ut_ad(i->second.status == file_name_t::DELETED || i->second.name != ""); + ut_ad(rs.second.status == file_name_t::DELETED + || rs.second.name != ""); } if (flag_deleted) { @@ -3672,7 +3586,7 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) if (err != DB_SUCCESS) { - srv_start_lsn = recv_sys->recovered_lsn = log_sys.lsn; + srv_start_lsn = recv_sys.recovered_lsn = log_sys.lsn; log_mutex_exit(); return(err); } @@ -3687,14 +3601,14 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) /* Start reading the log from the checkpoint lsn. The variable contiguous_lsn contains an lsn up to which the log is known to be contiguously written. */ - recv_sys->mlog_checkpoint_lsn = 0; + recv_sys.mlog_checkpoint_lsn = 0; ut_ad(RECV_SCAN_SIZE <= srv_log_buffer_size); const lsn_t end_lsn = mach_read_from_8( buf + LOG_CHECKPOINT_END_LSN); - ut_ad(recv_sys->n_addrs == 0); + ut_ad(recv_sys.n_addrs == 0); contiguous_lsn = checkpoint_lsn; switch (log_sys.log.format) { case 0: @@ -3709,7 +3623,7 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) contiguous_lsn = end_lsn; break; } - recv_sys->found_corrupt_log = true; + recv_sys.found_corrupt_log = true; log_mutex_exit(); return(DB_ERROR); } @@ -3717,21 +3631,21 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) /* Look for MLOG_CHECKPOINT. */ recv_group_scan_log_recs(checkpoint_lsn, &contiguous_lsn, false); /* The first scan should not have stored or applied any records. */ - ut_ad(recv_sys->n_addrs == 0); - ut_ad(!recv_sys->found_corrupt_fs); + ut_ad(recv_sys.n_addrs == 0); + ut_ad(!recv_sys.found_corrupt_fs); if (srv_read_only_mode && recv_needed_recovery) { log_mutex_exit(); return(DB_READ_ONLY); } - if (recv_sys->found_corrupt_log && !srv_force_recovery) { + if (recv_sys.found_corrupt_log && !srv_force_recovery) { log_mutex_exit(); ib::warn() << "Log scan aborted at LSN " << contiguous_lsn; return(DB_ERROR); } - if (recv_sys->mlog_checkpoint_lsn == 0) { + if (recv_sys.mlog_checkpoint_lsn == 0) { lsn_t scan_lsn = log_sys.log.scanned_lsn; if (!srv_read_only_mode && scan_lsn != checkpoint_lsn) { log_mutex_exit(); @@ -3752,8 +3666,8 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) rescan = recv_group_scan_log_recs( checkpoint_lsn, &contiguous_lsn, false); - if ((recv_sys->found_corrupt_log && !srv_force_recovery) - || recv_sys->found_corrupt_fs) { + if ((recv_sys.found_corrupt_log && !srv_force_recovery) + || recv_sys.found_corrupt_fs) { log_mutex_exit(); return(DB_ERROR); } @@ -3764,7 +3678,7 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) user about recovery: */ if (flush_lsn == checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT - && recv_sys->mlog_checkpoint_lsn == checkpoint_lsn) { + && recv_sys.mlog_checkpoint_lsn == checkpoint_lsn) { /* The redo log is logically empty. */ } else if (checkpoint_lsn != flush_lsn) { ut_ad(!srv_log_files_created); @@ -3796,7 +3710,7 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) } } - log_sys.lsn = recv_sys->recovered_lsn; + log_sys.lsn = recv_sys.recovered_lsn; if (recv_needed_recovery) { bool missing_tablespace = false; @@ -3819,17 +3733,17 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) DBUG_PRINT("ib_log", ("Rescan of redo log to validate " "the missing tablespace. Scan " "from last stored LSN " LSN_PF, - recv_sys->last_stored_lsn)); + recv_sys.last_stored_lsn)); - lsn_t recent_stored_lsn = recv_sys->last_stored_lsn; + lsn_t recent_stored_lsn = recv_sys.last_stored_lsn; rescan = recv_group_scan_log_recs( checkpoint_lsn, &recent_stored_lsn, false); - ut_ad(!recv_sys->found_corrupt_fs); + ut_ad(!recv_sys.found_corrupt_fs); missing_tablespace = false; - err = recv_sys->found_corrupt_log + err = recv_sys.found_corrupt_log ? DB_ERROR : recv_validate_tablespace( rescan, missing_tablespace); @@ -3859,15 +3773,15 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) recv_group_scan_log_recs( checkpoint_lsn, &contiguous_lsn, true); - if ((recv_sys->found_corrupt_log + if ((recv_sys.found_corrupt_log && !srv_force_recovery) - || recv_sys->found_corrupt_fs) { + || recv_sys.found_corrupt_fs) { log_mutex_exit(); return(DB_ERROR); } } } else { - ut_ad(!rescan || recv_sys->n_addrs == 0); + ut_ad(!rescan || recv_sys.n_addrs == 0); } if (log_sys.log.scanned_lsn < checkpoint_lsn @@ -3881,11 +3795,11 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) " database is now corrupt!"; } - if (recv_sys->recovered_lsn < checkpoint_lsn) { + if (recv_sys.recovered_lsn < checkpoint_lsn) { log_mutex_exit(); ib::error() << "Recovered only to lsn:" - << recv_sys->recovered_lsn << " checkpoint_lsn: " << checkpoint_lsn; + << recv_sys.recovered_lsn << " checkpoint_lsn: " << checkpoint_lsn; return(DB_ERROR); } @@ -3896,9 +3810,9 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) recv_synchronize_groups(); if (!recv_needed_recovery) { - ut_a(checkpoint_lsn == recv_sys->recovered_lsn); + ut_a(checkpoint_lsn == recv_sys.recovered_lsn); } else { - srv_start_lsn = recv_sys->recovered_lsn; + srv_start_lsn = recv_sys.recovered_lsn; } log_sys.buf_free = ulong(log_sys.lsn % OS_FILE_LOG_BLOCK_SIZE); @@ -3920,11 +3834,11 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) log_sys.next_checkpoint_no = ++checkpoint_no; - mutex_enter(&recv_sys->mutex); + mutex_enter(&recv_sys.mutex); - recv_sys->apply_log_recs = TRUE; + recv_sys.apply_log_recs = true; - mutex_exit(&recv_sys->mutex); + mutex_exit(&recv_sys.mutex); log_mutex_exit(); @@ -3945,7 +3859,7 @@ recv_recovery_from_checkpoint_finish(void) required because it grabs various mutexes and we want to ensure that when we enable sync_order_checks there is no mutex currently held by any thread. */ - mutex_enter(&recv_sys->writer_mutex); + mutex_enter(&recv_sys.writer_mutex); /* Free the resources of the recovery system */ recv_recovery_on = false; @@ -3955,7 +3869,7 @@ recv_recovery_from_checkpoint_finish(void) in progress batches to finish. */ buf_flush_wait_LRU_batch_end(); - mutex_exit(&recv_sys->writer_mutex); + mutex_exit(&recv_sys.writer_mutex); ulint count = 0; while (recv_writer_thread_active) { @@ -3968,7 +3882,7 @@ recv_recovery_from_checkpoint_finish(void) } } - recv_sys_debug_free(); + recv_sys.debug_free(); /* Free up the flush_rbt. */ buf_flush_free_flush_rbt(); @@ -4022,9 +3936,8 @@ recv_dblwr_t::find_page(ulint space_id, ulint page_no) const byte *result= NULL; lsn_t max_lsn= 0; - for (list::const_iterator i = pages.begin(); i != pages.end(); ++i) + for (const byte *page : pages) { - const byte *page= *i; if (page_get_page_no(page) != page_no || page_get_space_id(page) != space_id) continue; @@ -4206,6 +4119,12 @@ static const char* get_mlog_string(mlog_id_t type) case MLOG_TRUNCATE: return("MLOG_TRUNCATE"); + case MLOG_MEMSET: + return("MLOG_MEMSET"); + + case MLOG_INIT_FREE_PAGE: + return("MLOG_INIT_FREE_PAGE"); + case MLOG_FILE_WRITE_CRYPT_DATA: return("MLOG_FILE_WRITE_CRYPT_DATA"); } diff --git a/storage/innobase/mtr/mtr0log.cc b/storage/innobase/mtr/mtr0log.cc index 714bd4435dc..da7088dd7d9 100644 --- a/storage/innobase/mtr/mtr0log.cc +++ b/storage/innobase/mtr/mtr0log.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -100,7 +100,7 @@ mlog_parse_initial_log_record( *type = mlog_id_t(*ptr & ~MLOG_SINGLE_REC_FLAG); if (UNIV_UNLIKELY(*type > MLOG_BIGGEST_TYPE && !EXTRA_CHECK_MLOG_NUMBER(*type))) { - recv_sys->found_corrupt_log = true; + recv_sys.found_corrupt_log = true; return NULL; } @@ -121,7 +121,7 @@ mlog_parse_initial_log_record( } /********************************************************//** -Parses a log record written by mlog_write_ulint or mlog_write_ull. +Parses a log record written by mlog_write_ulint, mlog_write_ull, mlog_memset. @return parsed record end, NULL if not a complete record or a corrupt record */ byte* mlog_parse_nbytes( @@ -137,29 +137,46 @@ mlog_parse_nbytes( ulint val; ib_uint64_t dval; - ut_a(type <= MLOG_8BYTES); + ut_ad(type <= MLOG_8BYTES || type == MLOG_MEMSET); ut_a(!page || !page_zip + || type == MLOG_MEMSET || !fil_page_index_page_check(page)); if (end_ptr < ptr + 2) { - - return(NULL); + return NULL; } offset = mach_read_from_2(ptr); ptr += 2; - if (offset >= srv_page_size) { - recv_sys->found_corrupt_log = TRUE; - - return(NULL); + if (UNIV_UNLIKELY(offset >= srv_page_size)) { + goto corrupt; } - if (type == MLOG_8BYTES) { + switch (type) { + case MLOG_MEMSET: + if (end_ptr < ptr + 3) { + return NULL; + } + val = mach_read_from_2(ptr); + ptr += 2; + if (UNIV_UNLIKELY(offset + val > srv_page_size)) { + goto corrupt; + } + if (page) { + memset(page + offset, *ptr, val); + if (page_zip) { + ut_ad(offset + val <= PAGE_DATA + || !fil_page_index_page_check(page)); + memset(static_cast<page_zip_des_t*>(page_zip) + ->data + offset, *ptr, val); + } + } + return const_cast<byte*>(++ptr); + case MLOG_8BYTES: dval = mach_u64_parse_compressed(&ptr, end_ptr); if (ptr == NULL) { - - return(NULL); + return NULL; } if (page) { @@ -171,14 +188,13 @@ mlog_parse_nbytes( mach_write_to_8(page + offset, dval); } - return(const_cast<byte*>(ptr)); + return const_cast<byte*>(ptr); + default: + val = mach_parse_compressed(&ptr, end_ptr); } - val = mach_parse_compressed(&ptr, end_ptr); - if (ptr == NULL) { - - return(NULL); + return NULL; } switch (type) { @@ -221,11 +237,11 @@ mlog_parse_nbytes( break; default: corrupt: - recv_sys->found_corrupt_log = TRUE; + recv_sys.found_corrupt_log = true; ptr = NULL; } - return(const_cast<byte*>(ptr)); + return const_cast<byte*>(ptr); } /********************************************************//** @@ -388,7 +404,7 @@ mlog_parse_string( ptr += 2; if (offset >= srv_page_size || len + offset > srv_page_size) { - recv_sys->found_corrupt_log = TRUE; + recv_sys.found_corrupt_log = TRUE; return(NULL); } @@ -409,6 +425,72 @@ mlog_parse_string( return(ptr + len); } +/** Initialize a string of bytes. +@param[in,out] b buffer page +@param[in] ofs byte offset from block->frame +@param[in] len length of the data to write +@param[in] val the data byte to write +@param[in,out] mtr mini-transaction */ +void +mlog_memset(buf_block_t* b, ulint ofs, ulint len, byte val, mtr_t* mtr) +{ + ut_ad(len); + ut_ad(ofs <= ulint(srv_page_size)); + ut_ad(ofs + len <= ulint(srv_page_size)); + memset(ofs + b->frame, val, len); + + mtr->set_modified(); + switch (mtr->get_log_mode()) { + case MTR_LOG_NONE: + case MTR_LOG_NO_REDO: + return; + case MTR_LOG_SHORT_INSERTS: + ut_ad(0); + /* fall through */ + case MTR_LOG_ALL: + break; + } + + byte* l = mtr->get_log()->open(11 + 2 + 2 + 1); + l = mlog_write_initial_log_record_low( + MLOG_MEMSET, b->page.id.space(), b->page.id.page_no(), l, mtr); + mach_write_to_2(l, ofs); + mach_write_to_2(l + 2, len); + l[4] = val; + mlog_close(mtr, l + 5); +} + +/** Initialize a string of bytes. +@param[in,out] byte byte address +@param[in] len length of the data to write +@param[in] val the data byte to write +@param[in,out] mtr mini-transaction */ +void mlog_memset(byte* b, ulint len, byte val, mtr_t* mtr) +{ + ut_ad(len); + ut_ad(page_offset(b) + len <= ulint(srv_page_size)); + memset(b, val, len); + + mtr->set_modified(); + switch (mtr->get_log_mode()) { + case MTR_LOG_NONE: + case MTR_LOG_NO_REDO: + return; + case MTR_LOG_SHORT_INSERTS: + ut_ad(0); + /* fall through */ + case MTR_LOG_ALL: + break; + } + + byte* l = mtr->get_log()->open(11 + 2 + 2 + 1); + l = mlog_write_initial_log_record_fast(b, MLOG_MEMSET, l, mtr); + mach_write_to_2(l, page_offset(b)); + mach_write_to_2(l + 2, len); + l[4] = val; + mlog_close(mtr, l + 5); +} + /********************************************************//** Opens a buffer for mlog, writes the initial log record and, if needed, the field lengths of an index. @@ -562,7 +644,7 @@ mlog_parse_index( n_core_fields = mach_read_from_2(ptr); if (!n_core_fields || n_core_fields > n) { - recv_sys->found_corrupt_log = TRUE; + recv_sys.found_corrupt_log = TRUE; return(NULL); } diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index 03395b136b6..cb861c03b2d 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -28,9 +28,9 @@ Created 11/26/1995 Heikki Tuuri #include "buf0buf.h" #include "buf0flu.h" +#include "fsp0sysspace.h" #include "page0types.h" #include "mtr0log.h" -#include "row0trunc.h" #include "log0recv.h" /** Iterate over a memo block in reverse. */ @@ -167,7 +167,7 @@ struct FindPage slot->object); if (m_ptr < block->frame - || m_ptr >= block->frame + block->page.size.logical()) { + || m_ptr >= block->frame + srv_page_size) { return(true); } @@ -226,7 +226,7 @@ static void memo_slot_release(mtr_memo_slot_t *slot) case MTR_MEMO_PAGE_SX_FIX: case MTR_MEMO_PAGE_X_FIX: buf_block_t *block= reinterpret_cast<buf_block_t*>(slot->object); - buf_block_unfix(block); + block->unfix(); buf_page_release_latch(block, slot->type); break; } @@ -262,7 +262,7 @@ struct ReleaseLatches { case MTR_MEMO_PAGE_SX_FIX: case MTR_MEMO_PAGE_X_FIX: buf_block_t *block= reinterpret_cast<buf_block_t*>(slot->object); - buf_block_unfix(block); + block->unfix(); buf_page_release_latch(block, slot->type); break; } @@ -564,8 +564,7 @@ mtr_t::x_lock_space(ulint space_id, const char* file, unsigned line) ut_ad(get_log_mode() != MTR_LOG_NO_REDO || space->purpose == FIL_TYPE_TEMPORARY || space->purpose == FIL_TYPE_IMPORT - || my_atomic_loadlint(&space->redo_skipped_count) > 0 - || srv_is_tablespace_truncated(space->id)); + || space->redo_skipped_count > 0); } ut_ad(space); diff --git a/storage/innobase/os/os0event.cc b/storage/innobase/os/os0event.cc index f1d7b2ed337..0676ba5f6c1 100644 --- a/storage/innobase/os/os0event.cc +++ b/storage/innobase/os/os0event.cc @@ -218,7 +218,7 @@ private: int64_t signal_count; /*!< this is incremented each time the event becomes signaled */ - mutable EventMutex mutex; /*!< this mutex protects + mutable OSMutex mutex; /*!< this mutex protects the next fields */ diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 0dc8ce6f1d3..4c6b063d372 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -38,14 +38,14 @@ Created 10/21/1995 Heikki Tuuri #include "sql_const.h" #ifdef UNIV_LINUX -#include <sys/types.h> -#include <sys/stat.h> +# include <sys/types.h> +# include <sys/stat.h> #endif #include "srv0srv.h" #include "srv0start.h" #include "fil0fil.h" -#include "srv0srv.h" +#include "fsp0fsp.h" #ifdef HAVE_LINUX_UNISTD_H #include "unistd.h" #endif @@ -70,18 +70,14 @@ Created 10/21/1995 Heikki Tuuri # endif #endif -#if defined(UNIV_LINUX) && defined(HAVE_SYS_STATVFS_H) -#include <sys/statvfs.h> -#endif - -#if defined(UNIV_LINUX) && defined(HAVE_LINUX_FALLOC_H) -#include <linux/falloc.h> -#endif - #ifdef _WIN32 #include <winioctl.h> +#else +// my_test_if_atomic_write() +#include <my_sys.h> #endif + /** Insert buffer segment id */ static const ulint IO_IBUF_SEGMENT = 0; @@ -821,108 +817,6 @@ os_win32_device_io_control( #endif -/***********************************************************************//** -Try to get number of bytes per sector from file system. -@return file block size */ -UNIV_INTERN -ulint -os_file_get_block_size( -/*===================*/ - os_file_t file, /*!< in: handle to a file */ - const char* name) /*!< in: file name */ -{ - ulint fblock_size = 512; - -#if defined(UNIV_LINUX) - struct stat local_stat; - int err; - - err = fstat((int)file, &local_stat); - - if (err != 0) { - os_file_handle_error_no_exit(name, "fstat()", FALSE); - } else { - fblock_size = local_stat.st_blksize; - } -#endif /* UNIV_LINUX */ -#ifdef _WIN32 - - fblock_size = 0; - BOOL result = false; - size_t len = 0; - // Open volume for this file, find out it "physical bytes per sector" - - HANDLE volume_handle = INVALID_HANDLE_VALUE; - char volume[MAX_PATH + 4]="\\\\.\\"; // Special prefix required for volume names. - if (!GetVolumePathName(name , volume + 4, MAX_PATH)) { - os_file_handle_error_no_exit(name, - "GetVolumePathName()", FALSE); - goto end; - } - - len = strlen(volume); - if (volume[len - 1] == '\\') { - // Trim trailing backslash from volume name. - volume[len - 1] = 0; - } - - volume_handle = CreateFile(volume, FILE_READ_ATTRIBUTES, - FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, - 0, OPEN_EXISTING, 0, 0); - - if (volume_handle == INVALID_HANDLE_VALUE) { - if (GetLastError() != ERROR_ACCESS_DENIED) { - os_file_handle_error_no_exit(volume, - "CreateFile()", FALSE); - } - goto end; - } - - DWORD tmp; - STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR disk_alignment; - - STORAGE_PROPERTY_QUERY storage_query; - memset(&storage_query, 0, sizeof(storage_query)); - storage_query.PropertyId = StorageAccessAlignmentProperty; - storage_query.QueryType = PropertyStandardQuery; - - result = os_win32_device_io_control(volume_handle, - IOCTL_STORAGE_QUERY_PROPERTY, - &storage_query, - sizeof(storage_query), - &disk_alignment, - sizeof(disk_alignment), - &tmp); - - if (!result) { - DWORD err = GetLastError(); - if (err != ERROR_INVALID_FUNCTION && err != ERROR_NOT_SUPPORTED) { - os_file_handle_error_no_exit(volume, - "DeviceIoControl(IOCTL_STORAGE_QUERY_PROPERTY)", FALSE); - } - goto end; - } - - fblock_size = disk_alignment.BytesPerPhysicalSector; - -end: - if (volume_handle != INVALID_HANDLE_VALUE) { - CloseHandle(volume_handle); - } -#endif /* _WIN32 */ - - /* Currently we support file block size up to 4Kb */ - if (fblock_size > 4096 || fblock_size < 512) { - if (fblock_size < 512) { - fblock_size = 512; - } else { - fblock_size = 4096; - } - } - - return fblock_size; -} - #ifdef WIN_ASYNC_IO /** This function is only used in Windows asynchronous i/o. Waits for an aio operation to complete. This function is used to wait the @@ -1084,15 +978,8 @@ os_aio_validate_skip() /** Try os_aio_validate() every this many times */ # define OS_AIO_VALIDATE_SKIP 13 - static int os_aio_validate_count; - - if (my_atomic_add32_explicit(&os_aio_validate_count, -1, - MY_MEMORY_ORDER_RELAXED) - % OS_AIO_VALIDATE_SKIP) { - return true; - } - - return(os_aio_validate()); + static Atomic_counter<uint32_t> os_aio_validate_count; + return (os_aio_validate_count++ % OS_AIO_VALIDATE_SKIP) || os_aio_validate(); } #endif /* UNIV_DEBUG */ @@ -3563,8 +3450,6 @@ static WinIoInit win_io_init; /** Free storage space associated with a section of the file. @param[in] fh Open file handle -@param[in] page_size Tablespace page size -@param[in] block_size File system block size @param[in] off Starting offset (SEEK_SET) @param[in] len Size of the hole @return 0 on success or errno */ @@ -5322,6 +5207,34 @@ short_warning: #endif /* _WIN32 */ +/** Check if the file system supports sparse files. +@param fh file handle +@return true if the file system supports sparse files */ +IF_WIN(static,) bool os_is_sparse_file_supported(os_file_t fh) +{ + /* In this debugging mode, we act as if punch hole is supported, + then we skip any calls to actually punch a hole. In this way, + Transparent Page Compression is still being tested. */ + DBUG_EXECUTE_IF("ignore_punch_hole", + return(true); + ); + +#ifdef _WIN32 + FILE_ATTRIBUTE_TAG_INFO info; + if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo, + &info, (DWORD)sizeof(info))) { + if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) { + return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0; + } + } + return false; +#else + /* We don't know the FS block size, use the sector size. The FS + will do the magic. */ + return DB_SUCCESS == os_file_punch_hole_posix(fh, 0, srv_page_size); +#endif /* _WIN32 */ +} + /** Extend a file. On Windows, extending a file allocates blocks for the file, @@ -5550,15 +5463,16 @@ os_file_punch_hole( os_offset_t off, os_offset_t len) { - dberr_t err; - #ifdef _WIN32 - err = os_file_punch_hole_win32(fh, off, len); + return os_file_punch_hole_win32(fh, off, len); #else - err = os_file_punch_hole_posix(fh, off, len); + return os_file_punch_hole_posix(fh, off, len); #endif /* _WIN32 */ +} - return (err); +inline bool IORequest::should_punch_hole() const +{ + return m_fil_node && m_fil_node->space->punch_hole; } /** Free storage space associated with a section of the file. @@ -5598,7 +5512,9 @@ IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len) /* If punch hole is not supported, set space so that it is not used. */ if (err == DB_IO_NO_PUNCH_HOLE) { - space_no_punch_hole(); + if (m_fil_node) { + m_fil_node->space->punch_hole = false; + } err = DB_SUCCESS; } } @@ -5606,43 +5522,6 @@ IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len) return (err); } -/** Check if the file system supports sparse files. - -Warning: On POSIX systems we try and punch a hole from offset 0 to -the system configured page size. This should only be called on an empty -file. -@param[in] fh File handle for the file - if opened -@return true if the file system supports sparse files */ -bool -os_is_sparse_file_supported(os_file_t fh) -{ - /* In this debugging mode, we act as if punch hole is supported, - then we skip any calls to actually punch a hole. In this way, - Transparent Page Compression is still being tested. */ - DBUG_EXECUTE_IF("ignore_punch_hole", - return(true); - ); - -#ifdef _WIN32 - FILE_ATTRIBUTE_TAG_INFO info; - if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo, - &info, (DWORD)sizeof(info))) { - if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) { - return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0; - } - } - return false; -#else - dberr_t err; - - /* We don't know the FS block size, use the sector size. The FS - will do the magic. */ - err = os_file_punch_hole_posix(fh, 0, srv_page_size); - - return(err == DB_SUCCESS); -#endif /* _WIN32 */ -} - /** This function returns information about the specified file @param[in] path pathname of the file @param[out] stat_info information of a file in a directory @@ -7673,6 +7552,310 @@ os_file_set_umask(ulint umask) os_innodb_umask = umask; } +#ifdef _WIN32 +static int win32_get_block_size(HANDLE volume_handle, const char *volume_name) +{ + STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR disk_alignment; + STORAGE_PROPERTY_QUERY storage_query; + DWORD tmp; + + memset(&storage_query, 0, sizeof(storage_query)); + storage_query.PropertyId = StorageAccessAlignmentProperty; + storage_query.QueryType = PropertyStandardQuery; + + if (os_win32_device_io_control(volume_handle, + IOCTL_STORAGE_QUERY_PROPERTY, + &storage_query, + sizeof storage_query, + &disk_alignment, + sizeof disk_alignment, + &tmp) && tmp == sizeof disk_alignment) { + return disk_alignment.BytesPerPhysicalSector; + } + + switch (GetLastError()) { + case ERROR_INVALID_FUNCTION: + case ERROR_NOT_SUPPORTED: + break; + default: + os_file_handle_error_no_exit( + volume_name, + "DeviceIoControl(IOCTL_STORAGE_QUERY_PROPERTY / StorageAccessAlignmentProperty)", + FALSE); + } + return 512; +} + +static bool win32_is_ssd(HANDLE volume_handle) +{ + DWORD tmp; + DEVICE_SEEK_PENALTY_DESCRIPTOR seek_penalty; + STORAGE_PROPERTY_QUERY storage_query; + memset(&storage_query, 0, sizeof(storage_query)); + + storage_query.PropertyId = StorageDeviceSeekPenaltyProperty; + storage_query.QueryType = PropertyStandardQuery; + + if (os_win32_device_io_control(volume_handle, + IOCTL_STORAGE_QUERY_PROPERTY, + &storage_query, + sizeof storage_query, + &seek_penalty, + sizeof seek_penalty, + &tmp) && tmp == sizeof(seek_penalty)){ + return !seek_penalty.IncursSeekPenalty; + } + + DEVICE_TRIM_DESCRIPTOR trim; + storage_query.PropertyId = StorageDeviceTrimProperty; + if (os_win32_device_io_control(volume_handle, + IOCTL_STORAGE_QUERY_PROPERTY, + &storage_query, + sizeof storage_query, + &trim, + sizeof trim, + &tmp) && tmp == sizeof trim) { + return trim.TrimEnabled; + } + return false; +} +#endif + +/** Determine some file metadata when creating or reading the file. +@param file the file that is being created, or OS_FILE_CLOSED */ +void fil_node_t::find_metadata(os_file_t file +#ifdef UNIV_LINUX + , struct stat* statbuf +#endif + ) +{ + if (file == OS_FILE_CLOSED) { + file = handle; + ut_ad(is_open()); + } + +#ifdef _WIN32 /* FIXME: make this unconditional */ + if (space->punch_hole) { + space->punch_hole = os_is_sparse_file_supported(file); + } +#endif + + /* + For the temporary tablespace and during the + non-redo-logged adjustments in + IMPORT TABLESPACE, we do not care about + the atomicity of writes. + + Atomic writes is supported if the file can be used + with atomic_writes (not log file), O_DIRECT is + used (tested in ha_innodb.cc) and the file is + device and file system that supports atomic writes + for the given block size. + */ + space->atomic_write_supported = space->purpose == FIL_TYPE_TEMPORARY + || space->purpose == FIL_TYPE_IMPORT; +#ifdef _WIN32 + block_size = 512; + on_ssd = false; + // Open volume for this file, find out it "physical bytes per sector" + char volume[MAX_PATH + 4]; + if (!GetVolumePathName(name, volume + 4, MAX_PATH)) { + os_file_handle_error_no_exit(name, + "GetVolumePathName()", FALSE); + return; + } + // Special prefix required for volume names. + memcpy(volume, "\\\\.\\", 4); + + size_t len = strlen(volume); + if (volume[len - 1] == '\\') { + // Trim trailing backslash from volume name. + volume[len - 1] = 0; + } + + HANDLE volume_handle = CreateFile(volume, FILE_READ_ATTRIBUTES, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + 0, OPEN_EXISTING, 0, 0); + + if (volume_handle != INVALID_HANDLE_VALUE) { + block_size = win32_get_block_size(volume_handle, volume); + on_ssd = win32_is_ssd(volume_handle); + CloseHandle(volume_handle); + } else { + /* + Report error, unless it is expected, e.g + missing permissions, or error when trying to + open volume for UNC share. + */ + if (GetLastError() != ERROR_ACCESS_DENIED + && GetDriveType(volume) == DRIVE_FIXED) { + os_file_handle_error_no_exit(volume, "CreateFile()", FALSE); + } + } + + /* Currently we support file block size up to 4KiB */ + if (block_size > 4096) { + block_size = 4096; + } else if (block_size < 512) { + block_size = 512; + } +#else + on_ssd = space->atomic_write_supported; +# ifdef UNIV_LINUX + if (!on_ssd) { + struct stat sbuf; + if (!statbuf && !fstat(file, &sbuf)) { + statbuf = &sbuf; + } + if (statbuf && fil_system.is_ssd(statbuf->st_dev)) { + on_ssd = true; + } + } +# endif +#endif + if (!space->atomic_write_supported) { + space->atomic_write_supported = atomic_write + && srv_use_atomic_writes +#ifndef _WIN32 + && my_test_if_atomic_write(file, + space->physical_size()) +#else + /* On Windows, all single sector writes are atomic, + as per WriteFile() documentation on MSDN. + We also require SSD for atomic writes, eventhough + technically it is not necessary- the reason is that + on hard disks, we still want the benefit from + (non-atomic) neighbor page flushing in the buffer + pool code. */ + && srv_page_size == block_size + && on_ssd +#endif + ; + } +} + +/** Read the first page of a data file. +@param[in] first whether this is the very first read +@return whether the page was found valid */ +bool fil_node_t::read_page0(bool first) +{ + ut_ad(mutex_own(&fil_system.mutex)); + ut_a(space->purpose != FIL_TYPE_LOG); + const ulint psize = space->physical_size(); +#ifndef _WIN32 + struct stat statbuf; + if (fstat(handle, &statbuf)) { + return false; + } + block_size = statbuf.st_blksize; + os_offset_t size_bytes = statbuf.st_size; +#else + os_offset_t size_bytes = os_file_get_size(handle); + ut_a(size_bytes != (os_offset_t) -1); +#endif + const ulint min_size = FIL_IBD_FILE_INITIAL_SIZE * psize; + + if (size_bytes < min_size) { + ib::error() << "The size of the file " << name + << " is only " << size_bytes + << " bytes, should be at least " << min_size; + return false; + } + + byte* buf2 = static_cast<byte*>(ut_malloc_nokey(2 * psize)); + + /* Align the memory for file i/o if we might have O_DIRECT set */ + byte* page = static_cast<byte*>(ut_align(buf2, psize)); + IORequest request(IORequest::READ); + if (os_file_read(request, handle, page, 0, psize) != DB_SUCCESS) { + ib::error() << "Unable to read first page of file " << name; + ut_free(buf2); + return false; + } + const ulint space_id = fsp_header_get_space_id(page); + ulint flags = fsp_header_get_flags(page); + const ulint size = fsp_header_get_field(page, FSP_SIZE); + const ulint free_limit = fsp_header_get_field(page, FSP_FREE_LIMIT); + const ulint free_len = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE + + page); + if (!fil_space_t::is_valid_flags(flags, space->id)) { + ulint cflags = fsp_flags_convert_from_101(flags); + if (cflags == ULINT_UNDEFINED) { +invalid: + ib::error() + << "Expected tablespace flags " + << ib::hex(space->flags) + << " but found " << ib::hex(flags) + << " in the file " << name; + ut_free(buf2); + return false; + } + + ulint cf = cflags & ~FSP_FLAGS_MEM_MASK; + ulint sf = space->flags & ~FSP_FLAGS_MEM_MASK; + + if (!fil_space_t::is_flags_equal(cf, sf) + && !fil_space_t::is_flags_equal(sf, cf)) { + goto invalid; + } + + flags = cflags; + } + + ut_ad(!(flags & FSP_FLAGS_MEM_MASK)); + + /* Try to read crypt_data from page 0 if it is not yet read. */ + if (!space->crypt_data) { + space->crypt_data = fil_space_read_crypt_data( + fil_space_t::zip_size(flags), page); + } + ut_free(buf2); + + if (UNIV_UNLIKELY(space_id != space->id)) { + ib::error() << "Expected tablespace id " << space->id + << " but found " << space_id + << " in the file " << name; + return false; + } + + if (first) { + ut_ad(space->id != TRX_SYS_SPACE); +#ifdef UNIV_LINUX + find_metadata(handle, &statbuf); +#else + find_metadata(); +#endif + + /* Truncate the size to a multiple of extent size. */ + ulint mask = psize * FSP_EXTENT_SIZE - 1; + + if (size_bytes <= mask) { + /* .ibd files start smaller than an + extent size. Do not truncate valid data. */ + } else { + size_bytes &= ~os_offset_t(mask); + } + + space->flags = (space->flags & FSP_FLAGS_MEM_MASK) | flags; + + this->size = ulint(size_bytes / psize); + space->size += this->size; + } else if (space->id != TRX_SYS_SPACE || space->size_in_header) { + /* If this is not the first-time open, do nothing. + For the system tablespace, we always get invoked as + first=false, so we detect the true first-time-open based + on size_in_header and proceed to initiailze the data. */ + return true; + } + + ut_ad(space->free_limit == 0 || space->free_limit == free_limit); + ut_ad(space->free_len == 0 || space->free_len == free_len); + space->size_in_header = size; + space->free_limit = free_limit; + space->free_len = free_len; + return true; +} + #else #include "univ.i" #endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/os/os0proc.cc b/storage/innobase/os/os0proc.cc index 508a13de2ca..7e654c77c1a 100644 --- a/storage/innobase/os/os0proc.cc +++ b/storage/innobase/os/os0proc.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2019, MariaDB Corporation. +Copyright (c) 2019, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -40,7 +40,7 @@ MAP_ANON but MAP_ANON is marked as deprecated */ /** The total amount of memory currently allocated from the operating system with os_mem_alloc_large(). */ -ulint os_total_large_mem_allocated = 0; +Atomic_counter<ulint> os_total_large_mem_allocated; /** Converts the current process id to a number. @return process id as a number */ @@ -98,9 +98,7 @@ os_mem_alloc_large( if (ptr) { *n = size; - my_atomic_addlint( - &os_total_large_mem_allocated, size); - + os_total_large_mem_allocated += size; MEM_UNDEFINED(ptr, size); return(ptr); } @@ -123,8 +121,7 @@ skip: ib::info() << "VirtualAlloc(" << size << " bytes) failed;" " Windows error " << GetLastError(); } else { - my_atomic_addlint( - &os_total_large_mem_allocated, size); + os_total_large_mem_allocated += size; MEM_UNDEFINED(ptr, size); } #else @@ -139,8 +136,7 @@ skip: " errno " << errno; ptr = NULL; } else { - my_atomic_addlint( - &os_total_large_mem_allocated, size); + os_total_large_mem_allocated += size; MEM_UNDEFINED(ptr, size); } #endif @@ -167,8 +163,7 @@ os_mem_free_large( #ifdef HAVE_LINUX_LARGE_PAGES if (my_use_large_pages && opt_large_page_size && !shmdt(ptr)) { - my_atomic_addlint( - &os_total_large_mem_allocated, -size); + os_total_large_mem_allocated -= size; return; } #endif /* HAVE_LINUX_LARGE_PAGES */ @@ -179,8 +174,7 @@ os_mem_free_large( ib::error() << "VirtualFree(" << ptr << ", " << size << ") failed; Windows error " << GetLastError(); } else { - my_atomic_addlint( - &os_total_large_mem_allocated, -lint(size)); + os_total_large_mem_allocated -= size; } #elif !defined OS_MAP_ANON ut_free(ptr); @@ -193,8 +187,7 @@ os_mem_free_large( ib::error() << "munmap(" << ptr << ", " << size << ") failed;" " errno " << errno; } else { - my_atomic_addlint( - &os_total_large_mem_allocated, -size); + os_total_large_mem_allocated -= size; } #endif } diff --git a/storage/innobase/os/os0thread.cc b/storage/innobase/os/os0thread.cc index 0c97ebccc1b..bb5da7df7c4 100644 --- a/storage/innobase/os/os0thread.cc +++ b/storage/innobase/os/os0thread.cc @@ -28,7 +28,7 @@ Created 9/8/1995 Heikki Tuuri #include "srv0srv.h" /** Number of threads active. */ -ulint os_thread_count; +Atomic_counter<ulint> os_thread_count; /***************************************************************//** Compares two thread ids for equality. @@ -118,7 +118,7 @@ os_thread_create_func( CloseHandle(handle); - my_atomic_addlint(&os_thread_count, 1); + os_thread_count++; return((os_thread_t)new_thread_id); #else /* _WIN32 else */ @@ -133,7 +133,7 @@ os_thread_create_func( abort(); } - my_atomic_addlint(&os_thread_count, 1); + os_thread_count++; ret = pthread_create(&new_thread_id, &attr, func, arg); @@ -188,7 +188,7 @@ os_thread_exit(bool detach) pfs_delete_thread(); #endif - my_atomic_addlint(&os_thread_count, ulint(-1)); + os_thread_count--; #ifdef _WIN32 ExitThread(0); diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc index 46b2c73cf37..ad8d4ab2c1f 100644 --- a/storage/innobase/page/page0cur.cc +++ b/storage/innobase/page/page0cur.cc @@ -701,7 +701,7 @@ up_slot_match: & REC_INFO_MIN_REC_FLAG)) { ut_ad(!page_has_prev(page_align(mid_rec))); ut_ad(!page_rec_is_leaf(mid_rec) - || rec_is_metadata(mid_rec, index)); + || rec_is_metadata(mid_rec, *index)); cmp = 1; goto low_rec_match; } @@ -1026,7 +1026,7 @@ page_cur_parse_insert_rec( if (offset >= srv_page_size) { - recv_sys->found_corrupt_log = TRUE; + recv_sys.found_corrupt_log = TRUE; return(NULL); } @@ -1040,7 +1040,7 @@ page_cur_parse_insert_rec( } if (end_seg_len >= srv_page_size << 1) { - recv_sys->found_corrupt_log = TRUE; + recv_sys.found_corrupt_log = TRUE; return(NULL); } @@ -1330,7 +1330,7 @@ use_heap: switch (rec_get_status(current_rec)) { case REC_STATUS_ORDINARY: case REC_STATUS_NODE_PTR: - case REC_STATUS_COLUMNS_ADDED: + case REC_STATUS_INSTANT: case REC_STATUS_INFIMUM: break; case REC_STATUS_SUPREMUM: @@ -1339,7 +1339,7 @@ use_heap: switch (rec_get_status(insert_rec)) { case REC_STATUS_ORDINARY: case REC_STATUS_NODE_PTR: - case REC_STATUS_COLUMNS_ADDED: + case REC_STATUS_INSTANT: break; case REC_STATUS_INFIMUM: case REC_STATUS_SUPREMUM: @@ -1526,7 +1526,7 @@ page_cur_insert_rec_zip( get rid of the modification log. */ page_create_zip(page_cur_get_block(cursor), index, page_header_get_field(page, PAGE_LEVEL), - 0, NULL, mtr); + 0, mtr); ut_ad(!page_header_get_ptr(page, PAGE_FREE)); if (page_zip_available( @@ -1601,7 +1601,7 @@ page_cur_insert_rec_zip( if (!log_compressed) { if (page_zip_compress( page_zip, page, index, - level, NULL, NULL)) { + level, NULL)) { page_cur_insert_rec_write_log( insert_rec, rec_size, cursor->rec, index, mtr); @@ -1747,17 +1747,11 @@ too_small: columns of free_rec, in case it will not be overwritten by insert_rec. */ - ulint trx_id_col; ulint trx_id_offs; ulint len; - trx_id_col = dict_index_get_sys_col_pos(index, - DATA_TRX_ID); - ut_ad(trx_id_col > 0); - ut_ad(trx_id_col != ULINT_UNDEFINED); - - trx_id_offs = rec_get_nth_field_offs(foffsets, - trx_id_col, &len); + trx_id_offs = rec_get_nth_field_offs( + foffsets, index->db_trx_id(), &len); ut_ad(len == DATA_TRX_ID_LEN); if (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + trx_id_offs @@ -1773,7 +1767,7 @@ too_small: ut_ad(free_rec + trx_id_offs + DATA_TRX_ID_LEN == rec_get_nth_field(free_rec, foffsets, - trx_id_col + 1, &len)); + index->db_roll_ptr(), &len)); ut_ad(len == DATA_ROLL_PTR_LEN); } @@ -2223,7 +2217,7 @@ page_cur_parse_delete_rec( ptr += 2; if (UNIV_UNLIKELY(offset >= srv_page_size)) { - recv_sys->found_corrupt_log = true; + recv_sys.found_corrupt_log = true; return NULL; } @@ -2297,7 +2291,8 @@ page_cur_delete_rec( /* The record must not be the supremum or infimum record. */ ut_ad(page_rec_is_user_rec(current_rec)); - if (page_get_n_recs(page) == 1 && !recv_recovery_is_on()) { + if (page_get_n_recs(page) == 1 && !recv_recovery_is_on() + && !rec_is_alter_metadata(current_rec, *index)) { /* Empty the page, unless we are applying the redo log during crash recovery. During normal operation, the page_create_empty() gets logged as one of MLOG_PAGE_CREATE, diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc index ba2e08df2d2..ef5324e2a8d 100644 --- a/storage/innobase/page/page0page.cc +++ b/storage/innobase/page/page0page.cc @@ -29,8 +29,8 @@ Created 2/2/1994 Heikki Tuuri #include "page0cur.h" #include "page0zip.h" #include "buf0buf.h" +#include "buf0checksum.h" #include "btr0btr.h" -#include "row0trunc.h" #include "srv0srv.h" #include "lock0lock.h" #include "fut0lst.h" @@ -449,28 +449,19 @@ page_create_zip( buf_block_t* block, /*!< in/out: a buffer frame where the page is created */ dict_index_t* index, /*!< in: the index of the - page, or NULL when applying - TRUNCATE log - record during recovery */ + page */ ulint level, /*!< in: the B-tree level of the page */ trx_id_t max_trx_id, /*!< in: PAGE_MAX_TRX_ID */ - const redo_page_compress_t* page_comp_info, - /*!< in: used for applying - TRUNCATE log - record during recovery */ mtr_t* mtr) /*!< in/out: mini-transaction handle */ { page_t* page; page_zip_des_t* page_zip = buf_block_get_page_zip(block); - bool is_spatial; ut_ad(block); ut_ad(page_zip); - ut_ad(index == NULL || dict_table_is_comp(index->table)); - is_spatial = index ? dict_index_is_spatial(index) - : page_comp_info->type & DICT_SPATIAL; + ut_ad(dict_table_is_comp(index->table)); /* PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC are always 0 for temporary tables. */ @@ -488,22 +479,11 @@ page_create_zip( || !dict_index_is_sec_or_ibuf(index) || index->table->is_temporary()); - page = page_create_low(block, TRUE, is_spatial); + page = page_create_low(block, TRUE, dict_index_is_spatial(index)); mach_write_to_2(PAGE_HEADER + PAGE_LEVEL + page, level); mach_write_to_8(PAGE_HEADER + PAGE_MAX_TRX_ID + page, max_trx_id); - if (truncate_t::s_fix_up_active) { - /* Compress the index page created when applying - TRUNCATE log during recovery */ - if (!page_zip_compress(page_zip, page, index, page_zip_level, - page_comp_info, NULL)) { - /* The compression of a newly created - page should always succeed. */ - ut_error; - } - - } else if (!page_zip_compress(page_zip, page, index, - page_zip_level, NULL, mtr)) { + if (!page_zip_compress(page_zip, page, index, page_zip_level, mtr)) { /* The compression of a newly created page should always succeed. */ ut_error; @@ -549,7 +529,7 @@ page_create_empty( ut_ad(!index->table->is_temporary()); page_create_zip(block, index, page_header_get_field(page, PAGE_LEVEL), - max_trx_id, NULL, mtr); + max_trx_id, mtr); } else { page_create(block, mtr, page_is_comp(page), dict_index_is_spatial(index)); @@ -724,11 +704,8 @@ page_copy_rec_list_end( if (new_page_zip) { mtr_set_log_mode(mtr, log_mode); - if (!page_zip_compress(new_page_zip, - new_page, - index, - page_zip_level, - NULL, mtr)) { + if (!page_zip_compress(new_page_zip, new_page, index, + page_zip_level, mtr)) { /* Before trying to reorganize the page, store the number of preceding records on the page. */ ulint ret_pos @@ -893,7 +870,7 @@ page_copy_rec_list_start( goto zip_reorganize;); if (!page_zip_compress(new_page_zip, new_page, index, - page_zip_level, NULL, mtr)) { + page_zip_level, mtr)) { ulint ret_pos; #ifndef DBUG_OFF zip_reorganize: @@ -1834,6 +1811,7 @@ page_print_list( count = 0; for (;;) { offsets = rec_get_offsets(cur.rec, index, offsets, + page_rec_is_leaf(cur.rec), ULINT_UNDEFINED, &heap); page_rec_print(cur.rec, offsets); @@ -1856,6 +1834,7 @@ page_print_list( if (count + pr_n >= n_recs) { offsets = rec_get_offsets(cur.rec, index, offsets, + page_rec_is_leaf(cur.rec), ULINT_UNDEFINED, &heap); page_rec_print(cur.rec, offsets); } @@ -2550,16 +2529,14 @@ wrong_page_type: ib::error() << "REC_INFO_MIN_REC_FLAG " "is set in a leaf-page record"; ret = false; - } else if (rec_get_deleted_flag( - rec, page_is_comp(page))) { - /* If this were a 10.4 metadata - record for index->table->instant - we should not get here in 10.3, because - the metadata record should not have - been recognized by - btr_cur_instant_init_low(). */ - ib::error() << "Metadata record " - "is delete-marked"; + } else if (!rec_get_deleted_flag( + rec, page_is_comp(page)) + != !index->table->instant) { + ib::error() << (index->table->instant + ? "Metadata record " + "is not delete-marked" + : "Metadata record " + "is delete-marked"); ret = false; } } else if (!page_has_prev(page) diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc index ecfea3a2e90..bedbe962fe9 100644 --- a/storage/innobase/page/page0zip.cc +++ b/storage/innobase/page/page0zip.cc @@ -25,8 +25,12 @@ Compressed page interface Created June 2005 by Marko Makela *******************************************************/ -#include "page0size.h" #include "page0zip.h" +#include "fsp0types.h" +#include "page0page.h" +#include "buf0checksum.h" +#include "ut0crc32.h" +#include "zlib.h" #include "span.h" using st_::span; @@ -34,28 +38,21 @@ using st_::span; /** A BLOB field reference full of zero, for use in assertions and tests. Initially, BLOB field references are set to zero, in dtuple_convert_big_rec(). */ +alignas(UNIV_PAGE_SIZE_MIN) const byte field_ref_zero[UNIV_PAGE_SIZE_MAX] = { 0, }; #ifndef UNIV_INNOCHECKSUM -#include "page0page.h" #include "mtr0log.h" #include "dict0dict.h" #include "btr0cur.h" -#include "page0types.h" #include "log0recv.h" #include "row0row.h" -#include "row0trunc.h" -#include "zlib.h" -#include "buf0buf.h" -#include "buf0types.h" -#include "buf0checksum.h" #include "btr0sea.h" #include "dict0boot.h" #include "lock0lock.h" #include "srv0srv.h" #include "buf0lru.h" #include "srv0mon.h" -#include "ut0crc32.h" #include <map> #include <algorithm> @@ -104,7 +101,7 @@ Compare at most sizeof(field_ref_zero) bytes. @param s in: size of the memory block, in bytes */ #define ASSERT_ZERO(b, s) \ ut_ad(!memcmp(b, field_ref_zero, \ - ut_min(static_cast<size_t>(s), sizeof field_ref_zero))); + std::min<size_t>(s, sizeof field_ref_zero))); /** Assert that a BLOB pointer is filled with zero bytes. @param b in: BLOB pointer */ #define ASSERT_ZERO_BLOB(b) \ @@ -176,18 +173,17 @@ page_zip_is_too_big( const dict_index_t* index, const dtuple_t* entry) { - const page_size_t& page_size = - dict_table_page_size(index->table); + const ulint zip_size = index->table->space->zip_size(); /* Estimate the free space of an empty compressed page. Subtract one byte for the encoded heap_no in the modification log. */ ulint free_space_zip = page_zip_empty_size( - index->n_fields, page_size.physical()); + index->n_fields, zip_size); ulint n_uniq = dict_index_get_n_unique_in_tree(index); ut_ad(dict_table_is_comp(index->table)); - ut_ad(page_size.is_compressed()); + ut_ad(zip_size); if (free_space_zip == 0) { return(true); @@ -1246,17 +1242,11 @@ page_zip_compress( dict_index_t* index, /*!< in: index of the B-tree node */ ulint level, /*!< in: commpression level */ - const redo_page_compress_t* page_comp_info, - /*!< in: used for applying - TRUNCATE log - record during recovery */ mtr_t* mtr) /*!< in/out: mini-transaction, or NULL */ { z_stream c_stream; int err; - ulint n_fields; /* number of index fields - needed */ byte* fields; /*!< index field information */ byte* buf; /*!< compressed payload of the page */ @@ -1271,7 +1261,6 @@ page_zip_compress( ulint n_blobs = 0; byte* storage; /* storage of uncompressed columns */ - index_id_t ind_id; const ulonglong ns = my_interval_timer(); #ifdef PAGE_ZIP_COMPRESS_DBG FILE* logfile = NULL; @@ -1286,10 +1275,8 @@ page_zip_compress( ut_a(fil_page_index_page_check(page)); ut_ad(page_simple_validate_new((page_t*) page)); ut_ad(page_zip_simple_validate(page_zip)); - ut_ad(!index - || (index - && dict_table_is_comp(index->table) - && !dict_index_is_ibuf(index))); + ut_ad(dict_table_is_comp(index->table)); + ut_ad(!dict_index_is_ibuf(index)); MEM_CHECK_DEFINED(page, srv_page_size); @@ -1309,18 +1296,10 @@ page_zip_compress( == PAGE_NEW_SUPREMUM); } - if (truncate_t::s_fix_up_active) { - ut_ad(page_comp_info != NULL); - n_fields = page_comp_info->n_fields; - ind_id = page_comp_info->index_id; - } else { - if (page_is_leaf(page)) { - n_fields = dict_index_get_n_fields(index); - } else { - n_fields = dict_index_get_n_unique_in_tree_nonleaf(index); - } - ind_id = index->id; - } + const ulint n_fields = page_is_leaf(page) + ? dict_index_get_n_fields(index) + : dict_index_get_n_unique_in_tree_nonleaf(index); + index_id_t ind_id = index->id; /* The dense directory excludes the infimum and supremum records. */ n_dense = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW; @@ -1431,20 +1410,8 @@ page_zip_compress( /* Dense page directory and uncompressed columns, if any */ if (page_is_leaf(page)) { - if ((index && dict_index_is_clust(index)) - || (page_comp_info - && (page_comp_info->type & DICT_CLUSTERED))) { - - if (index) { - trx_id_col = dict_index_get_sys_col_pos( - index, DATA_TRX_ID); - ut_ad(trx_id_col > 0); - ut_ad(trx_id_col != ULINT_UNDEFINED); - } else if (page_comp_info - && (page_comp_info->type - & DICT_CLUSTERED)) { - trx_id_col = page_comp_info->trx_id_pos; - } + if (dict_index_is_clust(index)) { + trx_id_col = index->db_trx_id(); slot_size = PAGE_ZIP_DIR_SLOT_SIZE + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; @@ -1452,10 +1419,6 @@ page_zip_compress( } else { /* Signal the absence of trx_id in page_zip_fields_encode() */ - if (index) { - ut_ad(dict_index_get_sys_col_pos( - index, DATA_TRX_ID) == ULINT_UNDEFINED); - } trx_id_col = 0; slot_size = PAGE_ZIP_DIR_SLOT_SIZE; } @@ -1469,19 +1432,9 @@ page_zip_compress( goto zlib_error; } - c_stream.avail_out -= static_cast<uInt>(n_dense * slot_size); - if (truncate_t::s_fix_up_active) { - ut_ad(page_comp_info != NULL); - c_stream.avail_in = static_cast<uInt>( - page_comp_info->field_len); - for (ulint i = 0; i < page_comp_info->field_len; i++) { - fields[i] = page_comp_info->fields[i]; - } - } else { - c_stream.avail_in = static_cast<uInt>( - page_zip_fields_encode( - n_fields, index, trx_id_col, fields)); - } + c_stream.avail_out -= uInt(n_dense * slot_size); + c_stream.avail_in = uInt(page_zip_fields_encode(n_fields, index, + trx_id_col, fields)); c_stream.next_in = fields; if (UNIV_LIKELY(!trx_id_col)) { @@ -1637,7 +1590,7 @@ err_exit: mutex_exit(&page_zip_stat_per_index_mutex); } - if (page_is_leaf(page) && !truncate_t::s_fix_up_active) { + if (page_is_leaf(page)) { dict_index_zip_success(index); } @@ -1654,7 +1607,7 @@ page_zip_fields_free( { if (index) { dict_table_t* table = index->table; - dict_index_zip_pad_mutex_destroy(index); + mutex_free(&index->zip_pad.mutex); mem_heap_free(index->heap); dict_mem_table_free(table); @@ -2169,6 +2122,10 @@ page_zip_apply_log( rec_get_offsets_reverse(data, index, hs & REC_STATUS_NODE_PTR, offsets); + /* Silence a debug assertion in rec_offs_make_valid(). + This will be overwritten in page_zip_set_extra_bytes(), + called by page_zip_decompress_low(). */ + ut_d(rec[-REC_NEW_INFO_BITS] = 0); rec_offs_make_valid(rec, index, is_leaf, offsets); /* Copy the extra bytes (backwards). */ @@ -3783,29 +3740,25 @@ page_zip_write_rec( ulint len; if (dict_index_is_clust(index)) { - ulint trx_id_col; - - trx_id_col = dict_index_get_sys_col_pos(index, - DATA_TRX_ID); - ut_ad(trx_id_col != ULINT_UNDEFINED); - /* Store separately trx_id, roll_ptr and the BTR_EXTERN_FIELD_REF of each BLOB column. */ if (rec_offs_any_extern(offsets)) { data = page_zip_write_rec_ext( page_zip, page, rec, index, offsets, create, - trx_id_col, heap_no, storage, data); + index->db_trx_id(), heap_no, + storage, data); } else { /* Locate trx_id and roll_ptr. */ const byte* src = rec_get_nth_field(rec, offsets, - trx_id_col, &len); + index->db_trx_id(), + &len); ut_ad(len == DATA_TRX_ID_LEN); ut_ad(src + DATA_TRX_ID_LEN == rec_get_nth_field( rec, offsets, - trx_id_col + 1, &len)); + index->db_roll_ptr(), &len)); ut_ad(len == DATA_ROLL_PTR_LEN); /* Log the preceding fields. */ @@ -3833,8 +3786,6 @@ page_zip_write_rec( } else { /* Leaf page of a secondary index: no externally stored columns */ - ut_ad(dict_index_get_sys_col_pos(index, DATA_TRX_ID) - == ULINT_UNDEFINED); ut_ad(!rec_offs_any_extern(offsets)); /* Log the entire record. */ @@ -3908,7 +3859,7 @@ page_zip_parse_write_blob_ptr( || offset >= srv_page_size || z_offset >= srv_page_size) { corrupt: - recv_sys->found_corrupt_log = TRUE; + recv_sys.found_corrupt_log = TRUE; return(NULL); } @@ -4050,7 +4001,7 @@ page_zip_parse_write_node_ptr( || offset >= srv_page_size || z_offset >= srv_page_size) { corrupt: - recv_sys->found_corrupt_log = TRUE; + recv_sys.found_corrupt_log = TRUE; return(NULL); } @@ -4265,7 +4216,7 @@ page_zip_parse_write_trx_id( || offset >= srv_page_size || z_offset >= srv_page_size) { corrupt: - recv_sys->found_corrupt_log = TRUE; + recv_sys.found_corrupt_log = TRUE; return(NULL); } @@ -4682,7 +4633,7 @@ page_zip_parse_write_header( if (len == 0 || offset + len >= PAGE_DATA) { corrupt: - recv_sys->found_corrupt_log = TRUE; + recv_sys.found_corrupt_log = TRUE; return(NULL); } @@ -4818,9 +4769,7 @@ page_zip_reorganize( /* Restore logging. */ mtr_set_log_mode(mtr, log_mode); - if (!page_zip_compress(page_zip, page, index, - page_zip_level, NULL, mtr)) { - + if (!page_zip_compress(page_zip, page, index, page_zip_level, mtr)) { buf_block_free(temp_block); return(FALSE); } @@ -4962,7 +4911,7 @@ byte* page_zip_parse_compress(const byte* ptr, const byte* end_ptr, if (!page_zip || page_zip_get_size(page_zip) < size || block->page.id.page_no() < 3) { corrupt: - recv_sys->found_corrupt_log = TRUE; + recv_sys.found_corrupt_log = TRUE; return(NULL); } @@ -5001,12 +4950,7 @@ uint32_t page_zip_calc_checksum( const void* data, ulint size, - srv_checksum_algorithm_t algo -#ifdef INNODB_BUG_ENDIAN_CRC32 - /** for crc32, use the big-endian bug-compatible crc32 variant */ - , bool use_legacy_big_endian -#endif -) + srv_checksum_algorithm_t algo) { uLong adler; const Bytef* s = static_cast<const byte*>(data); @@ -5015,22 +4959,11 @@ page_zip_calc_checksum( and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */ switch (algo) { + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: case SRV_CHECKSUM_ALGORITHM_CRC32: case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); -#ifdef INNODB_BUG_ENDIAN_CRC32 - if (use_legacy_big_endian) { - return ut_crc32_legacy_big_endian(s + FIL_PAGE_OFFSET, - FIL_PAGE_LSN - - FIL_PAGE_OFFSET) - ^ ut_crc32_legacy_big_endian( - s + FIL_PAGE_TYPE, 2) - ^ ut_crc32_legacy_big_endian( - s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, - size - - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); - } -#endif return ut_crc32(s + FIL_PAGE_OFFSET, FIL_PAGE_LSN - FIL_PAGE_OFFSET) ^ ut_crc32(s + FIL_PAGE_TYPE, 2) @@ -5113,40 +5046,26 @@ bool page_zip_verify_checksum(const byte *data, size_t size) } switch (curr_algo) { + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: -#ifdef INNODB_BUG_ENDIAN_CRC32 - return stored == page_zip_calc_checksum(data, size, curr_algo, - true); -#endif - /* fall through */ case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: return FALSE; + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: case SRV_CHECKSUM_ALGORITHM_CRC32: if (stored == BUF_NO_CHECKSUM_MAGIC) { return(TRUE); } - return -#ifdef INNODB_BUG_ENDIAN_CRC32 - stored == page_zip_calc_checksum(data, size, curr_algo, - true) || -#endif - stored == page_zip_calc_checksum( - data, size, SRV_CHECKSUM_ALGORITHM_INNODB); + return stored == page_zip_calc_checksum( + data, size, SRV_CHECKSUM_ALGORITHM_INNODB); case SRV_CHECKSUM_ALGORITHM_INNODB: if (stored == BUF_NO_CHECKSUM_MAGIC) { return TRUE; } return stored == page_zip_calc_checksum( - data, size, SRV_CHECKSUM_ALGORITHM_CRC32) -#ifdef INNODB_BUG_ENDIAN_CRC32 - || stored == page_zip_calc_checksum( - data, size, - SRV_CHECKSUM_ALGORITHM_CRC32, true) -#endif - ; + data, size, SRV_CHECKSUM_ALGORITHM_CRC32); case SRV_CHECKSUM_ALGORITHM_NONE: return TRUE; } diff --git a/storage/innobase/pars/pars0pars.cc b/storage/innobase/pars/pars0pars.cc index 43f089e3bf7..ebfe7ada3b1 100644 --- a/storage/innobase/pars/pars0pars.cc +++ b/storage/innobase/pars/pars0pars.cc @@ -1986,7 +1986,7 @@ pars_sql( heap = mem_heap_create(16000); /* Currently, the parser is not reentrant: */ - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); pars_sym_tab_global = sym_tab_create(heap); diff --git a/storage/innobase/pars/pars0sym.cc b/storage/innobase/pars/pars0sym.cc index 6e416d7b635..5e4c0e0f6e0 100644 --- a/storage/innobase/pars/pars0sym.cc +++ b/storage/innobase/pars/pars0sym.cc @@ -67,7 +67,7 @@ sym_tab_free_private( sym_node_t* sym; func_node_t* func; - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); for (sym = UT_LIST_GET_FIRST(sym_tab->sym_list); sym != NULL; diff --git a/storage/innobase/que/que0que.cc b/storage/innobase/que/que0que.cc index abd4672517e..3f4810dcc0e 100644 --- a/storage/innobase/que/que0que.cc +++ b/storage/innobase/que/que0que.cc @@ -1185,9 +1185,9 @@ que_eval_sql( /*=========*/ pars_info_t* info, /*!< in: info struct, or NULL */ const char* sql, /*!< in: SQL string */ - ibool reserve_dict_mutex, - /*!< in: if TRUE, acquire/release - dict_sys->mutex around call to pars_sql. */ + bool reserve_dict_mutex, + /*!< in: whether to acquire/release + dict_sys.mutex around call to pars_sql. */ trx_t* trx) /*!< in: trx */ { que_thr_t* thr; @@ -1199,13 +1199,13 @@ que_eval_sql( ut_a(trx->error_state == DB_SUCCESS); if (reserve_dict_mutex) { - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); } graph = pars_sql(info, sql); if (reserve_dict_mutex) { - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } graph->trx = trx; @@ -1218,13 +1218,13 @@ que_eval_sql( que_run_threads(thr); if (reserve_dict_mutex) { - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); } que_graph_free(graph); if (reserve_dict_mutex) { - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } DBUG_RETURN(trx->error_state); diff --git a/storage/innobase/read/read0read.cc b/storage/innobase/read/read0read.cc index e14f564e264..1cc88f4b758 100644 --- a/storage/innobase/read/read0read.cc +++ b/storage/innobase/read/read0read.cc @@ -200,7 +200,7 @@ inline void ReadView::snapshot(trx_t *trx) void ReadView::open(trx_t *trx) { ut_ad(this == &trx->read_view); - switch (m_state) + switch (state()) { case READ_VIEW_STATE_OPEN: ut_ad(!srv_read_only_mode); @@ -254,8 +254,7 @@ void ReadView::open(trx_t *trx) */ mutex_enter(&trx_sys.mutex); mutex_exit(&trx_sys.mutex); - my_atomic_store32_explicit(&m_state, READ_VIEW_STATE_SNAPSHOT, - MY_MEMORY_ORDER_RELAXED); + m_state.store(READ_VIEW_STATE_SNAPSHOT, std::memory_order_relaxed); break; default: ut_ad(0); @@ -264,8 +263,7 @@ void ReadView::open(trx_t *trx) snapshot(trx); reopen: m_creator_trx_id= trx->id; - my_atomic_store32_explicit(&m_state, READ_VIEW_STATE_OPEN, - MY_MEMORY_ORDER_RELEASE); + m_state.store(READ_VIEW_STATE_OPEN, std::memory_order_release); } @@ -284,7 +282,7 @@ void trx_sys_t::clone_oldest_view() for (const trx_t *trx= UT_LIST_GET_FIRST(trx_list); trx; trx= UT_LIST_GET_NEXT(trx_list, trx)) { - int32_t state; + uint32_t state; while ((state= trx->read_view.get_state()) == READ_VIEW_STATE_SNAPSHOT) ut_delay(1); diff --git a/storage/innobase/rem/rem0cmp.cc b/storage/innobase/rem/rem0cmp.cc index 35cd24f06d3..93dfb58643a 100644 --- a/storage/innobase/rem/rem0cmp.cc +++ b/storage/innobase/rem/rem0cmp.cc @@ -798,7 +798,7 @@ cmp_dtuple_rec_with_match_bytes( & rec_get_info_bits(rec, rec_offs_comp(offsets)))) { ut_ad(page_rec_is_first(rec, page_align(rec))); ut_ad(!page_has_prev(page_align(rec))); - ut_ad(rec_is_metadata(rec, index)); + ut_ad(rec_is_metadata(rec, *index)); return 1; } diff --git a/storage/innobase/rem/rem0rec.cc b/storage/innobase/rem/rem0rec.cc index 1b40fa23841..b3c2fc84231 100644 --- a/storage/innobase/rem/rem0rec.cc +++ b/storage/innobase/rem/rem0rec.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2019, MariaDB Corporation. +Copyright (c) 2017, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -169,7 +169,7 @@ rec_get_n_extern_new( ut_ad(!index->table->supports_instant() || index->is_dummy); ut_ad(!index->is_instant()); ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY - || rec_get_status(rec) == REC_STATUS_COLUMNS_ADDED); + || rec_get_status(rec) == REC_STATUS_INSTANT); ut_ad(n == ULINT_UNDEFINED || n <= dict_index_get_n_fields(index)); if (n == ULINT_UNDEFINED) { @@ -231,40 +231,22 @@ rec_get_n_extern_new( return(n_extern); } -/** Get the added field count in a REC_STATUS_COLUMNS_ADDED record. -@param[in,out] header variable header of a REC_STATUS_COLUMNS_ADDED record -@return number of added fields */ -static inline unsigned rec_get_n_add_field(const byte*& header) -{ - unsigned n_fields_add = *--header; - if (n_fields_add < 0x80) { - ut_ad(rec_get_n_add_field_len(n_fields_add) == 1); - return n_fields_add; - } - - n_fields_add &= 0x7f; - n_fields_add |= unsigned(*--header) << 7; - ut_ad(n_fields_add < REC_MAX_N_FIELDS); - ut_ad(rec_get_n_add_field_len(n_fields_add) == 2); - return n_fields_add; -} - /** Format of a leaf-page ROW_FORMAT!=REDUNDANT record */ enum rec_leaf_format { /** Temporary file record */ REC_LEAF_TEMP, - /** Temporary file record, with added columns - (REC_STATUS_COLUMNS_ADDED) */ - REC_LEAF_TEMP_COLUMNS_ADDED, + /** Temporary file record, with added columns (REC_STATUS_INSTANT) */ + REC_LEAF_TEMP_INSTANT, /** Normal (REC_STATUS_ORDINARY) */ REC_LEAF_ORDINARY, - /** With added columns (REC_STATUS_COLUMNS_ADDED) */ - REC_LEAF_COLUMNS_ADDED + /** With add or drop columns (REC_STATUS_INSTANT) */ + REC_LEAF_INSTANT }; /** Determine the offset to each field in a leaf-page record in ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED. This is a special case of rec_init_offsets() and rec_get_offsets_func(). +@tparam mblob whether the record includes a metadata BLOB @param[in] rec leaf-page record @param[in] index the index that the record belongs in @param[in] n_core number of core fields (index->n_core_fields) @@ -272,6 +254,7 @@ This is a special case of rec_init_offsets() and rec_get_offsets_func(). NULL to refer to index->fields[].col->def_val @param[in,out] offsets offsets, with valid rec_offs_n_fields(offsets) @param[in] format record format */ +template<bool mblob = false> static inline void rec_init_offsets_comp_ordinary( @@ -293,12 +276,32 @@ rec_init_offsets_comp_ordinary( ut_ad(n_core > 0); ut_ad(index->n_fields >= n_core); ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable)); - ut_ad(format == REC_LEAF_TEMP || format == REC_LEAF_TEMP_COLUMNS_ADDED + ut_ad(format == REC_LEAF_TEMP || format == REC_LEAF_TEMP_INSTANT || dict_table_is_comp(index->table)); - ut_ad(format != REC_LEAF_TEMP_COLUMNS_ADDED + ut_ad(format != REC_LEAF_TEMP_INSTANT || index->n_fields == rec_offs_n_fields(offsets)); ut_d(ulint n_null= 0); + if (mblob) { + ut_ad(index->is_dummy || index->table->instant); + ut_ad(index->is_dummy || index->is_instant()); + ut_ad(rec_offs_n_fields(offsets) + <= ulint(index->n_fields) + 1); + ut_ad(!def_val); + ut_ad(format == REC_LEAF_INSTANT); + nulls -= REC_N_NEW_EXTRA_BYTES; + n_fields = n_core + 1 + rec_get_n_add_field(nulls); + ut_ad(n_fields <= ulint(index->n_fields) + 1); + const ulint n_nullable = index->get_n_nullable(n_fields - 1); + const ulint n_null_bytes = UT_BITS_IN_BYTES(n_nullable); + ut_d(n_null = n_nullable); + ut_ad(n_null <= index->n_nullable); + ut_ad(n_null_bytes >= index->n_core_null_bytes + || n_core < index->n_core_fields); + lens = --nulls - n_null_bytes; + goto start; + } + switch (format) { case REC_LEAF_TEMP: if (dict_table_is_comp(index->table)) { @@ -312,17 +315,15 @@ rec_init_offsets_comp_ordinary( ordinary: lens = --nulls - index->n_core_null_bytes; - ut_d(n_null = std::min(index->n_core_null_bytes * 8U, - index->n_nullable)); + ut_d(n_null = std::min<uint>(index->n_core_null_bytes * 8U, + index->n_nullable)); break; - case REC_LEAF_COLUMNS_ADDED: - /* We would have !index->is_instant() when rolling back - an instant ADD COLUMN operation. */ + case REC_LEAF_INSTANT: nulls -= REC_N_NEW_EXTRA_BYTES; ut_ad(index->is_instant()); /* fall through */ - case REC_LEAF_TEMP_COLUMNS_ADDED: - n_fields = n_core + 1 + rec_get_n_add_field(nulls); + case REC_LEAF_TEMP_INSTANT: + n_fields = n_core + rec_get_n_add_field(nulls) + 1; ut_ad(n_fields <= index->n_fields); const ulint n_nullable = index->get_n_nullable(n_fields); const ulint n_null_bytes = UT_BITS_IN_BYTES(n_nullable); @@ -333,6 +334,7 @@ ordinary: lens = --nulls - n_null_bytes; } +start: #ifdef UNIV_DEBUG /* We cannot invoke rec_offs_make_valid() if format==REC_LEAF_TEMP. Similarly, rec_offs_validate() will fail in that case, because @@ -342,17 +344,26 @@ ordinary: #endif /* UNIV_DEBUG */ /* read the lengths of fields 0..n_fields */ + rec_offs len; ulint i = 0; - do { - const dict_field_t* field - = dict_index_get_nth_field(index, i); - const dict_col_t* col - = dict_field_get_col(field); - rec_offs len; + const dict_field_t* field = index->fields; - /* set default value flag */ - if (i < n_fields) { - } else if (def_val) { + do { + if (mblob) { + if (i == index->first_user_field()) { + offs += FIELD_REF_SIZE; + len = combine(offs, STORED_OFFPAGE); + any |= REC_OFFS_EXTERNAL; + field--; + continue; + } else if (i >= n_fields) { + len = combine(offs, DEFAULT); + any |= REC_OFFS_DEFAULT; + continue; + } + } else if (i < n_fields) { + /* The field is present, and will be covered below. */ + } else if (!mblob && def_val) { const dict_col_t::def_t& d = def_val[i - n_core]; if (!d.data) { len = combine(offs, SQL_NULL); @@ -362,7 +373,7 @@ ordinary: any |= REC_OFFS_DEFAULT; } - goto resolved; + continue; } else { ulint dlen; if (!index->instant_field_value(i, &dlen)) { @@ -373,10 +384,12 @@ ordinary: any |= REC_OFFS_DEFAULT; } - goto resolved; + continue; } - if (!(col->prtype & DATA_NOT_NULL)) { + const dict_col_t* col = field->col; + + if (col->is_nullable()) { /* nullable field => read the null flag */ ut_ad(n_null--); @@ -392,7 +405,7 @@ ordinary: the length to zero and enable the SQL NULL flag in offsets[]. */ len = combine(offs, SQL_NULL); - goto resolved; + continue; } null_mask <<= 1; } @@ -423,16 +436,15 @@ ordinary: len = offs; } - goto resolved; + continue; } len = offs += len; } else { len = offs += field->fixed_len; } -resolved: - rec_offs_base(offsets)[i + 1] = len; - } while (++i < rec_offs_n_fields(offsets)); + } while (field++, rec_offs_base(offsets)[++i] = len, + i < rec_offs_n_fields(offsets)); *rec_offs_base(offsets) = static_cast<rec_offs>(rec - (lens + 1)) | REC_OFFS_COMPACT | any; @@ -451,7 +463,10 @@ rec_offs_make_valid( bool leaf, rec_offs* offsets) { - ut_ad(rec_offs_n_fields(offsets) + const bool is_alter_metadata = leaf + && rec_is_alter_metadata(rec, *index); + ut_ad(is_alter_metadata + || rec_offs_n_fields(offsets) <= (leaf ? dict_index_get_n_fields(index) : dict_index_get_n_unique_in_tree_nonleaf(index) + 1) @@ -469,7 +484,8 @@ rec_offs_make_valid( || n >= rec_offs_n_fields(offsets)); for (; n < rec_offs_n_fields(offsets); n++) { ut_ad(leaf); - ut_ad(get_type(rec_offs_base(offsets)[1 + n]) == DEFAULT); + ut_ad(is_alter_metadata + || get_type(rec_offs_base(offsets)[1 + n]) == DEFAULT); } memcpy(&offsets[RECORD_OFFSET], &rec, sizeof(rec)); memcpy(&offsets[INDEX_OFFSET], &index, sizeof(index)); @@ -509,14 +525,18 @@ rec_offs_validate( } } if (index) { - ulint max_n_fields; ut_ad(!memcmp(&index, &offsets[INDEX_OFFSET], sizeof(index))); - max_n_fields = ut_max( + ulint max_n_fields = std::max( dict_index_get_n_fields(index), dict_index_get_n_unique_in_tree(index) + 1); if (comp && rec) { switch (rec_get_status(rec)) { - case REC_STATUS_COLUMNS_ADDED: + case REC_STATUS_INSTANT: + ut_ad(index->is_instant() || index->is_dummy); + ut_ad(max_n_fields == index->n_fields); + max_n_fields += index->table->instant + || index->is_dummy; + break; case REC_STATUS_ORDINARY: break; case REC_STATUS_NODE_PTR: @@ -530,14 +550,19 @@ rec_offs_validate( default: ut_error; } + } else if (max_n_fields == index->n_fields + && (index->is_dummy + || (index->is_instant() + && index->table->instant))) { + max_n_fields++; } /* index->n_def == 0 for dummy indexes if !comp */ - ut_a(!comp || index->n_def); - ut_a(!index->n_def || i <= max_n_fields); + ut_ad(!comp || index->n_def); + ut_ad(!index->n_def || i <= max_n_fields); } while (i--) { ulint curr = get_value(rec_offs_base(offsets)[1 + i]); - ut_a(curr <= last); + ut_ad(curr <= last); last = curr; } return(TRUE); @@ -571,7 +596,13 @@ rec_init_offsets( ulint i = 0; rec_offs offs; - ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable)); + /* This assertion was relaxed for the btr_cur_open_at_index_side() + call in btr_cur_instant_init_low(). We cannot invoke + index->is_instant(), because the same assertion would fail there + until btr_cur_instant_init_low() has invoked + dict_table_t::deserialise_columns(). */ + ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable) + || index->in_instant_init); ut_d(memcpy(&offsets[RECORD_OFFSET], &rec, sizeof(rec))); ut_d(memcpy(&offsets[INDEX_OFFSET], &index, sizeof(index))); @@ -597,13 +628,13 @@ rec_init_offsets( = dict_index_get_n_unique_in_tree_nonleaf( index); break; - case REC_STATUS_COLUMNS_ADDED: + case REC_STATUS_INSTANT: ut_ad(leaf); ut_ad(index->is_instant()); rec_init_offsets_comp_ordinary(rec, index, offsets, index->n_core_fields, NULL, - REC_LEAF_COLUMNS_ADDED); + REC_LEAF_INSTANT); return; case REC_STATUS_ORDINARY: ut_ad(leaf); @@ -785,14 +816,17 @@ rec_get_offsets_func( { ulint n; ulint size; + bool alter_metadata = false; if (dict_table_is_comp(index->table)) { switch (UNIV_EXPECT(rec_get_status(rec), REC_STATUS_ORDINARY)) { - case REC_STATUS_COLUMNS_ADDED: + case REC_STATUS_INSTANT: + alter_metadata = rec_is_alter_metadata(rec, true); + /* fall through */ case REC_STATUS_ORDINARY: ut_ad(leaf); - n = dict_index_get_n_fields(index); + n = dict_index_get_n_fields(index) + alter_metadata; break; case REC_STATUS_NODE_PTR: /* Node pointer records consist of the @@ -836,7 +870,8 @@ rec_get_offsets_func( || dict_index_is_ibuf(index) || n == n_fields /* btr_pcur_restore_position() */ || (n + (index->id == DICT_INDEXES_ID) - >= index->n_core_fields && n <= index->n_fields)); + >= index->n_core_fields && n <= index->n_fields + + unsigned(rec_is_alter_metadata(rec, false)))); if (is_user_rec && leaf && n < index->n_fields) { ut_ad(!index->is_dummy); @@ -866,8 +901,25 @@ rec_get_offsets_func( } rec_offs_set_n_fields(offsets, n); - rec_init_offsets(rec, index, leaf, offsets); - return(offsets); + + if (UNIV_UNLIKELY(alter_metadata) && index->table->not_redundant()) { +#ifdef UNIV_DEBUG + memcpy(&offsets[RECORD_OFFSET], &rec, sizeof rec); + memcpy(&offsets[INDEX_OFFSET], &index, sizeof index); +#endif /* UNIV_DEBUG */ + ut_ad(leaf); + ut_ad(index->is_dummy || index->table->instant); + ut_ad(index->is_dummy || index->is_instant()); + ut_ad(rec_offs_n_fields(offsets) + <= ulint(index->n_fields) + 1); + rec_init_offsets_comp_ordinary<true>(rec, index, offsets, + index->n_core_fields, + NULL, + REC_LEAF_INSTANT); + } else { + rec_init_offsets(rec, index, leaf, offsets); + } + return offsets; } /******************************************************//** @@ -1041,36 +1093,45 @@ rec_get_nth_field_offs_old( return(os); } -/**********************************************************//** -Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT. +/** Determine the size of a data tuple prefix in ROW_FORMAT=COMPACT. +@tparam mblob whether the record includes a metadata BLOB +@param[in] index record descriptor; dict_table_is_comp() + is assumed to hold, even if it doesn't +@param[in] dfield array of data fields +@param[in] n_fields number of data fields +@param[out] extra extra size +@param[in] status status flags +@param[in] temp whether this is a temporary file record @return total size */ -MY_ATTRIBUTE((warn_unused_result, nonnull(1,2))) +template<bool mblob = false> static inline ulint rec_get_converted_size_comp_prefix_low( -/*===================================*/ - const dict_index_t* index, /*!< in: record descriptor; - dict_table_is_comp() is - assumed to hold, even if - it does not */ - const dfield_t* fields, /*!< in: array of data fields */ - ulint n_fields,/*!< in: number of data fields */ - ulint* extra, /*!< out: extra size */ - rec_comp_status_t status, /*!< in: status flags */ - bool temp) /*!< in: whether this is a - temporary file record */ + const dict_index_t* index, + const dfield_t* dfield, + ulint n_fields, + ulint* extra, + rec_comp_status_t status, + bool temp) { ulint extra_size = temp ? 0 : REC_N_NEW_EXTRA_BYTES; - ulint data_size; - ulint i; ut_ad(n_fields > 0); - ut_ad(n_fields <= dict_index_get_n_fields(index)); + ut_ad(n_fields <= dict_index_get_n_fields(index) + mblob); ut_d(ulint n_null = index->n_nullable); ut_ad(status == REC_STATUS_ORDINARY || status == REC_STATUS_NODE_PTR - || status == REC_STATUS_COLUMNS_ADDED); + || status == REC_STATUS_INSTANT); - if (status == REC_STATUS_COLUMNS_ADDED - && (!temp || n_fields > index->n_core_fields)) { + if (mblob) { + ut_ad(!temp); + ut_ad(index->table->instant); + ut_ad(index->is_instant()); + ut_ad(status == REC_STATUS_INSTANT); + ut_ad(n_fields == ulint(index->n_fields) + 1); + extra_size += UT_BITS_IN_BYTES(index->n_nullable) + + rec_get_n_add_field_len(n_fields - 1 + - index->n_core_fields); + } else if (status == REC_STATUS_INSTANT + && (!temp || n_fields > index->n_core_fields)) { ut_ad(index->is_instant()); ut_ad(UT_BITS_IN_BYTES(n_null) >= index->n_core_null_bytes); extra_size += UT_BITS_IN_BYTES(index->get_n_nullable(n_fields)) @@ -1081,7 +1142,7 @@ rec_get_converted_size_comp_prefix_low( extra_size += index->n_core_null_bytes; } - data_size = 0; + ulint data_size = 0; if (temp && dict_table_is_comp(index->table)) { /* No need to do adjust fixed_len=0. We only need to @@ -1089,46 +1150,53 @@ rec_get_converted_size_comp_prefix_low( temp = false; } + const dfield_t* const end = dfield + n_fields; /* read the lengths of fields 0..n */ - for (i = 0; i < n_fields; i++) { - const dict_field_t* field; - ulint len; - ulint fixed_len; - const dict_col_t* col; + for (ulint i = 0; dfield < end; i++, dfield++) { + if (mblob && i == index->first_user_field()) { + data_size += FIELD_REF_SIZE; + if (++dfield == end) { + ut_ad(i == index->n_fields); + break; + } + } - field = dict_index_get_nth_field(index, i); - len = dfield_get_len(&fields[i]); - col = dict_field_get_col(field); + ulint len = dfield_get_len(dfield); + const dict_field_t* field = dict_index_get_nth_field(index, i); #ifdef UNIV_DEBUG - const dtype_t* type = dfield_get_type(&fields[i]); if (dict_index_is_spatial(index)) { - if (DATA_GEOMETRY_MTYPE(col->mtype) && i == 0) { - ut_ad(type->prtype & DATA_GIS_MBR); + if (DATA_GEOMETRY_MTYPE(field->col->mtype) && i == 0) { + ut_ad(dfield->type.prtype & DATA_GIS_MBR); } else { - ut_ad(type->mtype == DATA_SYS_CHILD - || dict_col_type_assert_equal(col, type)); + ut_ad(dfield->type.mtype == DATA_SYS_CHILD + || dict_col_type_assert_equal( + field->col, &dfield->type)); } } else { - ut_ad(dict_col_type_assert_equal(col, type)); + ut_ad(field->col->is_dropped() + || dict_col_type_assert_equal(field->col, + &dfield->type)); } #endif /* All NULLable fields must be included in the n_null count. */ - ut_ad((col->prtype & DATA_NOT_NULL) || n_null--); + ut_ad(!field->col->is_nullable() || n_null--); - if (dfield_is_null(&fields[i])) { + if (dfield_is_null(dfield)) { /* No length is stored for NULL fields. */ - ut_ad(!(col->prtype & DATA_NOT_NULL)); + ut_ad(field->col->is_nullable()); continue; } - ut_ad(len <= col->len || DATA_LARGE_MTYPE(col->mtype) - || (col->len == 0 && col->mtype == DATA_VARCHAR)); + ut_ad(len <= field->col->len + || DATA_LARGE_MTYPE(field->col->mtype) + || (field->col->len == 0 + && field->col->mtype == DATA_VARCHAR)); - fixed_len = field->fixed_len; + ulint fixed_len = field->fixed_len; if (temp && fixed_len - && !dict_col_get_fixed_size(col, temp)) { + && !dict_col_get_fixed_size(field->col, temp)) { fixed_len = 0; } /* If the maximum length of a variable-length field @@ -1143,25 +1211,27 @@ rec_get_converted_size_comp_prefix_low( ut_ad(len <= fixed_len); if (dict_index_is_spatial(index)) { - ut_ad(type->mtype == DATA_SYS_CHILD - || !col->mbmaxlen - || len >= col->mbminlen - * fixed_len / col->mbmaxlen); + ut_ad(dfield->type.mtype == DATA_SYS_CHILD + || !field->col->mbmaxlen + || len >= field->col->mbminlen + * fixed_len / field->col->mbmaxlen); } else { - ut_ad(type->mtype != DATA_SYS_CHILD); - ut_ad(!col->mbmaxlen - || len >= col->mbminlen - * fixed_len / col->mbmaxlen); + ut_ad(dfield->type.mtype != DATA_SYS_CHILD); + + ut_ad(field->col->is_dropped() + || !field->col->mbmaxlen + || len >= field->col->mbminlen + * fixed_len / field->col->mbmaxlen); } /* dict_index_add_col() should guarantee this */ ut_ad(!field->prefix_len || fixed_len == field->prefix_len); #endif /* UNIV_DEBUG */ - } else if (dfield_is_ext(&fields[i])) { - ut_ad(DATA_BIG_COL(col)); + } else if (dfield_is_ext(dfield)) { + ut_ad(DATA_BIG_COL(field->col)); extra_size += 2; - } else if (len < 128 || !DATA_BIG_COL(col)) { + } else if (len < 128 || !DATA_BIG_COL(field->col)) { extra_size++; } else { /* For variable-length columns, we look up the @@ -1197,43 +1267,51 @@ rec_get_converted_size_comp_prefix( REC_STATUS_ORDINARY, false)); } -/**********************************************************//** -Determines the size of a data tuple in ROW_FORMAT=COMPACT. +/** Determine the size of a record in ROW_FORMAT=COMPACT. +@param[in] index record descriptor. dict_table_is_comp() + is assumed to hold, even if it doesn't +@param[in] tuple logical record +@param[out] extra extra size @return total size */ ulint rec_get_converted_size_comp( -/*========================*/ - const dict_index_t* index, /*!< in: record descriptor; - dict_table_is_comp() is - assumed to hold, even if - it does not */ - rec_comp_status_t status, /*!< in: status bits of the record */ - const dfield_t* fields, /*!< in: array of data fields */ - ulint n_fields,/*!< in: number of data fields */ - ulint* extra) /*!< out: extra size */ + const dict_index_t* index, + const dtuple_t* tuple, + ulint* extra) { - ut_ad(n_fields > 0); + ut_ad(tuple->n_fields > 0); + + rec_comp_status_t status = rec_comp_status_t(tuple->info_bits + & REC_NEW_STATUS_MASK); switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) { case REC_STATUS_ORDINARY: - if (n_fields > index->n_core_fields) { + ut_ad(!tuple->is_metadata()); + if (tuple->n_fields > index->n_core_fields) { ut_ad(index->is_instant()); - status = REC_STATUS_COLUMNS_ADDED; + status = REC_STATUS_INSTANT; } /* fall through */ - case REC_STATUS_COLUMNS_ADDED: - ut_ad(n_fields >= index->n_core_fields); - ut_ad(n_fields <= index->n_fields); + case REC_STATUS_INSTANT: + ut_ad(tuple->n_fields >= index->n_core_fields); + if (tuple->is_alter_metadata()) { + return rec_get_converted_size_comp_prefix_low<true>( + index, tuple->fields, tuple->n_fields, + extra, status, false); + } + ut_ad(tuple->n_fields <= index->n_fields); return rec_get_converted_size_comp_prefix_low( - index, fields, n_fields, extra, status, false); + index, tuple->fields, tuple->n_fields, + extra, status, false); case REC_STATUS_NODE_PTR: - n_fields--; - ut_ad(n_fields == dict_index_get_n_unique_in_tree_nonleaf( - index)); - ut_ad(dfield_get_len(&fields[n_fields]) == REC_NODE_PTR_SIZE); + ut_ad(tuple->n_fields - 1 + == dict_index_get_n_unique_in_tree_nonleaf(index)); + ut_ad(dfield_get_len(&tuple->fields[tuple->n_fields - 1]) + == REC_NODE_PTR_SIZE); return REC_NODE_PTR_SIZE /* child page number */ + rec_get_converted_size_comp_prefix_low( - index, fields, n_fields, extra, status, false); + index, tuple->fields, tuple->n_fields - 1, + extra, status, false); case REC_STATUS_INFIMUM: case REC_STATUS_SUPREMUM: /* not supported */ @@ -1411,33 +1489,30 @@ rec_convert_dtuple_to_rec_old( } /** Convert a data tuple into a ROW_FORMAT=COMPACT record. +@tparam mblob whether the record includes a metadata BLOB @param[out] rec converted record @param[in] index index -@param[in] fields data fields to convert +@param[in] field data fields to convert @param[in] n_fields number of data fields @param[in] status rec_get_status(rec) @param[in] temp whether to use the format for temporary files in index creation */ +template<bool mblob = false> static inline void rec_convert_dtuple_to_rec_comp( rec_t* rec, const dict_index_t* index, - const dfield_t* fields, + const dfield_t* field, ulint n_fields, rec_comp_status_t status, bool temp) { - const dfield_t* field; - const dtype_t* type; byte* end; byte* nulls = temp ? rec - 1 : rec - (REC_N_NEW_EXTRA_BYTES + 1); byte* UNINIT_VAR(lens); - ulint len; - ulint i; ulint UNINIT_VAR(n_node_ptr_field); - ulint fixed_len; ulint null_mask = 1; ut_ad(n_fields > 0); @@ -1446,8 +1521,22 @@ rec_convert_dtuple_to_rec_comp( ut_d(ulint n_null = index->n_nullable); + if (mblob) { + ut_ad(!temp); + ut_ad(index->table->instant); + ut_ad(index->is_instant()); + ut_ad(status == REC_STATUS_INSTANT); + ut_ad(n_fields == ulint(index->n_fields) + 1); + rec_set_n_add_field(nulls, n_fields - 1 + - index->n_core_fields); + rec_set_heap_no_new(rec, PAGE_HEAP_NO_USER_LOW); + rec_set_status(rec, REC_STATUS_INSTANT); + n_node_ptr_field = ULINT_UNDEFINED; + lens = nulls - UT_BITS_IN_BYTES(index->n_nullable); + goto start; + } switch (status) { - case REC_STATUS_COLUMNS_ADDED: + case REC_STATUS_INSTANT: ut_ad(index->is_instant()); ut_ad(n_fields > index->n_core_fields); rec_set_n_add_field(nulls, n_fields - 1 @@ -1457,19 +1546,24 @@ rec_convert_dtuple_to_rec_comp( ut_ad(n_fields <= dict_index_get_n_fields(index)); if (!temp) { rec_set_heap_no_new(rec, PAGE_HEAP_NO_USER_LOW); - rec_set_status(rec, n_fields == index->n_core_fields - ? REC_STATUS_ORDINARY - : REC_STATUS_COLUMNS_ADDED); - } if (dict_table_is_comp(index->table)) { + + rec_set_status( + rec, n_fields == index->n_core_fields + ? REC_STATUS_ORDINARY + : REC_STATUS_INSTANT); + } + + if (dict_table_is_comp(index->table)) { /* No need to do adjust fixed_len=0. We only need to adjust it for ROW_FORMAT=REDUNDANT. */ temp = false; } n_node_ptr_field = ULINT_UNDEFINED; + lens = nulls - (index->is_instant() ? UT_BITS_IN_BYTES(index->get_n_nullable( - n_fields)) + n_fields)) : UT_BITS_IN_BYTES( unsigned(index->n_nullable))); break; @@ -1479,8 +1573,8 @@ rec_convert_dtuple_to_rec_comp( rec_set_status(rec, status); ut_ad(n_fields == dict_index_get_n_unique_in_tree_nonleaf(index) + 1); - ut_d(n_null = std::min(index->n_core_null_bytes * 8U, - index->n_nullable)); + ut_d(n_null = std::min<uint>(index->n_core_null_bytes * 8U, + index->n_nullable)); n_node_ptr_field = n_fields - 1; lens = nulls - index->n_core_null_bytes; break; @@ -1490,30 +1584,37 @@ rec_convert_dtuple_to_rec_comp( return; } +start: end = rec; /* clear the SQL-null flags */ memset(lens + 1, 0, ulint(nulls - lens)); + const dfield_t* const fend = field + n_fields; /* Store the data and the offsets */ - - for (i = 0; i < n_fields; i++) { - const dict_field_t* ifield; - dict_col_t* col = NULL; - - field = &fields[i]; - - type = dfield_get_type(field); - len = dfield_get_len(field); - - if (UNIV_UNLIKELY(i == n_node_ptr_field)) { - ut_ad(dtype_get_prtype(type) & DATA_NOT_NULL); + for (ulint i = 0; field < fend; i++, field++) { + ulint len = dfield_get_len(field); + + if (mblob) { + if (i == index->first_user_field()) { + ut_ad(len == FIELD_REF_SIZE); + ut_ad(dfield_is_ext(field)); + memcpy(end, dfield_get_data(field), len); + end += len; + if (++field == fend) { + ut_ad(i == index->n_fields); + break; + } + len = dfield_get_len(field); + } + } else if (UNIV_UNLIKELY(i == n_node_ptr_field)) { + ut_ad(field->type.prtype & DATA_NOT_NULL); ut_ad(len == REC_NODE_PTR_SIZE); memcpy(end, dfield_get_data(field), len); end += REC_NODE_PTR_SIZE; break; } - if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) { + if (!(field->type.prtype & DATA_NOT_NULL)) { /* nullable field */ ut_ad(n_null--); @@ -1536,11 +1637,12 @@ rec_convert_dtuple_to_rec_comp( /* only nullable fields can be null */ ut_ad(!dfield_is_null(field)); - ifield = dict_index_get_nth_field(index, i); - fixed_len = ifield->fixed_len; - col = ifield->col; + const dict_field_t* ifield + = dict_index_get_nth_field(index, i); + ulint fixed_len = ifield->fixed_len; + if (temp && fixed_len - && !dict_col_get_fixed_size(col, temp)) { + && !dict_col_get_fixed_size(ifield->col, temp)) { fixed_len = 0; } @@ -1552,23 +1654,23 @@ rec_convert_dtuple_to_rec_comp( it is 128 or more, or when the field is stored externally. */ if (fixed_len) { ut_ad(len <= fixed_len); - ut_ad(!col->mbmaxlen - || len >= col->mbminlen - * fixed_len / col->mbmaxlen); + ut_ad(!ifield->col->mbmaxlen + || len >= ifield->col->mbminlen + * fixed_len / ifield->col->mbmaxlen); ut_ad(!dfield_is_ext(field)); } else if (dfield_is_ext(field)) { - ut_ad(DATA_BIG_COL(col)); + ut_ad(DATA_BIG_COL(ifield->col)); ut_ad(len <= REC_ANTELOPE_MAX_INDEX_COL_LEN - + BTR_EXTERN_FIELD_REF_SIZE); + + BTR_EXTERN_FIELD_REF_SIZE); *lens-- = (byte) (len >> 8) | 0xc0; *lens-- = (byte) len; } else { - ut_ad(len <= dtype_get_len(type) - || DATA_LARGE_MTYPE(dtype_get_mtype(type)) + ut_ad(len <= field->type.len + || DATA_LARGE_MTYPE(field->type.mtype) || !strcmp(index->name, FTS_INDEX_TABLE_IND_NAME)); if (len < 128 || !DATA_BIG_LEN_MTYPE( - dtype_get_len(type), dtype_get_mtype(type))) { + field->type.len, field->type.mtype)) { *lens-- = (byte) len; } else { @@ -1601,24 +1703,37 @@ rec_convert_dtuple_to_rec_new( ut_ad(!(dtuple->info_bits & ~(REC_NEW_STATUS_MASK | REC_INFO_DELETED_FLAG | REC_INFO_MIN_REC_FLAG))); - rec_comp_status_t status = static_cast<rec_comp_status_t>( - dtuple->info_bits & REC_NEW_STATUS_MASK); - if (status == REC_STATUS_ORDINARY - && dtuple->n_fields > index->n_core_fields) { - ut_ad(index->is_instant()); - status = REC_STATUS_COLUMNS_ADDED; - } ulint extra_size; - rec_get_converted_size_comp( - index, status, dtuple->fields, dtuple->n_fields, &extra_size); - rec_t* rec = buf + extra_size; + if (UNIV_UNLIKELY(dtuple->is_alter_metadata())) { + ut_ad((dtuple->info_bits & REC_NEW_STATUS_MASK) + == REC_STATUS_INSTANT); + rec_get_converted_size_comp_prefix_low<true>( + index, dtuple->fields, dtuple->n_fields, + &extra_size, REC_STATUS_INSTANT, false); + buf += extra_size; + rec_convert_dtuple_to_rec_comp<true>( + buf, index, dtuple->fields, dtuple->n_fields, + REC_STATUS_INSTANT, false); + } else { + rec_get_converted_size_comp(index, dtuple, &extra_size); + buf += extra_size; + rec_comp_status_t status = rec_comp_status_t( + dtuple->info_bits & REC_NEW_STATUS_MASK); + if (status == REC_STATUS_ORDINARY + && dtuple->n_fields > index->n_core_fields) { + ut_ad(index->is_instant()); + status = REC_STATUS_INSTANT; + } - rec_convert_dtuple_to_rec_comp( - rec, index, dtuple->fields, dtuple->n_fields, status, false); - rec_set_info_bits_new(rec, dtuple->info_bits & ~REC_NEW_STATUS_MASK); - return(rec); + rec_convert_dtuple_to_rec_comp( + buf, index, dtuple->fields, dtuple->n_fields, + status, false); + } + + rec_set_info_bits_new(buf, dtuple->info_bits & ~REC_NEW_STATUS_MASK); + return buf; } /*********************************************************//** @@ -1657,7 +1772,7 @@ rec_convert_dtuple_to_rec( @param[in] fields data fields @param[in] n_fields number of data fields @param[out] extra record header size -@param[in] status REC_STATUS_ORDINARY or REC_STATUS_COLUMNS_ADDED +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT @return total size, in bytes */ ulint rec_get_converted_size_temp( @@ -1677,7 +1792,7 @@ rec_get_converted_size_temp( @param[in,out] offsets offsets to the fields; in: rec_offs_n_fields(offsets) @param[in] n_core number of core fields (index->n_core_fields) @param[in] def_val default values for non-core fields -@param[in] status REC_STATUS_ORDINARY or REC_STATUS_COLUMNS_ADDED */ +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT */ void rec_init_offsets_temp( const rec_t* rec, @@ -1688,14 +1803,14 @@ rec_init_offsets_temp( rec_comp_status_t status) { ut_ad(status == REC_STATUS_ORDINARY - || status == REC_STATUS_COLUMNS_ADDED); + || status == REC_STATUS_INSTANT); /* The table may have been converted to plain format if it was emptied during an ALTER TABLE operation. */ ut_ad(index->n_core_fields == n_core || !index->is_instant()); ut_ad(index->n_core_fields >= n_core); rec_init_offsets_comp_ordinary(rec, index, offsets, n_core, def_val, - status == REC_STATUS_COLUMNS_ADDED - ? REC_LEAF_TEMP_COLUMNS_ADDED + status == REC_STATUS_INSTANT + ? REC_LEAF_TEMP_INSTANT : REC_LEAF_TEMP); } @@ -1721,7 +1836,7 @@ rec_init_offsets_temp( @param[in] index clustered or secondary index @param[in] fields data fields @param[in] n_fields number of data fields -@param[in] status REC_STATUS_ORDINARY or REC_STATUS_COLUMNS_ADDED +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT */ void rec_convert_dtuple_to_temp( @@ -1891,13 +2006,15 @@ rec_copy_prefix_to_buf( ut_ad(n_fields <= dict_index_get_n_unique_in_tree_nonleaf(index)); break; - case REC_STATUS_COLUMNS_ADDED: + case REC_STATUS_INSTANT: /* We would have !index->is_instant() when rolling back an instant ADD COLUMN operation. */ ut_ad(index->is_instant() || page_rec_is_metadata(rec)); + ut_ad(n_fields <= index->first_user_field()); nulls++; const ulint n_rec = ulint(index->n_core_fields) + 1 - + rec_get_n_add_field(nulls); + + rec_get_n_add_field(nulls) + - rec_is_alter_metadata(rec, true); instant_omit = ulint(&rec[-REC_N_NEW_EXTRA_BYTES] - nulls); ut_ad(instant_omit == 1 || instant_omit == 2); nullf = nulls; @@ -1986,7 +2103,7 @@ rec_copy_prefix_to_buf( /* copy the fixed-size header and the record prefix */ memcpy(b - REC_N_NEW_EXTRA_BYTES, rec - REC_N_NEW_EXTRA_BYTES, prefix_len + REC_N_NEW_EXTRA_BYTES); - ut_ad(rec_get_status(b) == REC_STATUS_COLUMNS_ADDED); + ut_ad(rec_get_status(b) == REC_STATUS_INSTANT); rec_set_status(b, REC_STATUS_ORDINARY); return b; } else { @@ -2504,8 +2621,6 @@ rec_get_trx_id( const rec_t* rec, const dict_index_t* index) { - ulint trx_id_col - = dict_index_get_sys_col_pos(index, DATA_TRX_ID); const byte* trx_id; ulint len; mem_heap_t* heap = NULL; @@ -2513,15 +2628,10 @@ rec_get_trx_id( rec_offs_init(offsets_); rec_offs* offsets = offsets_; - ut_ad(trx_id_col <= MAX_REF_PARTS); - ut_ad(dict_index_is_clust(index)); - ut_ad(trx_id_col > 0); - ut_ad(trx_id_col != ULINT_UNDEFINED); - offsets = rec_get_offsets(rec, index, offsets, true, - trx_id_col + 1, &heap); + index->db_trx_id() + 1, &heap); - trx_id = rec_get_nth_field(rec, offsets, trx_id_col, &len); + trx_id = rec_get_nth_field(rec, offsets, index->db_trx_id(), &len); ut_ad(len == DATA_TRX_ID_LEN); diff --git a/storage/innobase/row/row0ext.cc b/storage/innobase/row/row0ext.cc index 5892cbf31fe..b7a627603d9 100644 --- a/storage/innobase/row/row0ext.cc +++ b/storage/innobase/row/row0ext.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2020, MariaDB Corporation. +Copyright (c) 2019, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -30,14 +30,14 @@ Created September 2006 Marko Makela /** Fills the column prefix cache of an externally stored column. @param[in,out] ext column prefix cache @param[in] i index of ext->ext[] -@param[in] page_size page size +@param[in] space tablespace @param[in] dfield data field */ static void row_ext_cache_fill( row_ext_t* ext, ulint i, - const page_size_t& page_size, + fil_space_t* space, const dfield_t* dfield) { const byte* field = static_cast<const byte*>( @@ -76,7 +76,8 @@ row_ext_cache_fill( crashed during the execution of btr_free_externally_stored_field(). */ ext->len[i] = btr_copy_externally_stored_field_prefix( - buf, ext->max_len, page_size, field, f_len); + buf, ext->max_len, ext->zip_size, + field, f_len); } } } @@ -92,7 +93,7 @@ row_ext_create( in the InnoDB table object, as reported by dict_col_get_no(); NOT relative to the records in the clustered index */ - ulint flags, /*!< in: table->flags */ + const dict_table_t& table, /*!< in: table */ const dtuple_t* tuple, /*!< in: data tuple containing the field references of the externally stored columns; must be indexed by col_no; @@ -101,31 +102,30 @@ row_ext_create( to prevent deletion (rollback or purge). */ mem_heap_t* heap) /*!< in: heap where created */ { - ulint i; - const page_size_t& page_size = dict_tf_get_page_size(flags); - - row_ext_t* ret; + if (!table.space) { + return NULL; + } ut_ad(n_ext > 0); - ret = static_cast<row_ext_t*>( + row_ext_t* ret = static_cast<row_ext_t*>( mem_heap_alloc(heap, (sizeof *ret) + (n_ext - 1) * sizeof ret->len)); ret->n_ext = n_ext; ret->ext = ext; - ret->max_len = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags); - ret->page_size.copy_from(page_size); + ret->max_len = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(table.flags); + ret->zip_size = dict_tf_get_zip_size(table.flags); ret->buf = static_cast<byte*>( mem_heap_alloc(heap, n_ext * ret->max_len)); /* Fetch the BLOB prefixes */ - for (i = 0; i < n_ext; i++) { + for (ulint i = 0; i < n_ext; i++) { const dfield_t* dfield; dfield = dtuple_get_nth_field(tuple, ext[i]); - row_ext_cache_fill(ret, i, page_size, dfield); + row_ext_cache_fill(ret, i, table.space, dfield); } return(ret); diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc index c5b6276caf5..3f24564e7f6 100644 --- a/storage/innobase/row/row0ftsort.cc +++ b/storage/innobase/row/row0ftsort.cc @@ -163,22 +163,22 @@ row_merge_create_fts_sort_index( /** Initialize FTS parallel sort structures. @param[in] trx transaction @param[in,out] dup descriptor of FTS index being created -@param[in] new_table table where indexes are created +@param[in,out] new_table table where indexes are created @param[in] opt_doc_id_size whether to use 4 bytes instead of 8 bytes integer to store Doc ID during sort -@param[in] old_page_size page size of the old table during alter +@param[in] old_zip_size page size of the old table during alter @param[out] psort parallel sort info to be instantiated @param[out] merge parallel merge info to be instantiated -@return TRUE if all successful */ -ibool +@return true if all successful */ +bool row_fts_psort_info_init( - trx_t* trx, - row_merge_dup_t* dup, - const dict_table_t* new_table, - ibool opt_doc_id_size, - const page_size_t old_page_size, - fts_psort_t** psort, - fts_psort_t** merge) + trx_t* trx, + row_merge_dup_t*dup, + dict_table_t* new_table, + bool opt_doc_id_size, + ulint old_zip_size, + fts_psort_t** psort, + fts_psort_t** merge) { ulint i; ulint j; @@ -188,6 +188,7 @@ row_fts_psort_info_init( ulint block_size; ibool ret = TRUE; bool encrypted = false; + ut_ad(ut_is_2pow(old_zip_size)); block_size = 3 * srv_sort_buf_size; @@ -210,8 +211,8 @@ row_fts_psort_info_init( } common_info->dup = dup; - common_info->new_table = (dict_table_t*) new_table; - common_info->old_page_size = old_page_size; + common_info->new_table = new_table; + common_info->old_zip_size = old_zip_size; common_info->trx = trx; common_info->all_info = psort_info; common_info->sort_event = os_event_create(0); @@ -805,8 +806,7 @@ DECLARE_THREAD(fts_parallel_tokenization)( block = psort_info->merge_block; crypt_block = psort_info->crypt_block; - const page_size_t old_page_size = - psort_info->psort_common->old_page_size; + const ulint zip_size = psort_info->psort_common->old_zip_size; row_merge_fts_get_next_doc_item(psort_info, &doc_item); @@ -836,7 +836,7 @@ loop: doc.text.f_str = btr_copy_externally_stored_field( &doc.text.f_len, data, - old_page_size, data_len, blob_heap); + zip_size, data_len, blob_heap); } else { doc.text.f_str = data; doc.text.f_len = data_len; diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc index 02e6a97453f..275a9074c46 100644 --- a/storage/innobase/row/row0import.cc +++ b/storage/innobase/row/row0import.cc @@ -53,10 +53,12 @@ Created 2012-02-08 by Sunny Bains. #include <my_aes.h> #endif +using st_::span; + /** The size of the buffer to use for IO. @param n physical page size @return number of pages */ -#define IO_BUFFER_SIZE(n) ((1024 * 1024) / n) +#define IO_BUFFER_SIZE(n) ((1024 * 1024) / (n)) /** For gathering stats on records during phase I */ struct row_stats_t { @@ -118,7 +120,7 @@ struct row_import { m_hostname(NULL), m_table_name(NULL), m_autoinc(0), - m_page_size(0, 0, false), + m_zip_size(0), m_flags(0), m_n_cols(0), m_cols(NULL), @@ -199,7 +201,8 @@ struct row_import { ib_uint64_t m_autoinc; /*!< Next autoinc value */ - page_size_t m_page_size; /*!< Tablespace page size */ + ulint m_zip_size; /*!< ROW_FORMAT=COMPRESSED + page size, or 0 */ ulint m_flags; /*!< Table flags */ @@ -359,7 +362,7 @@ public: @param trx covering transaction */ AbstractCallback(trx_t* trx, ulint space_id) : - m_page_size(0, 0, false), + m_zip_size(0), m_trx(trx), m_space(space_id), m_xdes(), @@ -383,7 +386,7 @@ public: /** @return true if compressed table. */ bool is_compressed_table() const UNIV_NOTHROW { - return(get_page_size().is_compressed()); + return get_zip_size(); } /** @return the tablespace flags */ @@ -403,7 +406,11 @@ public: m_filepath = filename; } - const page_size_t& get_page_size() const { return m_page_size; } + ulint get_zip_size() const { return m_zip_size; } + ulint physical_size() const + { + return m_zip_size ? m_zip_size : srv_page_size; + } const char* filename() const { return m_filepath; } @@ -442,7 +449,7 @@ protected: { ulint offset; - offset = xdes_calc_descriptor_index(get_page_size(), page_no); + offset = xdes_calc_descriptor_index(get_zip_size(), page_no); return(page + XDES_ARR_OFFSET + XDES_SIZE * offset); } @@ -464,15 +471,12 @@ protected: UT_DELETE_ARRAY(m_xdes); m_xdes = NULL; - ulint state; - const xdes_t* xdesc = page + XDES_ARR_OFFSET; - - state = mach_read_ulint(xdesc + XDES_STATE, MLOG_4BYTES); + if (mach_read_from_4(XDES_ARR_OFFSET + XDES_STATE + page) + != XDES_FREE) { + const ulint physical_size = m_zip_size + ? m_zip_size : srv_page_size; - if (state != XDES_FREE) { - - m_xdes = UT_NEW_ARRAY_NOKEY(xdes_t, - m_page_size.physical()); + m_xdes = UT_NEW_ARRAY_NOKEY(xdes_t, physical_size); /* Trigger OOM */ DBUG_EXECUTE_IF( @@ -485,7 +489,7 @@ protected: return(DB_OUT_OF_MEMORY); } - memcpy(m_xdes, page, m_page_size.physical()); + memcpy(m_xdes, page, physical_size); } return(DB_SUCCESS); @@ -496,7 +500,7 @@ protected: @return true if the page is marked as free */ bool is_free(ulint page_no) const UNIV_NOTHROW { - ut_a(xdes_calc_descriptor_page(get_page_size(), page_no) + ut_a(xdes_calc_descriptor_page(get_zip_size(), page_no) == m_xdes_page_no); if (m_xdes != 0) { @@ -511,8 +515,8 @@ protected: } protected: - /** The tablespace page size. */ - page_size_t m_page_size; + /** The ROW_FORMAT=COMPRESSED page size, or 0. */ + ulint m_zip_size; /** File handle to the tablespace */ pfs_os_file_t m_file; @@ -559,7 +563,7 @@ AbstractCallback::init( const page_t* page = block->frame; m_space_flags = fsp_header_get_flags(page); - if (!fsp_flags_is_valid(m_space_flags, true)) { + if (!fil_space_t::is_valid_flags(m_space_flags, true)) { ulint cflags = fsp_flags_convert_from_101(m_space_flags); if (cflags == ULINT_UNDEFINED) { ib::error() << "Invalid FSP_SPACE_FLAGS=" @@ -571,21 +575,23 @@ AbstractCallback::init( /* Clear the DATA_DIR flag, which is basically garbage. */ m_space_flags &= ~(1U << FSP_FLAGS_POS_RESERVED); - m_page_size.copy_from(page_size_t(m_space_flags)); + m_zip_size = fil_space_t::zip_size(m_space_flags); + const ulint logical_size = fil_space_t::logical_size(m_space_flags); + const ulint physical_size = fil_space_t::physical_size(m_space_flags); - if (!is_compressed_table() && !m_page_size.equals_to(univ_page_size)) { + if (logical_size != srv_page_size) { - ib::error() << "Page size " << m_page_size.physical() + ib::error() << "Page size " << logical_size << " of ibd file is not the same as the server page" " size " << srv_page_size; return(DB_CORRUPTION); - } else if (file_size % m_page_size.physical() != 0) { + } else if (file_size & (physical_size - 1)) { ib::error() << "File size " << file_size << " is not a" " multiple of the page size " - << m_page_size.physical(); + << physical_size; return(DB_CORRUPTION); } @@ -628,12 +634,12 @@ struct FetchIndexRootPages : public AbstractCallback { m_table(table) UNIV_NOTHROW { } /** Destructor */ - virtual ~FetchIndexRootPages() UNIV_NOTHROW { } + ~FetchIndexRootPages() UNIV_NOTHROW override { } /** Called for each block as it is read from the file. @param block block to convert, it is not from the buffer pool. @retval DB_SUCCESS or error code. */ - dberr_t operator()(buf_block_t* block) UNIV_NOTHROW; + dberr_t operator()(buf_block_t* block) UNIV_NOTHROW override; /** Update the import configuration that will be used to import the tablespace. */ @@ -697,7 +703,7 @@ FetchIndexRootPages::build_row_import(row_import* cfg) const UNIV_NOTHROW Indexes::const_iterator end = m_indexes.end(); ut_a(cfg->m_table == m_table); - cfg->m_page_size.copy_from(m_page_size); + cfg->m_zip_size = m_zip_size; cfg->m_n_indexes = m_indexes.size(); if (cfg->m_n_indexes == 0) { @@ -806,7 +812,7 @@ public: rec_offs_init(m_offsets_); } - virtual ~PageConverter() UNIV_NOTHROW + ~PageConverter() UNIV_NOTHROW override { if (m_heap != 0) { mem_heap_free(m_heap); @@ -816,7 +822,8 @@ public: /** Called for each block as it is read from the file. @param block block to convert, it is not from the buffer pool. @retval DB_SUCCESS or error code. */ - dberr_t operator()(buf_block_t* block) UNIV_NOTHROW; + dberr_t operator()(buf_block_t* block) UNIV_NOTHROW override; + private: /** Update the page, set the space id, max trx id and index id. @param block block read from file @@ -1484,7 +1491,7 @@ IndexPurge::open() UNIV_NOTHROW btr_pcur_open_at_index_side( true, m_index, BTR_MODIFY_LEAF, &m_pcur, true, 0, &m_mtr); btr_pcur_move_to_next_user_rec(&m_pcur, &m_mtr); - if (rec_is_metadata(btr_pcur_get_rec(&m_pcur), m_index)) { + if (rec_is_metadata(btr_pcur_get_rec(&m_pcur), *m_index)) { ut_ad(btr_pcur_is_on_user_rec(&m_pcur)); /* Skip the metadata pseudo-record. */ } else { @@ -1563,7 +1570,7 @@ IndexPurge::next() UNIV_NOTHROW dict_index_t* index = m_pcur.btr_cur.index; buf_block_t* next_block = btr_block_get( page_id_t(block->page.id.space(), next_page), - block->page.size, BTR_MODIFY_LEAF, index, + block->zip_size(), BTR_MODIFY_LEAF, index, &m_mtr); if (UNIV_UNLIKELY(!next_block @@ -1927,6 +1934,23 @@ PageConverter::update_index_page( return(DB_CORRUPTION); } + if (index->n_core_fields > index->n_fields) { + /* Some columns have been dropped. + Refuse to IMPORT TABLESPACE for now. + + NOTE: This is not an accurate check. + Columns could have been both + added and dropped instantly. + For an accurate check, we must read + the metadata BLOB page pointed to + by the leftmost leaf page. + + But we would have to read + those pages in a special way, + bypassing the buffer pool! */ + return DB_UNSUPPORTED; + } + /* Provisionally set all instantly added columns to be DEFAULT NULL. */ for (unsigned i = index->n_core_fields; @@ -2088,27 +2112,30 @@ dberr_t PageConverter::operator()(buf_block_t* block) UNIV_NOTHROW /* If we already had an old page with matching number in the buffer pool, evict it now, because we no longer evict the pages on DISCARD TABLESPACE. */ - buf_page_get_gen(block->page.id, get_page_size(), + buf_page_get_gen(block->page.id, get_zip_size(), RW_NO_LATCH, NULL, BUF_EVICT_IF_IN_POOL, __FILE__, __LINE__, NULL, NULL); ulint page_type; - dberr_t err = update_page(block, page_type); - if (err != DB_SUCCESS) return err; + if (dberr_t err = update_page(block, page_type)) { + return err; + } + + const bool full_crc32 = fil_space_t::full_crc32(get_space_flags()); if (!block->page.zip.data) { buf_flush_init_for_writing( - NULL, block->frame, NULL, m_current_lsn); + NULL, block->frame, NULL, m_current_lsn, full_crc32); } else if (fil_page_type_is_index(page_type)) { buf_flush_init_for_writing( NULL, block->page.zip.data, &block->page.zip, - m_current_lsn); + m_current_lsn, full_crc32); } else { /* Calculate and update the checksum of non-index pages for ROW_FORMAT=COMPRESSED tables. */ buf_flush_update_zip_checksum( - block->page.zip.data, get_page_size().physical(), + block->page.zip.data, block->zip_size(), m_current_lsn); } @@ -2332,17 +2359,15 @@ row_import_adjust_root_pages_of_secondary_indexes( } /*****************************************************************//** -Ensure that dict_sys->row_id exceeds SELECT MAX(DB_ROW_ID). -@return error code */ -static MY_ATTRIBUTE((nonnull, warn_unused_result)) -dberr_t +Ensure that dict_sys.row_id exceeds SELECT MAX(DB_ROW_ID). */ +MY_ATTRIBUTE((nonnull)) static +void row_import_set_sys_max_row_id( /*==========================*/ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */ const dict_table_t* table) /*!< in: table to import */ { - dberr_t err; const rec_t* rec; mtr_t mtr; btr_pcur_t pcur; @@ -2350,7 +2375,8 @@ row_import_set_sys_max_row_id( dict_index_t* index; index = dict_table_get_first_index(table); - ut_a(dict_index_is_clust(index)); + ut_ad(index->is_primary()); + ut_ad(dict_index_is_auto_gen_clust(index)); mtr_start(&mtr); @@ -2371,71 +2397,29 @@ row_import_set_sys_max_row_id( /* Check for empty table. */ if (page_rec_is_infimum(rec)) { /* The table is empty. */ - err = DB_SUCCESS; - } else if (rec_is_metadata(rec, index)) { + } else if (rec_is_metadata(rec, *index)) { /* The clustered index contains the metadata record only, that is, the table is empty. */ - err = DB_SUCCESS; } else { - ulint len; - const byte* field; - mem_heap_t* heap = NULL; - rec_offs offsets_[1 + REC_OFFS_HEADER_SIZE]; - rec_offs* offsets; - - rec_offs_init(offsets_); - - offsets = rec_get_offsets( - rec, index, offsets_, true, ULINT_UNDEFINED, &heap); - - field = rec_get_nth_field( - rec, offsets, - dict_index_get_sys_col_pos(index, DATA_ROW_ID), - &len); - - if (len == DATA_ROW_ID_LEN) { - row_id = mach_read_from_6(field); - err = DB_SUCCESS; - } else { - err = DB_CORRUPTION; - } - - if (heap != NULL) { - mem_heap_free(heap); - } + row_id = mach_read_from_6(rec); } btr_pcur_close(&pcur); mtr_commit(&mtr); - DBUG_EXECUTE_IF("ib_import_set_max_rowid_failure", - err = DB_CORRUPTION;); - - if (err != DB_SUCCESS) { - ib_errf(prebuilt->trx->mysql_thd, - IB_LOG_LEVEL_WARN, - ER_INNODB_INDEX_CORRUPT, - "Index `%s` corruption detected, invalid DB_ROW_ID" - " in index.", index->name()); - - return(err); - - } else if (row_id > 0) { - + if (row_id) { /* Update the system row id if the imported index row id is greater than the max system row id. */ - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); - if (row_id >= dict_sys->row_id) { - dict_sys->row_id = row_id + 1; + if (row_id >= dict_sys.row_id) { + dict_sys.row_id = row_id + 1; dict_hdr_flush_row_id(); } - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); } - - return(DB_SUCCESS); } /*****************************************************************//** @@ -3055,10 +3039,7 @@ row_import_read_v1( cfg->m_flags = mach_read_from_4(ptr); ptr += sizeof(ib_uint32_t); - cfg->m_page_size.copy_from(dict_tf_get_page_size(cfg->m_flags)); - - ut_a(logical_page_size == cfg->m_page_size.logical()); - + cfg->m_zip_size = dict_tf_get_zip_size(cfg->m_flags); cfg->m_n_cols = mach_read_from_4(ptr); if (!dict_tf_is_valid(cfg->m_flags)) { @@ -3400,7 +3381,7 @@ fil_iterate( AbstractCallback& callback) { os_offset_t offset; - const ulint size = callback.get_page_size().physical(); + const ulint size = callback.physical_size(); ulint n_bytes = iter.n_io_buffers * size; const ulint buf_size = srv_page_size @@ -3417,6 +3398,10 @@ fil_iterate( return DB_OUT_OF_MEMORY; } + ulint actual_space_id = 0; + const bool full_crc32 = fil_space_t::full_crc32( + callback.get_space_flags()); + /* TODO: For ROW_FORMAT=COMPRESSED tables we do a lot of useless copying for non-index pages. Unfortunately, it is required by buf_zip_decompress() */ @@ -3474,15 +3459,10 @@ fil_iterate( byte* src = readptr + i * size; const ulint page_no = page_get_page_no(src); if (!page_no && block->page.id.page_no()) { - const ulint* b = reinterpret_cast<const ulint*> - (src); - const ulint* const e = b + size / sizeof *b; - do { - if (*b++) { - goto page_corrupted; - } - } while (b != e); - + if (!buf_is_zeroes(span<const byte>(src, + size))) { + goto page_corrupted; + } /* Proceed to the next page, because this one is all zero. */ continue; @@ -3498,9 +3478,19 @@ page_corrupted: goto func_exit; } - const bool page_compressed - = fil_page_is_compressed_encrypted(src) - || fil_page_is_compressed(src); + if (block->page.id.page_no() == 0) { + actual_space_id = mach_read_from_4( + src + FIL_PAGE_SPACE_ID); + } + + const bool page_compressed = + (full_crc32 + && fil_space_t::is_compressed( + callback.get_space_flags()) + && buf_page_is_compressed( + src, callback.get_space_flags())) + || (fil_page_is_compressed_encrypted(src) + || fil_page_is_compressed(src)); if (page_compressed && block->page.zip.data) { goto page_corrupted; @@ -3509,11 +3499,11 @@ page_corrupted: bool decrypted = false; byte* dst = io_buffer + i * size; bool frame_changed = false; + uint key_version = buf_page_get_key_version( + src, callback.get_space_flags()); if (!encrypted) { - } else if (!mach_read_from_4( - FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION - + src)) { + } else if (!key_version) { not_encrypted: if (block->page.id.page_no() == 0 && block->page.zip.data) { @@ -3528,14 +3518,17 @@ not_encrypted: memcpy(dst, src, size); } } else { - if (!fil_space_verify_crypt_checksum( - src, callback.get_page_size())) { + if (!buf_page_verify_crypt_checksum( + src, callback.get_space_flags())) { goto page_corrupted; } decrypted = fil_space_decrypt( + actual_space_id, iter.crypt_data, dst, - callback.get_page_size(), src, &err); + callback.physical_size(), + callback.get_space_flags(), + src, &err); if (err != DB_SUCCESS) { goto func_exit; @@ -3548,21 +3541,27 @@ not_encrypted: updated = true; } + /* For full_crc32 format, skip checksum check + after decryption. */ + bool skip_checksum_check = full_crc32 && encrypted; + /* If the original page is page_compressed, we need to decompress it before adjusting further. */ if (page_compressed) { ulint compress_length = fil_page_decompress( - page_compress_buf, dst); + page_compress_buf, dst, + callback.get_space_flags()); ut_ad(compress_length != srv_page_size); if (compress_length == 0) { goto page_corrupted; } updated = true; - } else if (buf_page_is_corrupted( + } else if (!skip_checksum_check + && buf_page_is_corrupted( false, encrypted && !frame_changed ? dst : src, - callback.get_page_size(), NULL)) { + callback.get_space_flags())) { goto page_corrupted; } @@ -3625,7 +3624,7 @@ not_encrypted: if (ulint len = fil_page_compress( src, page_compress_buf, - 0,/* FIXME: compression level */ + callback.get_space_flags(), 512,/* FIXME: proper block size */ encrypted)) { /* FIXME: remove memcpy() */ @@ -3638,12 +3637,14 @@ not_encrypted: /* Encrypt the page if encryption was used. */ if (encrypted && decrypted) { byte *dest = writeptr + i * size; + byte* tmp = fil_encrypt_buf( iter.crypt_data, block->page.id.space(), block->page.id.page_no(), mach_read_from_8(src + FIL_PAGE_LSN), - src, callback.get_page_size(), dest); + src, block->zip_size(), dest, + full_crc32); if (tmp == src) { /* TODO: remove unnecessary memcpy's */ @@ -3653,6 +3654,26 @@ not_encrypted: updated = true; } + + /* Write checksum for the compressed full crc32 page.*/ + if (full_crc32 && page_compressed) { + ut_ad(updated); + byte* dest = writeptr + i * size; + ut_d(bool comp = false); + ut_d(bool corrupt = false); + ulint size = buf_page_full_crc32_size( + dest, +#ifdef UNIV_DEBUG + &comp, &corrupt +#else + NULL, NULL +#endif + ); + ut_ad(!comp == (size == srv_page_size)); + ut_ad(!corrupt); + mach_write_to_4(dest + (size - 4), + ut_crc32(dest, size - 4)); + } } /* A page was updated in the set, write back to disk. */ @@ -3768,10 +3789,8 @@ fil_tablespace_iterate( if (err == DB_SUCCESS) { block->page.id = page_id_t(callback.get_space_id(), 0); - block->page.size.copy_from(callback.get_page_size()); - if (block->page.size.is_compressed()) { - page_zip_set_size(&block->page.zip, - callback.get_page_size().physical()); + if (ulint zip_size = callback.get_zip_size()) { + page_zip_set_size(&block->page.zip, zip_size); /* ROW_FORMAT=COMPRESSED is not optimised for block IO for now. We do the IMPORT page by page. */ n_io_buffers = 1; @@ -3781,7 +3800,7 @@ fil_tablespace_iterate( /* read (optional) crypt data */ iter.crypt_data = fil_space_read_crypt_data( - callback.get_page_size(), page); + callback.get_zip_size(), page); /* If tablespace is encrypted, it needs extra buffers */ if (iter.crypt_data && n_io_buffers > 1) { @@ -3922,7 +3941,7 @@ row_import_for_mysql( /* Prevent DDL operations while we are checking. */ - rw_lock_s_lock_func(&dict_operation_lock, 0, __FILE__, __LINE__); + rw_lock_s_lock(&dict_sys.latch); row_import cfg; @@ -3947,14 +3966,14 @@ row_import_for_mysql( autoinc = cfg.m_autoinc; } - rw_lock_s_unlock_gen(&dict_operation_lock, 0); + rw_lock_s_unlock(&dict_sys.latch); DBUG_EXECUTE_IF("ib_import_set_index_root_failure", err = DB_TOO_MANY_CONCURRENT_TRXS;); } else if (cfg.m_missing) { - rw_lock_s_unlock_gen(&dict_operation_lock, 0); + rw_lock_s_unlock(&dict_sys.latch); /* We don't have a schema file, we will have to discover the index root pages from the .ibd file and skip the schema @@ -3962,12 +3981,12 @@ row_import_for_mysql( ut_a(err == DB_FAIL); - cfg.m_page_size.copy_from(univ_page_size); + cfg.m_zip_size = 0; FetchIndexRootPages fetchIndexRootPages(table, trx); err = fil_tablespace_iterate( - table, IO_BUFFER_SIZE(cfg.m_page_size.physical()), + table, IO_BUFFER_SIZE(srv_page_size), fetchIndexRootPages); if (err == DB_SUCCESS) { @@ -3986,7 +4005,7 @@ row_import_for_mysql( space_flags = fetchIndexRootPages.get_space_flags(); } else { - rw_lock_s_unlock_gen(&dict_operation_lock, 0); + rw_lock_s_unlock(&dict_sys.latch); } if (err != DB_SUCCESS) { @@ -4005,7 +4024,8 @@ row_import_for_mysql( /* Set the IO buffer size in pages. */ err = fil_tablespace_iterate( - table, IO_BUFFER_SIZE(cfg.m_page_size.physical()), converter); + table, IO_BUFFER_SIZE(cfg.m_zip_size ? cfg.m_zip_size + : srv_page_size), converter); DBUG_EXECUTE_IF("ib_import_reset_space_and_lsn_failure", err = DB_TOO_MANY_CONCURRENT_TRXS;); @@ -4073,7 +4093,7 @@ row_import_for_mysql( /* Open the tablespace so that we can access via the buffer pool. We set the 2nd param (fix_dict = true) here because we already - have an x-lock on dict_operation_lock and dict_sys->mutex. + have an x-lock on dict_sys.latch and dict_sys.mutex. The tablespace is initially opened as a temporary one, because we will not be writing any redo log for it before we have invoked fil_space_t::set_imported() to declare it a persistent tablespace. */ @@ -4168,12 +4188,7 @@ row_import_for_mysql( any DB_ROW_ID stored in the table. */ if (prebuilt->clust_index_was_generated) { - - err = row_import_set_sys_max_row_id(prebuilt, table); - - if (err != DB_SUCCESS) { - return(row_import_error(prebuilt, trx, err)); - } + row_import_set_sys_max_row_id(prebuilt, table); } ib::info() << "Phase III - Flush changes to disk"; diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc index f659cd4a0a1..136458e7440 100644 --- a/storage/innobase/row/row0ins.cc +++ b/storage/innobase/row/row0ins.cc @@ -44,11 +44,8 @@ Created 4/20/1996 Heikki Tuuri #include "buf0lru.h" #include "fts0fts.h" #include "fts0types.h" - #ifdef WITH_WSREP -#include <mysql/service_wsrep.h> -#include "../../../wsrep/wsrep_api.h" -#include "wsrep_mysqld_c.h" +#include "wsrep_mysqld.h" #endif /* WITH_WSREP */ /************************************************************************* @@ -1002,11 +999,11 @@ func_exit: #ifdef WITH_WSREP dberr_t wsrep_append_foreign_key(trx_t *trx, - dict_foreign_t* foreign, - const rec_t* clust_rec, - dict_index_t* clust_index, - ibool referenced, - enum wsrep_key_type key_type); + dict_foreign_t* foreign, + const rec_t* clust_rec, + dict_index_t* clust_index, + ibool referenced, + Wsrep_service_key_type key_type); #endif /* WITH_WSREP */ /*********************************************************************//** @@ -1229,8 +1226,10 @@ row_ins_foreign_check_on_constraint( } if (table->fts) { - doc_id = fts_get_doc_id_from_rec(table, clust_rec, - clust_index, tmp_heap); + doc_id = fts_get_doc_id_from_rec( + clust_rec, clust_index, + rec_get_offsets(clust_rec, clust_index, NULL, true, + ULINT_UNDEFINED, &tmp_heap)); } if (node->is_delete @@ -1388,8 +1387,9 @@ row_ins_foreign_check_on_constraint( cascade->state = UPD_NODE_UPDATE_CLUSTERED; #ifdef WITH_WSREP - err = wsrep_append_foreign_key(trx, foreign, cascade->pcur->old_rec, clust_index, - FALSE, WSREP_KEY_EXCLUSIVE); + err = wsrep_append_foreign_key(trx, foreign, cascade->pcur->old_rec, + clust_index, + FALSE, WSREP_SERVICE_KEY_EXCLUSIVE); if (err != DB_SUCCESS) { fprintf(stderr, "WSREP: foreign key append failed: %d\n", err); @@ -1501,7 +1501,7 @@ row_ins_set_exclusive_rec_lock( /***************************************************************//** Checks if foreign key constraint fails for an index entry. Sets shared locks which lock either the success or the failure of the constraint. NOTE that -the caller must have a shared latch on dict_operation_lock. +the caller must have a shared latch on dict_sys.latch. @return DB_SUCCESS, DB_NO_REFERENCED_ROW, or DB_ROW_IS_REFERENCED */ dberr_t row_ins_check_foreign_constraint( @@ -1542,7 +1542,7 @@ row_ins_check_foreign_constraint( upd_node= NULL; #endif /* WITH_WSREP */ - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_S)); + ut_ad(rw_lock_own(&dict_sys.latch, RW_LOCK_S)); err = DB_SUCCESS; @@ -1781,32 +1781,16 @@ row_ins_check_foreign_constraint( if (check_ref) { err = DB_SUCCESS; #ifdef WITH_WSREP - if (!trx->is_wsrep()) { - goto end_scan; - } - enum wsrep_key_type key_type; - if (upd_node != NULL) { - key_type = WSREP_KEY_SHARED; - } else { - switch (wsrep_certification_rules) { - default: - case WSREP_CERTIFICATION_RULES_STRICT: - key_type = WSREP_KEY_EXCLUSIVE; - break; - case WSREP_CERTIFICATION_RULES_OPTIMIZED: - key_type = WSREP_KEY_SEMI; - break; - } - } - err = wsrep_append_foreign_key( - trx, + thr_get_trx(thr), foreign, rec, check_index, check_ref, - key_type); - + (upd_node != NULL + && wsrep_protocol_version < 4) + ? WSREP_SERVICE_KEY_SHARED + : WSREP_SERVICE_KEY_REFERENCE); if (err != DB_SUCCESS) { fprintf(stderr, "WSREP: foreign key append failed: %d\n", err); @@ -1989,7 +1973,7 @@ row_ins_check_foreign_constraints( } /* NOTE that if the thread ends up waiting for a lock - we will release dict_operation_lock temporarily! + we will release dict_sys.latch temporarily! But the counter on the table protects the referenced table from being dropped while the check is running. */ @@ -2589,25 +2573,32 @@ row_ins_clust_index_entry_low( } else { index->set_modified(mtr); - if (mode == BTR_MODIFY_LEAF - && dict_index_is_online_ddl(index)) { - mode = BTR_MODIFY_LEAF_ALREADY_S_LATCHED; - mtr_s_lock_index(index, &mtr); - } + if (UNIV_UNLIKELY(entry->is_metadata())) { + ut_ad(index->is_instant()); + ut_ad(!dict_index_is_online_ddl(index)); + ut_ad(mode == BTR_MODIFY_TREE); + } else { + if (mode == BTR_MODIFY_LEAF + && dict_index_is_online_ddl(index)) { + mode = BTR_MODIFY_LEAF_ALREADY_S_LATCHED; + mtr_s_lock_index(index, &mtr); + } - if (unsigned ai = index->table->persistent_autoinc) { - /* Prepare to persist the AUTO_INCREMENT value - from the index entry to PAGE_ROOT_AUTO_INC. */ - const dfield_t* dfield = dtuple_get_nth_field( - entry, ai - 1); - auto_inc = dfield_is_null(dfield) - ? 0 - : row_parse_int(static_cast<const byte*>( + if (unsigned ai = index->table->persistent_autoinc) { + /* Prepare to persist the AUTO_INCREMENT value + from the index entry to PAGE_ROOT_AUTO_INC. */ + const dfield_t* dfield = dtuple_get_nth_field( + entry, ai - 1); + if (!dfield_is_null(dfield)) { + auto_inc = row_parse_int( + static_cast<const byte*>( dfield->data), dfield->len, dfield->type.mtype, dfield->type.prtype & DATA_UNSIGNED); + } + } } } @@ -2637,35 +2628,25 @@ row_ins_clust_index_entry_low( #endif /* UNIV_DEBUG */ if (UNIV_UNLIKELY(entry->info_bits != 0)) { - ut_ad(entry->info_bits == REC_INFO_METADATA); + ut_ad(entry->is_metadata()); ut_ad(flags == BTR_NO_LOCKING_FLAG); ut_ad(index->is_instant()); ut_ad(!dict_index_is_online_ddl(index)); const rec_t* rec = btr_cur_get_rec(cursor); - switch (rec_get_info_bits(rec, page_rec_is_comp(rec)) - & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)) { - case REC_INFO_MIN_REC_FLAG: + if (rec_get_info_bits(rec, page_rec_is_comp(rec)) + & REC_INFO_MIN_REC_FLAG) { thr_get_trx(thr)->error_info = index; err = DB_DUPLICATE_KEY; goto err_exit; - case REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG: - /* The metadata record never carries the delete-mark - in MariaDB Server 10.3. - If a table loses its 'instantness', it happens - by the rollback of this first-time insert, or - by a call to btr_page_empty() on the root page - when the table becomes empty. */ - err = DB_CORRUPTION; - goto err_exit; - default: - ut_ad(!row_ins_must_modify_rec(cursor)); - goto do_insert; } + + ut_ad(!row_ins_must_modify_rec(cursor)); + goto do_insert; } - if (rec_is_metadata(btr_cur_get_rec(cursor), index)) { + if (rec_is_metadata(btr_cur_get_rec(cursor), *index)) { goto do_insert; } @@ -3025,9 +3006,9 @@ row_ins_sec_index_entry_low( if (!index->is_committed()) { ut_ad(!thr_get_trx(thr) ->dict_operation_lock_mode); - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); dict_set_corrupted_index_cache_only(index); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); /* Do not return any error to the caller. The duplicate will be reported by ALTER TABLE or CREATE UNIQUE INDEX. @@ -3170,9 +3151,27 @@ row_ins_clust_index_entry( n_uniq = dict_index_is_unique(index) ? index->n_uniq : 0; +#ifdef WITH_WSREP + const bool skip_locking + = wsrep_thd_skip_locking(thr_get_trx(thr)->mysql_thd); + ulint flags = index->table->no_rollback() ? BTR_NO_ROLLBACK + : (index->table->is_temporary() || skip_locking) + ? BTR_NO_LOCKING_FLAG : 0; +#ifdef UNIV_DEBUG + if (skip_locking && strcmp(wsrep_get_sr_table_name(), + index->table->name.m_name)) { + WSREP_ERROR("Record locking is disabled in this thread, " + "but the table being modified is not " + "`%s`: `%s`.", wsrep_get_sr_table_name(), + index->table->name.m_name); + ut_error; + } +#endif /* UNIV_DEBUG */ +#else ulint flags = index->table->no_rollback() ? BTR_NO_ROLLBACK : index->table->is_temporary() ? BTR_NO_LOCKING_FLAG : 0; +#endif /* WITH_WSREP */ const ulint orig_n_fields = entry->n_fields; /* Try first optimistic descent to the B-tree */ @@ -3393,6 +3392,24 @@ row_ins_index_entry_set_vals( ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(index->table)); row_field = dtuple_get_nth_v_field(row, v_col->v_pos); + } else if (col->is_dropped()) { + ut_ad(index->is_primary()); + + if (!(col->prtype & DATA_NOT_NULL)) { + field->data = NULL; + field->len = UNIV_SQL_NULL; + field->type.prtype = DATA_BINARY_TYPE; + } else { + ut_ad(col->len <= sizeof field_ref_zero); + ut_ad(ind_field->fixed_len <= col->len); + dfield_set_data(field, field_ref_zero, + ind_field->fixed_len); + field->type.prtype = DATA_NOT_NULL; + } + + field->type.mtype = col->len + ? DATA_FIXBINARY : DATA_BINARY; + continue; } else { row_field = dtuple_get_nth_field( row, ind_field->col->ind); @@ -3402,7 +3419,7 @@ row_ins_index_entry_set_vals( /* Check column prefix indexes */ if (ind_field != NULL && ind_field->prefix_len > 0 - && dfield_get_len(row_field) != UNIV_SQL_NULL) { + && len != UNIV_SQL_NULL) { const dict_col_t* col = dict_field_get_col(ind_field); diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc index 97cd7c2a92b..26c17dac1fc 100644 --- a/storage/innobase/row/row0log.cc +++ b/storage/innobase/row/row0log.cc @@ -42,7 +42,7 @@ Created 2011-05-26 Marko Makela #include <algorithm> #include <map> -ulint onlineddl_rowlog_rows; +Atomic_counter<ulint> onlineddl_rowlog_rows; ulint onlineddl_rowlog_pct_used; ulint onlineddl_pct_progress; @@ -293,7 +293,8 @@ row_log_block_allocate( ); log_buf.block = ut_allocator<byte>(mem_key_row_log_buf) - .allocate_large(srv_sort_buf_size, &log_buf.block_pfx); + .allocate_large(srv_sort_buf_size, + &log_buf.block_pfx); if (log_buf.block == NULL) { DBUG_RETURN(false); @@ -313,7 +314,8 @@ row_log_block_free( DBUG_ENTER("row_log_block_free"); if (log_buf.block != NULL) { ut_allocator<byte>(mem_key_row_log_buf).deallocate_large( - log_buf.block, &log_buf.block_pfx, log_buf.size); + log_buf.block, &log_buf.block_pfx, + log_buf.size); log_buf.block = NULL; } DBUG_VOID_RETURN; @@ -618,7 +620,7 @@ write_failed: err_exit: mutex_exit(&log->mutex); - my_atomic_addlint(&onlineddl_rowlog_rows, 1); + onlineddl_rowlog_rows++; /* 10000 means 100.00%, 4525 means 45.25% */ onlineddl_rowlog_pct_used = static_cast<ulint>((log->tail.total * 10000) / srv_online_max_size); } @@ -696,9 +698,9 @@ row_log_table_delete( fields of the record. */ heap = mem_heap_create( DATA_TRX_ID_LEN - + DTUPLE_EST_ALLOC(unsigned(new_index->n_uniq) + 2)); - old_pk = tuple = dtuple_create( - heap, unsigned(new_index->n_uniq) + 2); + + DTUPLE_EST_ALLOC(new_index->first_user_field())); + old_pk = tuple = dtuple_create(heap, + new_index->first_user_field()); dict_index_copy_types(tuple, new_index, tuple->n_fields); dtuple_set_n_fields_cmp(tuple, new_index->n_uniq); @@ -863,7 +865,7 @@ row_log_table_low_redundant( const bool is_instant = index->online_log->is_instant(index); rec_comp_status_t status = is_instant - ? REC_STATUS_COLUMNS_ADDED : REC_STATUS_ORDINARY; + ? REC_STATUS_INSTANT : REC_STATUS_ORDINARY; size = rec_get_converted_size_temp( index, tuple->fields, tuple->n_fields, &extra_size, status); @@ -917,7 +919,7 @@ row_log_table_low_redundant( *b++ = static_cast<byte>(extra_size); } - if (status == REC_STATUS_COLUMNS_ADDED) { + if (status == REC_STATUS_INSTANT) { ut_ad(is_instant); if (n_fields <= index->online_log->n_core_fields) { status = REC_STATUS_ORDINARY; @@ -983,7 +985,7 @@ row_log_table_low( ut_ad(!"wrong page type"); } #endif /* UNIV_DEBUG */ - ut_ad(!rec_is_metadata(rec, index)); + ut_ad(!rec_is_metadata(rec, *index)); ut_ad(page_rec_is_leaf(rec)); ut_ad(!page_is_comp(page_align(rec)) == !rec_offs_comp(offsets)); /* old_pk=row_log_table_get_pk() [not needed in INSERT] is a prefix @@ -1006,7 +1008,7 @@ row_log_table_low( ut_ad(page_is_comp(page_align(rec))); ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY - || rec_get_status(rec) == REC_STATUS_COLUMNS_ADDED); + || rec_get_status(rec) == REC_STATUS_INSTANT); const ulint omit_size = REC_N_NEW_EXTRA_BYTES; @@ -1080,7 +1082,7 @@ row_log_table_low( if (is_instant) { *b++ = fake_extra_size - ? REC_STATUS_COLUMNS_ADDED + ? REC_STATUS_INSTANT : rec_get_status(rec); } else { ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY); @@ -1147,13 +1149,14 @@ row_log_table_get_pk_old_col( /** Maps an old table column of a PRIMARY KEY column. @param[in] ifield clustered index field in the new table (after ALTER TABLE) +@param[in] index the clustered index of ifield @param[in,out] dfield clustered index tuple field in the new table @param[in,out] heap memory heap for allocating dfield contents @param[in] rec clustered index leaf page record in the old table @param[in] offsets rec_get_offsets(rec) @param[in] i rec field corresponding to col -@param[in] page_size page size of the old table +@param[in] zip_size ROW_FORMAT=COMPRESSED size of the old table @param[in] max_len maximum length of dfield @param[in] log row log for the table @retval DB_INVALID_NULL if a NULL value is encountered @@ -1162,12 +1165,13 @@ static dberr_t row_log_table_get_pk_col( const dict_field_t* ifield, + const dict_index_t* index, dfield_t* dfield, mem_heap_t* heap, const rec_t* rec, const rec_offs* offsets, ulint i, - const page_size_t& page_size, + ulint zip_size, ulint max_len, const row_log_t* log) { @@ -1211,7 +1215,7 @@ row_log_table_get_pk_col( mem_heap_alloc(heap, field_len)); len = btr_copy_externally_stored_field_prefix( - blob_field, field_len, page_size, field, len); + blob_field, field_len, zip_size, field, len); if (len >= max_len + 1) { return(DB_TOO_BIG_INDEX_COL); } @@ -1262,19 +1266,16 @@ row_log_table_get_pk( ulint trx_id_offs = index->trx_id_offset; if (!trx_id_offs) { - ulint pos = dict_index_get_sys_col_pos( - index, DATA_TRX_ID); ulint len; - ut_ad(pos > 0); if (!offsets) { offsets = rec_get_offsets( rec, index, NULL, true, - pos + 1, heap); + index->db_trx_id() + 1, heap); } trx_id_offs = rec_get_nth_field_offs( - offsets, pos, &len); + offsets, index->db_trx_id(), &len); ut_ad(len == DATA_TRX_ID_LEN); } @@ -1329,8 +1330,7 @@ row_log_table_get_pk( const ulint max_len = DICT_MAX_FIELD_LEN_BY_FORMAT(new_table); - const page_size_t& page_size - = dict_table_page_size(index->table); + const ulint zip_size = index->table->space->zip_size(); for (ulint new_i = 0; new_i < new_n_uniq; new_i++) { dict_field_t* ifield; @@ -1356,8 +1356,9 @@ row_log_table_get_pk( } log->error = row_log_table_get_pk_col( - ifield, dfield, *heap, - rec, offsets, i, page_size, max_len, log); + ifield, new_index, dfield, *heap, + rec, offsets, i, zip_size, max_len, + log); if (log->error != DB_SUCCESS) { err_exit: @@ -1577,11 +1578,17 @@ row_log_table_apply_convert_mrec( const dict_col_t* col = dict_field_get_col(ind_field); + if (col->is_dropped()) { + /* the column was instantly dropped earlier */ + ut_ad(index->table->instant); + continue; + } + ulint col_no = log->col_map[dict_col_get_no(col)]; if (col_no == ULINT_UNDEFINED) { - /* dropped column */ + /* the column is being dropped now */ continue; } @@ -1618,7 +1625,7 @@ row_log_table_apply_convert_mrec( data = btr_rec_copy_externally_stored_field( mrec, offsets, - dict_table_page_size(index->table), + index->table->space->zip_size(), i, &len, heap); ut_a(data); dfield_set_data(dfield, data, len); @@ -1656,12 +1663,12 @@ blob_done: /* See if any columns were changed to NULL or NOT NULL. */ const dict_col_t* new_col = dict_table_get_nth_col(log->table, col_no); - ut_ad(new_col->mtype == col->mtype); + ut_ad(new_col->same_format(*col)); /* Assert that prtype matches except for nullability. */ - ut_ad(!((new_col->prtype ^ col->prtype) & ~DATA_NOT_NULL)); ut_ad(!((new_col->prtype ^ dfield_get_type(dfield)->prtype) - & ~DATA_NOT_NULL)); + & ~(DATA_NOT_NULL | DATA_VERSIONED + | CHAR_COLL_MASK << 16 | DATA_LONG_TRUE_VARCHAR))); if (new_col->prtype == col->prtype) { continue; @@ -1940,8 +1947,7 @@ row_log_table_apply_delete( btr_pcur_t pcur; rec_offs* offsets; - ut_ad(rec_offs_n_fields(moffsets) - == dict_index_get_n_unique(index) + 2); + ut_ad(rec_offs_n_fields(moffsets) == index->first_user_field()); ut_ad(!rec_offs_any_extern(moffsets)); /* Convert the row to a search tuple. */ @@ -2508,8 +2514,7 @@ row_log_table_apply_op( /* The ROW_T_DELETE record was converted by rec_convert_dtuple_to_temp() using new_index. */ ut_ad(!new_index->is_instant()); - rec_offs_set_n_fields(offsets, - unsigned(new_index->n_uniq) + 2); + rec_offs_set_n_fields(offsets, new_index->first_user_field()); rec_init_offsets_temp(mrec, new_index, offsets); next_mrec = mrec + rec_offs_data_size(offsets); if (next_mrec > mrec_end) { @@ -2601,7 +2606,7 @@ row_log_table_apply_op( rec_convert_dtuple_to_temp() using new_index. */ ut_ad(!new_index->is_instant()); rec_offs_set_n_fields(offsets, - unsigned(new_index->n_uniq) + 2); + new_index->first_user_field()); rec_init_offsets_temp(mrec, new_index, offsets); next_mrec = mrec + rec_offs_data_size(offsets); @@ -2611,13 +2616,12 @@ row_log_table_apply_op( /* Copy the PRIMARY KEY fields and DB_TRX_ID, DB_ROLL_PTR from mrec to old_pk. */ - old_pk = dtuple_create( - heap, unsigned(new_index->n_uniq) + 2); + old_pk = dtuple_create(heap, + new_index->first_user_field()); dict_index_copy_types(old_pk, new_index, old_pk->n_fields); - for (ulint i = 0; - i < dict_index_get_n_unique(new_index) + 2; + for (ulint i = 0; i < new_index->first_user_field(); i++) { const void* field; ulint len; @@ -2699,8 +2703,8 @@ ulint row_log_progress_inc_per_block() { /* We must increment the progress once per page (as in - univ_page_size, usually 16KiB). One block here is srv_sort_buf_size - (usually 1MiB). */ + srv_page_size, default = innodb_page_size=16KiB). + One block here is srv_sort_buf_size (usually 1MiB). */ const ulint pages_per_block = std::max<ulint>( ulint(srv_sort_buf_size >> srv_page_size_shift), 1); @@ -2768,8 +2772,8 @@ row_log_table_apply_ops( dict_index_t* new_index = dict_table_get_first_index( new_table); const ulint i = 1 + REC_OFFS_HEADER_SIZE - + ut_max(dict_index_get_n_fields(index), - dict_index_get_n_unique(new_index) + 2); + + std::max<ulint>(index->n_fields, + new_index->first_user_field()); const ulint new_trx_id_col = dict_col_get_clust_pos( dict_table_get_sys_col(new_table, DATA_TRX_ID), new_index); trx_t* trx = thr_get_trx(thr); @@ -3132,7 +3136,7 @@ row_log_table_apply( stage->begin_phase_log_table(); - ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_S)); + ut_ad(!rw_lock_own(&dict_sys.latch, RW_LOCK_S)); clust_index = dict_table_get_first_index(old_table); if (clust_index->online_log->n_rows == 0) { @@ -3230,7 +3234,8 @@ row_log_allocate( log->head.total = 0; log->path = path; log->n_core_fields = index->n_core_fields; - ut_ad(!table || log->is_instant(index) == index->is_instant()); + ut_ad(!table || log->is_instant(index) + == (index->n_core_fields < index->n_fields)); log->allow_not_null = allow_not_null; log->old_table = old_table; log->n_rows = 0; diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc index 3d21d1d2efc..e1ecac82457 100644 --- a/storage/innobase/row/row0merge.cc +++ b/storage/innobase/row/row0merge.cc @@ -437,7 +437,7 @@ row_merge_buf_redundant_convert( const dfield_t* row_field, dfield_t* field, ulint len, - const page_size_t& page_size, + ulint zip_size, mem_heap_t* heap) { ut_ad(field->type.mbminlen == 1); @@ -457,7 +457,7 @@ row_merge_buf_redundant_convert( field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)); byte* data = btr_copy_externally_stored_field( - &ext_len, field_data, page_size, field_len, heap); + &ext_len, field_data, zip_size, field_len, heap); ut_ad(ext_len < len); @@ -699,13 +699,13 @@ row_merge_buf_add( if (conv_heap != NULL) { row_merge_buf_redundant_convert( row_field, field, col->len, - dict_table_page_size(old_table), + old_table->space->zip_size(), conv_heap); } else { /* Field length mismatch should not happen when rebuilding redundant row format table. */ - ut_ad(dict_table_is_comp(index->table)); + ut_ad(index->table->not_redundant()); } } } @@ -1856,7 +1856,7 @@ row_merge_read_clustered_index( btr_pcur_open_at_index_side( true, clust_index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr); btr_pcur_move_to_next_user_rec(&pcur, &mtr); - if (rec_is_metadata(btr_pcur_get_rec(&pcur), clust_index)) { + if (rec_is_metadata(btr_pcur_get_rec(&pcur), *clust_index)) { ut_ad(btr_pcur_is_on_user_rec(&pcur)); /* Skip the metadata pseudo-record. */ } else { @@ -1971,8 +1971,7 @@ row_merge_read_clustered_index( goto scan_next; } - if (my_atomic_load32_explicit(&clust_index->lock.waiters, - MY_MEMORY_ORDER_RELAXED)) { + if (clust_index->lock.waiters) { /* There are waiters on the clustered index tree lock, likely the purge thread. Store and restore the cursor @@ -2030,7 +2029,7 @@ end_of_index: block = btr_block_get( page_id_t(block->page.id.space(), next_page_no), - block->page.size, + block->zip_size(), BTR_SEARCH_LEAF, clust_index, &mtr); @@ -3428,7 +3427,7 @@ void row_merge_copy_blobs( const mrec_t* mrec, const rec_offs* offsets, - const page_size_t& page_size, + ulint zip_size, dtuple_t* tuple, mem_heap_t* heap) { @@ -3466,10 +3465,10 @@ row_merge_copy_blobs( BTR_EXTERN_FIELD_REF_SIZE)); data = btr_copy_externally_stored_field( - &len, field_data, page_size, field_len, heap); + &len, field_data, zip_size, field_len, heap); } else { data = btr_rec_copy_externally_stored_field( - mrec, offsets, page_size, i, &len, heap); + mrec, offsets, zip_size, i, &len, heap); } /* Because we have locked the table, any records @@ -3661,8 +3660,7 @@ row_merge_insert_index_tuples( row_log_table_blob_alloc() and row_log_table_blob_free(). */ row_merge_copy_blobs( - mrec, offsets, - dict_table_page_size(old_table), + mrec, offsets, old_table->space->zip_size(), dtuple, tuple_heap); } @@ -3745,10 +3743,9 @@ row_merge_drop_index_dict( pars_info_t* info; ut_ad(!srv_read_only_mode); - ut_ad(mutex_own(&dict_sys->mutex)); ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); + ut_d(dict_sys.assert_locked()); info = pars_info_create(); pars_info_add_ull_literal(info, "indexid", index_id); @@ -3808,17 +3805,16 @@ row_merge_drop_indexes_dict( pars_info_t* info; ut_ad(!srv_read_only_mode); - ut_ad(mutex_own(&dict_sys->mutex)); ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); + ut_d(dict_sys.assert_locked()); /* It is possible that table->n_ref_count > 1 when locked=TRUE. In this case, all code that should have an open handle to the table be waiting for the next statement to execute, or waiting for a meta-data lock. - A concurrent purge will be prevented by dict_operation_lock. */ + A concurrent purge will be prevented by dict_sys.latch. */ info = pars_info_create(); pars_info_add_ull_literal(info, "tableid", table_id); @@ -3858,10 +3854,9 @@ row_merge_drop_indexes( dict_index_t* next_index; ut_ad(!srv_read_only_mode); - ut_ad(mutex_own(&dict_sys->mutex)); ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); + ut_d(dict_sys.assert_locked()); index = dict_table_get_first_index(table); ut_ad(dict_index_is_clust(index)); @@ -3875,7 +3870,7 @@ row_merge_drop_indexes( handle to the table be waiting for the next statement to execute, or waiting for a meta-data lock. - A concurrent purge will be prevented by dict_operation_lock. */ + A concurrent purge will be prevented by dict_sys.latch. */ if (!locked && (table->get_ref_count() > 1 || UT_LIST_GET_FIRST(table->locks))) { @@ -3945,7 +3940,7 @@ row_merge_drop_indexes( rw_lock_x_unlock(dict_index_get_lock(index)); DEBUG_SYNC_C("merge_drop_index_after_abort"); - /* covered by dict_sys->mutex */ + /* covered by dict_sys.mutex */ MONITOR_INC(MONITOR_BACKGROUND_DROP_INDEX); /* fall through */ case ONLINE_INDEX_ABORTED: @@ -4009,7 +4004,7 @@ row_merge_drop_indexes( break; case ONLINE_INDEX_ABORTED: case ONLINE_INDEX_ABORTED_DROPPED: - /* covered by dict_sys->mutex */ + /* covered by dict_sys.mutex */ MONITOR_DEC(MONITOR_BACKGROUND_DROP_INDEX); } @@ -4316,7 +4311,7 @@ row_merge_rename_tables_dict( ut_ad(!srv_read_only_mode); ut_ad(old_table != new_table); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_d(dict_sys.assert_locked()); ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_TABLE || trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); @@ -4685,7 +4680,7 @@ row_merge_build_indexes( created */ if (!row_fts_psort_info_init( trx, dup, new_table, opt_doc_id_size, - dict_table_page_size(old_table), + old_table->space->zip_size(), &psort_info, &merge_info)) { error = DB_CORRUPTION; goto func_exit; @@ -5001,7 +4996,8 @@ func_exit: alloc.deallocate_large(block, &block_pfx, block_size); if (crypt_block) { - alloc.deallocate_large(crypt_block, &crypt_pfx, block_size); + alloc.deallocate_large(crypt_block, &crypt_pfx, + block_size); } DICT_TF2_FLAG_UNSET(new_table, DICT_TF2_FTS_ADD_DOC_ID); diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc index b2256e9905d..a8b83ad65b0 100644 --- a/storage/innobase/row/row0mysql.cc +++ b/storage/innobase/row/row0mysql.cc @@ -34,7 +34,6 @@ Created 9/17/2000 Heikki Tuuri #include "btr0sea.h" #include "dict0boot.h" #include "dict0crea.h" -#include <sql_const.h> #include "dict0dict.h" #include "dict0load.h" #include "dict0priv.h" @@ -327,6 +326,7 @@ row_mysql_read_geometry( ulint col_len) /*!< in: MySQL format length */ { byte* data; + ut_ad(col_len > 8); *len = mach_read_from_n_little_endian(ref, col_len - 8); @@ -826,7 +826,8 @@ row_create_prebuilt( clust_index = dict_table_get_first_index(table); /* Make sure that search_tuple is long enough for clustered index */ - ut_a(2 * dict_table_get_n_cols(table) >= clust_index->n_fields); + ut_a(2 * unsigned(table->n_cols) >= unsigned(clust_index->n_fields) + - clust_index->table->n_dropped()); ref_len = dict_index_get_n_unique(clust_index); @@ -2116,7 +2117,7 @@ row_mysql_freeze_data_dictionary_func( { ut_a(trx->dict_operation_lock_mode == 0); - rw_lock_s_lock_inline(&dict_operation_lock, 0, file, line); + rw_lock_s_lock_inline(&dict_sys.latch, 0, file, line); trx->dict_operation_lock_mode = RW_S_LATCH; } @@ -2132,7 +2133,7 @@ row_mysql_unfreeze_data_dictionary( ut_a(trx->dict_operation_lock_mode == RW_S_LATCH); - rw_lock_s_unlock(&dict_operation_lock); + rw_lock_s_unlock(&dict_sys.latch); trx->dict_operation_lock_mode = 0; } @@ -2323,14 +2324,8 @@ row_mysql_lock_data_dictionary_func( { ut_a(trx->dict_operation_lock_mode == 0 || trx->dict_operation_lock_mode == RW_X_LATCH); - - /* Serialize data dictionary operations with dictionary mutex: - no deadlocks or lock waits can occur then in these operations */ - - rw_lock_x_lock_inline(&dict_operation_lock, 0, file, line); + dict_sys.lock(file, line); trx->dict_operation_lock_mode = RW_X_LATCH; - - mutex_enter(&dict_sys->mutex); } /*********************************************************************//** @@ -2341,16 +2336,9 @@ row_mysql_unlock_data_dictionary( trx_t* trx) /*!< in/out: transaction */ { ut_ad(lock_trx_has_sys_table_locks(trx) == NULL); - ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); - - /* Serialize data dictionary operations with dictionary mutex: - no deadlocks can occur then in these operations */ - - mutex_exit(&dict_sys->mutex); - rw_lock_x_unlock(&dict_operation_lock); - trx->dict_operation_lock_mode = 0; + dict_sys.unlock(); } /*********************************************************************//** @@ -2372,8 +2360,7 @@ row_create_table_for_mysql( que_thr_t* thr; dberr_t err; - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_d(dict_sys.assert_locked()); ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); DBUG_EXECUTE_IF( @@ -2513,8 +2500,7 @@ row_create_index_for_mysql( ulint len; dict_table_t* table = index->table; - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_d(dict_sys.assert_locked()); for (i = 0; i < index->n_def; i++) { /* Check that prefix_len and actual length @@ -2760,7 +2746,7 @@ row_mysql_drop_garbage_tables() mtr.start(); btr_pcur_open_at_index_side( - true, dict_table_get_first_index(dict_sys->sys_tables), + true, dict_table_get_first_index(dict_sys.sys_tables), BTR_SEARCH_LEAF, &pcur, true, 0, &mtr); for (;;) { @@ -2870,11 +2856,15 @@ row_mysql_table_id_reassign( dberr_t err; pars_info_t* info = pars_info_create(); - dict_hdr_get_new_id(new_id, NULL, NULL, table, false); + dict_hdr_get_new_id(new_id, NULL, NULL); pars_info_add_ull_literal(info, "old_id", table->id); pars_info_add_ull_literal(info, "new_id", *new_id); + /* Note: This cannot be rolled back. Rollback would see the + UPDATE SYS_INDEXES as two operations: DELETE and INSERT. + It would invoke btr_free_if_exists() when rolling back the + INSERT, effectively dropping all indexes of the table. */ err = que_eval_sql( info, "PROCEDURE RENUMBER_TABLE_PROC () IS\n" @@ -3104,7 +3094,7 @@ row_discard_tablespace( dict_table_change_id_in_cache(table, new_id); dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); - if (index) index->remove_instant(); + if (index) index->clear_instant_alter(); /* Reset the root page numbers. */ for (; index; index = UT_LIST_GET_NEXT(indexes, index)) { @@ -3174,6 +3164,12 @@ row_discard_tablespace_for_mysql( err = row_discard_tablespace_foreign_key_checks(trx, table); if (err == DB_SUCCESS) { + /* Note: This cannot be rolled back. + Rollback would see the UPDATE SYS_INDEXES + as two operations: DELETE and INSERT. + It would invoke btr_free_if_exists() + when rolling back the INSERT, effectively + dropping all indexes of the table. */ err = row_discard_tablespace(trx, table); } @@ -3306,7 +3302,7 @@ row_drop_table_from_cache( is going to be destroyed below. */ trx->mod_tables.erase(table); - dict_table_remove_from_cache(table); + dict_sys.remove(table); if (dict_load_table(tablename, DICT_ERR_IGNORE_FK_NOKEY)) { ib::error() << "Not able to remove table " @@ -3328,7 +3324,7 @@ will remain locked. @param[in] create_failed true=create table failed because e.g. foreign key column @param[in] nonatomic Whether it is permitted to release - and reacquire dict_operation_lock + and reacquire dict_sys.latch @return error code or DB_SUCCESS */ dberr_t row_drop_table_for_mysql( @@ -3367,8 +3363,7 @@ row_drop_table_for_mysql( nonatomic = true; } - ut_ad(mutex_own(&dict_sys->mutex)); - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_X)); + ut_d(dict_sys.assert_locked()); table = dict_table_open_on_name( name, TRUE, FALSE, @@ -3392,15 +3387,14 @@ row_drop_table_for_mysql( for (dict_index_t* index = dict_table_get_first_index(table); index != NULL; index = dict_table_get_next_index(index)) { - btr_free(page_id_t(SRV_TMP_SPACE_ID, index->page), - univ_page_size); + btr_free(page_id_t(SRV_TMP_SPACE_ID, index->page)); } /* Remove the pointer to this table object from the list of modified tables by the transaction because the object is going to be destroyed below. */ trx->mod_tables.erase(table); table->release(); - dict_table_remove_from_cache(table); + dict_sys.remove(table); err = DB_SUCCESS; goto funct_exit_all_freed; } @@ -4022,8 +4016,8 @@ loop: /* The dict_table_t object must not be accessed before dict_table_open() or after dict_table_close(). But this is OK - if we are holding, the dict_sys->mutex. */ - ut_ad(mutex_own(&dict_sys->mutex)); + if we are holding, the dict_sys.mutex. */ + ut_ad(mutex_own(&dict_sys.mutex)); /* Disable statistics on the found table. */ if (!dict_stats_stop_bg(table)) { diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc index d299f948d7c..e9eaf27977d 100644 --- a/storage/innobase/row/row0purge.cc +++ b/storage/innobase/row/row0purge.cc @@ -102,33 +102,32 @@ row_purge_remove_clust_if_poss_low( purge_node_t* node, /*!< in/out: row purge node */ ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ { - dict_index_t* index; - bool success = true; - mtr_t mtr; - rec_t* rec; - mem_heap_t* heap = NULL; - rec_offs* offsets; - rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; - rec_offs_init(offsets_); - - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_S) + ut_ad(rw_lock_own(&dict_sys.latch, RW_LOCK_S) || node->vcol_info.is_used()); - index = dict_table_get_first_index(node->table); + dict_index_t* index = dict_table_get_first_index(node->table); log_free_check(); - mtr_start(&mtr); - index->set_modified(mtr); + + mtr_t mtr; + mtr.start(); if (!row_purge_reposition_pcur(mode, node, &mtr)) { /* The record was already removed. */ - goto func_exit; + mtr.commit(); + return true; } - rec = btr_pcur_get_rec(&node->pcur); + ut_d(const bool was_instant = !!index->table->instant); + index->set_modified(mtr); - offsets = rec_get_offsets( + rec_t* rec = btr_pcur_get_rec(&node->pcur); + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + mem_heap_t* heap = NULL; + rec_offs* offsets = rec_get_offsets( rec, index, offsets_, true, ULINT_UNDEFINED, &heap); + bool success = true; if (node->roll_ptr != row_get_rec_roll_ptr(rec, index, offsets)) { /* Someone else has modified the record later: do not remove */ @@ -161,6 +160,10 @@ row_purge_remove_clust_if_poss_low( } } + /* Prove that dict_index_t::clear_instant_alter() was + not called with index->table->instant != NULL. */ + ut_ad(!was_instant || index->table->instant); + func_exit: if (heap) { mem_heap_free(heap); @@ -787,7 +790,7 @@ whose old history can no longer be observed. @param[in,out] mtr mini-transaction (will be started and committed) */ static void row_purge_reset_trx_id(purge_node_t* node, mtr_t* mtr) { - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_S) + ut_ad(rw_lock_own(&dict_sys.latch, RW_LOCK_S) || node->vcol_info.is_used()); /* Reset DB_TRX_ID, DB_ROLL_PTR for old records. */ mtr->start(); @@ -820,8 +823,9 @@ static void row_purge_reset_trx_id(purge_node_t* node, mtr_t* mtr) became purgeable) */ if (node->roll_ptr == row_get_rec_roll_ptr(rec, index, offsets)) { - ut_ad(!rec_get_deleted_flag(rec, - rec_offs_comp(offsets))); + ut_ad(!rec_get_deleted_flag( + rec, rec_offs_comp(offsets)) + || rec_is_alter_metadata(rec, *index)); DBUG_LOG("purge", "reset DB_TRX_ID=" << ib::hex(row_get_rec_trx_id( rec, index, offsets))); @@ -863,7 +867,7 @@ row_purge_upd_exist_or_extern_func( { mem_heap_t* heap; - ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_S) + ut_ad(rw_lock_own(&dict_sys.latch, RW_LOCK_S) || node->vcol_info.is_used()); ut_ad(!node->table->skip_alter_undo); @@ -971,7 +975,7 @@ skip_secondaries: block = buf_page_get( page_id_t(rseg->space->id, page_no), - univ_page_size, RW_X_LATCH, &mtr); + 0, RW_X_LATCH, &mtr); buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); @@ -1062,7 +1066,7 @@ row_purge_parse_undo_rec( for this row */ try_again: - rw_lock_s_lock_inline(&dict_operation_lock, 0, __FILE__, __LINE__); + rw_lock_s_lock_inline(&dict_sys.latch, 0, __FILE__, __LINE__); node->table = dict_table_open_on_id( table_id, FALSE, DICT_TABLE_OP_NORMAL); @@ -1093,7 +1097,7 @@ try_again: if (!mysqld_server_started) { dict_table_close(node->table, FALSE, FALSE); - rw_lock_s_unlock(&dict_operation_lock); + rw_lock_s_unlock(&dict_sys.latch); if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) { return(false); } @@ -1123,7 +1127,7 @@ inaccessible: dict_table_close(node->table, FALSE, FALSE); node->table = NULL; err_exit: - rw_lock_s_unlock(&dict_operation_lock); + rw_lock_s_unlock(&dict_sys.latch); node->skip(table_id, trx_id); return(false); } @@ -1258,10 +1262,10 @@ row_purge( node, undo_rec, thr, updated_extern); if (!node->vcol_info.is_used()) { - rw_lock_s_unlock(&dict_operation_lock); + rw_lock_s_unlock(&dict_sys.latch); } - ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_S)); + ut_ad(!rw_lock_own(&dict_sys.latch, RW_LOCK_S)); if (purged || srv_shutdown_state > SRV_SHUTDOWN_INITIATED diff --git a/storage/innobase/row/row0quiesce.cc b/storage/innobase/row/row0quiesce.cc index 94a372bd046..02c8c495e88 100644 --- a/storage/innobase/row/row0quiesce.cc +++ b/storage/innobase/row/row0quiesce.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2018, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -70,17 +70,16 @@ row_quiesce_write_index_fields( return(DB_IO_ERROR); } + const char* field_name = field->name ? field->name : ""; /* Include the NUL byte in the length. */ - ib_uint32_t len = static_cast<ib_uint32_t>(strlen(field->name) + 1); - ut_a(len > 1); - + ib_uint32_t len = static_cast<ib_uint32_t>(strlen(field_name) + 1); mach_write_to_4(row, len); DBUG_EXECUTE_IF("ib_export_io_write_failure_10", close(fileno(file));); if (fwrite(row, 1, sizeof(len), file) != sizeof(len) - || fwrite(field->name, 1, len, file) != len) { + || fwrite(field_name, 1, len, file) != len) { ib_senderrf( thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, @@ -670,8 +669,11 @@ row_quiesce_set_state( } row_mysql_lock_data_dictionary(trx); - - dict_table_x_lock_indexes(table); + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + rw_lock_x_lock(&index->lock); + } switch (state) { case QUIESCE_START: @@ -688,7 +690,11 @@ row_quiesce_set_state( table->quiesce = state; - dict_table_x_unlock_indexes(table); + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + rw_lock_x_unlock(&index->lock); + } row_mysql_unlock_data_dictionary(trx); diff --git a/storage/innobase/row/row0row.cc b/storage/innobase/row/row0row.cc index bcb128f2870..f37b810b7eb 100644 --- a/storage/innobase/row/row0row.cc +++ b/storage/innobase/row/row0row.cc @@ -62,6 +62,10 @@ static bool row_build_spatial_index_key( ulint flag, mem_heap_t* heap) { + if (dfield2->type.mtype == DATA_MISSING) { + return false; + } + double* mbr; dfield_copy(dfield, dfield2); @@ -92,6 +96,7 @@ static bool row_build_spatial_index_key( if (!dfield_is_ext(dfield2)) { dptr = static_cast<const byte*>(dfield_get_data(dfield2)); dlen = dfield_get_len(dfield2); + ut_ad(dptr != &data_error); goto write_mbr; } @@ -152,7 +157,7 @@ static bool row_build_spatial_index_key( temp_heap = mem_heap_create(1000); dptr = btr_copy_externally_stored_field( - &dlen, dptr, ext ? ext->page_size : page_size_t(space->flags), + &dlen, dptr, ext ? ext->zip_size : space->zip_size(), flen, temp_heap); write_mbr: @@ -198,7 +203,7 @@ row_build_index_entry_low( { dtuple_t* entry; ulint entry_len; - ulint i; + ulint i = 0; ulint num_v = 0; entry_len = dict_index_get_n_fields(index); @@ -218,90 +223,87 @@ row_build_index_entry_low( } else { dtuple_set_n_fields_cmp( entry, dict_index_get_n_unique_in_tree(index)); - } + if (dict_index_is_spatial(index)) { + /* Set the MBR field */ + if (!row_build_spatial_index_key( + index, ext, + dtuple_get_nth_field(entry, 0), + dtuple_get_nth_field( + row, + dict_index_get_nth_field(index, i) + ->col->ind), flag, heap)) { + return NULL; + } - for (i = 0; i < entry_len + num_v; i++) { - const dict_field_t* ind_field = NULL; - const dict_col_t* col; - ulint col_no = 0; - dfield_t* dfield; - const dfield_t* dfield2; - ulint len; - - if (i >= entry_len) { - /* This is to insert new rows to cluster index */ - ut_ad(dict_index_is_clust(index) - && flag == ROW_BUILD_FOR_INSERT); - dfield = dtuple_get_nth_v_field(entry, i - entry_len); - col = &dict_table_get_nth_v_col( - index->table, i - entry_len)->m_col; + i = 1; + } + } - } else { - ind_field = dict_index_get_nth_field(index, i); - col = ind_field->col; - col_no = dict_col_get_no(col); - dfield = dtuple_get_nth_field(entry, i); + for (; i < entry_len; i++) { + const dict_field_t& f = index->fields[i]; + dfield_t* dfield = dtuple_get_nth_field(entry, i); + + if (f.col->is_dropped()) { + ut_ad(index->is_primary()); + ut_ad(index->is_instant()); + ut_ad(!f.col->is_virtual()); + dict_col_copy_type(f.col, &dfield->type); + if (f.col->is_nullable()) { + dfield_set_null(dfield); + } else { + dfield_set_data(dfield, field_ref_zero, + f.fixed_len); + } + continue; } - compile_time_assert(DATA_MISSING == 0); + const dfield_t* dfield2; - if (col->is_virtual()) { - const dict_v_col_t* v_col - = reinterpret_cast<const dict_v_col_t*>(col); + if (f.col->is_virtual()) { + const dict_v_col_t* v_col + = reinterpret_cast<const dict_v_col_t*>(f.col); ut_ad(v_col->v_pos < dtuple_get_n_v_fields(row)); dfield2 = dtuple_get_nth_v_field(row, v_col->v_pos); ut_ad(dfield_is_null(dfield2) || dfield_get_len(dfield2) == 0 || dfield2->data); + ut_ad(!dfield_is_ext(dfield2)); + if (UNIV_UNLIKELY(dfield2->type.mtype + == DATA_MISSING)) { + ut_ad(flag == ROW_BUILD_FOR_PURGE); + return(NULL); + } } else { - dfield2 = dtuple_get_nth_field(row, col_no); - ut_ad(dfield_get_type(dfield2)->mtype == DATA_MISSING - || (!(dfield_get_type(dfield2)->prtype - & DATA_VIRTUAL))); - } - - if (UNIV_UNLIKELY(dfield_get_type(dfield2)->mtype - == DATA_MISSING)) { - /* The field has not been initialized in the row. - This should be from trx_undo_rec_get_partial_row(). */ - return(NULL); - } - -#ifdef UNIV_DEBUG - if (dfield_get_type(dfield2)->prtype & DATA_VIRTUAL - && dict_index_is_clust(index)) { - ut_ad(flag == ROW_BUILD_FOR_INSERT); - } -#endif /* UNIV_DEBUG */ - - /* Special handle spatial index, set the first field - which is for store MBR. */ - if (dict_index_is_spatial(index) && i == 0) { - if (!row_build_spatial_index_key( - index, ext, dfield, dfield2, flag, heap)) { - return NULL; + dfield2 = dtuple_get_nth_field(row, f.col->ind); + if (UNIV_UNLIKELY(dfield2->type.mtype + == DATA_MISSING)) { + /* The field has not been initialized in + the row. This should be from + trx_undo_rec_get_partial_row(). */ + return(NULL); } - continue; + ut_ad(!(dfield2->type.prtype & DATA_VIRTUAL)); } - len = dfield_get_len(dfield2); + compile_time_assert(DATA_MISSING == 0); - dfield_copy(dfield, dfield2); + *dfield = *dfield2; if (dfield_is_null(dfield)) { continue; } - if ((!ind_field || ind_field->prefix_len == 0) + ulint len = dfield_get_len(dfield); + + if (f.prefix_len == 0 && (!dfield_is_ext(dfield) || dict_index_is_clust(index))) { /* The dfield_copy() above suffices for columns that are stored in-page, or for clustered index record columns that are not - part of a column prefix in the PRIMARY KEY, - or for virtaul columns in cluster index record. */ + part of a column prefix in the PRIMARY KEY. */ continue; } @@ -312,11 +314,11 @@ row_build_index_entry_low( index record with an off-page column is when it is a column prefix index. If atomic_blobs, also fully indexed long columns may be stored off-page. */ - ut_ad(col->ord_part); + ut_ad(f.col->ord_part); - if (ext && !col->is_virtual()) { + if (ext && !f.col->is_virtual()) { /* See if the column is stored externally. */ - const byte* buf = row_ext_lookup(ext, col_no, + const byte* buf = row_ext_lookup(ext, f.col->ind, &len); if (UNIV_LIKELY_NULL(buf)) { if (UNIV_UNLIKELY(buf == field_ref_zero)) { @@ -325,7 +327,7 @@ row_build_index_entry_low( dfield_set_data(dfield, buf, len); } - if (ind_field->prefix_len == 0) { + if (f.prefix_len == 0) { /* If ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED, we can have a secondary index on an entire column @@ -352,16 +354,33 @@ row_build_index_entry_low( } /* If a column prefix index, take only the prefix. */ - if (ind_field->prefix_len) { + if (f.prefix_len) { len = dtype_get_at_most_n_mbchars( - col->prtype, col->mbminlen, col->mbmaxlen, - ind_field->prefix_len, len, + f.col->prtype, + f.col->mbminlen, f.col->mbmaxlen, + f.prefix_len, len, static_cast<char*>(dfield_get_data(dfield))); dfield_set_len(dfield, len); } } - return(entry); + for (i = num_v; i--; ) { + ut_ad(index->is_primary()); + ut_ad(flag == ROW_BUILD_FOR_INSERT); + dfield_t* dfield = dtuple_get_nth_v_field(entry, i); + const dict_v_col_t* v_col = dict_table_get_nth_v_col( + index->table, i); + ut_ad(!v_col->m_col.is_dropped()); + ut_ad(v_col->v_pos < dtuple_get_n_v_fields(row)); + const dfield_t* dfield2 = dtuple_get_nth_v_field( + row, v_col->v_pos); + ut_ad(dfield_is_null(dfield2) || + dfield_get_len(dfield2) == 0 || dfield2->data); + ut_ad(dfield2->type.mtype != DATA_MISSING); + *dfield = *dfield2; + } + + return entry; } /** An inverse function to row_build_index_entry. Builds a row from a @@ -498,11 +517,23 @@ row_build_low( j = 0; + const dict_field_t* ind_field = index->fields; + for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) { - const dict_field_t* ind_field - = dict_index_get_nth_field(index, i); + if (i == index->first_user_field() + && rec_is_alter_metadata(rec, *index)) { + ut_ad(rec_offs_nth_extern(offsets, i)); + ut_d(ulint len); + ut_d(rec_get_nth_field_offs(offsets, i, &len)); + ut_ad(len == FIELD_REF_SIZE); + continue; + } + + ut_ad(ind_field < &index->fields[index->n_fields]); + + const dict_col_t* col = dict_field_get_col(ind_field); - if (ind_field->prefix_len) { + if ((ind_field++)->prefix_len) { /* Column prefixes can only occur in key fields, which cannot be stored externally. For a column prefix, there should also be the full @@ -512,10 +543,11 @@ row_build_low( continue; } - const dict_col_t* col - = dict_field_get_col(ind_field); - ulint col_no - = dict_col_get_no(col); + if (col->is_dropped()) { + continue; + } + + ulint col_no = dict_col_get_no(col); if (col_map) { col_no = col_map[col_no]; @@ -527,6 +559,7 @@ row_build_low( } dfield_t* dfield = dtuple_get_nth_field(row, col_no); + const void* field = rec_get_nth_field( copy, offsets, i, &len); if (len == UNIV_SQL_DEFAULT) { @@ -566,7 +599,7 @@ row_build_low( row_log_table_delete(). */ } else if (j) { - *ext = row_ext_create(j, ext_cols, index->table->flags, row, + *ext = row_ext_create(j, ext_cols, *index->table, row, heap); } else { *ext = NULL; @@ -670,57 +703,83 @@ row_build_w_add_vcol( } /** Convert an index record to a data tuple. -@tparam def whether the index->instant_field_value() needs to be accessed -@param[in] rec index record -@param[in] index index -@param[in] offsets rec_get_offsets(rec, index) -@param[out] n_ext number of externally stored columns -@param[in,out] heap memory heap for allocations +@tparam metadata whether the index->instant_field_value() needs to be accessed +@tparam mblob 1 if rec_is_alter_metadata(); +2 if we want converted metadata corresponding to info_bits +@param[in] rec index record +@param[in] index index +@param[in] offsets rec_get_offsets(rec, index) +@param[out] n_ext number of externally stored columns +@param[in,out] heap memory heap for allocations +@param[in] info_bits (only used if mblob=2) +@param[in] pad (only used if mblob=2) @return index entry built; does not set info_bits, and the data fields in the entry will point directly to rec */ -template<bool def> +template<bool metadata, int mblob = 0> static inline dtuple_t* row_rec_to_index_entry_impl( const rec_t* rec, const dict_index_t* index, const rec_offs* offsets, - mem_heap_t* heap) + mem_heap_t* heap, + ulint info_bits = 0, + bool pad = false) { - dtuple_t* entry; - dfield_t* dfield; - ulint i; - const byte* field; - ulint len; - ulint rec_len; - ut_ad(rec != NULL); ut_ad(heap != NULL); ut_ad(index != NULL); - ut_ad(def || !rec_offs_any_default(offsets)); - + ut_ad(!mblob || index->is_primary()); + ut_ad(!mblob || !index->table->is_temporary()); + ut_ad(!mblob || !dict_index_is_spatial(index)); + compile_time_assert(!mblob || metadata); + compile_time_assert(mblob <= 2); /* Because this function may be invoked by row0merge.cc on a record whose header is in different format, the check rec_offs_validate(rec, index, offsets) must be avoided here. */ - rec_len = rec_offs_n_fields(offsets); - - entry = dtuple_create(heap, rec_len); + const bool got = mblob == 2 && rec_is_alter_metadata(rec, *index); + ulint rec_len = rec_offs_n_fields(offsets); + if (mblob == 2) { + ut_ad(info_bits == REC_INFO_METADATA_ALTER + || info_bits == REC_INFO_METADATA_ADD); + ut_ad(rec_len <= ulint(index->n_fields + got)); + if (pad) { + rec_len = ulint(index->n_fields) + + (info_bits == REC_INFO_METADATA_ALTER); + } else if (!got && info_bits == REC_INFO_METADATA_ALTER) { + rec_len++; + } + } else { + ut_ad(info_bits == 0); + ut_ad(!pad); + } + dtuple_t* entry = dtuple_create(heap, rec_len); + dfield_t* dfield = entry->fields; dtuple_set_n_fields_cmp(entry, dict_index_get_n_unique_in_tree(index)); - ut_ad(rec_len == dict_index_get_n_fields(index) + ut_ad(mblob == 2 + || rec_len == dict_index_get_n_fields(index) + uint(mblob == 1) /* a record for older SYS_INDEXES table (missing merge_threshold column) is acceptable. */ - || (index->table->id == DICT_INDEXES_ID + || (!index->table->is_temporary() + && index->table->id == DICT_INDEXES_ID && rec_len == dict_index_get_n_fields(index) - 1)); - dict_index_copy_types(entry, index, rec_len); - - for (i = 0; i < rec_len; i++) { + ulint i; + for (i = 0; i < (mblob ? index->first_user_field() : rec_len); + i++, dfield++) { + dict_col_copy_type(dict_index_get_nth_col(index, i), + &dfield->type); + if (!mblob + && dict_index_is_spatial(index) + && DATA_GEOMETRY_MTYPE(dfield->type.mtype)) { + dfield->type.prtype |= DATA_GIS_MBR; + } - dfield = dtuple_get_nth_field(entry, i); - field = def + ulint len; + const byte* field = metadata ? rec_get_nth_cfield(rec, index, offsets, i, &len) : rec_get_nth_field(rec, offsets, i, &len); @@ -731,8 +790,74 @@ row_rec_to_index_entry_impl( } } + if (mblob) { + ulint len; + const byte* field; + ulint j = i; + + if (mblob == 2) { + const bool want = info_bits == REC_INFO_METADATA_ALTER; + if (got == want) { + if (got) { + goto copy_metadata; + } + } else { + if (want) { + /* Allocate a placeholder for + adding metadata in an update. */ + len = FIELD_REF_SIZE; + field = static_cast<byte*>( + mem_heap_zalloc(heap, len)); + /* In reality there is one fewer + field present in the record. */ + rec_len--; + goto init_metadata; + } + + /* Skip the undesired metadata blob + (for example, when rolling back an + instant ALTER TABLE). */ + i++; + } + goto copy_user_fields; + } +copy_metadata: + ut_ad(rec_offs_nth_extern(offsets, i)); + field = rec_get_nth_field(rec, offsets, i++, &len); +init_metadata: + dfield->type.metadata_blob_init(); + ut_ad(len == FIELD_REF_SIZE); + dfield_set_data(dfield, field, len); + dfield_set_ext(dfield++); +copy_user_fields: + for (; i < rec_len; i++, dfield++) { + dict_col_copy_type(dict_index_get_nth_col(index, j++), + &dfield->type); + if (mblob == 2 && pad + && i >= rec_offs_n_fields(offsets)) { + field = index->instant_field_value(j - 1, + &len); + dfield_set_data(dfield, field, len); + continue; + } + + field = rec_get_nth_field(rec, offsets, i, &len); + dfield_set_data(dfield, field, len); + + if (rec_offs_nth_extern(offsets, i)) { + dfield_set_ext(dfield); + } + } + } + + if (mblob == 2) { + ulint n_fields = ulint(dfield - entry->fields); + ut_ad(entry->n_fields >= n_fields); + entry->n_fields = n_fields; + } + ut_ad(dfield == entry->fields + entry->n_fields); ut_ad(dtuple_check_typed(entry)); - return(entry); + return entry; } /** Convert an index record to a data tuple. @@ -763,25 +888,26 @@ row_rec_to_index_entry( mem_heap_t* heap) /*!< in: memory heap from which the memory needed is allocated */ { - dtuple_t* entry; - byte* buf; - const rec_t* copy_rec; - ut_ad(rec != NULL); ut_ad(heap != NULL); ut_ad(index != NULL); ut_ad(rec_offs_validate(rec, index, offsets)); /* Take a copy of rec to heap */ - buf = static_cast<byte*>( - mem_heap_alloc(heap, rec_offs_size(offsets))); - - copy_rec = rec_copy(buf, rec, offsets); + const rec_t* copy_rec = rec_copy( + static_cast<byte*>(mem_heap_alloc(heap, + rec_offs_size(offsets))), + rec, offsets); rec_offs_make_valid(copy_rec, index, true, const_cast<rec_offs*>(offsets)); - entry = row_rec_to_index_entry_impl<true>( - copy_rec, index, offsets, heap); + + dtuple_t* entry = rec_is_alter_metadata(copy_rec, *index) + ? row_rec_to_index_entry_impl<true,1>( + copy_rec, index, offsets, heap) + : row_rec_to_index_entry_impl<true>( + copy_rec, index, offsets, heap); + rec_offs_make_valid(rec, index, true, const_cast<rec_offs*>(offsets)); @@ -791,6 +917,49 @@ row_rec_to_index_entry( return(entry); } +/** Convert a metadata record to a data tuple. +@param[in] rec metadata record +@param[in] index clustered index after instant ALTER TABLE +@param[in] offsets rec_get_offsets(rec) +@param[in,out] heap memory heap for allocations +@param[in] info_bits the info_bits after an update +@param[in] pad whether to pad to index->n_fields */ +dtuple_t* +row_metadata_to_tuple( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets, + mem_heap_t* heap, + ulint info_bits, + bool pad) +{ + ut_ad(info_bits == REC_INFO_METADATA_ALTER + || info_bits == REC_INFO_METADATA_ADD); + ut_ad(rec_is_metadata(rec, *index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + const rec_t* copy_rec = rec_copy( + static_cast<byte*>(mem_heap_alloc(heap, + rec_offs_size(offsets))), + rec, offsets); + + rec_offs_make_valid(copy_rec, index, true, + const_cast<rec_offs*>(offsets)); + + dtuple_t* entry = info_bits == REC_INFO_METADATA_ALTER + || rec_is_alter_metadata(copy_rec, *index) + ? row_rec_to_index_entry_impl<true,2>( + copy_rec, index, offsets, heap, info_bits, pad) + : row_rec_to_index_entry_impl<true>( + copy_rec, index, offsets, heap); + + rec_offs_make_valid(rec, index, true, + const_cast<rec_offs*>(offsets)); + + dtuple_set_info_bits(entry, info_bits); + return entry; +} + /*******************************************************************//** Builds from a secondary index record a row reference with which we can search the clustered index record. @@ -1022,7 +1191,7 @@ row_search_on_row_ref( index = dict_table_get_first_index(table); if (UNIV_UNLIKELY(ref->info_bits != 0)) { - ut_ad(ref->info_bits == REC_INFO_METADATA); + ut_ad(ref->is_metadata()); ut_ad(ref->n_fields <= index->n_uniq); if (btr_pcur_open_at_index_side( true, index, mode, pcur, true, 0, mtr) diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc index c3dc4e14094..4e36bce4a77 100644 --- a/storage/innobase/row/row0sel.cc +++ b/storage/innobase/row/row0sel.cc @@ -54,6 +54,9 @@ Created 12/19/1997 Heikki Tuuri #include "buf0lru.h" #include "srv0srv.h" #include "srv0mon.h" +#ifdef WITH_WSREP +#include "mysql/service_wsrep.h" /* For wsrep_thd_skip_locking */ +#endif /* Maximum number of rows to prefetch; MySQL interface has another parameter */ #define SEL_MAX_N_PREFETCH 16 @@ -124,7 +127,7 @@ row_sel_sec_rec_is_for_blob( } len = btr_copy_externally_stored_field_prefix( - buf, prefix_len, page_size_t(table->space->flags), + buf, prefix_len, table->space->zip_size(), clust_field, clust_len); if (len == 0) { @@ -305,8 +308,7 @@ row_sel_sec_rec_is_for_clust_rec( if (rec_offs_nth_extern(clust_offs, clust_pos)) { dptr = btr_copy_externally_stored_field( &clust_len, dptr, - page_size_t(clust_index->table->space - ->flags), + clust_index->table->space->zip_size(), len, heap); } @@ -529,7 +531,7 @@ row_sel_fetch_columns( data = btr_rec_copy_externally_stored_field( rec, offsets, - dict_table_page_size(index->table), + index->table->space->zip_size(), field_no, &len, heap); /* data == NULL means that the @@ -1134,7 +1136,7 @@ re_scan: cur_block = buf_page_get_gen( page_id_t(index->table->space_id, page_no), - page_size_t(index->table->space->flags), + index->table->space->zip_size(), RW_X_LATCH, NULL, BUF_GET, __FILE__, __LINE__, mtr, &err); } else { @@ -1482,7 +1484,7 @@ row_sel_try_search_shortcut( const rec_t* rec = btr_pcur_get_rec(&(plan->pcur)); - if (!page_rec_is_user_rec(rec) || rec_is_metadata(rec, index)) { + if (!page_rec_is_user_rec(rec) || rec_is_metadata(rec, *index)) { retry: return(SEL_RETRY); } @@ -1775,7 +1777,7 @@ skip_lock: goto next_rec; } - if (rec_is_metadata(rec, index)) { + if (rec_is_metadata(rec, *index)) { /* Skip the metadata pseudo-record. */ cost_counter++; goto next_rec; @@ -2681,44 +2683,6 @@ row_sel_convert_mysql_key_to_innobase( } /**************************************************************//** -Stores the row id to the prebuilt struct. */ -static -void -row_sel_store_row_id_to_prebuilt( -/*=============================*/ - row_prebuilt_t* prebuilt, /*!< in/out: prebuilt */ - const rec_t* index_rec, /*!< in: record */ - const dict_index_t* index, /*!< in: index of the record */ - const rec_offs* offsets) /*!< in: rec_get_offsets - (index_rec, index) */ -{ - const byte* data; - ulint len; - - ut_ad(rec_offs_validate(index_rec, index, offsets)); - - data = rec_get_nth_field( - index_rec, offsets, - dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len); - - if (UNIV_UNLIKELY(len != DATA_ROW_ID_LEN)) { - - ib::error() << "Row id field is wrong length " << len << " in" - " index " << index->name - << " of table " << index->table->name - << ", Field number " - << dict_index_get_sys_col_pos(index, DATA_ROW_ID) - << ", record:"; - - rec_print_new(stderr, index_rec, offsets); - putc('\n', stderr); - ut_error; - } - - ut_memcpy(prebuilt->row_id, data, len); -} - -/**************************************************************//** Stores a non-SQL-NULL field in the MySQL format. The counterpart of this function is row_mysql_store_col_in_innobase_format() in row0mysql.cc. */ void @@ -2732,7 +2696,6 @@ row_sel_field_store_in_mysql_format_func( const byte* data, ulint len) { - byte* ptr; #ifdef UNIV_DEBUG const dict_field_t* field = templ->is_virtual @@ -2746,31 +2709,10 @@ row_sel_field_store_in_mysql_format_func( MEM_UNDEFINED(dest, templ->mysql_col_len); #endif /* HAVE_valgrind_or_MSAN */ + byte* pad = dest + len; + switch (templ->type) { const byte* field_end; - byte* pad; - case DATA_INT: - /* Convert integer data from Innobase to a little-endian - format, sign bit restored to normal */ - - ptr = dest + len; - - for (;;) { - ptr--; - *ptr = *data; - if (ptr == dest) { - break; - } - data++; - } - - if (!templ->is_unsigned) { - dest[len - 1] = (byte) (dest[len - 1] ^ 128); - } - - ut_ad(templ->mysql_col_len == len); - break; - case DATA_VARCHAR: case DATA_VARMYSQL: case DATA_BINARY: @@ -2794,7 +2736,14 @@ row_sel_field_store_in_mysql_format_func( /* Pad with trailing spaces. */ - pad = dest + len; + if (pad == field_end) { + break; + } + + if (UNIV_UNLIKELY(templ->type == DATA_FIXBINARY)) { + memset(pad, 0, field_end - pad); + break; + } ut_ad(templ->mbminlen <= templ->mbmaxlen); @@ -2871,7 +2820,7 @@ row_sel_field_store_in_mysql_format_func( done in row0mysql.cc, function row_mysql_store_col_in_innobase_format(). */ - memset(dest + len, 0x20, templ->mysql_col_len - len); + memset(pad, 0x20, templ->mysql_col_len - len); } break; @@ -2888,13 +2837,24 @@ row_sel_field_store_in_mysql_format_func( case DATA_FLOAT: case DATA_DOUBLE: case DATA_DECIMAL: - /* Above are the valid column types for MySQL data. */ #endif /* UNIV_DEBUG */ ut_ad((templ->is_virtual && !field) || (field && field->prefix_len ? field->prefix_len == len : templ->mysql_col_len == len)); memcpy(dest, data, len); + break; + + case DATA_INT: + /* Convert InnoDB big-endian integer to little-endian + format, sign bit restored to 2's complement form */ + DBUG_ASSERT(templ->mysql_col_len == len); + + byte* ptr = pad; + do *--ptr = *data++; while (ptr != dest); + if (!templ->is_unsigned) { + pad[-1] ^= 0x80; + } } } @@ -2958,8 +2918,7 @@ row_sel_store_mysql_field( causes an assert */ data = btr_rec_copy_externally_stored_field( - rec, offsets, - dict_table_page_size(prebuilt->table), + rec, offsets, prebuilt->table->space->zip_size(), field_no, &len, heap); if (UNIV_UNLIKELY(!data)) { @@ -3088,9 +3047,6 @@ static bool row_sel_store_mysql_rec( const mysql_row_templ_t*templ = &prebuilt->mysql_template[i]; if (templ->is_virtual && dict_index_is_clust(index)) { - /* Virtual columns are never declared NOT NULL. */ - ut_ad(templ->mysql_null_bit_mask); - /* Skip virtual columns if it is not a covered search or virtual key read is not requested. */ if (!rec_clust @@ -3098,8 +3054,10 @@ static bool row_sel_store_mysql_rec( || (!prebuilt->read_just_key && !prebuilt->m_read_virtual_key)) { /* Initialize the NULL bit. */ - mysql_rec[templ->mysql_null_byte_offset] - |= (byte) templ->mysql_null_bit_mask; + if (templ->mysql_null_bit_mask) { + mysql_rec[templ->mysql_null_byte_offset] + |= (byte) templ->mysql_null_bit_mask; + } continue; } @@ -3159,8 +3117,9 @@ static bool row_sel_store_mysql_rec( = rec_clust ? templ->clust_rec_field_no : templ->rec_field_no; - /* We should never deliver column prefixes to MySQL, - except for evaluating innobase_index_cond(). */ + /* We should never deliver column prefixes to the SQL layer, + except for evaluating handler_index_cond_check() + or handler_rowid_filter_check(). */ /* ...actually, we do want to do this in order to support the prefix query optimization. @@ -3186,7 +3145,7 @@ static bool row_sel_store_mysql_rec( if (dict_index_is_clust(index) || prebuilt->fts_doc_id_in_read_set) { prebuilt->fts_doc_id = fts_get_doc_id_from_rec( - prebuilt->table, rec, index, NULL); + rec, index, offsets); } } @@ -3353,7 +3312,7 @@ Row_sel_get_clust_rec_for_mysql::operator()( and is it not unsafe to use RW_NO_LATCH here? */ buf_block_t* block = buf_page_get_gen( btr_pcur_get_block(prebuilt->pcur)->page.id, - dict_table_page_size(sec_index->table), + btr_pcur_get_block(prebuilt->pcur)->zip_size(), RW_NO_LATCH, NULL, BUF_GET, __FILE__, __LINE__, mtr, &err); mem_heap_t* heap = mem_heap_create(256); @@ -3572,7 +3531,7 @@ sel_restore_position_for_mysql( next: if (btr_pcur_move_to_next(pcur, mtr) && rec_is_metadata(btr_pcur_get_rec(pcur), - pcur->btr_cur.index)) { + *pcur->btr_cur.index)) { btr_pcur_move_to_next(pcur, mtr); } @@ -3588,7 +3547,7 @@ next: prev: if (btr_pcur_is_on_user_rec(pcur) && !moves_up && !rec_is_metadata(btr_pcur_get_rec(pcur), - pcur->btr_cur.index)) { + *pcur->btr_cur.index)) { btr_pcur_move_to_prev(pcur, mtr); } return true; @@ -3830,7 +3789,7 @@ row_sel_enqueue_cache_row_for_mysql( /* For non ICP code path the row should already exist in the next fetch cache slot. */ - if (prebuilt->idx_cond != NULL) { + if (prebuilt->pk_filter || prebuilt->idx_cond) { byte* dest = row_sel_fetch_last_buf(prebuilt); ut_memcpy(dest, mysql_rec, prebuilt->mysql_row_len); @@ -3871,7 +3830,7 @@ row_sel_try_search_shortcut_for_mysql( BTR_SEARCH_LEAF, pcur, ahi_latch, mtr); rec = btr_pcur_get_rec(pcur); - if (!page_rec_is_user_rec(rec) || rec_is_metadata(rec, index)) { + if (!page_rec_is_user_rec(rec) || rec_is_metadata(rec, *index)) { retry: rw_lock_s_unlock(ahi_latch); return(SEL_RETRY); @@ -3914,9 +3873,9 @@ exhausted: /*********************************************************************//** Check a pushed-down index condition. -@return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */ +@return CHECK_NEG, CHECK_POS, or CHECK_OUT_OF_RANGE */ static -ICP_RESULT +check_result_t row_search_idx_cond_check( /*======================*/ byte* mysql_rec, /*!< out: record @@ -3928,17 +3887,18 @@ row_search_idx_cond_check( const rec_t* rec, /*!< in: InnoDB record */ const rec_offs* offsets) /*!< in: rec_get_offsets() */ { - ICP_RESULT result; ulint i; ut_ad(rec_offs_validate(rec, prebuilt->index, offsets)); if (!prebuilt->idx_cond) { - return(ICP_MATCH); + if (!handler_rowid_filter_is_active(prebuilt->pk_filter)) { + return(CHECK_POS); + } + } else { + MONITOR_INC(MONITOR_ICP_ATTEMPTS); } - MONITOR_INC(MONITOR_ICP_ATTEMPTS); - /* Convert to MySQL format those fields that are needed for evaluating the index condition. */ @@ -3958,7 +3918,7 @@ row_search_idx_cond_check( rec, prebuilt->index, offsets, templ->icp_rec_field_no, templ)) { - return(ICP_NO_MATCH); + return(CHECK_NEG); } } @@ -3968,9 +3928,40 @@ row_search_idx_cond_check( index, if the case of the column has been updated in the past, or a record has been deleted and a record inserted in a different case. */ - result = innobase_index_cond(prebuilt->idx_cond); + check_result_t result = prebuilt->idx_cond + ? handler_index_cond_check(prebuilt->idx_cond) + : CHECK_POS; + switch (result) { - case ICP_MATCH: + case CHECK_POS: + if (handler_rowid_filter_is_active(prebuilt->pk_filter)) { + ut_ad(!prebuilt->index->is_primary()); + if (prebuilt->clust_index_was_generated) { + ulint len; + dict_index_t* index = prebuilt->index; + const byte* data = rec_get_nth_field( + rec, offsets, index->n_fields - 1, + &len); + ut_ad(dict_index_get_nth_col(index, + index->n_fields - 1) + ->prtype == (DATA_ROW_ID | DATA_NOT_NULL)); + ut_ad(len == DATA_ROW_ID_LEN); + memcpy(prebuilt->row_id, data, DATA_ROW_ID_LEN); + } + result = handler_rowid_filter_check(prebuilt->pk_filter); + switch (result) { + case CHECK_NEG: + MONITOR_INC(MONITOR_ICP_NO_MATCH); + return(result); + case CHECK_OUT_OF_RANGE: + MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE); + return(result); + case CHECK_POS: + break; + default: + ut_error; + } + } /* Convert the remaining fields to MySQL format. If this is a secondary index record, we must defer this until we have fetched the clustered index record. */ @@ -3980,19 +3971,19 @@ row_search_idx_cond_check( mysql_rec, prebuilt, rec, NULL, false, prebuilt->index, offsets)) { ut_ad(dict_index_is_clust(prebuilt->index)); - return(ICP_NO_MATCH); + return(CHECK_NEG); } } MONITOR_INC(MONITOR_ICP_MATCH); return(result); - case ICP_NO_MATCH: + case CHECK_NEG: MONITOR_INC(MONITOR_ICP_NO_MATCH); return(result); - case ICP_OUT_OF_RANGE: + case CHECK_OUT_OF_RANGE: MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE); return(result); - case ICP_ERROR: - case ICP_ABORTED_BY_USER: + case CHECK_ERROR: + case CHECK_ABORTED_BY_USER: return(result); } @@ -4424,16 +4415,16 @@ row_search_mvcc( mtr.commit(). */ ut_ad(!rec_get_deleted_flag(rec, comp)); - if (prebuilt->idx_cond) { + if (prebuilt->pk_filter || prebuilt->idx_cond) { switch (row_search_idx_cond_check( buf, prebuilt, rec, offsets)) { - case ICP_NO_MATCH: - case ICP_OUT_OF_RANGE: - case ICP_ABORTED_BY_USER: - case ICP_ERROR: + case CHECK_NEG: + case CHECK_OUT_OF_RANGE: + case CHECK_ABORTED_BY_USER: + case CHECK_ERROR: goto shortcut_mismatch; - case ICP_MATCH: + case CHECK_POS: goto shortcut_match; } } @@ -4518,6 +4509,13 @@ row_search_mvcc( set_also_gap_locks = FALSE; } +#ifdef WITH_WSREP + else if (wsrep_thd_skip_locking(trx->mysql_thd)) { + ut_ad(!strcmp(wsrep_get_sr_table_name(), + prebuilt->table->name.m_name)); + set_also_gap_locks = FALSE; + } +#endif /* WITH_WSREP */ /* Note that if the search mode was GE or G, then the cursor naturally moves upward (in fetch next) in alphabetical order, @@ -5205,14 +5203,14 @@ no_gap_lock: index entry. */ switch (row_search_idx_cond_check( buf, prebuilt, rec, offsets)) { - case ICP_NO_MATCH: + case CHECK_NEG: goto next_rec; - case ICP_OUT_OF_RANGE: - case ICP_ABORTED_BY_USER: - case ICP_ERROR: + case CHECK_OUT_OF_RANGE: + case CHECK_ABORTED_BY_USER: + case CHECK_ERROR: err = DB_RECORD_NOT_FOUND; goto idx_cond_failed; - case ICP_MATCH: + case CHECK_POS: goto requires_clust_rec; } @@ -5262,17 +5260,17 @@ locks_ok_del_marked: /* Check if the record matches the index condition. */ switch (row_search_idx_cond_check(buf, prebuilt, rec, offsets)) { - case ICP_NO_MATCH: + case CHECK_NEG: if (did_semi_consistent_read) { row_unlock_for_mysql(prebuilt, TRUE); } goto next_rec; - case ICP_OUT_OF_RANGE: - case ICP_ABORTED_BY_USER: - case ICP_ERROR: + case CHECK_OUT_OF_RANGE: + case CHECK_ABORTED_BY_USER: + case CHECK_ERROR: err = DB_RECORD_NOT_FOUND; goto idx_cond_failed; - case ICP_MATCH: + case CHECK_POS: break; } @@ -5357,7 +5355,7 @@ requires_clust_rec: result_rec = clust_rec; ut_ad(rec_offs_validate(result_rec, clust_index, offsets)); - if (prebuilt->idx_cond) { + if (prebuilt->pk_filter || prebuilt->idx_cond) { /* Convert the record to MySQL format. We were unable to do this in row_search_idx_cond_check(), because the condition is on the secondary index @@ -5418,8 +5416,7 @@ use_covering_index: /* We only convert from InnoDB row format to MySQL row format when ICP is disabled. */ - if (!prebuilt->idx_cond) { - + if (!prebuilt->pk_filter && !prebuilt->idx_cond) { /* We use next_buf to track the allocation of buffers where we store and enqueue the buffers for our pre-fetch optimisation. @@ -5491,7 +5488,7 @@ use_covering_index: rec_offs_size(offsets)); mach_write_to_4(buf, rec_offs_extra_size(offsets) + 4); - } else if (!prebuilt->idx_cond) { + } else if (!prebuilt->pk_filter && !prebuilt->idx_cond) { /* The record was not yet converted to MySQL format. */ if (!row_sel_store_mysql_rec( buf, prebuilt, result_rec, vrow, @@ -5510,11 +5507,19 @@ use_covering_index: } } - if (prebuilt->clust_index_was_generated) { - row_sel_store_row_id_to_prebuilt( - prebuilt, result_rec, - result_rec == rec ? index : clust_index, - offsets); + if (!prebuilt->clust_index_was_generated) { + } else if (result_rec != rec || index->is_primary()) { + memcpy(prebuilt->row_id, result_rec, DATA_ROW_ID_LEN); + } else { + ulint len; + const byte* data = rec_get_nth_field( + result_rec, offsets, index->n_fields - 1, + &len); + ut_ad(dict_index_get_nth_col(index, + index->n_fields - 1) + ->prtype == (DATA_ROW_ID | DATA_NOT_NULL)); + ut_ad(len == DATA_ROW_ID_LEN); + memcpy(prebuilt->row_id, data, DATA_ROW_ID_LEN); } } @@ -5733,8 +5738,7 @@ normal_return: DEBUG_SYNC_C("row_search_for_mysql_before_return"); - if (prebuilt->idx_cond != 0) { - + if (prebuilt->pk_filter || prebuilt->idx_cond) { /* When ICP is active we don't write to the MySQL buffer directly, only to buffers that are enqueued in the pre-fetch queue. We need to dequeue the first buffer and copy the contents diff --git a/storage/innobase/row/row0trunc.cc b/storage/innobase/row/row0trunc.cc deleted file mode 100644 index bd2ede21587..00000000000 --- a/storage/innobase/row/row0trunc.cc +++ /dev/null @@ -1,1742 +0,0 @@ -/***************************************************************************** - -Copyright (c) 2013, 2018, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file row/row0trunc.cc -TRUNCATE implementation - -Created 2013-04-12 Sunny Bains -*******************************************************/ - -#include "row0trunc.h" -#include "btr0sea.h" -#include "pars0pars.h" -#include "btr0pcur.h" -#include "dict0crea.h" -#include "dict0stats.h" -#include "dict0stats_bg.h" -#include "lock0lock.h" -#include "fts0fts.h" -#include "ibuf0ibuf.h" -#include "os0file.h" -#include "que0que.h" -#include "trx0undo.h" - -/* FIXME: For temporary tables, use a simple approach of btr_free() -and btr_create() of each index tree. */ - -/* FIXME: For persistent tables, remove this code in MDEV-11655 -and use a combination of the transactional DDL log to make atomic the -low-level operations ha_innobase::delete_table(), ha_innobase::create(). */ - -bool truncate_t::s_fix_up_active = false; -truncate_t::tables_t truncate_t::s_tables; -truncate_t::truncated_tables_t truncate_t::s_truncated_tables; - -/** -Iterator over the the raw records in an index, doesn't support MVCC. */ -class IndexIterator { - -public: - /** - Iterate over an indexes records - @param index index to iterate over */ - explicit IndexIterator(dict_index_t* index) - : - m_index(index) - { - /* Do nothing */ - } - - /** - Search for key. Position the cursor on a record GE key. - @return DB_SUCCESS or error code. */ - dberr_t search(dtuple_t& key, bool noredo) - { - mtr_start(&m_mtr); - - if (noredo) { - mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO); - } - - btr_pcur_open_on_user_rec( - m_index, - &key, - PAGE_CUR_GE, - BTR_MODIFY_LEAF, - &m_pcur, &m_mtr); - - return(DB_SUCCESS); - } - - /** - Iterate over all the records - @return DB_SUCCESS or error code */ - template <typename Callback> - dberr_t for_each(Callback& callback) - { - dberr_t err = DB_SUCCESS; - - for (;;) { - - if (!btr_pcur_is_on_user_rec(&m_pcur) - || !callback.match(&m_pcur)) { - - /* The end of of the index has been reached. */ - err = DB_END_OF_INDEX; - break; - } - - rec_t* rec = btr_pcur_get_rec(&m_pcur); - - if (!rec_get_deleted_flag(rec, FALSE)) { - - err = callback(&m_mtr, &m_pcur); - - if (err != DB_SUCCESS) { - break; - } - } - - btr_pcur_move_to_next_user_rec(&m_pcur, &m_mtr); - } - - btr_pcur_close(&m_pcur); - mtr_commit(&m_mtr); - - return(err == DB_END_OF_INDEX ? DB_SUCCESS : err); - } - -private: - // Disable copying - IndexIterator(const IndexIterator&); - IndexIterator& operator=(const IndexIterator&); - -private: - mtr_t m_mtr; - btr_pcur_t m_pcur; - dict_index_t* m_index; -}; - -/** SysIndex table iterator, iterate over records for a table. */ -class SysIndexIterator { - -public: - /** - Iterate over all the records that match the table id. - @return DB_SUCCESS or error code */ - template <typename Callback> - dberr_t for_each(Callback& callback) const - { - dict_index_t* sys_index; - byte buf[DTUPLE_EST_ALLOC(1)]; - dtuple_t* tuple = - dtuple_create_from_mem(buf, sizeof(buf), 1, 0); - dfield_t* dfield = dtuple_get_nth_field(tuple, 0); - - dfield_set_data( - dfield, - callback.table_id(), - sizeof(*callback.table_id())); - - sys_index = dict_table_get_first_index(dict_sys->sys_indexes); - - dict_index_copy_types(tuple, sys_index, 1); - - IndexIterator iterator(sys_index); - - /* Search on the table id and position the cursor - on GE table_id. */ - iterator.search(*tuple, callback.get_logging_status()); - - return(iterator.for_each(callback)); - } -}; - -/** Generic callback abstract class. */ -class Callback -{ - -public: - /** - Constructor - @param table_id id of the table being operated. - @param noredo if true turn off logging. */ - Callback(table_id_t table_id, bool noredo) - : - m_id(), - m_noredo(noredo) - { - /* Convert to storage byte order. */ - mach_write_to_8(&m_id, table_id); - } - - /** - Destructor */ - virtual ~Callback() - { - /* Do nothing */ - } - - /** - @param pcur persistent cursor used for iteration - @return true if the table id column matches. */ - bool match(btr_pcur_t* pcur) const - { - ulint len; - const byte* field; - rec_t* rec = btr_pcur_get_rec(pcur); - - field = rec_get_nth_field_old( - rec, DICT_FLD__SYS_INDEXES__TABLE_ID, &len); - - ut_ad(len == 8); - - return(memcmp(&m_id, field, len) == 0); - } - - /** - @return pointer to table id storage format buffer */ - const table_id_t* table_id() const - { - return(&m_id); - } - - /** - @return return if logging needs to be turned off. */ - bool get_logging_status() const - { - return(m_noredo); - } - -protected: - // Disably copying - Callback(const Callback&); - Callback& operator=(const Callback&); - -protected: - /** Table id in storage format */ - table_id_t m_id; - - /** Turn off logging. */ - const bool m_noredo; -}; - -/** -Scan to find out truncate log file from the given directory path. - -@param dir_path look for log directory in following path. -@param log_files cache to hold truncate log file name found. -@return DB_SUCCESS or error code. */ -dberr_t -TruncateLogParser::scan( - const char* dir_path, - trunc_log_files_t& log_files) -{ - os_file_dir_t dir; - os_file_stat_t fileinfo; - dberr_t err = DB_SUCCESS; - const ulint dir_len = strlen(dir_path); - - /* Scan and look out for the truncate log files. */ - dir = os_file_opendir(dir_path, true); - if (dir == NULL) { - return(DB_IO_ERROR); - } - - while (fil_file_readdir_next_file( - &err, dir_path, dir, &fileinfo) == 0) { - - const size_t nm_len = strlen(fileinfo.name); - - if (fileinfo.type == OS_FILE_TYPE_FILE - && nm_len > sizeof "ib_trunc.log" - && (0 == strncmp(fileinfo.name + nm_len - - ((sizeof "trunc.log") - 1), - "trunc.log", (sizeof "trunc.log") - 1)) - && (0 == strncmp(fileinfo.name, "ib_", 3))) { - - if (fileinfo.size == 0) { - /* Truncate log not written. Remove the file. */ - os_file_delete( - innodb_log_file_key, fileinfo.name); - continue; - } - - /* Construct file name by appending directory path */ - ulint sz = dir_len + 22 + 22 + sizeof "ib_trunc.log"; - char* log_file_name = UT_NEW_ARRAY_NOKEY(char, sz); - if (log_file_name == NULL) { - err = DB_OUT_OF_MEMORY; - break; - } - - memcpy(log_file_name, dir_path, dir_len); - char* e = log_file_name + dir_len; - if (e[-1] != OS_PATH_SEPARATOR) { - *e++ = OS_PATH_SEPARATOR; - } - strcpy(e, fileinfo.name); - log_files.push_back(log_file_name); - } - } - - os_file_closedir(dir); - - return(err); -} - -/** -Parse the log file and populate table to truncate information. -(Add this table to truncate information to central vector that is then - used by truncate fix-up routine to fix-up truncate action of the table.) - -@param log_file_name log file to parse -@return DB_SUCCESS or error code. */ -dberr_t -TruncateLogParser::parse( - const char* log_file_name) -{ - dberr_t err = DB_SUCCESS; - truncate_t* truncate = NULL; - - /* Open the file and read magic-number to findout if truncate action - was completed. */ - bool ret; - os_file_t handle = os_file_create_simple( - innodb_log_file_key, log_file_name, - OS_FILE_OPEN, OS_FILE_READ_ONLY, srv_read_only_mode, &ret); - if (!ret) { - ib::error() << "Error opening truncate log file: " - << log_file_name; - return(DB_IO_ERROR); - } - - ulint sz = srv_page_size; - void* buf = ut_zalloc_nokey(sz + srv_page_size); - if (buf == 0) { - os_file_close(handle); - return(DB_OUT_OF_MEMORY); - } - - IORequest request(IORequest::READ); - - /* Align the memory for file i/o if we might have O_DIRECT set*/ - byte* log_buf = static_cast<byte*>(ut_align(buf, srv_page_size)); - - do { - err = os_file_read(request, handle, log_buf, 0, sz); - - if (err != DB_SUCCESS) { - os_file_close(handle); - break; - } - - if (mach_read_from_4(log_buf) == 32743712) { - - /* Truncate action completed. Avoid parsing the file. */ - os_file_close(handle); - - os_file_delete(innodb_log_file_key, log_file_name); - break; - } - - if (truncate == NULL) { - truncate = UT_NEW_NOKEY(truncate_t(log_file_name)); - if (truncate == NULL) { - os_file_close(handle); - err = DB_OUT_OF_MEMORY; - break; - } - } - - err = truncate->parse(log_buf + 4, log_buf + sz - 4); - - if (err != DB_SUCCESS) { - - ut_ad(err == DB_FAIL); - - ut_free(buf); - buf = 0; - - sz *= 2; - - buf = ut_zalloc_nokey(sz + srv_page_size); - - if (buf == 0) { - os_file_close(handle); - err = DB_OUT_OF_MEMORY; - UT_DELETE(truncate); - truncate = NULL; - break; - } - - log_buf = static_cast<byte*>( - ut_align(buf, srv_page_size)); - } - } while (err != DB_SUCCESS); - - ut_free(buf); - - if (err == DB_SUCCESS && truncate != NULL) { - truncate_t::add(truncate); - os_file_close(handle); - } - - return(err); -} - -/** -Scan and Parse truncate log files. - -@param dir_path look for log directory in following path -@return DB_SUCCESS or error code. */ -dberr_t -TruncateLogParser::scan_and_parse( - const char* dir_path) -{ - dberr_t err; - trunc_log_files_t log_files; - - /* Scan and trace all the truncate log files. */ - err = TruncateLogParser::scan(dir_path, log_files); - - /* Parse truncate lof files if scan was successful. */ - if (err == DB_SUCCESS) { - - for (ulint i = 0; - i < log_files.size() && err == DB_SUCCESS; - i++) { - err = TruncateLogParser::parse(log_files[i]); - } - } - - trunc_log_files_t::const_iterator end = log_files.end(); - for (trunc_log_files_t::const_iterator it = log_files.begin(); - it != end; - ++it) { - if (*it != NULL) { - UT_DELETE_ARRAY(*it); - } - } - log_files.clear(); - - return(err); -} - -/** Check for presence of table-id in SYS_XXXX tables. */ -class TableLocator : public Callback { - -public: - /** - Constructor - @param table_id table_id to look for */ - explicit TableLocator(table_id_t table_id) - : - Callback(table_id, false), - m_table_found() - { - /* No op */ - } - - /** - @return true if table is found */ - bool is_table_found() const - { - return(m_table_found); - } - - /** - Look for table-id in SYS_XXXX tables without loading the table. - - @param pcur persistent cursor used for reading - @return DB_SUCCESS */ - dberr_t operator()(mtr_t*, btr_pcur_t*) - { - m_table_found = true; - return(DB_SUCCESS); - } - -private: - /** Set to true if table is present */ - bool m_table_found; -}; - -/** -Update system table to reflect new table id. -@param old_table_id old table id -@param new_table_id new table id -@param reserve_dict_mutex if TRUE, acquire/release - dict_sys->mutex around call to pars_sql. -@param trx transaction -@return error code or DB_SUCCESS */ -static MY_ATTRIBUTE((warn_unused_result)) -dberr_t -row_truncate_update_table_id( - table_id_t old_table_id, - table_id_t new_table_id, - ibool reserve_dict_mutex, - trx_t* trx) -{ - pars_info_t* info = NULL; - dberr_t err = DB_SUCCESS; - - /* Scan the SYS_XXXX table and update to reflect new table-id. */ - info = pars_info_create(); - pars_info_add_ull_literal(info, "old_id", old_table_id); - pars_info_add_ull_literal(info, "new_id", new_table_id); - - err = que_eval_sql( - info, - "PROCEDURE RENUMBER_TABLE_ID_PROC () IS\n" - "BEGIN\n" - "UPDATE SYS_TABLES" - " SET ID = :new_id\n" - " WHERE ID = :old_id;\n" - "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n" - " WHERE TABLE_ID = :old_id;\n" - "UPDATE SYS_INDEXES" - " SET TABLE_ID = :new_id\n" - " WHERE TABLE_ID = :old_id;\n" - "UPDATE SYS_VIRTUAL" - " SET TABLE_ID = :new_id\n" - " WHERE TABLE_ID = :old_id;\n" - "END;\n", reserve_dict_mutex, trx); - - return(err); -} - -/** -Get the table id to truncate. -@param truncate_t old/new table id of table to truncate -@return table_id_t table_id to use in SYS_XXXX table update. */ -static MY_ATTRIBUTE((warn_unused_result)) -table_id_t -row_truncate_get_trunc_table_id( - const truncate_t& truncate) -{ - TableLocator tableLocator(truncate.old_table_id()); - - SysIndexIterator().for_each(tableLocator); - - return(tableLocator.is_table_found() ? - truncate.old_table_id(): truncate.new_table_id()); -} - -/** -Update system table to reflect new table id and root page number. -@param truncate_t old/new table id of table to truncate - and updated root_page_no of indexes. -@param new_table_id new table id -@param reserve_dict_mutex if TRUE, acquire/release - dict_sys->mutex around call to pars_sql. -@param mark_index_corrupted if true, then mark index corrupted. -@return error code or DB_SUCCESS */ -static MY_ATTRIBUTE((warn_unused_result)) -dberr_t -row_truncate_update_sys_tables_during_fix_up( - const truncate_t& truncate, - table_id_t new_table_id, - ibool reserve_dict_mutex, - bool mark_index_corrupted) -{ - trx_t* trx = trx_create(); - - trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); - - table_id_t table_id = row_truncate_get_trunc_table_id(truncate); - - /* Step-1: Update the root-page-no */ - - dberr_t err; - - err = truncate.update_root_page_no( - trx, table_id, reserve_dict_mutex, mark_index_corrupted); - - if (err != DB_SUCCESS) { - return(err); - } - - /* Step-2: Update table-id. */ - - err = row_truncate_update_table_id( - table_id, new_table_id, reserve_dict_mutex, trx); - - if (err == DB_SUCCESS) { - dict_mutex_enter_for_mysql(); - - /* Remove the table with old table_id from cache. */ - dict_table_t* old_table = dict_table_open_on_id( - table_id, true, DICT_TABLE_OP_NORMAL); - - if (old_table != NULL) { - dict_table_close(old_table, true, false); - dict_table_remove_from_cache(old_table); - } - - /* Open table with new table_id and set table as - corrupted if it has FTS index. */ - - dict_table_t* table = dict_table_open_on_id( - new_table_id, true, DICT_TABLE_OP_NORMAL); - ut_ad(table->id == new_table_id); - - bool has_internal_doc_id = - dict_table_has_fts_index(table) - || DICT_TF2_FLAG_IS_SET( - table, DICT_TF2_FTS_HAS_DOC_ID); - - if (has_internal_doc_id) { - trx->dict_operation_lock_mode = RW_X_LATCH; - fts_check_corrupt(table, trx); - trx->dict_operation_lock_mode = 0; - } - - dict_table_close(table, true, false); - dict_mutex_exit_for_mysql(); - } - - trx_commit_for_mysql(trx); - trx_free(trx); - - return(err); -} - -/********************************************************//** -Recreates table indexes by applying -TRUNCATE log record during recovery. -@return DB_SUCCESS or error code */ -static -dberr_t -fil_recreate_table( -/*===============*/ - ulint format_flags, /*!< in: page format */ - const char* name, /*!< in: table name */ - truncate_t& truncate) /*!< in: The information of - TRUNCATE log record */ -{ - ut_ad(!truncate_t::s_fix_up_active); - truncate_t::s_fix_up_active = true; - - /* Step-1: Scan for active indexes from REDO logs and drop - all the indexes using low level function that take root_page_no - and space-id. */ - truncate.drop_indexes(fil_system.sys_space); - - /* Step-2: Scan for active indexes and re-create them. */ - dberr_t err = truncate.create_indexes( - name, fil_system.sys_space, format_flags); - if (err != DB_SUCCESS) { - ib::info() << "Recovery failed for TRUNCATE TABLE '" - << name << "' within the system tablespace"; - } - - truncate_t::s_fix_up_active = false; - - return(err); -} - -/********************************************************//** -Recreates the tablespace and table indexes by applying -TRUNCATE log record during recovery. -@return DB_SUCCESS or error code */ -static -dberr_t -fil_recreate_tablespace( -/*====================*/ - ulint space_id, /*!< in: space id */ - ulint format_flags, /*!< in: page format */ - ulint flags, /*!< in: tablespace flags */ - const char* name, /*!< in: table name */ - truncate_t& truncate, /*!< in: The information of - TRUNCATE log record */ - lsn_t recv_lsn) /*!< in: the end LSN of - the log record */ -{ - dberr_t err = DB_SUCCESS; - mtr_t mtr; - - ut_ad(!truncate_t::s_fix_up_active); - truncate_t::s_fix_up_active = true; - - /* Step-1: Invalidate buffer pool pages belonging to the tablespace - to re-create. */ - buf_LRU_flush_or_remove_pages(space_id, NULL); - - /* Remove all insert buffer entries for the tablespace */ - ibuf_delete_for_discarded_space(space_id); - - /* Step-2: truncate tablespace (reset the size back to original or - default size) of tablespace. */ - err = truncate.truncate( - space_id, truncate.get_dir_path(), name, flags, true); - - if (err != DB_SUCCESS) { - - ib::info() << "Cannot access .ibd file for table '" - << name << "' with tablespace " << space_id - << " while truncating"; - return(DB_ERROR); - } - - fil_space_t* space = fil_space_acquire(space_id); - if (!space) { - ib::info() << "Missing .ibd file for table '" << name - << "' with tablespace " << space_id; - return(DB_ERROR); - } - - const page_size_t page_size(space->flags); - - /* Step-3: Initialize Header. */ - if (page_size.is_compressed()) { - byte* buf; - page_t* page; - - buf = static_cast<byte*>( - ut_zalloc_nokey(3U << srv_page_size_shift)); - - /* Align the memory for file i/o */ - page = static_cast<byte*>(ut_align(buf, srv_page_size)); - - flags |= FSP_FLAGS_PAGE_SSIZE(); - - fsp_header_init_fields(page, space_id, flags); - - mach_write_to_4( - page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id); - - page_zip_des_t page_zip; - page_zip_set_size(&page_zip, page_size.physical()); - page_zip.data = page + srv_page_size; - -#ifdef UNIV_DEBUG - page_zip.m_start = -#endif /* UNIV_DEBUG */ - page_zip.m_end = page_zip.m_nonempty = page_zip.n_blobs = 0; - buf_flush_init_for_writing(NULL, page, &page_zip, 0); - - err = fil_io(IORequestWrite, true, page_id_t(space_id, 0), - page_size, 0, page_size.physical(), page_zip.data, - NULL); - - ut_free(buf); - - if (err != DB_SUCCESS) { - ib::info() << "Failed to clean header of the" - " table '" << name << "' with tablespace " - << space_id; - goto func_exit; - } - } - - mtr_start(&mtr); - /* Don't log the operation while fixing up table truncate operation - as crash at this level can still be sustained with recovery restarting - from last checkpoint. */ - mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); - - /* Initialize the first extent descriptor page and - the second bitmap page for the new tablespace. */ - fsp_header_init(space, FIL_IBD_FILE_INITIAL_SIZE, &mtr); - mtr_commit(&mtr); - - /* Step-4: Re-Create Indexes to newly re-created tablespace. - This operation will restore tablespace back to what it was - when it was created during CREATE TABLE. */ - err = truncate.create_indexes(name, space, format_flags); - if (err != DB_SUCCESS) { - goto func_exit; - } - - /* Step-5: Write new created pages into ibd file handle and - flush it to disk for the tablespace, in case i/o-handler thread - deletes the bitmap page from buffer. */ - mtr_start(&mtr); - - mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); - - for (ulint page_no = 0; - page_no < UT_LIST_GET_FIRST(space->chain)->size; ++page_no) { - - const page_id_t cur_page_id(space_id, page_no); - - buf_block_t* block = buf_page_get(cur_page_id, page_size, - RW_X_LATCH, &mtr); - - byte* page = buf_block_get_frame(block); - - if (!FSP_FLAGS_GET_ZIP_SSIZE(flags)) { - ut_ad(!page_size.is_compressed()); - - buf_flush_init_for_writing( - block, page, NULL, recv_lsn); - - err = fil_io(IORequestWrite, true, cur_page_id, - page_size, 0, srv_page_size, page, NULL); - } else { - ut_ad(page_size.is_compressed()); - - /* We don't want to rewrite empty pages. */ - - if (fil_page_get_type(page) != 0) { - page_zip_des_t* page_zip = - buf_block_get_page_zip(block); - - buf_flush_init_for_writing( - block, page, page_zip, recv_lsn); - - err = fil_io(IORequestWrite, true, - cur_page_id, - page_size, 0, - page_size.physical(), - page_zip->data, NULL); - } else { -#ifdef UNIV_DEBUG - const byte* data = block->page.zip.data; - - /* Make sure that the page is really empty */ - for (ulint i = 0; - i < page_size.physical(); - ++i) { - - ut_a(data[i] == 0); - } -#endif /* UNIV_DEBUG */ - } - } - - if (err != DB_SUCCESS) { - ib::info() << "Cannot write page " << page_no - << " into a .ibd file for table '" - << name << "' with tablespace " << space_id; - } - } - - mtr_commit(&mtr); - - truncate_t::s_fix_up_active = false; -func_exit: - space->release(); - return(err); -} - -/** -Fix the table truncate by applying information parsed from TRUNCATE log. -Fix-up includes re-creating table (drop and re-create indexes) -@return error code or DB_SUCCESS */ -dberr_t -truncate_t::fixup_tables_in_system_tablespace() -{ - dberr_t err = DB_SUCCESS; - - /* Using the info cached during REDO log scan phase fix the - table truncate. */ - - for (tables_t::iterator it = s_tables.begin(); - it != s_tables.end();) { - - if ((*it)->m_space_id == TRX_SYS_SPACE) { - /* Step-1: Drop and re-create indexes. */ - ib::info() << "Completing truncate for table with " - "id (" << (*it)->m_old_table_id << ") " - "residing in the system tablespace."; - - err = fil_recreate_table( - (*it)->m_format_flags, - (*it)->m_tablename, - **it); - - /* Step-2: Update the SYS_XXXX tables to reflect - this new table_id and root_page_no. */ - table_id_t new_id; - - dict_hdr_get_new_id(&new_id, NULL, NULL, NULL, true); - - err = row_truncate_update_sys_tables_during_fix_up( - **it, new_id, TRUE, - (err == DB_SUCCESS) ? false : true); - - if (err != DB_SUCCESS) { - break; - } - - os_file_delete( - innodb_log_file_key, (*it)->m_log_file_name); - UT_DELETE(*it); - it = s_tables.erase(it); - } else { - ++it; - } - } - - /* Also clear the map used to track tablespace truncated. */ - s_truncated_tables.clear(); - - return(err); -} - -/** -Fix the table truncate by applying information parsed from TRUNCATE log. -Fix-up includes re-creating tablespace. -@return error code or DB_SUCCESS */ -dberr_t -truncate_t::fixup_tables_in_non_system_tablespace() -{ - dberr_t err = DB_SUCCESS; - - /* Using the info cached during REDO log scan phase fix the - table truncate. */ - tables_t::iterator end = s_tables.end(); - - for (tables_t::iterator it = s_tables.begin(); it != end; ++it) { - - /* All tables in the system tablespace have already been - done and erased from this list. */ - ut_a((*it)->m_space_id != TRX_SYS_SPACE); - - /* Drop tablespace, drop indexes and re-create indexes. */ - - ib::info() << "Completing truncate for table with " - "id (" << (*it)->m_old_table_id << ") " - "residing in file-per-table tablespace with " - "id (" << (*it)->m_space_id << ")"; - - fil_space_t* space = fil_space_get((*it)->m_space_id); - - if (!space) { - /* Create the database directory for name, - if it does not exist yet */ - fil_create_directory_for_tablename( - (*it)->m_tablename); - - space = fil_ibd_create((*it)->m_space_id, - (*it)->m_tablename, - (*it)->m_dir_path, - (*it)->m_tablespace_flags, - FIL_IBD_FILE_INITIAL_SIZE, - (*it)->m_encryption, - (*it)->m_key_id, &err); - if (!space) { - /* If checkpoint is not yet done - and table is dropped and then we might - still have REDO entries for this table - which are INVALID. Ignore them. */ - ib::warn() << "Failed to create" - " tablespace for " - << (*it)->m_space_id - << " space-id"; - err = DB_ERROR; - break; - } - } - - err = fil_recreate_tablespace( - (*it)->m_space_id, - (*it)->m_format_flags, - (*it)->m_tablespace_flags, - (*it)->m_tablename, - **it, log_get_lsn()); - - /* Step-2: Update the SYS_XXXX tables to reflect new - table-id and root_page_no. */ - table_id_t new_id; - - dict_hdr_get_new_id(&new_id, NULL, NULL, NULL, true); - - err = row_truncate_update_sys_tables_during_fix_up( - **it, new_id, TRUE, (err == DB_SUCCESS) ? false : true); - - if (err != DB_SUCCESS) { - break; - } - } - - if (err == DB_SUCCESS && s_tables.size() > 0) { - - log_make_checkpoint(); - } - - for (ulint i = 0; i < s_tables.size(); ++i) { - os_file_delete( - innodb_log_file_key, s_tables[i]->m_log_file_name); - UT_DELETE(s_tables[i]); - } - - s_tables.clear(); - - return(err); -} - -/** -Constructor - -@param old_table_id old table id assigned to table before truncate -@param new_table_id new table id that will be assigned to table - after truncate -@param dir_path directory path */ - -truncate_t::truncate_t( - table_id_t old_table_id, - table_id_t new_table_id, - const char* dir_path) - : - m_space_id(), - m_old_table_id(old_table_id), - m_new_table_id(new_table_id), - m_dir_path(), - m_tablename(), - m_tablespace_flags(), - m_format_flags(), - m_indexes(), - m_log_lsn(), - m_log_file_name(), - /* JAN: TODO: Encryption */ - m_encryption(FIL_ENCRYPTION_DEFAULT), - m_key_id(FIL_DEFAULT_ENCRYPTION_KEY) -{ - if (dir_path != NULL) { - m_dir_path = mem_strdup(dir_path); - } -} - -/** -Consturctor - -@param log_file_name parse the log file during recovery to populate - information related to table to truncate */ -truncate_t::truncate_t( - const char* log_file_name) - : - m_space_id(), - m_old_table_id(), - m_new_table_id(), - m_dir_path(), - m_tablename(), - m_tablespace_flags(), - m_format_flags(), - m_indexes(), - m_log_lsn(), - m_log_file_name(), - /* JAN: TODO: Encryption */ - m_encryption(FIL_ENCRYPTION_DEFAULT), - m_key_id(FIL_DEFAULT_ENCRYPTION_KEY) - -{ - m_log_file_name = mem_strdup(log_file_name); - if (m_log_file_name == NULL) { - ib::fatal() << "Failed creating truncate_t; out of memory"; - } -} - -/** Constructor */ - -truncate_t::index_t::index_t() - : - m_id(), - m_type(), - m_root_page_no(FIL_NULL), - m_new_root_page_no(FIL_NULL), - m_n_fields(), - m_trx_id_pos(ULINT_UNDEFINED), - m_fields() -{ - /* Do nothing */ -} - -/** Destructor */ - -truncate_t::~truncate_t() -{ - if (m_dir_path != NULL) { - ut_free(m_dir_path); - m_dir_path = NULL; - } - - if (m_tablename != NULL) { - ut_free(m_tablename); - m_tablename = NULL; - } - - if (m_log_file_name != NULL) { - ut_free(m_log_file_name); - m_log_file_name = NULL; - } - - m_indexes.clear(); -} - -/** -@return number of indexes parsed from the log record */ - -size_t -truncate_t::indexes() const -{ - return(m_indexes.size()); -} - -/** -Update root page number in SYS_XXXX tables. - -@param trx transaction object -@param table_id table id for which information needs to - be updated. -@param reserve_dict_mutex if TRUE, acquire/release - dict_sys->mutex around call to pars_sql. -@param mark_index_corrupted if true, then mark index corrupted. -@return DB_SUCCESS or error code */ - -dberr_t -truncate_t::update_root_page_no( - trx_t* trx, - table_id_t table_id, - ibool reserve_dict_mutex, - bool mark_index_corrupted) const -{ - indexes_t::const_iterator end = m_indexes.end(); - - dberr_t err = DB_SUCCESS; - - for (indexes_t::const_iterator it = m_indexes.begin(); - it != end; - ++it) { - - pars_info_t* info = pars_info_create(); - - pars_info_add_int4_literal( - info, "page_no", it->m_new_root_page_no); - - pars_info_add_ull_literal(info, "table_id", table_id); - - pars_info_add_ull_literal( - info, "index_id", - (mark_index_corrupted ? IB_ID_MAX : it->m_id)); - - err = que_eval_sql( - info, - "PROCEDURE RENUMBER_IDX_PAGE_NO_PROC () IS\n" - "BEGIN\n" - "UPDATE SYS_INDEXES" - " SET PAGE_NO = :page_no\n" - " WHERE TABLE_ID = :table_id" - " AND ID = :index_id;\n" - "END;\n", reserve_dict_mutex, trx); - - if (err != DB_SUCCESS) { - break; - } - } - - return(err); -} - -/** -Check whether a tablespace was truncated during recovery -@param space_id tablespace id to check -@return true if the tablespace was truncated */ - -bool -truncate_t::is_tablespace_truncated(ulint space_id) -{ - tables_t::iterator end = s_tables.end(); - - for (tables_t::iterator it = s_tables.begin(); it != end; ++it) { - - if ((*it)->m_space_id == space_id) { - - return(true); - } - } - - return(false); -} - -/** Was tablespace truncated (on crash before checkpoint). -If the MLOG_TRUNCATE redo-record is still available then tablespace -was truncated and checkpoint is yet to happen. -@param[in] space_id tablespace id to check. -@return true if tablespace is was truncated. */ -bool -truncate_t::was_tablespace_truncated(ulint space_id) -{ - return(s_truncated_tables.find(space_id) != s_truncated_tables.end()); -} - -/** Get the lsn associated with space. -@param[in] space_id tablespace id to check. -@return associated lsn. */ -lsn_t -truncate_t::get_truncated_tablespace_init_lsn(ulint space_id) -{ - ut_ad(was_tablespace_truncated(space_id)); - - return(s_truncated_tables.find(space_id)->second); -} - -/** -Parses log record during recovery -@param start_ptr buffer containing log body to parse -@param end_ptr buffer end - -@return DB_SUCCESS or error code */ - -dberr_t -truncate_t::parse( - byte* start_ptr, - const byte* end_ptr) -{ - /* Parse lsn, space-id, format-flags and tablespace-flags. */ - if (end_ptr < start_ptr + (8 + 4 + 4 + 4)) { - return(DB_FAIL); - } - - m_log_lsn = mach_read_from_8(start_ptr); - start_ptr += 8; - - m_space_id = mach_read_from_4(start_ptr); - start_ptr += 4; - - m_format_flags = mach_read_from_4(start_ptr); - start_ptr += 4; - - m_tablespace_flags = mach_read_from_4(start_ptr); - start_ptr += 4; - - /* Parse table-name. */ - if (end_ptr < start_ptr + (2)) { - return(DB_FAIL); - } - - ulint n_tablename_len = mach_read_from_2(start_ptr); - start_ptr += 2; - - if (n_tablename_len > 0) { - if (end_ptr < start_ptr + n_tablename_len) { - return(DB_FAIL); - } - m_tablename = mem_strdup(reinterpret_cast<char*>(start_ptr)); - ut_ad(m_tablename[n_tablename_len - 1] == 0); - start_ptr += n_tablename_len; - } - - - /* Parse and read old/new table-id, number of indexes */ - if (end_ptr < start_ptr + (8 + 8 + 2 + 2)) { - return(DB_FAIL); - } - - ut_ad(m_indexes.empty()); - - m_old_table_id = mach_read_from_8(start_ptr); - start_ptr += 8; - - m_new_table_id = mach_read_from_8(start_ptr); - start_ptr += 8; - - ulint n_indexes = mach_read_from_2(start_ptr); - start_ptr += 2; - - /* Parse the remote directory from TRUNCATE log record */ - { - ulint n_tabledirpath_len = mach_read_from_2(start_ptr); - start_ptr += 2; - - if (end_ptr < start_ptr + n_tabledirpath_len) { - return(DB_FAIL); - } - - if (n_tabledirpath_len > 0) { - - m_dir_path = mem_strdup(reinterpret_cast<char*>(start_ptr)); - ut_ad(m_dir_path[n_tabledirpath_len - 1] == 0); - start_ptr += n_tabledirpath_len; - } - } - - /* Parse index ids and types from TRUNCATE log record */ - for (ulint i = 0; i < n_indexes; ++i) { - index_t index; - - if (end_ptr < start_ptr + (8 + 4 + 4 + 4)) { - return(DB_FAIL); - } - - index.m_id = mach_read_from_8(start_ptr); - start_ptr += 8; - - index.m_type = mach_read_from_4(start_ptr); - start_ptr += 4; - - index.m_root_page_no = mach_read_from_4(start_ptr); - start_ptr += 4; - - index.m_trx_id_pos = mach_read_from_4(start_ptr); - start_ptr += 4; - - if (!(index.m_type & DICT_FTS)) { - m_indexes.push_back(index); - } - } - - ut_ad(!m_indexes.empty()); - - if (FSP_FLAGS_GET_ZIP_SSIZE(m_tablespace_flags)) { - - /* Parse the number of index fields from TRUNCATE log record */ - for (ulint i = 0; i < m_indexes.size(); ++i) { - - if (end_ptr < start_ptr + (2 + 2)) { - return(DB_FAIL); - } - - m_indexes[i].m_n_fields = mach_read_from_2(start_ptr); - start_ptr += 2; - - ulint len = mach_read_from_2(start_ptr); - start_ptr += 2; - - if (end_ptr < start_ptr + len) { - return(DB_FAIL); - } - - index_t& index = m_indexes[i]; - - /* Should be NUL terminated. */ - ut_ad((start_ptr)[len - 1] == 0); - - index_t::fields_t::iterator end; - - end = index.m_fields.end(); - - index.m_fields.insert( - end, start_ptr, &(start_ptr)[len]); - - start_ptr += len; - } - } - - return(DB_SUCCESS); -} - -/** Parse log record from REDO log file during recovery. -@param[in,out] start_ptr buffer containing log body to parse -@param[in] end_ptr buffer end -@param[in] space_id tablespace identifier -@return parsed upto or NULL. */ -byte* -truncate_t::parse_redo_entry( - byte* start_ptr, - const byte* end_ptr, - ulint space_id) -{ - lsn_t lsn; - - /* Parse space-id, lsn */ - if (end_ptr < (start_ptr + 8)) { - return(NULL); - } - - lsn = mach_read_from_8(start_ptr); - start_ptr += 8; - - /* Tablespace can't exist in both state. - (scheduled-for-truncate, was-truncated). */ - if (!is_tablespace_truncated(space_id)) { - - truncated_tables_t::iterator it = - s_truncated_tables.find(space_id); - - if (it == s_truncated_tables.end()) { - s_truncated_tables.insert( - std::pair<ulint, lsn_t>(space_id, lsn)); - } else { - it->second = lsn; - } - } - - return(start_ptr); -} - -/** -Set the truncate log values for a compressed table. -@param index index from which recreate infoormation needs to be extracted -@return DB_SUCCESS or error code */ - -dberr_t -truncate_t::index_t::set( - const dict_index_t* index) -{ - /* Get trx-id column position (set only for clustered index) */ - if (dict_index_is_clust(index)) { - m_trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); - ut_ad(m_trx_id_pos > 0); - ut_ad(m_trx_id_pos != ULINT_UNDEFINED); - } else { - m_trx_id_pos = 0; - } - - /* Original logic set this field differently if page is not leaf. - For truncate case this being first page to get created it is - always a leaf page and so we don't need that condition here. */ - m_n_fields = dict_index_get_n_fields(index); - - /* See requirements of page_zip_fields_encode for size. */ - ulint encoded_buf_size = (m_n_fields + 1) * 2; - byte* encoded_buf = UT_NEW_ARRAY_NOKEY(byte, encoded_buf_size); - - if (encoded_buf == NULL) { - return(DB_OUT_OF_MEMORY); - } - - ulint len = page_zip_fields_encode( - m_n_fields, index, m_trx_id_pos, encoded_buf); - ut_a(len <= encoded_buf_size); - - /* Append the encoded fields data. */ - m_fields.insert(m_fields.end(), &encoded_buf[0], &encoded_buf[len]); - - /* NUL terminate the encoded data */ - m_fields.push_back(0); - - UT_DELETE_ARRAY(encoded_buf); - - return(DB_SUCCESS); -} - -/** Create an index for a table. -@param[in] table_name table name, for which to create -the index -@param[in] space tablespace -@param[in] page_size page size of the .ibd file -@param[in] index_type type of index to truncate -@param[in] index_id id of index to truncate -@param[in] btr_redo_create_info control info for ::btr_create() -@param[in,out] mtr mini-transaction covering the -create index -@return root page no or FIL_NULL on failure */ -inline ulint -truncate_t::create_index( - const char* table_name, - fil_space_t* space, - ulint index_type, - index_id_t index_id, - const btr_create_t& btr_redo_create_info, - mtr_t* mtr) const -{ - ulint root_page_no = btr_create( - index_type, space, index_id, - NULL, &btr_redo_create_info, mtr); - - if (root_page_no == FIL_NULL) { - - ib::info() << "innodb_force_recovery was set to " - << srv_force_recovery << ". Continuing crash recovery" - " even though we failed to create index " << index_id - << " for compressed table '" << table_name << "' with" - " file " << space->chain.start->name; - } - - return(root_page_no); -} - -/** Check if index has been modified since TRUNCATE log snapshot -was recorded. -@param[in] space tablespace -@param[in] root_page_no index root page number -@return true if modified else false */ -inline -bool -truncate_t::is_index_modified_since_logged( - const fil_space_t* space, - ulint root_page_no) const -{ - dberr_t err; - mtr_t mtr; - - mtr_start(&mtr); - - /* Root page could be in free state if truncate crashed after drop_index - and page was not allocated for any other object. */ - buf_block_t* block= buf_page_get_gen( - page_id_t(space->id, root_page_no), page_size_t(space->flags), - RW_X_LATCH, NULL, - BUF_GET_POSSIBLY_FREED, __FILE__, __LINE__, &mtr, &err); - if (!block) return true; - - page_t* root = buf_block_get_frame(block); - -#ifdef UNIV_DEBUG - /* If the root page has been freed as part of truncate drop_index action - and not yet allocated for any object still the pagelsn > snapshot lsn */ - if (block->page.file_page_was_freed) { - ut_ad(mach_read_from_8(root + FIL_PAGE_LSN) > m_log_lsn); - } -#endif /* UNIV_DEBUG */ - - lsn_t page_lsn = mach_read_from_8(root + FIL_PAGE_LSN); - - mtr_commit(&mtr); - - if (page_lsn > m_log_lsn) { - return(true); - } - - return(false); -} - -/** Drop indexes for a table. -@param[in,out] space tablespace */ -void truncate_t::drop_indexes(fil_space_t* space) const -{ - mtr_t mtr; - - indexes_t::const_iterator end = m_indexes.end(); - const page_size_t page_size(space->flags); - - for (indexes_t::const_iterator it = m_indexes.begin(); - it != end; - ++it) { - - ulint root_page_no = it->m_root_page_no; - - if (is_index_modified_since_logged(space, root_page_no)) { - /* Page has been modified since TRUNCATE log snapshot - was recorded so not safe to drop the index. */ - continue; - } - - mtr_start(&mtr); - - if (space->id != TRX_SYS_SPACE) { - /* Do not log changes for single-table - tablespaces, we are in recovery mode. */ - mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); - } - - if (root_page_no != FIL_NULL) { - const page_id_t root_page_id(space->id, root_page_no); - - btr_free_if_exists( - root_page_id, page_size, it->m_id, &mtr); - } - - /* If tree is already freed then we might return immediately - in which case we need to release the lock we have acquired - on root_page. */ - mtr_commit(&mtr); - } -} - - -/** Create the indexes for a table -@param[in] table_name table name, for which to create the indexes -@param[in,out] space tablespace -@param[in] format_flags page format flags -@return DB_SUCCESS or error code. */ -inline dberr_t -truncate_t::create_indexes( - const char* table_name, - fil_space_t* space, - ulint format_flags) -{ - mtr_t mtr; - - mtr_start(&mtr); - - if (space->id != TRX_SYS_SPACE) { - /* Do not log changes for single-table tablespaces, we - are in recovery mode. */ - mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); - } - - /* Create all new index trees with table format, index ids, index - types, number of index fields and index field information taken - out from the TRUNCATE log record. */ - - ulint root_page_no = FIL_NULL; - indexes_t::iterator end = m_indexes.end(); - for (indexes_t::iterator it = m_indexes.begin(); - it != end; - ++it) { - - btr_create_t btr_redo_create_info( - FSP_FLAGS_GET_ZIP_SSIZE(space->flags) - ? &it->m_fields[0] : NULL); - - btr_redo_create_info.format_flags = format_flags; - - if (FSP_FLAGS_GET_ZIP_SSIZE(space->flags)) { - - btr_redo_create_info.n_fields = it->m_n_fields; - /* Skip the NUL appended field */ - btr_redo_create_info.field_len = - it->m_fields.size() - 1; - btr_redo_create_info.trx_id_pos = it->m_trx_id_pos; - } - - root_page_no = create_index( - table_name, space, it->m_type, it->m_id, - btr_redo_create_info, &mtr); - - if (root_page_no == FIL_NULL) { - break; - } - - it->m_new_root_page_no = root_page_no; - } - - mtr_commit(&mtr); - - return(root_page_no == FIL_NULL ? DB_ERROR : DB_SUCCESS); -} - -/** -Write a TRUNCATE log record for fixing up table if truncate crashes. -@param start_ptr buffer to write log record -@param end_ptr buffer end -@param space_id space id -@param tablename the table name in the usual databasename/tablename - format of InnoDB -@param flags tablespace flags -@param format_flags page format -@param lsn lsn while logging -@return DB_SUCCESS or error code */ - -dberr_t -truncate_t::write( - byte* start_ptr, - byte* end_ptr, - ulint space_id, - const char* tablename, - ulint flags, - ulint format_flags, - lsn_t lsn) const -{ - if (end_ptr < start_ptr) { - return(DB_FAIL); - } - - /* LSN, Type, Space-ID, format-flag (also know as log_flag. - Stored in page_no field), tablespace flags */ - if (end_ptr < (start_ptr + (8 + 4 + 4 + 4))) { - return(DB_FAIL); - } - - mach_write_to_8(start_ptr, lsn); - start_ptr += 8; - - mach_write_to_4(start_ptr, space_id); - start_ptr += 4; - - mach_write_to_4(start_ptr, format_flags); - start_ptr += 4; - - mach_write_to_4(start_ptr, flags); - start_ptr += 4; - - /* Name of the table. */ - /* Include the NUL in the log record. */ - ulint len = strlen(tablename) + 1; - if (end_ptr < (start_ptr + (len + 2))) { - return(DB_FAIL); - } - - mach_write_to_2(start_ptr, len); - start_ptr += 2; - - memcpy(start_ptr, tablename, len - 1); - start_ptr += len; - - DBUG_EXECUTE_IF("ib_trunc_crash_while_writing_redo_log", - DBUG_SUICIDE();); - - /* Old/New Table-ID, Number of Indexes and Tablespace dir-path-name. */ - /* Write the remote directory of the table into mtr log */ - len = m_dir_path != NULL ? strlen(m_dir_path) + 1 : 0; - if (end_ptr < (start_ptr + (len + 8 + 8 + 2 + 2))) { - return(DB_FAIL); - } - - /* Write out old-table-id. */ - mach_write_to_8(start_ptr, m_old_table_id); - start_ptr += 8; - - /* Write out new-table-id. */ - mach_write_to_8(start_ptr, m_new_table_id); - start_ptr += 8; - - /* Write out the number of indexes. */ - mach_write_to_2(start_ptr, m_indexes.size()); - start_ptr += 2; - - /* Write the length (NUL included) of the .ibd path. */ - mach_write_to_2(start_ptr, len); - start_ptr += 2; - - if (m_dir_path != NULL) { - memcpy(start_ptr, m_dir_path, len - 1); - start_ptr += len; - } - - /* Indexes information (id, type) */ - /* Write index ids, type, root-page-no into mtr log */ - for (ulint i = 0; i < m_indexes.size(); ++i) { - - if (end_ptr < (start_ptr + (8 + 4 + 4 + 4))) { - return(DB_FAIL); - } - - mach_write_to_8(start_ptr, m_indexes[i].m_id); - start_ptr += 8; - - mach_write_to_4(start_ptr, m_indexes[i].m_type); - start_ptr += 4; - - mach_write_to_4(start_ptr, m_indexes[i].m_root_page_no); - start_ptr += 4; - - mach_write_to_4(start_ptr, m_indexes[i].m_trx_id_pos); - start_ptr += 4; - } - - /* If tablespace compressed then field info of each index. */ - if (FSP_FLAGS_GET_ZIP_SSIZE(flags)) { - - for (ulint i = 0; i < m_indexes.size(); ++i) { - - ulint len = m_indexes[i].m_fields.size(); - if (end_ptr < (start_ptr + (len + 2 + 2))) { - return(DB_FAIL); - } - - mach_write_to_2( - start_ptr, m_indexes[i].m_n_fields); - start_ptr += 2; - - mach_write_to_2(start_ptr, len); - start_ptr += 2; - - const byte* ptr = &m_indexes[i].m_fields[0]; - memcpy(start_ptr, ptr, len - 1); - start_ptr += len; - } - } - - return(DB_SUCCESS); -} diff --git a/storage/innobase/row/row0uins.cc b/storage/innobase/row/row0uins.cc index 1e560dad7b3..2f66f3636ff 100644 --- a/storage/innobase/row/row0uins.cc +++ b/storage/innobase/row/row0uins.cc @@ -65,7 +65,6 @@ row_undo_ins_remove_clust_rec( /*==========================*/ undo_node_t* node) /*!< in: undo node */ { - btr_cur_t* btr_cur; ibool success; dberr_t err; ulint n_tries = 0; @@ -73,15 +72,27 @@ row_undo_ins_remove_clust_rec( dict_index_t* index = node->pcur.btr_cur.index; bool online; - ut_ad(dict_index_is_clust(index)); + ut_ad(index->is_primary()); ut_ad(node->trx->in_rollback); mtr.start(); if (index->table->is_temporary()) { ut_ad(node->rec_type == TRX_UNDO_INSERT_REC); mtr.set_log_mode(MTR_LOG_NO_REDO); + ut_ad(!dict_index_is_online_ddl(index)); + ut_ad(index->table->id >= DICT_HDR_FIRST_ID); + online = false; } else { index->set_modified(mtr); + online = dict_index_is_online_ddl(index); + if (online) { + ut_ad(node->rec_type == TRX_UNDO_INSERT_REC); + ut_ad(node->trx->dict_operation_lock_mode + != RW_X_LATCH); + ut_ad(node->table->id != DICT_INDEXES_ID); + ut_ad(node->table->id != DICT_COLUMNS_ID); + mtr_s_lock_index(index, &mtr); + } } /* This is similar to row_undo_mod_clust(). The DDL thread may @@ -90,105 +101,70 @@ row_undo_ins_remove_clust_rec( purged. However, we can log the removal out of sync with the B-tree modification. */ - online = dict_index_is_online_ddl(index); - if (online) { - ut_ad(node->trx->dict_operation_lock_mode - != RW_X_LATCH); - ut_ad(node->table->id != DICT_INDEXES_ID); - mtr_s_lock_index(index, &mtr); - } - success = btr_pcur_restore_position( online ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED - : BTR_MODIFY_LEAF, &node->pcur, &mtr); + : (node->rec_type == TRX_UNDO_INSERT_METADATA) + ? BTR_MODIFY_TREE : BTR_MODIFY_LEAF, &node->pcur, &mtr); ut_a(success); - btr_cur = btr_pcur_get_btr_cur(&node->pcur); + rec_t* rec = btr_pcur_get_rec(&node->pcur); - ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur), btr_cur->index) - == node->trx->id); - ut_ad(!rec_get_deleted_flag( - btr_cur_get_rec(btr_cur), - dict_table_is_comp(btr_cur->index->table))); + ut_ad(rec_get_trx_id(rec, index) == node->trx->id); + ut_ad(!rec_get_deleted_flag(rec, index->table->not_redundant()) + || rec_is_alter_metadata(rec, index->table->not_redundant())); + ut_ad(rec_is_metadata(rec, index->table->not_redundant()) + == (node->rec_type == TRX_UNDO_INSERT_METADATA)); if (online && dict_index_is_online_ddl(index)) { - const rec_t* rec = btr_cur_get_rec(btr_cur); mem_heap_t* heap = NULL; const rec_offs* offsets = rec_get_offsets( rec, index, NULL, true, ULINT_UNDEFINED, &heap); row_log_table_delete(rec, index, offsets, NULL); mem_heap_free(heap); - } - - switch (node->table->id) { - case DICT_INDEXES_ID: - ut_ad(!online); - ut_ad(node->trx->dict_operation_lock_mode == RW_X_LATCH); - ut_ad(node->rec_type == TRX_UNDO_INSERT_REC); - - dict_drop_index_tree( - btr_pcur_get_rec(&node->pcur), &node->pcur, node->trx, - &mtr); - - mtr.commit(); + } else { + switch (node->table->id) { + case DICT_INDEXES_ID: + ut_ad(!online); + ut_ad(node->trx->dict_operation_lock_mode + == RW_X_LATCH); + ut_ad(node->rec_type == TRX_UNDO_INSERT_REC); - mtr.start(); + dict_drop_index_tree(rec, &node->pcur, node->trx, + &mtr); + mtr.commit(); - success = btr_pcur_restore_position( - BTR_MODIFY_LEAF, &node->pcur, &mtr); - ut_a(success); - break; - case DICT_COLUMNS_ID: - /* This is rolling back an INSERT into SYS_COLUMNS. - If it was part of an instant ADD COLUMN operation, we - must modify the table definition. At this point, any - corresponding operation to the metadata record will have - been rolled back. */ - ut_ad(!online); - ut_ad(node->trx->dict_operation_lock_mode == RW_X_LATCH); - ut_ad(node->rec_type == TRX_UNDO_INSERT_REC); - const rec_t* rec = btr_pcur_get_rec(&node->pcur); - if (rec_get_n_fields_old(rec) - != DICT_NUM_FIELDS__SYS_COLUMNS) { - break; - } - ulint len; - const byte* data = rec_get_nth_field_old( - rec, DICT_FLD__SYS_COLUMNS__TABLE_ID, &len); - if (len != 8) { + mtr.start(); + success = btr_pcur_restore_position( + BTR_MODIFY_LEAF, &node->pcur, &mtr); + ut_a(success); break; + case DICT_COLUMNS_ID: + /* This is rolling back an INSERT into SYS_COLUMNS. + If it was part of an instant ALTER TABLE operation, we + must evict the table definition, so that it can be + reloaded after the dictionary operation has been + completed. At this point, any corresponding operation + to the metadata record will have been rolled back. */ + ut_ad(!online); + ut_ad(node->trx->dict_operation_lock_mode + == RW_X_LATCH); + ut_ad(node->rec_type == TRX_UNDO_INSERT_REC); + if (rec_get_n_fields_old(rec) + != DICT_NUM_FIELDS__SYS_COLUMNS) { + break; + } + ulint len; + const byte* data = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__TABLE_ID, &len); + if (len != 8) { + break; + } + node->trx->evict_table(mach_read_from_8(data)); } - const table_id_t table_id = mach_read_from_8(data); - data = rec_get_nth_field_old(rec, DICT_FLD__SYS_COLUMNS__POS, - &len); - if (len != 4) { - break; - } - const unsigned pos = mach_read_from_4(data); - if (pos == 0 || pos >= (1U << 16)) { - break; - } - dict_table_t* table = dict_table_open_on_id( - table_id, true, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED); - if (!table) { - break; - } - - dict_index_t* index = dict_table_get_first_index(table); - - if (index && index->is_instant() - && DATA_N_SYS_COLS + 1 + pos == table->n_cols) { - /* This is the rollback of an instant ADD COLUMN. - Remove the column from the dictionary cache, - but keep the system columns. */ - table->rollback_instant(pos); - } - - dict_table_close(table, true, false); } - if (btr_cur_optimistic_delete(btr_cur, 0, &mtr)) { + if (btr_cur_optimistic_delete(&node->pcur.btr_cur, 0, &mtr)) { err = DB_SUCCESS; goto func_exit; } @@ -208,7 +184,8 @@ retry: &node->pcur, &mtr); ut_a(success); - btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, true, &mtr); + btr_cur_pessimistic_delete(&err, FALSE, &node->pcur.btr_cur, 0, true, + &mtr); /* The delete operation may fail if we have little file space left: TODO: easiest to crash the database @@ -227,29 +204,34 @@ retry: } func_exit: - btr_pcur_commit_specify_mtr(&node->pcur, &mtr); if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_INSERT_METADATA) { /* When rolling back the very first instant ADD COLUMN operation, reset the root page to the basic state. */ ut_ad(!index->table->is_temporary()); - mtr.start(); if (page_t* root = btr_root_get(index, &mtr)) { byte* page_type = root + FIL_PAGE_TYPE; ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT || mach_read_from_2(page_type) == FIL_PAGE_INDEX); - index->set_modified(mtr); mlog_write_ulint(page_type, FIL_PAGE_INDEX, MLOG_2BYTES, &mtr); byte* instant = PAGE_INSTANT + PAGE_HEADER + root; mlog_write_ulint(instant, page_ptr_get_direction(instant + 1), MLOG_2BYTES, &mtr); + rec_t* infimum = page_get_infimum_rec(root); + rec_t* supremum = page_get_supremum_rec(root); + static const byte str[8 + 8] = "supremuminfimum"; + if (memcmp(infimum, str + 8, 8) + || memcmp(supremum, str, 8)) { + mlog_write_string(infimum, str + 8, 8, &mtr); + mlog_write_string(supremum, str, 8, &mtr); + } } - mtr.commit(); } + btr_pcur_commit_specify_mtr(&node->pcur, &mtr); return(err); } @@ -381,14 +363,10 @@ retry: return(err); } -/***********************************************************//** -Parses the row reference and other info in a fresh insert undo record. */ -static -void -row_undo_ins_parse_undo_rec( -/*========================*/ - undo_node_t* node, /*!< in/out: row undo node */ - ibool dict_locked) /*!< in: TRUE if own dict_sys->mutex */ +/** Parse an insert undo record. +@param[in,out] node row rollback state +@param[in] dict_locked whether the data dictionary cache is locked */ +static bool row_undo_ins_parse_undo_rec(undo_node_t* node, bool dict_locked) { dict_index_t* clust_index; byte* ptr; @@ -397,18 +375,28 @@ row_undo_ins_parse_undo_rec( ulint dummy; bool dummy_extern; - ut_ad(node); + ut_ad(node->state == UNDO_INSERT_PERSISTENT + || node->state == UNDO_INSERT_TEMPORARY); + ut_ad(node->trx->in_rollback); + ut_ad(trx_undo_roll_ptr_is_insert(node->roll_ptr)); ptr = trx_undo_rec_get_pars(node->undo_rec, &node->rec_type, &dummy, &dummy_extern, &undo_no, &table_id); node->update = NULL; - node->table = dict_table_open_on_id( - table_id, dict_locked, DICT_TABLE_OP_NORMAL); + if (node->state == UNDO_INSERT_PERSISTENT) { + node->table = dict_table_open_on_id(table_id, dict_locked, + DICT_TABLE_OP_NORMAL); + } else if (!dict_locked) { + mutex_enter(&dict_sys.mutex); + node->table = dict_sys.get_temporary_table(table_id); + mutex_exit(&dict_sys.mutex); + } else { + node->table = dict_sys.get_temporary_table(table_id); + } - /* Skip the UNDO if we can't find the table or the .ibd file. */ - if (UNIV_UNLIKELY(node->table == NULL)) { - return; + if (!node->table) { + return false; } switch (node->rec_type) { @@ -447,6 +435,7 @@ close_table: connection, instead of doing this rollback. */ dict_table_close(node->table, dict_locked, FALSE); node->table = NULL; + return false; } else { ut_ad(!node->table->skip_alter_undo); clust_index = dict_table_get_first_index(node->table); @@ -478,6 +467,8 @@ close_table: goto close_table; } } + + return true; } /***************************************************************//** @@ -554,18 +545,10 @@ row_undo_ins( que_thr_t* thr) /*!< in: query thread */ { dberr_t err; - ibool dict_locked; + bool dict_locked = node->trx->dict_operation_lock_mode == RW_X_LATCH; - ut_ad(node->state == UNDO_NODE_INSERT); - ut_ad(node->trx->in_rollback); - ut_ad(trx_undo_roll_ptr_is_insert(node->roll_ptr)); - - dict_locked = node->trx->dict_operation_lock_mode == RW_X_LATCH; - - row_undo_ins_parse_undo_rec(node, dict_locked); - - if (node->table == NULL) { - return(DB_SUCCESS); + if (!row_undo_ins_parse_undo_rec(node, dict_locked)) { + return DB_SUCCESS; } /* Iterate over all the indexes and undo the insert.*/ @@ -589,26 +572,19 @@ row_undo_ins( break; } - /* fall through */ - case TRX_UNDO_INSERT_METADATA: log_free_check(); if (node->table->id == DICT_INDEXES_ID) { - ut_ad(node->rec_type == TRX_UNDO_INSERT_REC); - + ut_ad(!node->table->is_temporary()); if (!dict_locked) { - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); } - } - - // FIXME: We need to update the dict_index_t::space and - // page number fields too. - err = row_undo_ins_remove_clust_rec(node); - - if (node->table->id == DICT_INDEXES_ID - && !dict_locked) { - - mutex_exit(&dict_sys->mutex); + err = row_undo_ins_remove_clust_rec(node); + if (!dict_locked) { + mutex_exit(&dict_sys.mutex); + } + } else { + err = row_undo_ins_remove_clust_rec(node); } if (err == DB_SUCCESS && node->table->stat_initialized) { @@ -621,13 +597,19 @@ row_undo_ins( /* Do not attempt to update statistics when executing ROLLBACK in the InnoDB SQL interpreter, because in that case we would - already be holding dict_sys->mutex, which + already be holding dict_sys.mutex, which would be acquired when updating statistics. */ if (!dict_locked) { dict_stats_update_if_needed(node->table, *node->trx); } } + break; + + case TRX_UNDO_INSERT_METADATA: + log_free_check(); + ut_ad(!node->table->is_temporary()); + err = row_undo_ins_remove_clust_rec(node); } dict_table_close(node->table, dict_locked, FALSE); diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc index 2ac54c4025e..e9551d33c73 100644 --- a/storage/innobase/row/row0umod.cc +++ b/storage/innobase/row/row0umod.cc @@ -111,6 +111,9 @@ row_undo_mod_clust_low( ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur), btr_cur_get_index(btr_cur)) == thr_get_trx(thr)->id); + ut_ad(node->ref != &trx_undo_metadata + || node->update->info_bits == REC_INFO_METADATA_ADD + || node->update->info_bits == REC_INFO_METADATA_ALTER); if (mode != BTR_MODIFY_LEAF && dict_index_is_online_ddl(btr_cur_get_index(btr_cur))) { @@ -131,6 +134,7 @@ row_undo_mod_clust_low( btr_cur, offsets, offsets_heap, node->update, node->cmpl_info, thr, thr_get_trx(thr)->id, mtr); + ut_ad(err != DB_SUCCESS || node->ref != &trx_undo_metadata); } else { big_rec_t* dummy_big_rec; @@ -143,6 +147,52 @@ row_undo_mod_clust_low( node->cmpl_info, thr, thr_get_trx(thr)->id, mtr); ut_a(!dummy_big_rec); + + static const byte + INFIMUM[8] = {'i','n','f','i','m','u','m',0}, + SUPREMUM[8] = {'s','u','p','r','e','m','u','m'}; + + if (err == DB_SUCCESS + && node->ref == &trx_undo_metadata + && btr_cur_get_index(btr_cur)->table->instant + && node->update->info_bits == REC_INFO_METADATA_ADD) { + if (page_t* root = btr_root_get( + btr_cur_get_index(btr_cur), mtr)) { + byte* infimum; + byte *supremum; + if (page_is_comp(root)) { + infimum = PAGE_NEW_INFIMUM + root; + supremum = PAGE_NEW_SUPREMUM + root; + } else { + infimum = PAGE_OLD_INFIMUM + root; + supremum = PAGE_OLD_SUPREMUM + root; + } + + ut_ad(!memcmp(infimum, INFIMUM, 8) + == !memcmp(supremum, SUPREMUM, 8)); + + if (memcmp(infimum, INFIMUM, 8)) { + mlog_write_string(infimum, INFIMUM, + 8, mtr); + mlog_write_string(supremum, SUPREMUM, + 8, mtr); + } + } + } + } + + if (err == DB_SUCCESS + && btr_cur_get_index(btr_cur)->table->id == DICT_COLUMNS_ID) { + /* This is rolling back an UPDATE or DELETE on SYS_COLUMNS. + If it was part of an instant ALTER TABLE operation, we + must evict the table definition, so that it can be + reloaded after the dictionary operation has been + completed. At this point, any corresponding operation + to the metadata record will have been rolled back. */ + const dfield_t& table_id = *dtuple_get_nth_field(node->row, 0); + ut_ad(dfield_get_len(&table_id) == 8); + node->trx->evict_table(mach_read_from_8(static_cast<byte*>( + table_id.data))); } return(err); @@ -221,7 +271,7 @@ row_undo_mod_clust( ut_ad(thr_get_trx(thr) == node->trx); ut_ad(node->trx->dict_operation_lock_mode); ut_ad(node->trx->in_rollback); - ut_ad(rw_lock_own_flagged(&dict_operation_lock, + ut_ad(rw_lock_own_flagged(&dict_sys.latch, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); log_free_check(); @@ -278,7 +328,7 @@ row_undo_mod_clust( } /* Online rebuild cannot be initiated while we are holding - dict_operation_lock and index->lock. (It can be aborted.) */ + dict_sys.latch and index->lock. (It can be aborted.) */ ut_ad(online || !dict_index_is_online_ddl(index)); if (err == DB_SUCCESS && online) { @@ -401,22 +451,49 @@ row_undo_mod_clust( goto mtr_commit_exit; } + ulint trx_id_offset = index->trx_id_offset; ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1; - ut_ad(index->n_uniq <= MAX_REF_PARTS); - /* Reserve enough offsets for the PRIMARY KEY and 2 columns - so that we can access DB_TRX_ID, DB_ROLL_PTR. */ + /* Reserve enough offsets for the PRIMARY KEY and + 2 columns so that we can access DB_TRX_ID, DB_ROLL_PTR. */ rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2]; - rec_offs_init(offsets_); - offsets = rec_get_offsets( - rec, index, offsets_, true, trx_id_pos + 2, &heap); - ulint len; - ulint trx_id_offset = rec_get_nth_field_offs( - offsets, trx_id_pos, &len); - ut_ad(len == DATA_TRX_ID_LEN); + if (trx_id_offset) { +#ifdef UNIV_DEBUG + ut_ad(rec_offs_validate(NULL, index, offsets)); + if (buf_block_get_page_zip( + btr_pcur_get_block(&node->pcur))) { + /* Below, page_zip_write_trx_id_and_roll_ptr() + needs offsets to access DB_TRX_ID,DB_ROLL_PTR. + We already computed offsets for possibly + another record in the clustered index. + Because the PRIMARY KEY is fixed-length, + the offsets for the PRIMARY KEY and + DB_TRX_ID,DB_ROLL_PTR are still valid. + Silence the rec_offs_validate() assertion. */ + rec_offs_make_valid(rec, index, true, offsets); + } +#endif + } else if (rec_is_metadata(rec, *index)) { + ut_ad(!buf_block_get_page_zip(btr_pcur_get_block( + &node->pcur))); + for (unsigned i = index->first_user_field(); i--; ) { + trx_id_offset += index->fields[i].fixed_len; + } + } else { + ut_ad(index->n_uniq <= MAX_REF_PARTS); + rec_offs_init(offsets_); + offsets = rec_get_offsets( + rec, index, offsets_, true, trx_id_pos + 2, + &heap); + ulint len; + trx_id_offset = rec_get_nth_field_offs( + offsets, trx_id_pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + } if (trx_read_trx_id(rec + trx_id_offset) == node->new_trx_id) { ut_ad(!rec_get_deleted_flag( - rec, dict_table_is_comp(node->table))); + rec, dict_table_is_comp(node->table)) + || rec_is_alter_metadata(rec, *index)); index->set_modified(mtr); if (page_zip_des_t* page_zip = buf_block_get_page_zip( btr_pcur_get_block(&node->pcur))) { @@ -438,8 +515,6 @@ mtr_commit_exit: btr_pcur_commit_specify_mtr(pcur, &mtr); func_exit: - node->state = UNDO_NODE_FETCH_NEXT; - if (offsets_heap) { mem_heap_free(offsets_heap); } @@ -852,9 +927,9 @@ row_undo_mod_sec_flag_corrupted( on the data dictionary during normal rollback, we can only mark the index corrupted in the data dictionary cache. TODO: fix this somehow.*/ - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); dict_set_corrupted_index_cache_only(index); - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); break; default: ut_ad(0); @@ -1141,14 +1216,10 @@ row_undo_mod_upd_exist_sec( return(err); } -/***********************************************************//** -Parses the row reference and other info in a modify undo log record. */ -static MY_ATTRIBUTE((nonnull)) -void -row_undo_mod_parse_undo_rec( -/*========================*/ - undo_node_t* node, /*!< in: row undo node */ - ibool dict_locked) /*!< in: TRUE if own dict_sys->mutex */ +/** Parse an update undo record. +@param[in,out] node row rollback state +@param[in] dict_locked whether the data dictionary cache is locked */ +static bool row_undo_mod_parse_undo_rec(undo_node_t* node, bool dict_locked) { dict_index_t* clust_index; byte* ptr; @@ -1161,19 +1232,28 @@ row_undo_mod_parse_undo_rec( ulint cmpl_info; bool dummy_extern; + ut_ad(node->state == UNDO_UPDATE_PERSISTENT + || node->state == UNDO_UPDATE_TEMPORARY); + ut_ad(node->trx->in_rollback); + ut_ad(!trx_undo_roll_ptr_is_insert(node->roll_ptr)); + ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info, &dummy_extern, &undo_no, &table_id); node->rec_type = type; - node->table = dict_table_open_on_id( - table_id, dict_locked, DICT_TABLE_OP_NORMAL); - - /* TODO: other fixes associated with DROP TABLE + rollback in the - same table by another user */ + if (node->state == UNDO_UPDATE_PERSISTENT) { + node->table = dict_table_open_on_id(table_id, dict_locked, + DICT_TABLE_OP_NORMAL); + } else if (!dict_locked) { + mutex_enter(&dict_sys.mutex); + node->table = dict_sys.get_temporary_table(table_id); + mutex_exit(&dict_sys.mutex); + } else { + node->table = dict_sys.get_temporary_table(table_id); + } - if (node->table == NULL) { - /* Table was dropped */ - return; + if (!node->table) { + return false; } ut_ad(!node->table->skip_alter_undo); @@ -1191,7 +1271,7 @@ close_table: connection, instead of doing this rollback. */ dict_table_close(node->table, dict_locked, FALSE); node->table = NULL; - return; + return false; } clust_index = dict_table_get_first_index(node->table); @@ -1210,16 +1290,21 @@ close_table: ut_ad(!node->ref->info_bits); if (node->update->info_bits & REC_INFO_MIN_REC_FLAG) { - /* This must be an undo log record for a subsequent - instant ALTER TABLE, extending the metadata record. */ - ut_ad(clust_index->is_instant()); - if (node->update->info_bits != REC_INFO_MIN_REC_FLAG) { + if ((node->update->info_bits & ~REC_INFO_DELETED_FLAG) + != REC_INFO_MIN_REC_FLAG) { ut_ad(!"wrong info_bits in undo log record"); goto close_table; } - node->update->info_bits = REC_INFO_METADATA; - const_cast<dtuple_t*>(node->ref)->info_bits - = REC_INFO_METADATA; + /* This must be an undo log record for a subsequent + instant ALTER TABLE, extending the metadata record. */ + ut_ad(clust_index->is_instant()); + ut_ad(clust_index->table->instant + || !(node->update->info_bits & REC_INFO_DELETED_FLAG)); + node->ref = &trx_undo_metadata; + node->update->info_bits = (node->update->info_bits + & REC_INFO_DELETED_FLAG) + ? REC_INFO_METADATA_ALTER + : REC_INFO_METADATA_ADD; } if (!row_undo_search_clust_to_pcur(node)) { @@ -1257,6 +1342,8 @@ close_table: (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) ? NULL : ptr); } + + return true; } /***********************************************************//** @@ -1269,34 +1356,19 @@ row_undo_mod( que_thr_t* thr) /*!< in: query thread */ { dberr_t err; - ibool dict_locked; - - ut_ad(node != NULL); - ut_ad(thr != NULL); - ut_ad(node->state == UNDO_NODE_MODIFY); - ut_ad(node->trx->in_rollback); - ut_ad(!trx_undo_roll_ptr_is_insert(node->roll_ptr)); - - dict_locked = thr_get_trx(thr)->dict_operation_lock_mode == RW_X_LATCH; - ut_ad(thr_get_trx(thr) == node->trx); + const bool dict_locked = node->trx->dict_operation_lock_mode + == RW_X_LATCH; - row_undo_mod_parse_undo_rec(node, dict_locked); - - if (node->table == NULL) { - /* It is already undone, or will be undone by another query - thread, or table was dropped */ - - node->state = UNDO_NODE_FETCH_NEXT; - - return(DB_SUCCESS); + if (!row_undo_mod_parse_undo_rec(node, dict_locked)) { + return DB_SUCCESS; } node->index = dict_table_get_first_index(node->table); ut_ad(dict_index_is_clust(node->index)); if (node->ref->info_bits) { - ut_ad(node->ref->info_bits == REC_INFO_METADATA); + ut_ad(node->ref->is_metadata()); goto rollback_clust; } @@ -1347,7 +1419,7 @@ rollback_clust: /* Do not attempt to update statistics when executing ROLLBACK in the InnoDB SQL interpreter, because in that case we would - already be holding dict_sys->mutex, which + already be holding dict_sys.mutex, which would be acquired when updating statistics. */ if (update_statistics && !dict_locked) { dict_stats_update_if_needed(node->table, diff --git a/storage/innobase/row/row0undo.cc b/storage/innobase/row/row0undo.cc index 762228503be..a07258910b4 100644 --- a/storage/innobase/row/row0undo.cc +++ b/storage/innobase/row/row0undo.cc @@ -218,7 +218,8 @@ row_undo_search_clust_to_pcur( log, first mark them DATA_MISSING. So we will know if the value gets updated */ if (node->table->n_v_cols - && node->state != UNDO_NODE_INSERT + && (node->state == UNDO_UPDATE_PERSISTENT + || node->state == UNDO_UPDATE_TEMPORARY) && !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { for (ulint i = 0; i < dict_table_get_n_v_cols(node->table); i++) { @@ -228,13 +229,15 @@ row_undo_search_clust_to_pcur( } if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) { - ut_ad(node->row->info_bits == REC_INFO_MIN_REC_FLAG + ut_ad((node->row->info_bits & ~REC_INFO_DELETED_FLAG) + == REC_INFO_MIN_REC_FLAG || node->row->info_bits == 0); node->undo_row = dtuple_copy(node->row, node->heap); row_upd_replace(node->undo_row, &node->undo_ext, clust_index, node->update, node->heap); } else { - ut_ad((node->row->info_bits == REC_INFO_MIN_REC_FLAG) + ut_ad(((node->row->info_bits & ~REC_INFO_DELETED_FLAG) + == REC_INFO_MIN_REC_FLAG) == (node->rec_type == TRX_UNDO_INSERT_METADATA)); node->undo_row = NULL; node->undo_ext = NULL; @@ -252,6 +255,149 @@ func_exit: return(found); } +/** Try to truncate the undo logs. +@param[in,out] trx transaction */ +static void row_undo_try_truncate(trx_t* trx) +{ + if (trx_undo_t* undo = trx->rsegs.m_redo.undo) { + ut_ad(undo->rseg == trx->rsegs.m_redo.rseg); + trx_undo_truncate_end(*undo, trx->undo_no, false); + } + + if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) { + ut_ad(undo->rseg == trx->rsegs.m_noredo.rseg); + trx_undo_truncate_end(*undo, trx->undo_no, true); + } +} + +/** Get the latest undo log record for rollback. +@param[in,out] node rollback context +@return whether an undo log record was fetched */ +static bool row_undo_rec_get(undo_node_t* node) +{ + trx_t* trx = node->trx; + + if (trx->pages_undone) { + trx->pages_undone = 0; + row_undo_try_truncate(trx); + } + + trx_undo_t* undo = NULL; + trx_undo_t* insert = trx->rsegs.m_redo.old_insert; + trx_undo_t* update = trx->rsegs.m_redo.undo; + trx_undo_t* temp = trx->rsegs.m_noredo.undo; + const undo_no_t limit = trx->roll_limit; + + ut_ad(!insert || !update || insert->empty() || update->empty() + || insert->top_undo_no != update->top_undo_no); + ut_ad(!insert || !temp || insert->empty() || temp->empty() + || insert->top_undo_no != temp->top_undo_no); + ut_ad(!update || !temp || update->empty() || temp->empty() + || update->top_undo_no != temp->top_undo_no); + + if (UNIV_LIKELY_NULL(insert) + && !insert->empty() && limit <= insert->top_undo_no) { + undo = insert; + } + + if (update && !update->empty() && update->top_undo_no >= limit) { + if (!undo) { + undo = update; + } else if (undo->top_undo_no < update->top_undo_no) { + undo = update; + } + } + + if (temp && !temp->empty() && temp->top_undo_no >= limit) { + if (!undo) { + undo = temp; + } else if (undo->top_undo_no < temp->top_undo_no) { + undo = temp; + } + } + + if (undo == NULL) { + row_undo_try_truncate(trx); + /* Mark any ROLLBACK TO SAVEPOINT completed, so that + if the transaction object is committed and reused + later, we will default to a full ROLLBACK. */ + trx->roll_limit = 0; + trx->in_rollback = false; + return false; + } + + ut_ad(!undo->empty()); + ut_ad(limit <= undo->top_undo_no); + + node->roll_ptr = trx_undo_build_roll_ptr( + false, undo->rseg->id, undo->top_page_no, undo->top_offset); + + mtr_t mtr; + mtr.start(); + + page_t* undo_page = trx_undo_page_get_s_latched( + page_id_t(undo->rseg->space->id, undo->top_page_no), &mtr); + + ulint offset = undo->top_offset; + + trx_undo_rec_t* prev_rec = trx_undo_get_prev_rec( + undo_page + offset, undo->hdr_page_no, undo->hdr_offset, + true, &mtr); + + if (prev_rec == NULL) { + undo->top_undo_no = IB_ID_MAX; + ut_ad(undo->empty()); + } else { + page_t* prev_rec_page = page_align(prev_rec); + + if (prev_rec_page != undo_page) { + + trx->pages_undone++; + } + + undo->top_page_no = page_get_page_no(prev_rec_page); + undo->top_offset = ulint(prev_rec - prev_rec_page); + undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec); + ut_ad(!undo->empty()); + } + + { + const trx_undo_rec_t* undo_rec = undo_page + offset; + node->undo_rec = trx_undo_rec_copy(undo_rec, node->heap); + } + + mtr.commit(); + + switch (trx_undo_rec_get_type(node->undo_rec)) { + case TRX_UNDO_INSERT_METADATA: + /* This record type was introduced in MDEV-11369 + instant ADD COLUMN, which was implemented after + MDEV-12288 removed the insert_undo log. There is no + instant ADD COLUMN for temporary tables. Therefore, + this record can only be present in the main undo log. */ + ut_ad(undo == update); + /* fall through */ + case TRX_UNDO_RENAME_TABLE: + ut_ad(undo == insert || undo == update); + /* fall through */ + case TRX_UNDO_INSERT_REC: + ut_ad(undo == insert || undo == update || undo == temp); + node->roll_ptr |= 1ULL << ROLL_PTR_INSERT_FLAG_POS; + node->state = undo == temp + ? UNDO_INSERT_TEMPORARY : UNDO_INSERT_PERSISTENT; + break; + default: + ut_ad(undo == update || undo == temp); + node->state = undo == temp + ? UNDO_UPDATE_TEMPORARY : UNDO_UPDATE_PERSISTENT; + break; + } + + trx->undo_no = node->undo_no = trx_undo_rec_get_undo_no( + node->undo_rec); + return true; +} + /***********************************************************//** Fetches an undo log record and does the undo for the recorded operation. If none left, or a partial rollback completed, returns control to the @@ -264,55 +410,47 @@ row_undo( undo_node_t* node, /*!< in: row undo node */ que_thr_t* thr) /*!< in: query thread */ { - trx_t* trx = node->trx; - ut_ad(trx->in_rollback); - - if (node->state == UNDO_NODE_FETCH_NEXT) { - - node->undo_rec = trx_roll_pop_top_rec_of_trx( - trx, &node->roll_ptr, node->heap); - - if (!node->undo_rec) { - /* Rollback completed for this query thread */ - thr->run_node = que_node_get_parent(node); - return(DB_SUCCESS); - } + ut_ad(node->trx->in_rollback); - node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec); - node->state = trx_undo_roll_ptr_is_insert(node->roll_ptr) - ? UNDO_NODE_INSERT : UNDO_NODE_MODIFY; + if (node->state == UNDO_NODE_FETCH_NEXT && !row_undo_rec_get(node)) { + /* Rollback completed for this query thread */ + thr->run_node = que_node_get_parent(node); + return DB_SUCCESS; } /* Prevent DROP TABLE etc. while we are rolling back this row. If we are doing a TABLE CREATE or some other dictionary operation, - then we already have dict_operation_lock locked in x-mode. Do not + then we already have dict_sys.latch locked in x-mode. Do not try to lock again, because that would cause a hang. */ + trx_t* trx = node->trx; const bool locked_data_dict = (trx->dict_operation_lock_mode == 0); if (locked_data_dict) { - row_mysql_freeze_data_dictionary(trx); } dberr_t err; - if (node->state == UNDO_NODE_INSERT) { - + switch (node->state) { + case UNDO_INSERT_PERSISTENT: + case UNDO_INSERT_TEMPORARY: err = row_undo_ins(node, thr); - - node->state = UNDO_NODE_FETCH_NEXT; - } else { - ut_ad(node->state == UNDO_NODE_MODIFY); + break; + case UNDO_UPDATE_PERSISTENT: + case UNDO_UPDATE_TEMPORARY: err = row_undo_mod(node, thr); + break; + default: + ut_ad(!"wrong state"); + err = DB_CORRUPTION; } if (locked_data_dict) { - row_mysql_unfreeze_data_dictionary(trx); } - /* Do some cleanup */ + node->state = UNDO_NODE_FETCH_NEXT; btr_pcur_close(&(node->pcur)); mem_heap_empty(node->heap); diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc index ee432fdddb7..f41ef6fd4af 100644 --- a/storage/innobase/row/row0upd.cc +++ b/storage/innobase/row/row0upd.cc @@ -122,7 +122,7 @@ row_upd_changes_first_fields_binary( Checks if index currently is mentioned as a referenced index in a foreign key constraint. -NOTE that since we do not hold dict_operation_lock when leaving the +NOTE that since we do not hold dict_sys.latch when leaving the function, it may be that the referencing table has been dropped when we leave this function: this function is only for heuristic use! @@ -288,7 +288,7 @@ row_upd_check_references_constraints( } /* NOTE that if the thread ends up waiting for a lock - we will release dict_operation_lock temporarily! + we will release dict_sys.latch temporarily! But the inc_fk_checks() protects foreign_table from being dropped while the check is running. */ @@ -394,7 +394,7 @@ wsrep_row_upd_check_foreign_constraints( } /* NOTE that if the thread ends up waiting for a lock - we will release dict_operation_lock temporarily! + we will release dict_sys.latch temporarily! But the counter on the table protects 'foreign' from being dropped while the check is running. */ @@ -492,39 +492,6 @@ row_upd_rec_sys_fields_in_recovery( } } -/*********************************************************************//** -Sets the trx id or roll ptr field of a clustered index entry. */ -void -row_upd_index_entry_sys_field( -/*==========================*/ - dtuple_t* entry, /*!< in/out: index entry, where the memory - buffers for sys fields are already allocated: - the function just copies the new values to - them */ - dict_index_t* index, /*!< in: clustered index */ - ulint type, /*!< in: DATA_TRX_ID or DATA_ROLL_PTR */ - ib_uint64_t val) /*!< in: value to write */ -{ - dfield_t* dfield; - byte* field; - ulint pos; - - ut_ad(dict_index_is_clust(index)); - - pos = dict_index_get_sys_col_pos(index, type); - - dfield = dtuple_get_nth_field(entry, pos); - field = static_cast<byte*>(dfield_get_data(dfield)); - - if (type == DATA_TRX_ID) { - ut_ad(val > 0); - trx_write_trx_id(field, val); - } else { - ut_ad(type == DATA_ROLL_PTR); - trx_write_roll_ptr(field, val); - } -} - /***********************************************************//** Returns TRUE if row update changes size of some field in index or if some field to be updated is stored externally in rec or update. @@ -677,7 +644,7 @@ row_upd_rec_in_place( switch (rec_get_status(rec)) { case REC_STATUS_ORDINARY: break; - case REC_STATUS_COLUMNS_ADDED: + case REC_STATUS_INSTANT: ut_ad(index->is_instant()); break; case REC_STATUS_NODE_PTR: @@ -728,35 +695,6 @@ row_upd_rec_in_place( } /*********************************************************************//** -Writes into the redo log the values of trx id and roll ptr and enough info -to determine their positions within a clustered index record. -@return new pointer to mlog */ -byte* -row_upd_write_sys_vals_to_log( -/*==========================*/ - dict_index_t* index, /*!< in: clustered index */ - trx_id_t trx_id, /*!< in: transaction id */ - roll_ptr_t roll_ptr,/*!< in: roll ptr of the undo log record */ - byte* log_ptr,/*!< pointer to a buffer of size > 20 opened - in mlog */ - mtr_t* mtr MY_ATTRIBUTE((unused))) /*!< in: mtr */ -{ - ut_ad(dict_index_is_clust(index)); - ut_ad(mtr); - - log_ptr += mach_write_compressed(log_ptr, - dict_index_get_sys_col_pos( - index, DATA_TRX_ID)); - - trx_write_roll_ptr(log_ptr, roll_ptr); - log_ptr += DATA_ROLL_PTR_LEN; - - log_ptr += mach_u64_write_compressed(log_ptr, trx_id); - - return(log_ptr); -} - -/*********************************************************************//** Parses the log data of system field values. @return log data end or NULL */ byte* @@ -1049,7 +987,6 @@ row_upd_build_difference_binary( ulint len; upd_t* update; ulint n_diff; - ulint trx_id_pos; rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; const ulint n_v_fld = dtuple_get_n_v_fields(entry); rec_offs_init(offsets_); @@ -1064,10 +1001,6 @@ row_upd_build_difference_binary( n_diff = 0; - trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); - ut_ad(dict_index_get_sys_col_pos(index, DATA_ROLL_PTR) - == trx_id_pos + 1); - if (!offsets) { offsets = rec_get_offsets(rec, index, offsets_, true, ULINT_UNDEFINED, &heap); @@ -1082,16 +1015,9 @@ row_upd_build_difference_binary( /* NOTE: we compare the fields as binary strings! (No collation) */ - if (no_sys) { - /* TRX_ID */ - if (i == trx_id_pos) { - continue; - } - - /* DB_ROLL_PTR */ - if (i == trx_id_pos + 1) { - continue; - } + if (no_sys && (i == index->db_trx_id() + || i == index->db_roll_ptr())) { + continue; } if (!dfield_is_ext(dfield) @@ -1197,7 +1123,7 @@ of the column and must not be poisoned with the new values. @param[in] data 'internally' stored part of the field containing also the reference to the external part @param[in] local_len length of data, in bytes -@param[in] page_size BLOB page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in,out] len input - length of prefix to fetch; output: fetched length of the prefix @param[in,out] heap heap where to allocate @@ -1207,14 +1133,14 @@ byte* row_upd_ext_fetch( const byte* data, ulint local_len, - const page_size_t& page_size, + ulint zip_size, ulint* len, mem_heap_t* heap) { byte* buf = static_cast<byte*>(mem_heap_alloc(heap, *len)); *len = btr_copy_externally_stored_field_prefix( - buf, *len, page_size, data, local_len); + buf, *len, zip_size, data, local_len); /* We should never update records containing a half-deleted BLOB. */ ut_a(*len); @@ -1230,7 +1156,7 @@ the given index entry field. @param[in] uf update field @param[in,out] heap memory heap for allocating and copying the new value -@param[in] page_size page size */ +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 */ static void row_upd_index_replace_new_col_val( @@ -1239,7 +1165,7 @@ row_upd_index_replace_new_col_val( const dict_col_t* col, const upd_field_t* uf, mem_heap_t* heap, - const page_size_t& page_size) + ulint zip_size) { ulint len; const byte* data; @@ -1253,7 +1179,7 @@ row_upd_index_replace_new_col_val( len = dfield_get_len(dfield); data = static_cast<const byte*>(dfield_get_data(dfield)); - if (field->prefix_len > 0) { + if (field && field->prefix_len > 0) { ibool fetch_ext = dfield_is_ext(dfield) && len < (ulint) field->prefix_len + BTR_EXTERN_FIELD_REF_SIZE; @@ -1263,7 +1189,7 @@ row_upd_index_replace_new_col_val( len = field->prefix_len; - data = row_upd_ext_fetch(data, l, page_size, + data = row_upd_ext_fetch(data, l, zip_size, &len, heap); } @@ -1319,6 +1245,57 @@ row_upd_index_replace_new_col_val( } } +/** Apply an update vector to an metadata entry. +@param[in,out] entry clustered index metadata record to be updated +@param[in] index index of the entry +@param[in] update update vector built for the entry +@param[in,out] heap memory heap for copying off-page columns */ +static +void +row_upd_index_replace_metadata( + dtuple_t* entry, + const dict_index_t* index, + const upd_t* update, + mem_heap_t* heap) +{ + ut_ad(!index->table->skip_alter_undo); + ut_ad(update->is_alter_metadata()); + ut_ad(entry->info_bits == update->info_bits); + ut_ad(entry->n_fields == ulint(index->n_fields) + 1); + const ulint zip_size = index->table->space->zip_size(); + const ulint first = index->first_user_field(); + ut_d(bool found_mblob = false); + + for (ulint i = upd_get_n_fields(update); i--; ) { + const upd_field_t* uf = upd_get_nth_field(update, i); + ut_ad(!upd_fld_is_virtual_col(uf)); + ut_ad(uf->field_no >= first - 2); + ulint f = uf->field_no; + dfield_t* dfield = dtuple_get_nth_field(entry, f); + + if (f == first) { + ut_d(found_mblob = true); + ut_ad(!dfield_is_null(&uf->new_val)); + ut_ad(dfield_is_ext(dfield)); + ut_ad(dfield_get_len(dfield) == FIELD_REF_SIZE); + ut_ad(!dfield_is_null(dfield)); + dfield_set_data(dfield, uf->new_val.data, + uf->new_val.len); + if (dfield_is_ext(&uf->new_val)) { + dfield_set_ext(dfield); + } + continue; + } + + f -= f > first; + const dict_field_t* field = dict_index_get_nth_field(index, f); + row_upd_index_replace_new_col_val(dfield, field, field->col, + uf, heap, zip_size); + } + + ut_ad(found_mblob); +} + /** Apply an update vector to an index entry. @param[in,out] entry index entry to be updated; the clustered index record must be covered by a lock or a page latch to prevent @@ -1334,8 +1311,14 @@ row_upd_index_replace_new_col_vals_index_pos( mem_heap_t* heap) { ut_ad(!index->table->skip_alter_undo); + ut_ad(!entry->is_metadata() || entry->info_bits == update->info_bits); + + if (UNIV_UNLIKELY(entry->is_alter_metadata())) { + row_upd_index_replace_metadata(entry, index, update, heap); + return; + } - const page_size_t& page_size = dict_table_page_size(index->table); + const ulint zip_size = index->table->space->zip_size(); dtuple_set_info_bits(entry, update->info_bits); @@ -1361,7 +1344,7 @@ row_upd_index_replace_new_col_vals_index_pos( if (uf) { row_upd_index_replace_new_col_val( dtuple_get_nth_field(entry, i), - field, col, uf, heap, page_size); + field, col, uf, heap, zip_size); } } } @@ -1387,7 +1370,7 @@ row_upd_index_replace_new_col_vals( ulint i; const dict_index_t* clust_index = dict_table_get_first_index(index->table); - const page_size_t& page_size = dict_table_page_size(index->table); + const ulint zip_size = index->table->space->zip_size(); ut_ad(!index->table->skip_alter_undo); @@ -1417,7 +1400,7 @@ row_upd_index_replace_new_col_vals( if (uf) { row_upd_index_replace_new_col_val( dtuple_get_nth_field(entry, i), - field, col, uf, heap, page_size); + field, col, uf, heap, zip_size); } } } @@ -1641,8 +1624,7 @@ row_upd_replace( } if (n_ext_cols) { - *ext = row_ext_create(n_ext_cols, ext_cols, table->flags, row, - heap); + *ext = row_ext_create(n_ext_cols, ext_cols, *table, row, heap); } else { *ext = NULL; } @@ -1750,11 +1732,9 @@ row_upd_changes_ord_field_binary_func( mem_heap_t* temp_heap = NULL; const dfield_t* new_field = &upd_field->new_val; - const page_size_t page_size - = (ext != NULL) - ? ext->page_size - : dict_table_page_size( - index->table); + const ulint zip_size = ext + ? ext->zip_size + : index->table->space->zip_size(); ut_ad(dfield->data != NULL && dfield->len > GEO_DATA_HEADER_SIZE); @@ -1771,7 +1751,7 @@ row_upd_changes_ord_field_binary_func( dptr = btr_copy_externally_stored_field( &dlen, dptr, - page_size, + zip_size, flen, temp_heap); } else { @@ -1834,7 +1814,7 @@ row_upd_changes_ord_field_binary_func( dptr = btr_copy_externally_stored_field( &dlen, dptr, - page_size, + zip_size, flen, temp_heap); } else { @@ -2443,7 +2423,7 @@ row_upd_sec_index_entry( #ifdef UNIV_DEBUG mtr_commit(&mtr); mtr_start(&mtr); - ut_ad(btr_validate_index(index, 0, false)); + ut_ad(btr_validate_index(index, 0)); ut_ad(0); #endif /* UNIV_DEBUG */ break; @@ -2574,10 +2554,10 @@ row_upd_sec_step( } #ifdef UNIV_DEBUG -# define row_upd_clust_rec_by_insert_inherit(rec,offsets,entry,update) \ - row_upd_clust_rec_by_insert_inherit_func(rec,offsets,entry,update) +# define row_upd_clust_rec_by_insert_inherit(rec,index,offsets,entry,update) \ + row_upd_clust_rec_by_insert_inherit_func(rec,index,offsets,entry,update) #else /* UNIV_DEBUG */ -# define row_upd_clust_rec_by_insert_inherit(rec,offsets,entry,update) \ +# define row_upd_clust_rec_by_insert_inherit(rec,index,offsets,entry,update) \ row_upd_clust_rec_by_insert_inherit_func(rec,entry,update) #endif /* UNIV_DEBUG */ /*******************************************************************//** @@ -2592,6 +2572,7 @@ row_upd_clust_rec_by_insert_inherit_func( /*=====================================*/ const rec_t* rec, /*!< in: old record, or NULL */ #ifdef UNIV_DEBUG + dict_index_t* index, /*!< in: index, or NULL */ const rec_offs* offsets,/*!< in: rec_get_offsets(rec), or NULL */ #endif /* UNIV_DEBUG */ dtuple_t* entry, /*!< in/out: updated entry to be @@ -2602,6 +2583,8 @@ row_upd_clust_rec_by_insert_inherit_func( ulint i; ut_ad(!rec == !offsets); + ut_ad(!rec == !index); + ut_ad(!rec || rec_offs_validate(rec, index, offsets)); ut_ad(!rec || rec_offs_any_extern(offsets)); for (i = 0; i < dtuple_get_n_fields(entry); i++) { @@ -2612,6 +2595,9 @@ row_upd_clust_rec_by_insert_inherit_func( ut_ad(!offsets || !rec_offs_nth_extern(offsets, i) == !dfield_is_ext(dfield) + || (!dict_index_get_nth_field(index, i)->name + && !dfield_is_ext(dfield) + && (dfield_is_null(dfield) || dfield->len == 0)) || upd_get_field_by_field_no(update, i, false)); if (!dfield_is_ext(dfield) || upd_get_field_by_field_no(update, i, false)) { @@ -2712,7 +2698,11 @@ row_upd_clust_rec_by_insert( if (index->is_instant()) entry->trim(*index); ut_ad(dtuple_get_info_bits(entry) == 0); - row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id); + { + dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id()); + ut_ad(t->len == DATA_TRX_ID_LEN); + trx_write_trx_id(static_cast<byte*>(t->data), trx->id); + } switch (node->state) { default: @@ -2721,7 +2711,7 @@ row_upd_clust_rec_by_insert( /* A lock wait occurred in row_ins_clust_index_entry() in the previous invocation of this function. */ row_upd_clust_rec_by_insert_inherit( - NULL, NULL, entry, node->update); + NULL, NULL, NULL, entry, node->update); break; case UPD_NODE_UPDATE_CLUSTERED: /* This is the first invocation of the function where @@ -2762,7 +2752,8 @@ err_exit: if (rec_offs_any_extern(offsets)) { if (row_upd_clust_rec_by_insert_inherit( - rec, offsets, entry, node->update)) { + rec, index, offsets, + entry, node->update)) { /* The blobs are disowned here, expecting the insert down below to inherit them. But if the insert fails, then this disown will be undone diff --git a/storage/innobase/srv/srv0conc.cc b/storage/innobase/srv/srv0conc.cc index 10caa0193a7..6167c8daeba 100644 --- a/storage/innobase/srv/srv0conc.cc +++ b/storage/innobase/srv/srv0conc.cc @@ -67,14 +67,12 @@ ulong srv_thread_concurrency = 0; /** Variables tracking the active and waiting threads. */ struct srv_conc_t { - char pad[CACHE_LINE_SIZE - (sizeof(ulint) + sizeof(lint))]; - /** Number of transactions that have declared_to_be_inside_innodb */ - ulint n_active; + MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) Atomic_counter<ulint> n_active; /** Number of OS threads waiting in the FIFO for permission to enter InnoDB */ - ulint n_waiting; + MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) Atomic_counter<ulint> n_waiting; }; /* Control variables for tracking concurrency. */ @@ -120,7 +118,7 @@ srv_conc_enter_innodb_with_atomics( for (;;) { ulint sleep_in_us; #ifdef WITH_WSREP - if (trx->is_wsrep() && wsrep_trx_is_aborting(trx->mysql_thd)) { + if (trx->is_wsrep() && wsrep_thd_is_aborting(trx->mysql_thd)) { if (UNIV_UNLIKELY(wsrep_debug)) { ib::info() << "srv_conc_enter due to MUST_ABORT"; @@ -132,8 +130,7 @@ srv_conc_enter_innodb_with_atomics( if (srv_thread_concurrency == 0) { if (notified_mysql) { - my_atomic_addlint(&srv_conc.n_waiting, - ulint(-1)); + srv_conc.n_waiting--; thd_wait_end(trx->mysql_thd); } @@ -141,19 +138,14 @@ srv_conc_enter_innodb_with_atomics( } if (srv_conc.n_active < srv_thread_concurrency) { - ulint n_active; /* Check if there are any free tickets. */ - n_active = my_atomic_addlint( - &srv_conc.n_active, 1) + 1; - - if (n_active <= srv_thread_concurrency) { + if (srv_conc.n_active++ < srv_thread_concurrency) { srv_enter_innodb_with_tickets(trx); if (notified_mysql) { - my_atomic_addlint(&srv_conc.n_waiting, - ulint(-1)); + srv_conc.n_waiting--; thd_wait_end(trx->mysql_thd); } @@ -175,11 +167,11 @@ srv_conc_enter_innodb_with_atomics( /* Since there were no free seats, we relinquish the overbooked ticket. */ - my_atomic_addlint(&srv_conc.n_active, ulint(-1)); + srv_conc.n_active--; } if (!notified_mysql) { - my_atomic_addlint(&srv_conc.n_waiting, 1); + srv_conc.n_waiting++; thd_wait_begin(trx->mysql_thd, THD_WAIT_USER_LOCK); @@ -223,7 +215,7 @@ srv_conc_exit_innodb_with_atomics( trx->n_tickets_to_enter_innodb = 0; trx->declared_to_be_inside_innodb = FALSE; - my_atomic_addlint(&srv_conc.n_active, ulint(-1)); + srv_conc.n_active--; } /*********************************************************************//** @@ -257,7 +249,7 @@ srv_conc_force_enter_innodb( return; } - (void) my_atomic_addlint(&srv_conc.n_active, 1); + srv_conc.n_active++; trx->n_tickets_to_enter_innodb = 1; trx->declared_to_be_inside_innodb = TRUE; diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc index 0404335574a..c759a5b75fe 100644 --- a/storage/innobase/srv/srv0mon.cc +++ b/storage/innobase/srv/srv0mon.cc @@ -298,12 +298,6 @@ static monitor_info_t innodb_counter_info[] = MONITOR_EXISTING | MONITOR_DEFAULT_ON), MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_READ}, - {"buffer_pages0_read", "buffer", - "Number of page 0 read (innodb_pages0_read)", - static_cast<monitor_type_t>( - MONITOR_EXISTING | MONITOR_DEFAULT_ON), - MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES0_READ}, - {"buffer_index_sec_rec_cluster_reads", "buffer", "Number of secondary record reads triggered cluster read", static_cast<monitor_type_t>( @@ -802,11 +796,6 @@ static monitor_info_t innodb_counter_info[] = MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK_SAVEPOINT}, - {"trx_rollback_active", "transaction", - "Number of resurrected active transactions rolled back", - MONITOR_NONE, - MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK_ACTIVE}, - {"trx_active_transactions", "transaction", "Number of active transactions", MONITOR_NONE, @@ -1748,11 +1737,6 @@ srv_mon_process_existing_counter( value = stat.n_pages_read; break; - /* innodb_pages0_read */ - case MONITOR_OVLD_PAGES0_READ: - value = srv_stats.page0_read; - break; - /* Number of times secondary index lookup triggered cluster lookup */ case MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS: value = srv_stats.n_sec_rec_cluster_reads; @@ -1955,7 +1939,7 @@ srv_mon_process_existing_counter( break; case MONITOR_RSEG_HISTORY_LEN: - value = trx_sys.history_size(); + value = trx_sys.rseg_history_len; break; case MONITOR_RSEG_CUR_SIZE: @@ -2052,7 +2036,7 @@ srv_mon_process_existing_counter( #endif /* BTR_CUR_HASH_ADAPT */ case MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE: - value = my_atomic_loadlint(&btr_cur_n_non_sea); + value = btr_cur_n_non_sea; break; case MONITOR_OVLD_PAGE_COMPRESS_SAVED: diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 5dd2baaf130..f9d5ede3794 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -59,7 +59,6 @@ Created 10/8/1995 Heikki Tuuri #include "pars0pars.h" #include "que0que.h" #include "row0mysql.h" -#include "row0trunc.h" #include "row0log.h" #include "srv0mon.h" #include "srv0srv.h" @@ -77,10 +76,6 @@ Created 10/8/1995 Heikki Tuuri #include <my_service_manager.h> -#ifdef WITH_WSREP -extern int wsrep_debug; -extern int wsrep_trx_is_aborting(void *thd_ptr); -#endif /* The following is the maximum allowed duration of a lock wait. */ UNIV_INTERN ulong srv_fatal_semaphore_wait_threshold = DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT; @@ -199,8 +194,6 @@ ulong srv_page_size_shift; /** innodb_log_write_ahead_size */ ulong srv_log_write_ahead_size; -page_size_t univ_page_size(0, 0, false); - /** innodb_adaptive_flushing; try to flush dirty pages so as to avoid IO bursts at the checkpoints. */ my_bool srv_adaptive_flushing; @@ -503,10 +496,6 @@ UNIV_INTERN ulong srv_buf_dump_status_frequency; mutex_enter(&srv_sys.mutex); \ } while (0) -/** Test if the system mutex is owned. */ -#define srv_sys_mutex_own() (mutex_own(&srv_sys.mutex) \ - && !srv_read_only_mode) - /** Release the system mutex. */ #define srv_sys_mutex_exit() do { \ mutex_exit(&srv_sys.mutex); \ @@ -603,11 +592,12 @@ struct srv_sys_t{ sys_threads[]->event are covered by srv_sys_t::mutex */ - ulint n_threads_active[SRV_MASTER + 1]; + Atomic_counter<ulint> + n_threads_active[SRV_MASTER + 1]; /*!< number of threads active in a thread class; protected - by both my_atomic_addlint() - and mutex */ + by both std::atomic and + mutex */ srv_stats_t::ulint_ctr_1_t activity_count; /*!< For tracking server @@ -619,7 +609,7 @@ static srv_sys_t srv_sys; /** @return whether the purge coordinator thread is active */ bool purge_sys_t::running() { - return my_atomic_loadlint(&srv_sys.n_threads_active[SRV_PURGE]); + return srv_sys.n_threads_active[SRV_PURGE]; } /** Event to signal srv_monitor_thread. Not protected by a mutex. @@ -822,7 +812,7 @@ srv_reserve_slot( ut_ad(srv_slot_get_type(slot) == type); - my_atomic_addlint(&srv_sys.n_threads_active[type], 1); + srv_sys.n_threads_active[type]++; srv_sys_mutex_exit(); @@ -839,7 +829,7 @@ srv_suspend_thread_low( srv_slot_t* slot) /*!< in/out: thread slot */ { ut_ad(!srv_read_only_mode); - ut_ad(srv_sys_mutex_own()); + ut_ad(mutex_own(&srv_sys.mutex)); ut_ad(slot->in_use); @@ -869,8 +859,7 @@ srv_suspend_thread_low( ut_a(!slot->suspended); slot->suspended = TRUE; - if (lint(my_atomic_addlint(&srv_sys.n_threads_active[type], ulint(-1))) - < 0) { + if (srv_sys.n_threads_active[type]-- == 0) { ut_error; } @@ -927,7 +916,7 @@ srv_resume_thread(srv_slot_t* slot, int64_t sig_count = 0, bool wait = true, ut_ad(slot->suspended); slot->suspended = FALSE; - my_atomic_addlint(&srv_sys.n_threads_active[slot->type], 1); + srv_sys.n_threads_active[slot->type]++; srv_sys_mutex_exit(); return(timeout); } @@ -1157,7 +1146,7 @@ srv_refresh_innodb_monitor_stats(void) #ifdef BTR_CUR_HASH_ADAPT btr_cur_n_sea_old = btr_cur_n_sea; #endif /* BTR_CUR_HASH_ADAPT */ - btr_cur_n_non_sea_old = my_atomic_loadlint(&btr_cur_n_non_sea); + btr_cur_n_non_sea_old = btr_cur_n_non_sea; log_refresh_stats(); @@ -1318,16 +1307,16 @@ srv_printf_innodb_monitor( "%.2f hash searches/s, %.2f non-hash searches/s\n", (btr_cur_n_sea - btr_cur_n_sea_old) / time_elapsed, - (my_atomic_loadlint(&btr_cur_n_non_sea) - btr_cur_n_non_sea_old) + (btr_cur_n_non_sea - btr_cur_n_non_sea_old) / time_elapsed); btr_cur_n_sea_old = btr_cur_n_sea; #else /* BTR_CUR_HASH_ADAPT */ fprintf(file, "%.2f non-hash searches/s\n", - (my_atomic_loadlint(&btr_cur_n_non_sea) - btr_cur_n_non_sea_old) + (btr_cur_n_non_sea - btr_cur_n_non_sea_old) / time_elapsed); #endif /* BTR_CUR_HASH_ADAPT */ - btr_cur_n_non_sea_old = my_atomic_loadlint(&btr_cur_n_non_sea); + btr_cur_n_non_sea_old = btr_cur_n_non_sea; fputs("---\n" "LOG\n" @@ -1340,7 +1329,7 @@ srv_printf_innodb_monitor( fprintf(file, "Total large memory allocated " ULINTPF "\n" "Dictionary memory allocated " ULINTPF "\n", - os_total_large_mem_allocated, + ulint{os_total_large_mem_allocated}, dict_sys_get_size()); buf_print_io(file); @@ -1547,7 +1536,6 @@ srv_export_innodb_status(void) export_vars.innodb_pages_created = stat.n_pages_created; export_vars.innodb_pages_read = stat.n_pages_read; - export_vars.innodb_page0_read = srv_stats.page0_read; export_vars.innodb_pages_written = stat.n_pages_written; @@ -1928,11 +1916,11 @@ void srv_active_wake_master_thread_low() { ut_ad(!srv_read_only_mode); - ut_ad(!srv_sys_mutex_own()); + ut_ad(!mutex_own(&srv_sys.mutex)); srv_inc_activity_count(); - if (my_atomic_loadlint(&srv_sys.n_threads_active[SRV_MASTER]) == 0) { + if (srv_sys.n_threads_active[SRV_MASTER] == 0) { srv_slot_t* slot; srv_sys_mutex_enter(); @@ -1954,11 +1942,12 @@ srv_active_wake_master_thread_low() void srv_wake_purge_thread_if_not_active() { - ut_ad(!srv_sys_mutex_own()); + ut_ad(!srv_read_only_mode); + ut_ad(!mutex_own(&srv_sys.mutex)); if (purge_sys.enabled() && !purge_sys.paused() - && !my_atomic_loadlint(&srv_sys.n_threads_active[SRV_PURGE]) - && trx_sys.history_size()) { + && !srv_sys.n_threads_active[SRV_PURGE] + && trx_sys.rseg_history_len) { srv_release_threads(SRV_PURGE, 1); } @@ -2026,16 +2015,12 @@ srv_master_evict_from_table_cache( { ulint n_tables_evicted = 0; - rw_lock_x_lock(&dict_operation_lock); - - dict_mutex_enter_for_mysql(); + dict_sys_lock(); n_tables_evicted = dict_make_room_in_cache( innobase_get_table_cache_size(), pct_check); - dict_mutex_exit_for_mysql(); - - rw_lock_x_unlock(&dict_operation_lock); + dict_sys_unlock(); return(n_tables_evicted); } @@ -2444,7 +2429,7 @@ static bool srv_purge_should_exit() return true; /* Slow shutdown was requested. */ - if (const ulint history_size= trx_sys.history_size()) + if (const uint32_t history_size= trx_sys.rseg_history_len) { static time_t progress_time; time_t now= time(NULL); @@ -2453,7 +2438,7 @@ static bool srv_purge_should_exit() progress_time= now; #if defined HAVE_SYSTEMD && !defined EMBEDDED_LIBRARY service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, - "InnoDB: to purge %zu transactions", + "InnoDB: to purge %u transactions", history_size); ib::info() << "to purge " << history_size << " transactions"; #endif @@ -2481,7 +2466,7 @@ static bool srv_task_execute(ut_d(srv_slot_t *slot)) mutex_exit(&srv_sys.tasks_mutex); ut_d(thr->thread_slot = slot); que_run_threads(thr); - my_atomic_addlint(&purge_sys.n_completed, 1); + purge_sys.n_tasks.fetch_sub(1, std::memory_order_release); return true; } @@ -2517,7 +2502,7 @@ DECLARE_THREAD(srv_worker_thread)( slot = srv_reserve_slot(SRV_WORKER); ut_a(srv_n_purge_threads > 1); - ut_a(ulong(my_atomic_loadlint(&srv_sys.n_threads_active[SRV_WORKER])) + ut_a(ulong(srv_sys.n_threads_active[SRV_WORKER]) < srv_n_purge_threads); /* We need to ensure that the worker threads exit after the @@ -2558,17 +2543,17 @@ DECLARE_THREAD(srv_worker_thread)( /** Do the actual purge operation. @param[in,out] n_total_purged total number of purged pages @return length of history list before the last purge batch. */ -static ulint srv_do_purge(ulint* n_total_purged +static uint32_t srv_do_purge(ulint* n_total_purged #ifdef UNIV_DEBUG - , srv_slot_t* slot /*!< purge coordinator */ + , srv_slot_t* slot /*!< purge coordinator */ #endif - ) + ) { ulint n_pages_purged; static ulint count = 0; static ulint n_use_threads = 0; - static ulint rseg_history_len = 0; + static uint32_t rseg_history_len = 0; ulint old_activity_count = srv_get_activity_count(); const ulint n_threads = srv_n_purge_threads; @@ -2586,7 +2571,7 @@ static ulint srv_do_purge(ulint* n_total_purged } do { - if (trx_sys.history_size() > rseg_history_len + if (trx_sys.rseg_history_len > rseg_history_len || (srv_max_purge_lag > 0 && rseg_history_len > srv_max_purge_lag)) { @@ -2615,20 +2600,14 @@ static ulint srv_do_purge(ulint* n_total_purged ut_a(n_use_threads <= n_threads); /* Take a snapshot of the history list before purge. */ - if (!(rseg_history_len = trx_sys.history_size())) { + if (!(rseg_history_len = trx_sys.rseg_history_len)) { break; } - ulint undo_trunc_freq = - purge_sys.undo_trunc.get_rseg_truncate_frequency(); - - ulint rseg_truncate_frequency = ut_min( - static_cast<ulint>(srv_purge_rseg_truncate_frequency), - undo_trunc_freq); - n_pages_purged = trx_purge( n_use_threads, - (++count % rseg_truncate_frequency) == 0 + !(++count % srv_purge_rseg_truncate_frequency) + || purge_sys.truncate.current #ifdef UNIV_DEBUG , slot #endif @@ -2652,7 +2631,7 @@ srv_purge_coordinator_suspend( /*==========================*/ srv_slot_t* slot, /*!< in/out: Purge coordinator thread slot */ - ulint rseg_history_len) /*!< in: history list length + uint32_t rseg_history_len) /*!< in: history list length before last purge */ { ut_ad(!srv_read_only_mode); @@ -2669,7 +2648,7 @@ srv_purge_coordinator_suspend( /* We don't wait right away on the the non-timed wait because we want to signal the thread that wants to suspend purge. */ const bool wait = stop - || rseg_history_len <= trx_sys.history_size(); + || rseg_history_len <= trx_sys.rseg_history_len; const bool timeout = srv_resume_thread( slot, sig_count, wait, stop ? 0 : SRV_PURGE_MAX_TIMEOUT); @@ -2679,12 +2658,12 @@ srv_purge_coordinator_suspend( rw_lock_x_lock(&purge_sys.latch); stop = srv_shutdown_state <= SRV_SHUTDOWN_INITIATED - && purge_sys.paused_latched(); + && purge_sys.paused(); if (!stop) { if (timeout && rseg_history_len < 5000 - && rseg_history_len == trx_sys.history_size()) { + && rseg_history_len == trx_sys.rseg_history_len) { /* No new records were added since the wait started. Simply wait for new records. The magic number 5000 is an @@ -2737,7 +2716,7 @@ DECLARE_THREAD(srv_purge_coordinator_thread)( slot = srv_reserve_slot(SRV_PURGE); - ulint rseg_history_len = trx_sys.history_size(); + uint32_t rseg_history_len = trx_sys.rseg_history_len; do { /* If there are no records to purge or the last @@ -2770,11 +2749,6 @@ DECLARE_THREAD(srv_purge_coordinator_thread)( /* Note that we are shutting down. */ rw_lock_x_lock(&purge_sys.latch); purge_sys.coordinator_shutdown(); - - /* If there are any pending undo-tablespace truncate then clear - it off as we plan to shutdown the purge thread. */ - purge_sys.undo_trunc.clear(); - /* Ensure that the wait in purge_sys_t::stop() will terminate. */ os_event_set(purge_sys.event); @@ -2866,9 +2840,7 @@ srv_purge_wakeup() srv_release_threads(SRV_WORKER, n_workers); } - } while (!my_atomic_loadptr_explicit(reinterpret_cast<void**> - (&srv_running), - MY_MEMORY_ORDER_RELAXED) + } while (!srv_running.load(std::memory_order_relaxed) && (srv_sys.n_threads_active[SRV_WORKER] || srv_sys.n_threads_active[SRV_PURGE])); } @@ -2882,41 +2854,6 @@ void srv_purge_shutdown() } while (srv_sys.sys_threads[SRV_PURGE_SLOT].in_use); } -/** Check if tablespace is being truncated. -(Ignore system-tablespace as we don't re-create the tablespace -and so some of the action that are suppressed by this function -for independent tablespace are not applicable to system-tablespace). -@param space_id space_id to check for truncate action -@return true if being truncated, false if not being - truncated or tablespace is system-tablespace. */ -bool -srv_is_tablespace_truncated(ulint space_id) -{ - if (is_system_tablespace(space_id)) { - return(false); - } - - return(truncate_t::is_tablespace_truncated(space_id) - || undo::Truncate::is_tablespace_truncated(space_id)); - -} - -/** Check if tablespace was truncated. -@param[in] space space object to check for truncate action -@return true if tablespace was truncated and we still have an active -MLOG_TRUNCATE REDO log record. */ -bool -srv_was_tablespace_truncated(const fil_space_t* space) -{ - if (space == NULL) { - ut_ad(0); - return(false); - } - - return (!is_system_tablespace(space->id) - && truncate_t::was_tablespace_truncated(space->id)); -} - #ifdef UNIV_DEBUG static ulint get_first_slot(srv_thread_type type) { diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index b88aa5f07c4..edae63c0e47 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -73,7 +73,6 @@ Created 2/16/1996 Heikki Tuuri #include "srv0start.h" #include "srv0srv.h" #include "btr0defragment.h" -#include "row0trunc.h" #include "mysql/service_wsrep.h" /* wsrep_recovery */ #include "trx0rseg.h" #include "os0proc.h" @@ -96,7 +95,6 @@ Created 2/16/1996 Heikki Tuuri #include "row0upd.h" #include "row0row.h" #include "row0mysql.h" -#include "row0trunc.h" #include "btr0pcur.h" #include "os0event.h" #include "zlib.h" @@ -677,9 +675,19 @@ static bool srv_undo_tablespace_open(const char* name, ulint space_id, fil_set_max_space_id_if_bigger(space_id); - fil_space_t* space = fil_space_create( - undo_name, space_id, FSP_FLAGS_PAGE_SSIZE(), - FIL_TYPE_TABLESPACE, NULL); + ulint fsp_flags; + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + fsp_flags = (FSP_FLAGS_FCRC32_MASK_MARKER + | FSP_FLAGS_FCRC32_PAGE_SSIZE()); + break; + default: + fsp_flags = FSP_FLAGS_PAGE_SSIZE(); + } + + fil_space_t* space = fil_space_create(undo_name, space_id, fsp_flags, + FIL_TYPE_TABLESPACE, NULL); ut_a(fil_validate()); ut_a(space); @@ -782,8 +790,6 @@ srv_check_undo_redo_logs_exists() return(DB_SUCCESS); } -undo::undo_spaces_t undo::Truncate::s_fix_up_spaces; - /** Open the configured number of dedicated undo tablespaces. @param[in] create_new_db whether the database is being initialized @return DB_SUCCESS or error code */ @@ -865,47 +871,9 @@ srv_undo_tablespaces_init(bool create_new_db) prev_space_id = srv_undo_space_id_start - 1; break; case SRV_OPERATION_NORMAL: - if (create_new_db) { - break; - } - /* fall through */ case SRV_OPERATION_RESTORE_ROLLBACK_XA: case SRV_OPERATION_RESTORE: case SRV_OPERATION_RESTORE_EXPORT: - ut_ad(!create_new_db); - - /* Check if any of the UNDO tablespace needs fix-up because - server crashed while truncate was active on UNDO tablespace.*/ - for (i = 0; i < n_undo_tablespaces; ++i) { - - undo::Truncate undo_trunc; - - if (undo_trunc.needs_fix_up(undo_tablespace_ids[i])) { - - char name[OS_FILE_MAX_PATH]; - - snprintf(name, sizeof(name), - "%s%cundo%03zu", - srv_undo_dir, OS_PATH_SEPARATOR, - undo_tablespace_ids[i]); - - os_file_delete(innodb_data_file_key, name); - - err = srv_undo_tablespace_create( - name, - SRV_UNDO_TABLESPACE_SIZE_IN_PAGES); - - if (err != DB_SUCCESS) { - ib::error() << "Could not fix-up undo " - " tablespace truncate '" - << name << "'."; - return(err); - } - - undo::Truncate::s_fix_up_spaces.push_back( - undo_tablespace_ids[i]); - } - } break; } @@ -1010,64 +978,6 @@ srv_undo_tablespaces_init(bool create_new_db) } } - if (!undo::Truncate::s_fix_up_spaces.empty()) { - - /* Step-1: Initialize the tablespace header and rsegs header. */ - mtr_t mtr; - - mtr_start(&mtr); - /* Turn off REDO logging. We are in server start mode and fixing - UNDO tablespace even before REDO log is read. Let's say we - do REDO logging here then this REDO log record will be applied - as part of the current recovery process. We surely don't need - that as this is fix-up action parallel to REDO logging. */ - mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); - buf_block_t* sys_header = trx_sysf_get(&mtr); - if (!sys_header) { - mtr.commit(); - return DB_CORRUPTION; - } - - for (undo::undo_spaces_t::const_iterator it - = undo::Truncate::s_fix_up_spaces.begin(); - it != undo::Truncate::s_fix_up_spaces.end(); - ++it) { - - undo::Truncate::add_space_to_trunc_list(*it); - - fil_space_t* space = fil_space_get(*it); - - fsp_header_init(space, - SRV_UNDO_TABLESPACE_SIZE_IN_PAGES, - &mtr); - - for (ulint i = 0; i < TRX_SYS_N_RSEGS; i++) { - if (trx_sysf_rseg_get_space(sys_header, i) - == *it) { - trx_rseg_header_create( - space, i, sys_header, &mtr); - } - } - - undo::Truncate::clear_trunc_list(); - } - mtr_commit(&mtr); - - /* Step-2: Flush the dirty pages from the buffer pool. */ - for (undo::undo_spaces_t::const_iterator it - = undo::Truncate::s_fix_up_spaces.begin(); - it != undo::Truncate::s_fix_up_spaces.end(); - ++it) { - FlushObserver dummy(fil_system.sys_space, NULL, NULL); - buf_LRU_flush_or_remove_pages(TRX_SYS_SPACE, &dummy); - FlushObserver dummy2(fil_space_get(*it), NULL, NULL); - buf_LRU_flush_or_remove_pages(*it, &dummy2); - - /* Remove the truncate redo log file. */ - undo::done(*it); - } - } - return(DB_SUCCESS); } @@ -1198,11 +1108,11 @@ srv_shutdown_all_bg_threads() ut_ad(!srv_read_only_mode); /* e. Exit the i/o threads */ - if (recv_sys->flush_start != NULL) { - os_event_set(recv_sys->flush_start); + if (recv_sys.flush_start != NULL) { + os_event_set(recv_sys.flush_start); } - if (recv_sys->flush_end != NULL) { - os_event_set(recv_sys->flush_end); + if (recv_sys.flush_end != NULL) { + os_event_set(recv_sys.flush_end); } os_event_set(buf_flush_event); @@ -1296,9 +1206,7 @@ srv_prepare_to_delete_redo_log_files( ulint pending_io = 0; ulint count = 0; - if ((log_sys.log.format & ~LOG_HEADER_FORMAT_ENCRYPTED) - != LOG_HEADER_FORMAT_CURRENT - || log_sys.log.subformat != 2) { + if (log_sys.log.subformat != 2) { srv_log_file_size = 0; } @@ -1317,12 +1225,10 @@ srv_prepare_to_delete_redo_log_files( { ib::info info; - if (srv_log_file_size == 0) { - info << ((log_sys.log.format - & ~LOG_HEADER_FORMAT_ENCRYPTED) - != LOG_HEADER_FORMAT_10_4 - ? "Upgrading redo log: " - : "Downgrading redo log: "); + if (srv_log_file_size == 0 + || (log_sys.log.format & ~log_t::FORMAT_ENCRYPTED) + != log_t::FORMAT_10_4) { + info << "Upgrading redo log: "; } else if (n_files != srv_n_log_files || srv_log_file_size != srv_log_file_size_requested) { @@ -1627,7 +1533,7 @@ dberr_t srv_start(bool create_new_db) #endif /* UNIV_DEBUG */ log_sys.create(); - recv_sys_init(); + recv_sys.create(); lock_sys.create(srv_lock_table_size); /* Create i/o-handler threads: */ @@ -1655,7 +1561,7 @@ dberr_t srv_start(bool create_new_db) #ifdef UNIV_LINUX /* Wait for the setpriority() call to finish. */ - os_event_wait(recv_sys->flush_end); + os_event_wait(recv_sys.flush_end); #endif /* UNIV_LINUX */ srv_start_state_set(SRV_START_STATE_IO); } @@ -1669,7 +1575,7 @@ dberr_t srv_start(bool create_new_db) if (err != DB_SUCCESS) { return(srv_init_abort(DB_ERROR)); } - recv_sys_debug_free(); + recv_sys.debug_free(); } /* Open or create the data files. */ @@ -1906,7 +1812,7 @@ files_checked: ulint ibuf_root = btr_create( DICT_CLUSTERED | DICT_IBUF, fil_system.sys_space, - DICT_IBUF_ID_MIN, dict_ind_redundant, NULL, &mtr); + DICT_IBUF_ID_MIN, dict_ind_redundant, &mtr); mtr_commit(&mtr); @@ -1945,28 +1851,20 @@ files_checked: return(srv_init_abort(err)); } } else { - /* Invalidate the buffer pool to ensure that we reread - the page that we read above, during recovery. - Note that this is not as heavy weight as it seems. At - this point there will be only ONE page in the buf_LRU - and there must be no page in the buf_flush list. */ - buf_pool_invalidate(); - - /* Scan and locate truncate log files. Parsed located files - and add table to truncate information to central vector for - truncate fix-up action post recovery. */ - err = TruncateLogParser::scan_and_parse(srv_log_group_home_dir); - if (err != DB_SUCCESS) { + /* Work around the bug that we were performing a dirty read of + at least the TRX_SYS page into the buffer pool above, without + reading or applying any redo logs. - return(srv_init_abort(DB_ERROR)); - } + MDEV-19229 FIXME: Remove the dirty reads and this call. + Add an assertion that the buffer pool is empty. */ + buf_pool_invalidate(); /* We always try to do a recovery, even if the database had been shut down normally: this is the normal startup path */ err = recv_recovery_from_checkpoint_start(flushed_lsn); - recv_sys->dblwr.pages.clear(); + recv_sys.dblwr.pages.clear(); if (err != DB_SUCCESS) { return(srv_init_abort(err)); @@ -1999,8 +1897,8 @@ files_checked: recv_apply_hashed_log_recs(true); - if (recv_sys->found_corrupt_log - || recv_sys->found_corrupt_fs) { + if (recv_sys.found_corrupt_log + || recv_sys.found_corrupt_fs) { return(srv_init_abort(DB_CORRUPTION)); } @@ -2023,7 +1921,7 @@ files_checked: /* New data file(s) were added */ mtr.start(); buf_block_t* block = buf_page_get( - page_id_t(0, 0), univ_page_size, + page_id_t(0, 0), 0, RW_SX_LATCH, &mtr); ulint size = mach_read_from_4( FSP_HEADER_OFFSET + FSP_SIZE @@ -2047,8 +1945,7 @@ files_checked: #ifdef UNIV_DEBUG { mtr.start(); - buf_block_t* block = buf_page_get(page_id_t(0, 0), - univ_page_size, + buf_block_t* block = buf_page_get(page_id_t(0, 0), 0, RW_S_LATCH, &mtr); ut_ad(mach_read_from_4(FSP_SIZE + FSP_HEADER_OFFSET + block->frame) @@ -2140,9 +2037,8 @@ files_checked: && srv_n_log_files_found == srv_n_log_files && log_sys.log.format == (srv_encrypt_log - ? LOG_HEADER_FORMAT_CURRENT - | LOG_HEADER_FORMAT_ENCRYPTED - : LOG_HEADER_FORMAT_CURRENT) + ? log_t::FORMAT_ENC_10_4 + : log_t::FORMAT_10_4) && log_sys.log.subformat == 2) { /* No need to add or remove encryption, upgrade, downgrade, or resize. */ @@ -2201,8 +2097,9 @@ files_checked: } /* Validate a few system page types that were left - uninitialized by older versions of MySQL. */ - if (!high_level_read_only) { + uninitialized before MySQL or MariaDB 5.5. */ + if (!high_level_read_only + && !fil_system.sys_space->full_crc32()) { buf_block_t* block; mtr.start(); /* Bitmap page types will be reset in @@ -2210,24 +2107,24 @@ files_checked: block = buf_page_get( page_id_t(IBUF_SPACE_ID, FSP_IBUF_HEADER_PAGE_NO), - univ_page_size, RW_X_LATCH, &mtr); + 0, RW_X_LATCH, &mtr); fil_block_check_type(*block, FIL_PAGE_TYPE_SYS, &mtr); /* Already MySQL 3.23.53 initialized FSP_IBUF_TREE_ROOT_PAGE_NO to FIL_PAGE_INDEX. No need to reset that one. */ block = buf_page_get( page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), - univ_page_size, RW_X_LATCH, &mtr); + 0, RW_X_LATCH, &mtr); fil_block_check_type(*block, FIL_PAGE_TYPE_TRX_SYS, &mtr); block = buf_page_get( page_id_t(TRX_SYS_SPACE, FSP_FIRST_RSEG_PAGE_NO), - univ_page_size, RW_X_LATCH, &mtr); + 0, RW_X_LATCH, &mtr); fil_block_check_type(*block, FIL_PAGE_TYPE_SYS, &mtr); block = buf_page_get( page_id_t(TRX_SYS_SPACE, FSP_DICT_HDR_PAGE_NO), - univ_page_size, RW_X_LATCH, &mtr); + 0, RW_X_LATCH, &mtr); fil_block_check_type(*block, FIL_PAGE_TYPE_SYS, &mtr); mtr.commit(); } @@ -2237,17 +2134,18 @@ files_checked: The data dictionary latch should guarantee that there is at most one data dictionary transaction active at a time. */ if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) { + /* If the following call is ever removed, the + first-time ha_innobase::open() must hold (or + acquire and release) a table lock that + conflicts with trx_resurrect_table_locks(), to + ensure that any recovered incomplete ALTER TABLE + will have been rolled back. Otherwise, + dict_table_t::instant could be cleared by rollback + invoking dict_index_t::clear_instant_alter() while + open table handles exist in client connections. */ trx_rollback_recovered(false); } - /* Fix-up truncate of tables in the system tablespace - if server crashed while truncate was active. The non- - system tables are done after tablespace discovery. Do - this now because this procedure assumes that no pages - have changed since redo recovery. Tablespace discovery - can do updates to pages in the system tablespace.*/ - err = truncate_t::fixup_tables_in_system_tablespace(); - if (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE) { /* Open or Create SYS_TABLESPACES and SYS_DATAFILES so that tablespace names and other metadata can be @@ -2274,10 +2172,6 @@ files_checked: dict_check_tablespaces_and_store_max_id(); } - /* Fix-up truncate of table if server crashed while truncate - was active. */ - err = truncate_t::fixup_tables_in_non_system_tablespace(); - if (err != DB_SUCCESS) { return(srv_init_abort(err)); } @@ -2474,7 +2368,7 @@ skip_monitors: Create the dump/load thread only when not running with --wsrep-recover. */ - if (!wsrep_recovery) { + if (!get_wsrep_recovery()) { #endif /* WITH_WSREP */ /* Create the buffer pool dump/load thread */ @@ -2533,9 +2427,7 @@ void srv_shutdown_bg_undo_sources() /** Shut down InnoDB. */ void innodb_shutdown() { - ut_ad(!my_atomic_loadptr_explicit(reinterpret_cast<void**> - (&srv_running), - MY_MEMORY_ORDER_RELAXED)); + ut_ad(!srv_running.load(std::memory_order_relaxed)); ut_ad(!srv_undo_sources); switch (srv_operation) { @@ -2579,7 +2471,7 @@ void innodb_shutdown() } ut_ad(dict_stats_event || !srv_was_started || srv_read_only_mode); - ut_ad(dict_sys || !srv_was_started); + ut_ad(dict_sys.is_initialised() || !srv_was_started); ut_ad(trx_sys.is_initialised() || !srv_was_started); ut_ad(buf_dblwr || !srv_was_started || srv_read_only_mode || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO); @@ -2608,7 +2500,7 @@ void innodb_shutdown() and closing the data dictionary. */ #ifdef BTR_CUR_HASH_ADAPT - if (dict_sys) { + if (dict_sys.is_initialised()) { btr_search_disable(); } #endif /* BTR_CUR_HASH_ADAPT */ @@ -2629,7 +2521,7 @@ void innodb_shutdown() mutex_free(&srv_misc_tmpfile_mutex); } - dict_close(); + dict_sys.close(); btr_search_sys_free(); /* 3. Free all InnoDB's own mutexes and the os_fast_mutexes inside @@ -2642,7 +2534,7 @@ void innodb_shutdown() /* 4. Free all allocated memory */ pars_lexer_close(); - recv_sys_close(); + recv_sys.close(); ut_ad(buf_pool_ptr || !srv_was_started); if (buf_pool_ptr) { diff --git a/storage/innobase/sync/sync0arr.cc b/storage/innobase/sync/sync0arr.cc index 65f9353ae77..7d66581998d 100644 --- a/storage/innobase/sync/sync0arr.cc +++ b/storage/innobase/sync/sync0arr.cc @@ -76,8 +76,8 @@ keep the global wait array for the sake of diagnostics and also to avoid infinite wait The error_monitor thread scans the global wait array to signal any waiting threads who have missed the signal. */ -typedef SyncArrayMutex::MutexType WaitMutex; -typedef BlockSyncArrayMutex::MutexType BlockWaitMutex; +typedef TTASEventMutex<GenericPolicy> WaitMutex; +typedef TTASEventMutex<BlockMutexPolicy> BlockWaitMutex; /** The latch types that use the sync array. */ union sync_object_t { @@ -501,7 +501,7 @@ sync_array_cell_print( WaitMutex* mutex = cell->latch.mutex; const WaitMutex::MutexPolicy& policy = mutex->policy(); #ifdef UNIV_DEBUG - const char* name = policy.get_enter_filename(); + const char* name = policy.context.get_enter_filename(); if (name == NULL) { /* The mutex might have been released. */ name = "NULL"; @@ -520,7 +520,7 @@ sync_array_cell_print( mutex->state() #ifdef UNIV_DEBUG ,name, - policy.get_enter_line() + policy.context.get_enter_line() #endif /* UNIV_DEBUG */ ); } @@ -530,7 +530,7 @@ sync_array_cell_print( const BlockWaitMutex::MutexPolicy& policy = mutex->policy(); #ifdef UNIV_DEBUG - const char* name = policy.get_enter_filename(); + const char* name = policy.context.get_enter_filename(); if (name == NULL) { /* The mutex might have been released. */ name = "NULL"; @@ -548,7 +548,7 @@ sync_array_cell_print( (ulong) mutex->state() #ifdef UNIV_DEBUG ,name, - (ulong) policy.get_enter_line() + (ulong) policy.context.get_enter_line() #endif /* UNIV_DEBUG */ ); } else if (type == RW_LOCK_X @@ -593,8 +593,8 @@ sync_array_cell_print( #endif "\n", rw_lock_get_reader_count(rwlock), - my_atomic_load32_explicit(&rwlock->waiters, MY_MEMORY_ORDER_RELAXED), - my_atomic_load32_explicit(&rwlock->lock_word, MY_MEMORY_ORDER_RELAXED), + uint32_t{rwlock->waiters}, + int32_t{rwlock->lock_word}, innobase_basename(rwlock->last_x_file_name), rwlock->last_x_line #if 0 /* JAN: TODO: FIX LATER */ @@ -740,7 +740,7 @@ sync_array_detect_deadlock( const WaitMutex::MutexPolicy& policy = mutex->policy(); if (mutex->state() != MUTEX_STATE_UNLOCKED) { - thread = policy.get_thread_id(); + thread = policy.context.get_thread_id(); /* Note that mutex->thread_id above may be also OS_THREAD_ID_UNDEFINED, because the @@ -755,7 +755,7 @@ sync_array_detect_deadlock( if (ret) { const char* name; - name = policy.get_enter_filename(); + name = policy.context.get_enter_filename(); if (name == NULL) { /* The mutex might have been @@ -767,7 +767,7 @@ sync_array_detect_deadlock( << "Mutex " << mutex << " owned by" " thread " << os_thread_pf(thread) << " file " << name << " line " - << policy.get_enter_line(); + << policy.context.get_enter_line(); sync_array_cell_print(stderr, cell); @@ -787,7 +787,7 @@ sync_array_detect_deadlock( mutex->policy(); if (mutex->state() != MUTEX_STATE_UNLOCKED) { - thread = policy.get_thread_id(); + thread = policy.context.get_thread_id(); /* Note that mutex->thread_id above may be also OS_THREAD_ID_UNDEFINED, because the @@ -802,7 +802,7 @@ sync_array_detect_deadlock( if (ret) { const char* name; - name = policy.get_enter_filename(); + name = policy.context.get_enter_filename(); if (name == NULL) { /* The mutex might have been @@ -814,7 +814,7 @@ sync_array_detect_deadlock( << "Mutex " << mutex << " owned by" " thread " << os_thread_pf(thread) << " file " << name << " line " - << policy.get_enter_line(); + << policy.context.get_enter_line(); return(true); @@ -972,7 +972,7 @@ sync_array_print_long_waits_low( ulint i; /* For huge tables, skip the check during CHECK TABLE etc... */ - if (fatal_timeout > SRV_SEMAPHORE_WAIT_EXTENSION) { + if (btr_validate_index_running) { return(false); } @@ -1381,9 +1381,9 @@ sync_arr_fill_sys_semphore_waits_table( //fields[SYS_SEMAPHORE_WAITS_HOLDER_LINE]->set_notnull(); OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_READERS], rw_lock_get_reader_count(rwlock))); OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_WAITERS_FLAG], - my_atomic_load32_explicit(&rwlock->waiters, MY_MEMORY_ORDER_RELAXED))); + rwlock->waiters)); OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_LOCK_WORD], - my_atomic_load32_explicit(&rwlock->lock_word, MY_MEMORY_ORDER_RELAXED))); + rwlock->lock_word)); OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE], innobase_basename(rwlock->last_x_file_name))); OK(fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE]->store(rwlock->last_x_line, true)); fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE]->set_notnull(); diff --git a/storage/innobase/sync/sync0rw.cc b/storage/innobase/sync/sync0rw.cc index d71bc9cda90..fea94cc05f9 100644 --- a/storage/innobase/sync/sync0rw.cc +++ b/storage/innobase/sync/sync0rw.cc @@ -202,9 +202,6 @@ rw_lock_create_func( new(lock) rw_lock_t(); #endif /* UNIV_DEBUG */ - /* If this is the very first time a synchronization object is - created, then the following call initializes the sync system. */ - lock->lock_word = X_LOCK_DECR; lock->waiters = 0; @@ -238,9 +235,7 @@ rw_lock_create_func( lock->is_block_lock = 0; mutex_enter(&rw_lock_list_mutex); - UT_LIST_ADD_FIRST(rw_lock_list, lock); - mutex_exit(&rw_lock_list_mutex); } @@ -254,8 +249,7 @@ rw_lock_free_func( rw_lock_t* lock) /*!< in/out: rw-lock */ { ut_ad(rw_lock_validate(lock)); - ut_a(my_atomic_load32_explicit(&lock->lock_word, - MY_MEMORY_ORDER_RELAXED) == X_LOCK_DECR); + ut_a(lock->lock_word == X_LOCK_DECR); mutex_enter(&rw_lock_list_mutex); @@ -300,8 +294,7 @@ lock_loop: HMT_low(); ulint j = i; while (i < srv_n_spin_wait_rounds && - my_atomic_load32_explicit(&lock->lock_word, - MY_MEMORY_ORDER_RELAXED) <= 0) { + lock->lock_word <= 0) { ut_delay(srv_spin_wait_delay); i++; } @@ -341,7 +334,7 @@ lock_loop: /* Set waiters before checking lock_word to ensure wake-up signal is sent. This may lead to some unnecessary signals. */ - my_atomic_fas32_explicit(&lock->waiters, 1, MY_MEMORY_ORDER_ACQUIRE); + lock->waiters.exchange(1, std::memory_order_acquire); if (rw_lock_s_lock_low(lock, pass, file_name, line)) { @@ -419,10 +412,10 @@ rw_lock_x_lock_wait_func( sync_array_t* sync_arr; int64_t count_os_wait = 0; - ut_ad(my_atomic_load32_explicit(&lock->lock_word, MY_MEMORY_ORDER_RELAXED) <= threshold); + ut_ad(lock->lock_word <= threshold); HMT_low(); - while (my_atomic_load32_explicit(&lock->lock_word, MY_MEMORY_ORDER_RELAXED) < threshold) { + while (lock->lock_word < threshold) { ut_delay(srv_spin_wait_delay); if (i < srv_n_spin_wait_rounds) { @@ -441,8 +434,7 @@ rw_lock_x_lock_wait_func( i = 0; /* Check lock_word to ensure wake-up isn't missed.*/ - if (my_atomic_load32_explicit(&lock->lock_word, MY_MEMORY_ORDER_RELAXED) < threshold) { - + if (lock->lock_word < threshold) { ++count_os_wait; /* Add debug info as it is needed to detect possible @@ -531,18 +523,15 @@ rw_lock_x_lock_low( file_name, line); } else { - int32_t lock_word = my_atomic_load32_explicit(&lock->lock_word, - MY_MEMORY_ORDER_RELAXED); + int32_t lock_word = lock->lock_word; /* At least one X lock by this thread already exists. Add another. */ if (lock_word == 0 || lock_word == -X_LOCK_HALF_DECR) { - my_atomic_add32_explicit(&lock->lock_word, -X_LOCK_DECR, - MY_MEMORY_ORDER_RELAXED); + lock->lock_word.fetch_sub(X_LOCK_DECR); } else { ut_ad(lock_word <= -X_LOCK_DECR); - my_atomic_add32_explicit(&lock->lock_word, -1, - MY_MEMORY_ORDER_RELAXED); + lock->lock_word.fetch_sub(1); } } @@ -614,10 +603,10 @@ rw_lock_sx_lock_low( read and write to the lock_word. */ #ifdef UNIV_DEBUG - int32_t lock_word = + auto lock_word = #endif - my_atomic_add32_explicit(&lock->lock_word, -X_LOCK_HALF_DECR, - MY_MEMORY_ORDER_RELAXED); + lock->lock_word.fetch_sub(X_LOCK_HALF_DECR, + std::memory_order_relaxed); ut_ad((lock_word == 0) || ((lock_word <= -X_LOCK_DECR) @@ -692,7 +681,7 @@ lock_loop: HMT_low(); ulint j = i; while (i < srv_n_spin_wait_rounds - && my_atomic_load32_explicit(&lock->lock_word, MY_MEMORY_ORDER_RELAXED) <= X_LOCK_HALF_DECR) { + && lock->lock_word <= X_LOCK_HALF_DECR) { ut_delay(srv_spin_wait_delay); i++; } @@ -717,7 +706,7 @@ lock_loop: /* Waiters must be set before checking lock_word, to ensure signal is sent. This could lead to a few unnecessary wake-up signals. */ - my_atomic_fas32_explicit(&lock->waiters, 1, MY_MEMORY_ORDER_ACQUIRE); + lock->waiters.exchange(1, std::memory_order_acquire); if (rw_lock_x_lock_low(lock, pass, file_name, line)) { sync_array_free_cell(sync_arr, cell); @@ -797,7 +786,7 @@ lock_loop: /* Spin waiting for the lock_word to become free */ ulint j = i; while (i < srv_n_spin_wait_rounds - && my_atomic_load32_explicit(&lock->lock_word, MY_MEMORY_ORDER_RELAXED) <= X_LOCK_HALF_DECR) { + && lock->lock_word <= X_LOCK_HALF_DECR) { ut_delay(srv_spin_wait_delay); i++; } @@ -821,7 +810,7 @@ lock_loop: /* Waiters must be set before checking lock_word, to ensure signal is sent. This could lead to a few unnecessary wake-up signals. */ - my_atomic_fas32_explicit(&lock->waiters, 1, MY_MEMORY_ORDER_ACQUIRE); + lock->waiters.exchange(1, std::memory_order_acquire); if (rw_lock_sx_lock_low(lock, pass, file_name, line)) { @@ -859,15 +848,11 @@ rw_lock_validate( /*=============*/ const rw_lock_t* lock) /*!< in: rw-lock */ { - int32_t lock_word; - ut_ad(lock); - lock_word = my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), - MY_MEMORY_ORDER_RELAXED); + int32_t lock_word = lock->lock_word; - ut_ad(my_atomic_load32_explicit(const_cast<int32_t*>(&lock->waiters), - MY_MEMORY_ORDER_RELAXED) < 2); + ut_ad(lock->waiters < 2); ut_ad(lock_word > -(2 * X_LOCK_DECR)); ut_ad(lock_word <= X_LOCK_DECR); @@ -930,8 +915,7 @@ rw_lock_add_debug_info( rw_lock_debug_mutex_exit(); if (pass == 0 && lock_type != RW_LOCK_X_WAIT) { - int32_t lock_word = my_atomic_load32_explicit(&lock->lock_word, - MY_MEMORY_ORDER_RELAXED); + int32_t lock_word = lock->lock_word; /* Recursive x while holding SX (lock_type == RW_LOCK_X && lock_word == -X_LOCK_HALF_DECR) @@ -1117,11 +1101,11 @@ rw_lock_list_print_info( count++; - if (my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), MY_MEMORY_ORDER_RELAXED) != X_LOCK_DECR) { + if (lock->lock_word != X_LOCK_DECR) { fprintf(file, "RW-LOCK: %p ", (void*) lock); - if (int32_t waiters= my_atomic_load32_explicit(const_cast<int32_t*>(&lock->waiters), MY_MEMORY_ORDER_RELAXED)) { + if (int32_t waiters= lock->waiters) { fprintf(file, " (%d waiters)\n", waiters); } else { putc('\n', file); @@ -1185,10 +1169,10 @@ rw_lock_debug_print( fprintf(f, "\n"); } -/** Print where it was locked from +/** Print the rw-lock information. @return the string representation */ std::string -rw_lock_t::locked_from() const +rw_lock_t::to_string() const { /* Note: For X locks it can be locked form multiple places because the same thread can call X lock recursively. */ @@ -1198,6 +1182,11 @@ rw_lock_t::locked_from() const ut_ad(rw_lock_validate(this)); + msg << "RW-LATCH: " + << "thread id " << os_thread_pf(os_thread_get_curr_id()) + << " addr: " << this + << " Locked from: "; + rw_lock_debug_mutex_enter(); for (rw_lock_debug_t* info = UT_LIST_GET_FIRST(debug_list); @@ -1220,19 +1209,4 @@ rw_lock_t::locked_from() const return(msg.str()); } - -/** Print the rw-lock information. -@return the string representation */ -std::string -rw_lock_t::to_string() const -{ - std::ostringstream msg; - - msg << "RW-LATCH: " - << "thread id " << os_thread_pf(os_thread_get_curr_id()) - << " addr: " << this - << " Locked from: " << locked_from().c_str(); - - return(msg.str()); -} #endif /* UNIV_DEBUG */ diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc index defa74885b6..b95a2ac565c 100644 --- a/storage/innobase/trx/trx0purge.cc +++ b/storage/innobase/trx/trx0purge.cc @@ -166,8 +166,6 @@ void purge_sys_t::create() ut_ad(event); m_paused= 0; query= purge_graph_build(); - n_submitted= 0; - n_completed= 0; next_stored= false; rseg= NULL; page_no= 0; @@ -176,7 +174,8 @@ void purge_sys_t::create() hdr_offset= 0; rw_lock_create(trx_purge_latch_key, &latch, SYNC_PURGE_LATCH); mutex_create(LATCH_ID_PURGE_SYS_PQ, &pq_mutex); - undo_trunc.create(); + truncate.current= NULL; + truncate.last= NULL; } /** Close the purge subsystem on shutdown. */ @@ -185,7 +184,8 @@ void purge_sys_t::close() ut_ad(this == &purge_sys); if (!event) return; - m_enabled= false; + ut_ad(!enabled()); + ut_ad(n_tasks.load(std::memory_order_relaxed) == 0); trx_t* trx = query->trx; que_graph_free(query); ut_ad(!trx->id); @@ -308,7 +308,7 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr) rseg->needs_purge = true; } - trx_sys.history_insert(); + trx_sys.rseg_history_len++; if (undo->state == TRX_UNDO_CACHED) { UT_LIST_ADD_FIRST(rseg->undo_cached, undo); @@ -334,7 +334,7 @@ trx_purge_remove_log_hdr( { flst_remove(rseg_hdr + TRX_RSEG_HISTORY, log_hdr + TRX_UNDO_HISTORY_NODE, mtr); - trx_sys.history_remove(); + trx_sys.rseg_history_len--; } /** Free an undo log segment, and remove the header from the history list. @@ -504,308 +504,22 @@ func_exit: goto loop; } -/** UNDO log truncate logger. Needed to track state of truncate during crash. -An auxiliary redo log file undo_<space_id>_trunc.log will created while the -truncate of the UNDO is in progress. This file is required during recovery -to complete the truncate. */ - -namespace undo { - /** Magic Number to indicate truncate action is complete. */ - static const ib_uint32_t s_magic = 76845412; - - /** Populate log file name based on space_id - @param[in] space_id id of the undo tablespace. - @return DB_SUCCESS or error code */ - static dberr_t populate_log_file_name( - ulint space_id, - char*& log_file_name) - { - static const char s_log_prefix[] = "undo_"; - static const char s_log_ext[] = "trunc.log"; - - ulint log_file_name_sz = strlen(srv_log_group_home_dir) - + (22 - 1 /* NUL */ - + sizeof s_log_prefix + sizeof s_log_ext); - - log_file_name = new (std::nothrow) char[log_file_name_sz]; - if (log_file_name == 0) { - return(DB_OUT_OF_MEMORY); - } - - memset(log_file_name, 0, log_file_name_sz); - - strcpy(log_file_name, srv_log_group_home_dir); - ulint log_file_name_len = strlen(log_file_name); - - if (log_file_name[log_file_name_len - 1] - != OS_PATH_SEPARATOR) { - - log_file_name[log_file_name_len] - = OS_PATH_SEPARATOR; - log_file_name_len = strlen(log_file_name); - } - - snprintf(log_file_name + log_file_name_len, - log_file_name_sz - log_file_name_len, - "%s" ULINTPF "_%s", s_log_prefix, - space_id, s_log_ext); - - return(DB_SUCCESS); - } - - /** Mark completion of undo truncate action by writing magic number to - the log file and then removing it from the disk. - If we are going to remove it from disk then why write magic number ? - This is to safeguard from unlink (file-system) anomalies that will keep - the link to the file even after unlink action is successfull and - ref-count = 0. - @param[in] space_id id of the undo tablespace to truncate.*/ - void done( - ulint space_id) - { - dberr_t err; - char* log_file_name; - - /* Step-1: Create the log file name using the pre-decided - prefix/suffix and table id of undo tablepsace to truncate. */ - err = populate_log_file_name(space_id, log_file_name); - if (err != DB_SUCCESS) { - return; - } - - /* Step-2: Open log file and write magic number to - indicate done phase. */ - bool ret; - os_file_t handle = - os_file_create_simple_no_error_handling( - innodb_log_file_key, log_file_name, - OS_FILE_OPEN, OS_FILE_READ_WRITE, - srv_read_only_mode, &ret); - - if (!ret) { - os_file_delete(innodb_log_file_key, log_file_name); - delete[] log_file_name; - return; - } - - ulint sz = srv_page_size; - void* buf = ut_zalloc_nokey(sz + srv_page_size); - if (buf == NULL) { - os_file_close(handle); - os_file_delete(innodb_log_file_key, log_file_name); - delete[] log_file_name; - return; - } - - byte* log_buf = static_cast<byte*>( - ut_align(buf, srv_page_size)); - - mach_write_to_4(log_buf, undo::s_magic); - - IORequest request(IORequest::WRITE); - - err = os_file_write( - request, log_file_name, handle, log_buf, 0, sz); - - ut_ad(err == DB_SUCCESS); - - os_file_flush(handle); - os_file_close(handle); - - ut_free(buf); - os_file_delete(innodb_log_file_key, log_file_name); - delete[] log_file_name; - } - - /** Check if TRUNCATE_DDL_LOG file exist. - @param[in] space_id id of the undo tablespace. - @return true if exist else false. */ - bool is_log_present( - ulint space_id) - { - dberr_t err; - char* log_file_name; - - /* Step-1: Populate log file name. */ - err = populate_log_file_name(space_id, log_file_name); - if (err != DB_SUCCESS) { - return(false); - } - - /* Step-2: Check for existence of the file. */ - bool exist; - os_file_type_t type; - os_file_status(log_file_name, &exist, &type); - - /* Step-3: If file exists, check it for presence of magic - number. If found, then delete the file and report file - doesn't exist as presence of magic number suggest that - truncate action was complete. */ - - if (exist) { - bool ret; - os_file_t handle = - os_file_create_simple_no_error_handling( - innodb_log_file_key, log_file_name, - OS_FILE_OPEN, OS_FILE_READ_WRITE, - srv_read_only_mode, &ret); - if (!ret) { - os_file_delete(innodb_log_file_key, - log_file_name); - delete[] log_file_name; - return(false); - } - - ulint sz = srv_page_size; - void* buf = ut_zalloc_nokey(sz + srv_page_size); - if (buf == NULL) { - os_file_close(handle); - os_file_delete(innodb_log_file_key, - log_file_name); - delete[] log_file_name; - return(false); - } - - byte* log_buf = static_cast<byte*>( - ut_align(buf, srv_page_size)); - - IORequest request(IORequest::READ); - - dberr_t err; - - err = os_file_read(request, handle, log_buf, 0, sz); - - os_file_close(handle); - - if (UNIV_UNLIKELY(err != DB_SUCCESS)) { - ib::info() - << "Unable to read '" - << log_file_name << "' : " - << err; - - os_file_delete( - innodb_log_file_key, log_file_name); - - ut_free(buf); - - delete[] log_file_name; - - return(false); - } - - ulint magic_no = mach_read_from_4(log_buf); - - ut_free(buf); - - if (magic_no == undo::s_magic) { - /* Found magic number. */ - os_file_delete(innodb_log_file_key, - log_file_name); - delete[] log_file_name; - return(false); - } - } - - delete[] log_file_name; - - return(exist); - } -}; - -/** Iterate over all the UNDO tablespaces and check if any of the UNDO -tablespace qualifies for TRUNCATE (size > threshold). -@param[in,out] undo_trunc undo truncate tracker */ -static -void -trx_purge_mark_undo_for_truncate( - undo::Truncate* undo_trunc) -{ - /* Step-1: If UNDO Tablespace - - already marked for truncate (OR) - - truncate disabled - return immediately else search for qualifying tablespace. */ - if (undo_trunc->is_marked() || !srv_undo_log_truncate) { - return; - } - - /* Step-2: Validation/Qualification checks - a. At-least 2 UNDO tablespaces so even if one UNDO tablespace - is being truncated server can continue to operate. - b. At-least 2 persistent UNDO logs (besides the default rseg-0) - b. At-least 1 UNDO tablespace size > threshold. */ - if (srv_undo_tablespaces_active < 2 || srv_undo_logs < 3) { - return; - } - - /* Avoid bias selection and so start the scan from immediate next - of last selected UNDO tablespace for truncate. */ - ulint space_id = undo_trunc->get_scan_start(); - - for (ulint i = 1; i <= srv_undo_tablespaces_active; i++) { - - if (fil_space_get_size(space_id) - > (srv_max_undo_log_size >> srv_page_size_shift)) { - /* Tablespace qualifies for truncate. */ - undo_trunc->mark(space_id); - undo::Truncate::add_space_to_trunc_list(space_id); - break; - } - - space_id = ((space_id + 1) % (srv_undo_tablespaces_active + 1)); - if (space_id == 0) { - /* Note: UNDO tablespace ids starts from 1. */ - ++space_id; - } - } - - /* Couldn't make any selection. */ - if (!undo_trunc->is_marked()) { - return; - } - - DBUG_LOG("undo", - "marking for truncate UNDO tablespace " - << undo_trunc->get_marked_space_id()); - - /* Step-3: Iterate over all the rsegs of selected UNDO tablespace - and mark them temporarily unavailable for allocation.*/ - for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) { - if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) { - ut_ad(rseg->is_persistent()); - if (rseg->space->id - == undo_trunc->get_marked_space_id()) { - - /* Once set this rseg will not be allocated - to new booting transaction but we will wait - for existing active transaction to finish. */ - rseg->skip_allocation = true; - undo_trunc->add_rseg_to_trunc(rseg); - } - } - } -} - -undo::undo_spaces_t undo::Truncate::s_spaces_to_truncate; - /** Cleanse purge queue to remove the rseg that reside in undo-tablespace marked for truncate. -@param[in,out] undo_trunc undo truncate tracker */ -static -void -trx_purge_cleanse_purge_queue( - undo::Truncate* undo_trunc) +@param[in] space undo tablespace being truncated */ +static void trx_purge_cleanse_purge_queue(const fil_space_t& space) { - mutex_enter(&purge_sys.pq_mutex); typedef std::vector<TrxUndoRsegs> purge_elem_list_t; purge_elem_list_t purge_elem_list; + mutex_enter(&purge_sys.pq_mutex); + /* Remove rseg instances that are in the purge queue before we start truncate of corresponding UNDO truncate. */ while (!purge_sys.purge_queue.empty()) { purge_elem_list.push_back(purge_sys.purge_queue.top()); purge_sys.purge_queue.pop(); } - ut_ad(purge_sys.purge_queue.empty()); for (purge_elem_list_t::iterator it = purge_elem_list.begin(); it != purge_elem_list.end(); @@ -814,9 +528,7 @@ trx_purge_cleanse_purge_queue( for (TrxUndoRsegs::iterator it2 = it->begin(); it2 != it->end(); ++it2) { - - if ((*it2)->space->id - == undo_trunc->get_marked_space_id()) { + if ((*it2)->space == &space) { it->erase(it2); break; } @@ -826,278 +538,285 @@ trx_purge_cleanse_purge_queue( purge_sys.purge_queue.push(*it); } } + mutex_exit(&purge_sys.pq_mutex); } -/** Iterate over selected UNDO tablespace and check if all the rsegs -that resides in the tablespace are free. -@param[in] limit truncate_limit -@param[in,out] undo_trunc undo truncate tracker */ -static -void -trx_purge_initiate_truncate( - const purge_sys_t::iterator& limit, - undo::Truncate* undo_trunc) +/** +Removes unnecessary history data from rollback segments. NOTE that when this +function is called, the caller must not have any latches on undo log pages! +*/ +static void trx_purge_truncate_history() { - /* Step-1: Early check to findout if any of the the UNDO tablespace - is marked for truncate. */ - if (!undo_trunc->is_marked()) { - /* No tablespace marked for truncate yet. */ - return; - } - - /* Step-2: Scan over each rseg and ensure that it doesn't hold any - active undo records. */ - bool all_free = true; - - for (ulint i = 0; i < undo_trunc->rsegs_size() && all_free; ++i) { - - trx_rseg_t* rseg = undo_trunc->get_ith_rseg(i); + ut_ad(purge_sys.head <= purge_sys.tail); + purge_sys_t::iterator& head = purge_sys.head.commit + ? purge_sys.head : purge_sys.tail; - mutex_enter(&rseg->mutex); + if (head.trx_no() >= purge_sys.view.low_limit_no()) { + /* This is sometimes necessary. TODO: find out why. */ + head.reset_trx_no(purge_sys.view.low_limit_no()); + head.undo_no = 0; + } - if (rseg->trx_ref_count > 0) { - /* This rseg is still being held by an active - transaction. */ - all_free = false; - mutex_exit(&rseg->mutex); - continue; + for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) { + if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) { + ut_ad(rseg->id == i); + trx_purge_truncate_rseg_history(*rseg, head); } + } - ut_ad(rseg->trx_ref_count == 0); - ut_ad(rseg->skip_allocation); - - ulint size_of_rsegs = rseg->curr_size; - - if (size_of_rsegs == 1) { - mutex_exit(&rseg->mutex); - continue; - } else { - - /* There could be cached undo segment. Check if records - in these segments can be purged. Normal purge history - will not touch these cached segment. */ - ulint cached_undo_size = 0; + if (srv_undo_tablespaces_active < 2) { + return; + } - for (trx_undo_t* undo = - UT_LIST_GET_FIRST(rseg->undo_cached); - undo != NULL && all_free; - undo = UT_LIST_GET_NEXT(undo_list, undo)) { + while (srv_undo_log_truncate && srv_undo_logs >= 3) { + if (!purge_sys.truncate.current) { + const ulint threshold = ulint(srv_max_undo_log_size + >> srv_page_size_shift); + for (ulint i = purge_sys.truncate.last + ? purge_sys.truncate.last->id + - srv_undo_space_id_start + : 0, j = i;; ) { + ulint space_id = srv_undo_space_id_start + i; + ut_ad(srv_is_undo_tablespace(space_id)); + + if (fil_space_get_size(space_id) + > threshold) { + purge_sys.truncate.current + = fil_space_get(space_id); + break; + } - if (limit.trx_no() < undo->trx_id) { - all_free = false; - } else { - cached_undo_size += undo->size; + ++i; + i %= srv_undo_tablespaces_active; + if (i == j) { + break; } } + } - ut_ad(size_of_rsegs >= (cached_undo_size + 1)); + if (!purge_sys.truncate.current) { + return; + } - if (size_of_rsegs > (cached_undo_size + 1)) { - /* There are pages besides cached pages that - still hold active data. */ - all_free = false; + const fil_space_t& space = *purge_sys.truncate.current; + /* Undo tablespace always are a single file. */ + ut_a(UT_LIST_GET_LEN(space.chain) == 1); + fil_node_t* file = UT_LIST_GET_FIRST(space.chain); + /* The undo tablespace files are never closed. */ + ut_ad(file->is_open()); + + DBUG_LOG("undo", "marking for truncate: " << file->name); + + for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) { + if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) { + ut_ad(rseg->is_persistent()); + if (rseg->space == &space) { + /* Once set, this rseg will + not be allocated to subsequent + transactions, but we will wait + for existing active + transactions to finish. */ + rseg->skip_allocation = true; + } } } - mutex_exit(&rseg->mutex); - } - - if (!all_free) { - /* rseg still holds active data.*/ - return; - } - - - /* Step-3: Start the actual truncate. - a. Remove rseg instance if added to purge queue before we - initiate truncate. - b. Execute actual truncate */ - - const ulint space_id = undo_trunc->get_marked_space_id(); - - ib::info() << "Truncating UNDO tablespace " << space_id; - - trx_purge_cleanse_purge_queue(undo_trunc); - - ut_a(srv_is_undo_tablespace(space_id)); - - fil_space_t* space = fil_space_get(space_id); - - if (!space) { -not_found: - ib::error() << "Failed to find UNDO tablespace " << space_id; - return; - } - - /* Flush all to-be-discarded pages of the tablespace. - - During truncation, we do not want any writes to the - to-be-discarded area, because we must set the space->size - early in order to have deterministic page allocation. - - If a log checkpoint was completed at LSN earlier than our - mini-transaction commit and the server was killed, then - discarding the to-be-trimmed pages without flushing would - break crash recovery. So, we cannot avoid the write. */ - { - FlushObserver observer( - space, - UT_LIST_GET_FIRST(purge_sys.query->thrs)->graph->trx, - NULL); - buf_LRU_flush_or_remove_pages(space_id, &observer); - } - - log_free_check(); + for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) { + trx_rseg_t* rseg = trx_sys.rseg_array[i]; + if (!rseg || rseg->space != &space) { + continue; + } + mutex_enter(&rseg->mutex); + ut_ad(rseg->skip_allocation); + if (rseg->trx_ref_count) { +not_free: + mutex_exit(&rseg->mutex); + return; + } - /* Adjust the tablespace metadata. */ - space = fil_truncate_prepare(space_id); + if (rseg->curr_size != 1) { + /* Check if all segments are + cached and safe to remove. */ + ulint cached = 0; + + for (trx_undo_t* undo = UT_LIST_GET_FIRST( + rseg->undo_cached); + undo; + undo = UT_LIST_GET_NEXT(undo_list, + undo)) { + if (head.trx_no() < undo->trx_id) { + goto not_free; + } else { + cached += undo->size; + } + } - if (!space) { - goto not_found; - } + ut_ad(rseg->curr_size > cached); - /* Undo tablespace always are a single file. */ - ut_a(UT_LIST_GET_LEN(space->chain) == 1); - fil_node_t* file = UT_LIST_GET_FIRST(space->chain); - /* The undo tablespace files are never closed. */ - ut_ad(file->is_open()); + if (rseg->curr_size > cached + 1) { + goto not_free; + } + } - /* Re-initialize tablespace, in a single mini-transaction. */ - mtr_t mtr; - const ulint size = SRV_UNDO_TABLESPACE_SIZE_IN_PAGES; - mtr.start(); - mtr_x_lock_space(space, &mtr); - fil_truncate_log(space, size, &mtr); - fsp_header_init(space, size, &mtr); - mutex_enter(&fil_system.mutex); - space->size = file->size = size; - mutex_exit(&fil_system.mutex); - - buf_block_t* sys_header = trx_sysf_get(&mtr); - - for (ulint i = 0; i < undo_trunc->rsegs_size(); ++i) { - trx_rseg_t* rseg = undo_trunc->get_ith_rseg(i); - buf_block_t* rblock = trx_rseg_header_create( - space, rseg->id, sys_header, &mtr); - ut_ad(rblock); - rseg->page_no = rblock ? rblock->page.id.page_no() : FIL_NULL; - - /* Before re-initialization ensure that we free the existing - structure. There can't be any active transactions. */ - ut_a(UT_LIST_GET_LEN(rseg->undo_list) == 0); - ut_a(UT_LIST_GET_LEN(rseg->old_insert_list) == 0); - - trx_undo_t* next_undo; - - for (trx_undo_t* undo = UT_LIST_GET_FIRST(rseg->undo_cached); - undo != NULL; - undo = next_undo) { - - next_undo = UT_LIST_GET_NEXT(undo_list, undo); - UT_LIST_REMOVE(rseg->undo_cached, undo); - MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); - ut_free(undo); + mutex_exit(&rseg->mutex); } - UT_LIST_INIT(rseg->undo_list, &trx_undo_t::undo_list); - UT_LIST_INIT(rseg->undo_cached, &trx_undo_t::undo_list); - UT_LIST_INIT(rseg->old_insert_list, &trx_undo_t::undo_list); - - /* These were written by trx_rseg_header_create(). */ - ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT - + rblock->frame)); - ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE - + rblock->frame)); - - /* Initialize the undo log lists according to the rseg header */ - rseg->curr_size = 1; - rseg->trx_ref_count = 0; - rseg->last_page_no = FIL_NULL; - rseg->last_offset = 0; - rseg->last_commit = 0; - rseg->needs_purge = false; - } + ib::info() << "Truncating " << file->name; + trx_purge_cleanse_purge_queue(space); + + /* Flush all to-be-discarded pages of the tablespace. + + During truncation, we do not want any writes to the + to-be-discarded area, because we must set the space.size + early in order to have deterministic page allocation. + + If a log checkpoint was completed at LSN earlier than our + mini-transaction commit and the server was killed, then + discarding the to-be-trimmed pages without flushing would + break crash recovery. So, we cannot avoid the write. */ + { + FlushObserver observer( + purge_sys.truncate.current, + UT_LIST_GET_FIRST(purge_sys.query->thrs) + ->graph->trx, + NULL); + buf_LRU_flush_or_remove_pages(space.id, &observer); + } - mtr.commit(); - /* Write-ahead the redo log record. */ - log_write_up_to(mtr.commit_lsn(), true); - - /* Trim the file size. */ - os_file_truncate(file->name, file->handle, - os_offset_t(size) << srv_page_size_shift, true); - - /* This is only executed by the srv_purge_coordinator_thread. */ - export_vars.innodb_undo_truncations++; - - /* TODO: PUNCH_HOLE the garbage (with write-ahead logging) */ - - mutex_enter(&fil_system.mutex); - ut_ad(space->stop_new_ops); - ut_ad(space->is_being_truncated); - space->stop_new_ops = false; - space->is_being_truncated = false; - mutex_exit(&fil_system.mutex); - - if (purge_sys.rseg != NULL - && purge_sys.rseg->last_page_no == FIL_NULL) { - /* If purge_sys.rseg is pointing to rseg that was recently - truncated then move to next rseg element. - Note: Ideally purge_sys.rseg should be NULL because purge - should complete processing of all the records but there is - purge_batch_size that can force the purge loop to exit before - all the records are purged and in this case purge_sys.rseg - could point to a valid rseg waiting for next purge cycle. */ - purge_sys.next_stored = false; - purge_sys.rseg = NULL; - } + log_free_check(); - DBUG_EXECUTE_IF("ib_undo_trunc", - ib::info() << "ib_undo_trunc"; - log_write_up_to(LSN_MAX, true); - DBUG_SUICIDE();); + /* Adjust the tablespace metadata. */ + if (!fil_truncate_prepare(space.id)) { + ib::error() << "Failed to find UNDO tablespace " + << file->name; + return; + } - /* Completed truncate. Now it is safe to re-use the tablespace. */ - for (ulint i = 0; i < undo_trunc->rsegs_size(); ++i) { - trx_rseg_t* rseg = undo_trunc->get_ith_rseg(i); - rseg->skip_allocation = false; - } + /* Re-initialize tablespace, in a single mini-transaction. */ + mtr_t mtr; + const ulint size = SRV_UNDO_TABLESPACE_SIZE_IN_PAGES; + mtr.start(); + mtr_x_lock_space(purge_sys.truncate.current, &mtr); + fil_truncate_log(purge_sys.truncate.current, size, &mtr); + fsp_header_init(purge_sys.truncate.current, size, &mtr); + mutex_enter(&fil_system.mutex); + purge_sys.truncate.current->size = file->size = size; + mutex_exit(&fil_system.mutex); + + buf_block_t* sys_header = trx_sysf_get(&mtr); + + for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) { + trx_rseg_t* rseg = trx_sys.rseg_array[i]; + if (!rseg || rseg->space != &space) { + continue; + } - ib::info() << "Truncated UNDO tablespace " << space_id; + ut_ad(rseg->is_persistent()); + ut_d(const ulint old_page = rseg->page_no); + + buf_block_t* rblock = trx_rseg_header_create( + purge_sys.truncate.current, + rseg->id, sys_header, &mtr); + ut_ad(rblock); + rseg->page_no = rblock + ? rblock->page.id.page_no() : FIL_NULL; + ut_ad(old_page == rseg->page_no); + + /* Before re-initialization ensure that we + free the existing structure. There can't be + any active transactions. */ + ut_a(UT_LIST_GET_LEN(rseg->undo_list) == 0); + ut_a(UT_LIST_GET_LEN(rseg->old_insert_list) == 0); + + trx_undo_t* next_undo; + + for (trx_undo_t* undo = UT_LIST_GET_FIRST( + rseg->undo_cached); + undo; undo = next_undo) { + + next_undo = UT_LIST_GET_NEXT(undo_list, undo); + UT_LIST_REMOVE(rseg->undo_cached, undo); + MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); + ut_free(undo); + } - undo_trunc->reset(); - undo::Truncate::clear_trunc_list(); -} + UT_LIST_INIT(rseg->undo_list, + &trx_undo_t::undo_list); + UT_LIST_INIT(rseg->undo_cached, + &trx_undo_t::undo_list); + UT_LIST_INIT(rseg->old_insert_list, + &trx_undo_t::undo_list); + + /* These were written by trx_rseg_header_create(). */ + ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + + rblock->frame)); + ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE + + rblock->frame)); + + /* Initialize the undo log lists according to + the rseg header */ + rseg->curr_size = 1; + rseg->trx_ref_count = 0; + rseg->last_page_no = FIL_NULL; + rseg->last_offset = 0; + rseg->last_commit = 0; + rseg->needs_purge = false; + } -/** -Removes unnecessary history data from rollback segments. NOTE that when this -function is called, the caller must not have any latches on undo log pages! -*/ -static void trx_purge_truncate_history() -{ - ut_ad(purge_sys.head <= purge_sys.tail); - purge_sys_t::iterator& head = purge_sys.head.commit - ? purge_sys.head : purge_sys.tail; + mtr.commit(); + /* Write-ahead the redo log record. */ + log_write_up_to(mtr.commit_lsn(), true); + + /* Trim the file size. */ + os_file_truncate(file->name, file->handle, + os_offset_t(size) << srv_page_size_shift, + true); + + /* This is only executed by srv_purge_coordinator_thread. */ + export_vars.innodb_undo_truncations++; + + /* TODO: PUNCH_HOLE the garbage (with write-ahead logging) */ + mutex_enter(&fil_system.mutex); + ut_ad(&space == purge_sys.truncate.current); + ut_ad(space.stop_new_ops); + ut_ad(space.is_being_truncated); + purge_sys.truncate.current->stop_new_ops = false; + purge_sys.truncate.current->is_being_truncated = false; + mutex_exit(&fil_system.mutex); + + if (purge_sys.rseg != NULL + && purge_sys.rseg->last_page_no == FIL_NULL) { + /* If purge_sys.rseg is pointing to rseg that + was recently truncated then move to next rseg + element. Note: Ideally purge_sys.rseg should + be NULL because purge should complete + processing of all the records but there is + purge_batch_size that can force the purge loop + to exit before all the records are purged and + in this case purge_sys.rseg could point to a + valid rseg waiting for next purge cycle. */ + purge_sys.next_stored = false; + purge_sys.rseg = NULL; + } - if (head.trx_no() >= purge_sys.view.low_limit_no()) { - /* This is sometimes necessary. TODO: find out why. */ - head.reset_trx_no(purge_sys.view.low_limit_no()); - head.undo_no = 0; - } + DBUG_EXECUTE_IF("ib_undo_trunc", + ib::info() << "ib_undo_trunc"; + log_write_up_to(LSN_MAX, true); + DBUG_SUICIDE();); - for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) { - if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) { - ut_ad(rseg->id == i); - trx_purge_truncate_rseg_history(*rseg, head); + for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) { + if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) { + ut_ad(rseg->is_persistent()); + if (rseg->space == &space) { + rseg->skip_allocation = false; + } + } } - } - /* UNDO tablespace truncate. We will try to truncate as much as we - can (greedy approach). This will ensure when the server is idle we - try and truncate all the UNDO tablespaces. */ - for (ulint i = srv_undo_tablespaces_active; i--; ) { - trx_purge_mark_undo_for_truncate(&purge_sys.undo_trunc); - trx_purge_initiate_truncate(head, &purge_sys.undo_trunc); + ib::info() << "Truncated " << file->name; + purge_sys.truncate.last = purge_sys.truncate.current; + purge_sys.truncate.current = NULL; } } @@ -1511,7 +1230,7 @@ trx_purge_dml_delay(void) if (srv_max_purge_lag > 0) { float ratio; - ratio = float(trx_sys.history_size()) / srv_max_purge_lag; + ratio = float(trx_sys.rseg_history_len) / srv_max_purge_lag; if (ratio > 1.0) { /* If the history list length exceeds the @@ -1537,8 +1256,7 @@ void trx_purge_wait_for_workers_to_complete() { /* Ensure that the work queue empties out. */ - while (my_atomic_loadlint(&purge_sys.n_completed) - != purge_sys.n_submitted) { + while (purge_sys.n_tasks.load(std::memory_order_acquire)) { if (srv_get_task_queue_length() > 0) { srv_release_threads(SRV_WORKER, 1); @@ -1574,9 +1292,8 @@ trx_purge( srv_dml_needed_delay = trx_purge_dml_delay(); - /* The number of tasks submitted should be completed. */ - ut_a(purge_sys.n_submitted - == my_atomic_loadlint(&purge_sys.n_completed)); + /* All submitted tasks should be completed. */ + ut_ad(purge_sys.n_tasks.load(std::memory_order_relaxed) == 0); rw_lock_x_lock(&purge_sys.latch); trx_sys.clone_oldest_view(); @@ -1590,7 +1307,7 @@ trx_purge( /* Fetch the UNDO recs that need to be purged. */ n_pages_handled = trx_purge_attach_undo_recs(n_purge_threads); - purge_sys.n_submitted += n_purge_threads; + purge_sys.n_tasks.store(n_purge_threads - 1, std::memory_order_relaxed); /* Submit tasks to workers queue if using multi-threaded purge. */ for (ulint i = n_purge_threads; --i; ) { @@ -1604,14 +1321,9 @@ trx_purge( ut_d(thr->thread_slot = slot); que_run_threads(thr); - my_atomic_addlint(&purge_sys.n_completed, 1); - - if (n_purge_threads > 1) { - trx_purge_wait_for_workers_to_complete(); - } + trx_purge_wait_for_workers_to_complete(); - ut_a(purge_sys.n_submitted - == my_atomic_loadlint(&purge_sys.n_completed)); + ut_ad(purge_sys.n_tasks.load(std::memory_order_relaxed) == 0); if (truncate) { trx_purge_truncate_history(); @@ -1628,7 +1340,7 @@ void purge_sys_t::stop() { rw_lock_x_lock(&latch); - if (!enabled_latched()) + if (!enabled()) { /* Shutdown must have been initiated during FLUSH TABLES FOR EXPORT. */ ut_ad(!srv_undo_sources); @@ -1638,7 +1350,7 @@ void purge_sys_t::stop() ut_ad(srv_n_purge_threads > 0); - if (0 == my_atomic_add32_explicit(&m_paused, 1, MY_MEMORY_ORDER_RELAXED)) + if (m_paused++ == 0) { /* We need to wakeup the purge thread in case it is suspended, so that it can acknowledge the state change. */ @@ -1672,8 +1384,7 @@ void purge_sys_t::resume() return; } - int32_t paused= my_atomic_add32_explicit(&m_paused, -1, - MY_MEMORY_ORDER_RELAXED); + int32_t paused= m_paused--; ut_a(paused); if (paused == 1) diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc index b282bb177af..7316fc15b0f 100644 --- a/storage/innobase/trx/trx0rec.cc +++ b/storage/innobase/trx/trx0rec.cc @@ -39,9 +39,11 @@ Created 3/26/1996 Heikki Tuuri #include "row0row.h" #include "row0mysql.h" -/** The search tuple corresponding to TRX_UNDO_INSERT_METADATA */ +/** The search tuple corresponding to TRX_UNDO_INSERT_METADATA. */ const dtuple_t trx_undo_metadata = { - REC_INFO_METADATA, 0, 0, + /* This also works for REC_INFO_METADATA_ALTER, because the + delete-mark (REC_INFO_DELETED_FLAG) is ignored when searching. */ + REC_INFO_METADATA_ADD, 0, 0, NULL, 0, NULL #ifdef UNIV_DEBUG , DATA_TUPLE_MAGIC_N @@ -228,8 +230,7 @@ trx_undo_log_v_idx( { ut_ad(pos < table->n_v_def); dict_v_col_t* vcol = dict_table_get_nth_v_col(table, pos); - - ulint n_idx = vcol->v_indexes->size(); + ulint n_idx = vcol->n_v_indexes; byte* old_ptr; ut_ad(n_idx > 0); @@ -256,12 +257,7 @@ trx_undo_log_v_idx( ptr += mach_write_compressed(ptr, n_idx); - dict_v_idx_list::iterator it; - - for (it = vcol->v_indexes->begin(); - it != vcol->v_indexes->end(); ++it) { - dict_v_idx_t v_index = *it; - + for (const auto& v_index : vcol->v_indexes) { ptr += mach_write_compressed( ptr, static_cast<ulint>(v_index.index->id)); @@ -503,7 +499,7 @@ trx_undo_page_report_insert( /* Store then the fields required to uniquely determine the record to be inserted in the clustered index */ if (UNIV_UNLIKELY(clust_entry->info_bits != 0)) { - ut_ad(clust_entry->info_bits == REC_INFO_METADATA); + ut_ad(clust_entry->is_metadata()); ut_ad(index->is_instant()); ut_ad(undo_block->frame[first_free + 2] == TRX_UNDO_INSERT_REC); @@ -716,7 +712,7 @@ trx_undo_rec_skip_row_ref( log of an update or delete marking of a clustered index record. @param[out] ext_buf buffer to hold the prefix data and BLOB pointer @param[in] prefix_len prefix size to store in the undo log -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] field an externally stored column @param[in,out] len input: length of field; output: used length of ext_buf @@ -726,13 +722,13 @@ byte* trx_undo_page_fetch_ext( byte* ext_buf, ulint prefix_len, - const page_size_t& page_size, + ulint zip_size, const byte* field, ulint* len) { /* Fetch the BLOB. */ ulint ext_len = btr_copy_externally_stored_field_prefix( - ext_buf, prefix_len, page_size, field, *len); + ext_buf, prefix_len, zip_size, field, *len); /* BLOBs should always be nonempty. */ ut_a(ext_len); /* Append the BLOB pointer to the prefix. */ @@ -750,7 +746,7 @@ available size, or NULL when should not fetch a longer prefix @param[in] prefix_len prefix size to store in the undo log -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in,out] field the locally stored part of the externally stored column @param[in,out] len length of field, in bytes @@ -763,7 +759,7 @@ trx_undo_page_report_modify_ext( byte* ptr, byte* ext_buf, ulint prefix_len, - const page_size_t& page_size, + ulint zip_size, const byte** field, ulint* len, spatial_status_t spatial_status) @@ -805,7 +801,7 @@ trx_undo_page_report_modify_ext( ptr += mach_write_compressed(ptr, *len); *field = trx_undo_page_fetch_ext(ext_buf, prefix_len, - page_size, *field, len); + zip_size, *field, len); ptr += mach_write_compressed(ptr, *len + spatial_len); } else { @@ -818,7 +814,7 @@ trx_undo_page_report_modify_ext( /** Get MBR from a Geometry column stored externally @param[out] mbr MBR to fill -@param[in] pagesize table pagesize +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] field field contain the geometry data @param[in,out] len length of field, in bytes */ @@ -826,17 +822,17 @@ static void trx_undo_get_mbr_from_ext( /*======================*/ - double* mbr, - const page_size_t& page_size, - const byte* field, - ulint* len) + double* mbr, + ulint zip_size, + const byte* field, + ulint* len) { uchar* dptr = NULL; ulint dlen; mem_heap_t* heap = mem_heap_create(100); dptr = btr_copy_externally_stored_field( - &dlen, field, page_size, *len, heap); + &dlen, field, zip_size, *len, heap); if (dlen <= GEO_DATA_HEADER_SIZE) { for (uint i = 0; i < SPDIMS; ++i) { @@ -920,9 +916,9 @@ trx_undo_page_report_modify( /* Store first some general parameters to the undo log */ if (!update) { - ut_ad(!rec_get_deleted_flag(rec, dict_table_is_comp(table))); + ut_ad(!rec_is_delete_marked(rec, dict_table_is_comp(table))); type_cmpl = TRX_UNDO_DEL_MARK_REC; - } else if (rec_get_deleted_flag(rec, dict_table_is_comp(table))) { + } else if (rec_is_delete_marked(rec, dict_table_is_comp(table))) { /* In delete-marked records, DB_TRX_ID must always refer to an existing update_undo log record. */ ut_ad(row_get_rec_trx_id(rec, index, offsets)); @@ -951,9 +947,7 @@ trx_undo_page_report_modify( *ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table)); /* Store the values of the system columns */ - field = rec_get_nth_field(rec, offsets, - dict_index_get_sys_col_pos( - index, DATA_TRX_ID), &flen); + field = rec_get_nth_field(rec, offsets, index->db_trx_id(), &flen); ut_ad(flen == DATA_TRX_ID_LEN); trx_id = trx_read_trx_id(field); @@ -967,9 +961,7 @@ trx_undo_page_report_modify( } ptr += mach_u64_write_compressed(ptr, trx_id); - field = rec_get_nth_field(rec, offsets, - dict_index_get_sys_col_pos( - index, DATA_ROLL_PTR), &flen); + field = rec_get_nth_field(rec, offsets, index->db_roll_ptr(), &flen); ut_ad(flen == DATA_ROLL_PTR_LEN); ut_ad(memcmp(field, field_ref_zero, DATA_ROLL_PTR_LEN)); @@ -1030,45 +1022,60 @@ trx_undo_page_report_modify( on them */ if (upd_fld_is_virtual_col(fld) && dict_table_get_nth_v_col( - table, pos)->v_indexes->empty()) { + table, pos)->v_indexes.empty()) { n_updated--; } } } + i = 0; + + if (UNIV_UNLIKELY(update->is_alter_metadata())) { + ut_ad(update->n_fields >= 1); + ut_ad(!upd_fld_is_virtual_col(&update->fields[0])); + ut_ad(update->fields[0].field_no + == index->first_user_field()); + ut_ad(!dfield_is_ext(&update->fields[0].new_val)); + ut_ad(!dfield_is_null(&update->fields[0].new_val)); + /* The instant ADD COLUMN metadata record does not + contain the BLOB. Do not write anything for it. */ + i = !rec_is_alter_metadata(rec, *index); + n_updated -= i; + } + ptr += mach_write_compressed(ptr, n_updated); - for (i = 0; i < upd_get_n_fields(update); i++) { + for (; i < upd_get_n_fields(update); i++) { + if (trx_undo_left(undo_block, ptr) < 5) { + return 0; + } + upd_field_t* fld = upd_get_nth_field(update, i); bool is_virtual = upd_fld_is_virtual_col(fld); ulint max_v_log_len = 0; - ulint pos = fld->field_no; - - /* Write field number to undo log */ - if (trx_undo_left(undo_block, ptr) < 5) { - return(0); - } + ulint pos = fld->field_no; + const dict_col_t* col = NULL; if (is_virtual) { /* Skip the non-indexed column, during an online alter table */ if (dict_index_is_online_ddl(index) && dict_table_get_nth_v_col( - table, pos)->v_indexes->empty()) { + table, pos)->v_indexes.empty()) { continue; } /* add REC_MAX_N_FIELDS to mark this is a virtual col */ - pos += REC_MAX_N_FIELDS; - } + ptr += mach_write_compressed( + ptr, pos + REC_MAX_N_FIELDS); - ptr += mach_write_compressed(ptr, pos); + if (trx_undo_left(undo_block, ptr) < 15) { + return 0; + } - /* Save the old value of field */ - if (is_virtual) { ut_ad(fld->field_no < table->n_v_def); ptr = trx_undo_log_v_idx(undo_block, table, @@ -1093,36 +1100,87 @@ trx_undo_page_report_modify( flen = ut_min( flen, max_v_log_len); } + + goto store_len; + } + + if (UNIV_UNLIKELY(update->is_metadata())) { + ut_ad(pos >= index->first_user_field()); + ut_ad(rec_is_metadata(rec, *index)); + + if (rec_is_alter_metadata(rec, *index)) { + ut_ad(update->is_alter_metadata()); + + field = rec_offs_n_fields(offsets) + > pos + && !rec_offs_nth_default( + offsets, pos) + ? rec_get_nth_field( + rec, offsets, + pos, &flen) + : index->instant_field_value( + pos - 1, &flen); + + if (pos == index->first_user_field()) { + ut_ad(rec_offs_nth_extern( + offsets, pos)); + ut_ad(flen == FIELD_REF_SIZE); + goto write_field; + } + col = dict_index_get_nth_col(index, + pos - 1); + } else if (!update->is_alter_metadata()) { + goto get_field; + } else { + /* We are converting an ADD COLUMN + metadata record to an ALTER TABLE + metadata record, with BLOB. Subtract + the missing metadata BLOB field. */ + ut_ad(pos > index->first_user_field()); + --pos; + goto get_field; + } } else { +get_field: + col = dict_index_get_nth_col(index, pos); field = rec_get_nth_cfield( rec, index, offsets, pos, &flen); } +write_field: + /* Write field number to undo log */ + ptr += mach_write_compressed(ptr, pos); if (trx_undo_left(undo_block, ptr) < 15) { - return(0); + return 0; } - if (!is_virtual && rec_offs_nth_extern(offsets, pos)) { - const dict_col_t* col - = dict_index_get_nth_col(index, pos); - ulint prefix_len - = dict_max_field_len_store_undo( - table, col); + if (rec_offs_n_fields(offsets) > pos + && rec_offs_nth_extern(offsets, pos)) { + ut_ad(col || pos == index->first_user_field()); + ut_ad(col || update->is_alter_metadata()); + ut_ad(col + || rec_is_alter_metadata(rec, *index)); + ulint prefix_len = col + ? dict_max_field_len_store_undo( + table, col) + : 0; ut_ad(prefix_len + BTR_EXTERN_FIELD_REF_SIZE <= sizeof ext_buf); ptr = trx_undo_page_report_modify_ext( ptr, - col->ord_part + col + && col->ord_part && !ignore_prefix && flen < REC_ANTELOPE_MAX_INDEX_COL_LEN ? ext_buf : NULL, prefix_len, - dict_table_page_size(table), + table->space->zip_size(), &field, &flen, SPATIAL_UNKNOWN); *type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN; } else { +store_len: ptr += mach_write_compressed(ptr, flen); } @@ -1271,6 +1329,8 @@ trx_undo_page_report_modify( table, col); ut_a(prefix_len < sizeof ext_buf); + const ulint zip_size + = table->space->zip_size(); /* If there is a spatial index on it, log its MBR */ @@ -1279,9 +1339,7 @@ trx_undo_page_report_modify( col->mtype)); trx_undo_get_mbr_from_ext( - mbr, - dict_table_page_size( - table), + mbr, zip_size, field, &flen); } @@ -1290,7 +1348,7 @@ trx_undo_page_report_modify( flen < REC_ANTELOPE_MAX_INDEX_COL_LEN && !ignore_prefix ? ext_buf : NULL, prefix_len, - dict_table_page_size(table), + zip_size, &field, &flen, spatial_status); } else { @@ -1487,7 +1545,6 @@ trx_undo_update_rec_get_update( upd_t* update; ulint n_fields; byte* buf; - ulint i; bool first_v_col = true; bool is_undo_log = true; ulint n_skip_field = 0; @@ -1500,7 +1557,7 @@ trx_undo_update_rec_get_update( n_fields = 0; } - update = upd_create(n_fields + 2, heap); + *upd = update = upd_create(n_fields + 2, heap); update->info_bits = info_bits; @@ -1512,9 +1569,7 @@ trx_undo_update_rec_get_update( mach_write_to_6(buf, trx_id); - upd_field_set_field_no(upd_field, - dict_index_get_sys_col_pos(index, DATA_TRX_ID), - index); + upd_field_set_field_no(upd_field, index->db_trx_id(), index); dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN); upd_field = upd_get_nth_field(update, n_fields + 1); @@ -1523,25 +1578,20 @@ trx_undo_update_rec_get_update( trx_write_roll_ptr(buf, roll_ptr); - upd_field_set_field_no( - upd_field, dict_index_get_sys_col_pos(index, DATA_ROLL_PTR), - index); + upd_field_set_field_no(upd_field, index->db_roll_ptr(), index); dfield_set_data(&(upd_field->new_val), buf, DATA_ROLL_PTR_LEN); /* Store then the updated ordinary columns to the update vector */ - for (i = 0; i < n_fields; i++) { - + for (ulint i = 0; i < n_fields; i++) { const byte* field; ulint len; - ulint field_no; ulint orig_len; - bool is_virtual; upd_field = upd_get_nth_field(update, i); - field_no = mach_read_next_compressed(&ptr); + ulint field_no = mach_read_next_compressed(&ptr); - is_virtual = (field_no >= REC_MAX_N_FIELDS); + const bool is_virtual = (field_no >= REC_MAX_N_FIELDS); if (is_virtual) { /* If new version, we need to check index list to figure @@ -1564,15 +1614,63 @@ trx_undo_update_rec_get_update( } upd_field_set_v_field_no(upd_field, field_no, index); + } else if (UNIV_UNLIKELY((update->info_bits + & ~REC_INFO_DELETED_FLAG) + == REC_INFO_MIN_REC_FLAG)) { + ut_ad(type == TRX_UNDO_UPD_EXIST_REC); + const ulint uf = index->first_user_field(); + ut_ad(field_no >= uf); + + if (update->info_bits != REC_INFO_MIN_REC_FLAG) { + /* Generic instant ALTER TABLE */ + if (field_no == uf) { + upd_field->new_val.type + .metadata_blob_init(); + } else if (field_no >= index->n_fields) { + /* This is reachable during + purge if the table was emptied + and converted to the canonical + format on a later ALTER TABLE. + In this case, + row_purge_upd_exist_or_extern() + would only be interested in + freeing any BLOBs that were + updated, that is, the metadata + BLOB above. Other BLOBs in + the metadata record are never + updated; they are for the + initial DEFAULT values of the + instantly added columns, and + they will never change. + + Note: if the table becomes + empty during ROLLBACK or is + empty during subsequent ALTER + TABLE, and btr_page_empty() is + called to re-create the root + page without the metadata + record, in that case we should + only free the latest version + of BLOBs in the record, + which purge would never touch. */ + field_no = REC_MAX_N_FIELDS; + n_skip_field++; + } else { + dict_col_copy_type( + dict_index_get_nth_col( + index, field_no - 1), + &upd_field->new_val.type); + } + } else { + /* Instant ADD COLUMN...LAST */ + dict_col_copy_type( + dict_index_get_nth_col(index, + field_no), + &upd_field->new_val.type); + } + upd_field->field_no = field_no; } else if (field_no < index->n_fields) { upd_field_set_field_no(upd_field, field_no, index); - } else if (update->info_bits == REC_INFO_MIN_REC_FLAG - && index->is_instant()) { - /* This must be a rollback of a subsequent - instant ADD COLUMN operation. This will be - detected and handled by btr_cur_trim(). */ - upd_field->field_no = field_no; - upd_field->orig_len = 0; } else { ib::error() << "Trying to access update undo rec" " field " << field_no @@ -1605,6 +1703,12 @@ trx_undo_update_rec_get_update( dfield_set_ext(&upd_field->new_val); } + ut_ad(update->info_bits != (REC_INFO_DELETED_FLAG + | REC_INFO_MIN_REC_FLAG) + || field_no != index->first_user_field() + || (upd_field->new_val.ext + && upd_field->new_val.len == FIELD_REF_SIZE)); + if (is_virtual) { upd_field->old_v_val = static_cast<dfield_t*>( mem_heap_alloc( @@ -1622,31 +1726,23 @@ trx_undo_update_rec_get_update( } } - /* In rare scenario, we could have skipped virtual column (as they - are dropped. We will regenerate a update vector and skip them */ - if (n_skip_field > 0) { - ulint n = 0; - ut_ad(n_skip_field <= n_fields); + /* We may have to skip dropped indexed virtual columns. + Also, we may have to trim the update vector of a metadata record + if dict_index_t::clear_instant_alter() was invoked on the table + later, and the number of fields no longer matches. */ - upd_t* new_update = upd_create( - n_fields + 2 - n_skip_field, heap); + if (n_skip_field) { + upd_field_t* d = upd_get_nth_field(update, 0); + const upd_field_t* const end = d + n_fields + 2; - for (i = 0; i < n_fields + 2; i++) { - upd_field = upd_get_nth_field(update, i); - - if (upd_field->field_no == REC_MAX_N_FIELDS) { - continue; + for (const upd_field_t* s = d; s != end; s++) { + if (s->field_no != REC_MAX_N_FIELDS) { + *d++ = *s; } - - upd_field_t* new_upd_field - = upd_get_nth_field(new_update, n); - *new_upd_field = *upd_field; - n++; } - ut_ad(n == n_fields + 2 - n_skip_field); - *upd = new_update; - } else { - *upd = update; + + ut_ad(d + n_skip_field == end); + update->n_fields = d - upd_get_nth_field(update, 0); } return(const_cast<byte*>(ptr)); @@ -1701,8 +1797,11 @@ trx_undo_rec_get_partial_row( if (uf->old_v_val) { continue; } - ulint c = dict_index_get_nth_col(index, uf->field_no)->ind; - *dtuple_get_nth_field(*row, c) = uf->new_val; + const dict_col_t& c = *dict_index_get_nth_col(index, + uf->field_no); + if (!c.is_dropped()) { + *dtuple_get_nth_field(*row, c.ind) = uf->new_val; + } } end_ptr = ptr + mach_read_from_2(ptr); @@ -1713,7 +1812,6 @@ trx_undo_rec_get_partial_row( const byte* field; ulint field_no; const dict_col_t* col; - ulint col_no; ulint len; ulint orig_len; bool is_virtual; @@ -1741,15 +1839,18 @@ trx_undo_rec_get_partial_row( dict_v_col_t* vcol = dict_table_get_nth_v_col( index->table, field_no); col = &vcol->m_col; - col_no = dict_col_get_no(col); dfield = dtuple_get_nth_v_field(*row, vcol->v_pos); dict_col_copy_type( &vcol->m_col, dfield_get_type(dfield)); } else { col = dict_index_get_nth_col(index, field_no); - col_no = dict_col_get_no(col); - dfield = dtuple_get_nth_field(*row, col_no); + + if (col->is_dropped()) { + continue; + } + + dfield = dtuple_get_nth_field(*row, col->ind); ut_ad(dfield->type.mtype == DATA_MISSING || dict_col_type_assert_equal(col, &dfield->type)); @@ -1757,9 +1858,7 @@ trx_undo_rec_get_partial_row( || dfield->len == len || (len != UNIV_SQL_NULL && len >= UNIV_EXTERN_STORAGE_FIELD)); - dict_col_copy_type( - dict_table_get_nth_col(index->table, col_no), - dfield_get_type(dfield)); + dict_col_copy_type(col, dfield_get_type(dfield)); } dfield_set_data(dfield, field, len); @@ -2380,7 +2479,8 @@ trx_undo_prev_version_build( row_upd_index_replace_new_col_vals(entry, index, update, heap); /* Get number of externally stored columns in updated record */ - const ulint n_ext = dtuple_get_n_ext(entry); + const ulint n_ext = index->is_primary() + ? dtuple_get_n_ext(entry) : 0; buf = static_cast<byte*>(mem_heap_alloc( heap, rec_get_converted_size(index, entry, n_ext))); diff --git a/storage/innobase/trx/trx0roll.cc b/storage/innobase/trx/trx0roll.cc index f2b108edf39..0e60c6fb745 100644 --- a/storage/innobase/trx/trx0roll.cc +++ b/storage/innobase/trx/trx0roll.cc @@ -44,10 +44,6 @@ Created 3/26/1996 Heikki Tuuri #include "trx0trx.h" #include "trx0undo.h" -/** This many pages must be undone before a truncate is tried within -rollback */ -static const ulint TRX_ROLL_TRUNC_THRESHOLD = 1; - /** true if trx_rollback_all_recovered() thread is active */ bool trx_rollback_is_active; @@ -63,7 +59,7 @@ static bool trx_rollback_finish(trx_t* trx) trx->mod_tables.clear(); bool finished = trx->error_state == DB_SUCCESS; if (UNIV_LIKELY(finished)) { - trx_commit(trx); + trx->commit(); } else { ut_a(trx->error_state == DB_INTERRUPTED); ut_ad(!srv_is_being_started); @@ -88,7 +84,7 @@ static bool trx_rollback_finish(trx_t* trx) ut_free(undo); undo = NULL; } - trx_commit_low(trx, NULL); + trx->commit_low(); } trx->lock.que_state = TRX_QUE_RUNNING; @@ -181,6 +177,11 @@ trx_rollback_to_savepoint( partial rollback requested, or NULL for complete rollback */ { +#ifdef WITH_WSREP + if (!savept && trx->is_wsrep() && wsrep_thd_is_SR(trx->mysql_thd)) { + wsrep_handle_SR_rollback(NULL, trx->mysql_thd); + } +#endif /* WITH_WSREP */ ut_ad(!trx_mutex_own(trx)); trx_start_if_not_started_xa(trx, true); @@ -228,7 +229,8 @@ dberr_t trx_rollback_for_mysql(trx_t* trx) trx->will_lock = 0; ut_ad(trx->mysql_thd); #ifdef WITH_WSREP - trx->wsrep = false; + trx->wsrep= false; + trx->lock.was_chosen_as_wsrep_victim= false; #endif return(DB_SUCCESS); @@ -451,11 +453,8 @@ trx_rollback_to_savepoint_for_mysql_low( trx_mark_sql_stat_end(trx); trx->op_info = ""; - #ifdef WITH_WSREP - if (trx->is_wsrep()) { - trx->lock.was_chosen_as_deadlock_victim = false; - } + trx->lock.was_chosen_as_wsrep_victim = false; #endif return(err); } @@ -716,9 +715,9 @@ static my_bool trx_roll_count_callback(rw_trx_hash_element_t *element, void trx_roll_report_progress() { time_t now = time(NULL); - mutex_enter(&recv_sys->mutex); - bool report = recv_sys->report(now); - mutex_exit(&recv_sys->mutex); + mutex_enter(&recv_sys.mutex); + bool report = recv_sys.report(now); + mutex_exit(&recv_sys.mutex); if (report) { trx_roll_count_callback_arg arg; @@ -877,169 +876,6 @@ DECLARE_THREAD(trx_rollback_all_recovered)(void*) OS_THREAD_DUMMY_RETURN; } -/** Try to truncate the undo logs. -@param[in,out] trx transaction */ -static -void -trx_roll_try_truncate(trx_t* trx) -{ - trx->pages_undone = 0; - - undo_no_t undo_no = trx->undo_no; - - if (trx_undo_t* undo = trx->rsegs.m_redo.undo) { - ut_ad(undo->rseg == trx->rsegs.m_redo.rseg); - mutex_enter(&undo->rseg->mutex); - trx_undo_truncate_end(undo, undo_no, false); - mutex_exit(&undo->rseg->mutex); - } - - if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) { - ut_ad(undo->rseg == trx->rsegs.m_noredo.rseg); - mutex_enter(&undo->rseg->mutex); - trx_undo_truncate_end(undo, undo_no, true); - mutex_exit(&undo->rseg->mutex); - } -} - -/***********************************************************************//** -Pops the topmost undo log record in a single undo log and updates the info -about the topmost record in the undo log memory struct. -@return undo log record, the page s-latched */ -static -trx_undo_rec_t* -trx_roll_pop_top_rec( -/*=================*/ - trx_t* trx, /*!< in: transaction */ - trx_undo_t* undo, /*!< in: undo log */ - mtr_t* mtr) /*!< in: mtr */ -{ - page_t* undo_page = trx_undo_page_get_s_latched( - page_id_t(undo->rseg->space->id, undo->top_page_no), mtr); - - ulint offset = undo->top_offset; - - trx_undo_rec_t* prev_rec = trx_undo_get_prev_rec( - undo_page + offset, undo->hdr_page_no, undo->hdr_offset, - true, mtr); - - if (prev_rec == NULL) { - undo->top_undo_no = IB_ID_MAX; - ut_ad(undo->empty()); - } else { - page_t* prev_rec_page = page_align(prev_rec); - - if (prev_rec_page != undo_page) { - - trx->pages_undone++; - } - - undo->top_page_no = page_get_page_no(prev_rec_page); - undo->top_offset = ulint(prev_rec - prev_rec_page); - undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec); - ut_ad(!undo->empty()); - } - - return(undo_page + offset); -} - -/** Get the last undo log record of a transaction (for rollback). -@param[in,out] trx transaction -@param[out] roll_ptr DB_ROLL_PTR to the undo record -@param[in,out] heap memory heap for allocation -@return undo log record copied to heap -@retval NULL if none left or the roll_limit (savepoint) was reached */ -trx_undo_rec_t* -trx_roll_pop_top_rec_of_trx(trx_t* trx, roll_ptr_t* roll_ptr, mem_heap_t* heap) -{ - if (trx->pages_undone >= TRX_ROLL_TRUNC_THRESHOLD) { - trx_roll_try_truncate(trx); - } - - trx_undo_t* undo = NULL; - trx_undo_t* insert = trx->rsegs.m_redo.old_insert; - trx_undo_t* update = trx->rsegs.m_redo.undo; - trx_undo_t* temp = trx->rsegs.m_noredo.undo; - const undo_no_t limit = trx->roll_limit; - - ut_ad(!insert || !update || insert->empty() || update->empty() - || insert->top_undo_no != update->top_undo_no); - ut_ad(!insert || !temp || insert->empty() || temp->empty() - || insert->top_undo_no != temp->top_undo_no); - ut_ad(!update || !temp || update->empty() || temp->empty() - || update->top_undo_no != temp->top_undo_no); - - if (UNIV_LIKELY_NULL(insert) - && !insert->empty() && limit <= insert->top_undo_no) { - undo = insert; - } - - if (update && !update->empty() && update->top_undo_no >= limit) { - if (!undo) { - undo = update; - } else if (undo->top_undo_no < update->top_undo_no) { - undo = update; - } - } - - if (temp && !temp->empty() && temp->top_undo_no >= limit) { - if (!undo) { - undo = temp; - } else if (undo->top_undo_no < temp->top_undo_no) { - undo = temp; - } - } - - if (undo == NULL) { - trx_roll_try_truncate(trx); - /* Mark any ROLLBACK TO SAVEPOINT completed, so that - if the transaction object is committed and reused - later, we will default to a full ROLLBACK. */ - trx->roll_limit = 0; - trx->in_rollback = false; - return(NULL); - } - - ut_ad(!undo->empty()); - ut_ad(limit <= undo->top_undo_no); - - *roll_ptr = trx_undo_build_roll_ptr( - false, undo->rseg->id, undo->top_page_no, undo->top_offset); - - mtr_t mtr; - mtr.start(); - - trx_undo_rec_t* undo_rec = trx_roll_pop_top_rec(trx, undo, &mtr); - const undo_no_t undo_no = trx_undo_rec_get_undo_no(undo_rec); - switch (trx_undo_rec_get_type(undo_rec)) { - case TRX_UNDO_INSERT_METADATA: - /* This record type was introduced in MDEV-11369 - instant ADD COLUMN, which was implemented after - MDEV-12288 removed the insert_undo log. There is no - instant ADD COLUMN for temporary tables. Therefore, - this record can only be present in the main undo log. */ - ut_ad(undo == update); - /* fall through */ - case TRX_UNDO_RENAME_TABLE: - ut_ad(undo == insert || undo == update); - /* fall through */ - case TRX_UNDO_INSERT_REC: - ut_ad(undo == insert || undo == update || undo == temp); - *roll_ptr |= 1ULL << ROLL_PTR_INSERT_FLAG_POS; - break; - default: - ut_ad(undo == update || undo == temp); - break; - } - - trx->undo_no = undo_no; - - trx_undo_rec_t* undo_rec_copy = trx_undo_rec_copy(undo_rec, heap); - mtr.commit(); - - return(undo_rec_copy); -} - /****************************************************************//** Builds an undo 'query' graph for a transaction. The actual rollback is performed by executing this query graph like a query subprocedure call. diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc index 46fb2680371..29e6acc773c 100644 --- a/storage/innobase/trx/trx0rseg.cc +++ b/storage/innobase/trx/trx0rseg.cc @@ -53,6 +53,10 @@ trx_rseg_write_wsrep_checkpoint( const XID* xid, mtr_t* mtr) { + DBUG_ASSERT(xid->gtrid_length >= 0); + DBUG_ASSERT(xid->bqual_length >= 0); + DBUG_ASSERT(xid->gtrid_length + xid->bqual_length < XIDDATASIZE); + mlog_write_ulint(TRX_RSEG_WSREP_XID_FORMAT + rseg_header, uint32_t(xid->formatID), MLOG_4BYTES, mtr); @@ -65,9 +69,15 @@ trx_rseg_write_wsrep_checkpoint( uint32_t(xid->bqual_length), MLOG_4BYTES, mtr); + const ulint xid_length = static_cast<ulint>(xid->gtrid_length + + xid->bqual_length); mlog_write_string(TRX_RSEG_WSREP_XID_DATA + rseg_header, reinterpret_cast<const byte*>(xid->data), - XIDDATASIZE, mtr); + xid_length, mtr); + if (UNIV_LIKELY(xid_length < XIDDATASIZE)) { + mlog_memset(TRX_RSEG_WSREP_XID_DATA + rseg_header + xid_length, + XIDDATASIZE - xid_length, 0, mtr); + } } /** Update the WSREP XID information in rollback segment header. @@ -106,8 +116,9 @@ trx_rseg_clear_wsrep_checkpoint( trx_rsegf_t* rseg_header, mtr_t* mtr) { - mlog_write_ulint(TRX_RSEG_WSREP_XID_FORMAT + rseg_header, - 0, MLOG_4BYTES, mtr); + mlog_memset(rseg_header + TRX_RSEG_WSREP_XID_INFO, + TRX_RSEG_WSREP_XID_DATA + XIDDATASIZE + - TRX_RSEG_WSREP_XID_INFO, 0, mtr); } static void @@ -275,12 +286,10 @@ void trx_rseg_format_upgrade(trx_rsegf_t* rseg_header, mtr_t* mtr) mlog_write_ulint(rseg_format, 0, MLOG_4BYTES, mtr); /* Clear also possible garbage at the end of the page. Old InnoDB versions did not initialize unused parts of pages. */ - byte* b = rseg_header + TRX_RSEG_MAX_TRX_ID + 8; - ulint len = srv_page_size - - (FIL_PAGE_DATA_END - + TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8); - memset(b, 0, len); - mlog_log_string(b, len, mtr); + mlog_memset(TRX_RSEG_MAX_TRX_ID + 8 + rseg_header, + srv_page_size + - (FIL_PAGE_DATA_END + + TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8), 0, mtr); } /** Create a rollback segment header. @@ -312,22 +321,17 @@ trx_rseg_header_create( buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW); - mlog_write_ulint(TRX_RSEG + TRX_RSEG_FORMAT + block->frame, 0, - MLOG_4BYTES, mtr); + ut_ad(0 == mach_read_from_4(TRX_RSEG_FORMAT + TRX_RSEG + + block->frame)); + ut_ad(0 == mach_read_from_4(TRX_RSEG_HISTORY_SIZE + TRX_RSEG + + block->frame)); /* Initialize the history list */ - - mlog_write_ulint(TRX_RSEG + TRX_RSEG_HISTORY_SIZE + block->frame, 0, - MLOG_4BYTES, mtr); - flst_init(TRX_RSEG + TRX_RSEG_HISTORY + block->frame, mtr); - trx_rsegf_t* rsegf = TRX_RSEG + block->frame; + flst_init(block, TRX_RSEG_HISTORY + TRX_RSEG, mtr); /* Reset the undo log slots */ - for (ulint i = 0; i < TRX_RSEG_N_SLOTS; i++) { - /* This is generating a lot of redo log. MariaDB 10.4 - introduced MLOG_MEMSET to reduce the redo log volume. */ - trx_rsegf_set_nth_undo(rsegf, i, FIL_NULL, mtr); - } + mlog_memset(block, TRX_RSEG_UNDO_SLOTS + TRX_RSEG, + TRX_RSEG_N_SLOTS * 4, 0xff, mtr); if (sys_header) { /* Add the rollback segment info to the free slot in @@ -498,8 +502,8 @@ trx_rseg_mem_restore(trx_rseg_t* rseg, trx_id_t& max_trx_id, mtr_t* mtr) rseg->curr_size = mach_read_from_4(rseg_header + TRX_RSEG_HISTORY_SIZE) + 1 + trx_undo_lists_init(rseg, max_trx_id, rseg_header); - if (ulint len = flst_get_len(rseg_header + TRX_RSEG_HISTORY)) { - trx_sys.history_add(int32(len)); + if (auto len = flst_get_len(rseg_header + TRX_RSEG_HISTORY)) { + trx_sys.rseg_history_len += len; fil_addr_t node_addr = trx_purge_get_log_from_hist( flst_get_last(rseg_header + TRX_RSEG_HISTORY, mtr)); @@ -635,10 +639,8 @@ trx_rseg_array_init() /* Finally, clear WSREP XID in TRX_SYS page. */ const buf_block_t* sys = trx_sysf_get(&mtr); - mlog_write_ulint(TRX_SYS + TRX_SYS_WSREP_XID_INFO + - + TRX_SYS_WSREP_XID_MAGIC_N_FLD + sys->frame, - 0, MLOG_4BYTES, &mtr); - + mlog_memset(TRX_SYS + TRX_SYS_WSREP_XID_INFO + sys->frame, + TRX_SYS_WSREP_XID_LEN, 0, &mtr); mtr.commit(); } #endif diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc index 3b7d6dab4eb..e1d72f14e3a 100644 --- a/storage/innobase/trx/trx0sys.cc +++ b/storage/innobase/trx/trx0sys.cc @@ -189,10 +189,9 @@ trx_sysf_create( ut_a(ptr <= page + (srv_page_size - FIL_PAGE_DATA_END)); /* Initialize all of the page. This part used to be uninitialized. */ - memset(ptr, 0, srv_page_size - FIL_PAGE_DATA_END + size_t(page - ptr)); - - mlog_log_string(TRX_SYS + page, srv_page_size - FIL_PAGE_DATA_END - - TRX_SYS, mtr); + mlog_memset(block, ptr - page, + srv_page_size - FIL_PAGE_DATA_END + size_t(page - ptr), + 0, mtr); /* Create the first rollback segment in the SYSTEM tablespace */ slot_no = trx_sys_rseg_find_free(block); @@ -212,7 +211,7 @@ trx_sys_t::create() m_initialised = true; mutex_create(LATCH_ID_TRX_SYS, &mutex); UT_LIST_INIT(trx_list, &trx_t::trx_list); - my_atomic_store32(&rseg_history_len, 0); + rseg_history_len= 0; rw_trx_hash.init(); } diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index 2e761cd7a16..4814c5517a7 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -200,6 +200,9 @@ struct TrxFactory { lock_trx_lock_list_init(&trx->lock.trx_locks); + UT_LIST_INIT(trx->lock.evicted_tables, + &dict_table_t::table_LRU); + UT_LIST_INIT( trx->trx_savepoints, &trx_named_savept_t::trx_savepoints); @@ -224,6 +227,7 @@ struct TrxFactory { } ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); + ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0); UT_DELETE(trx->xid); ut_free(trx->detailed_error); @@ -349,14 +353,13 @@ trx_t *trx_create() { trx_t* trx = trx_pools->get(); - assert_trx_is_free(trx); + trx->assert_freed(); mem_heap_t* heap; ib_alloc_t* alloc; /* We just got trx from pool, it should be non locking */ ut_ad(trx->will_lock == 0); - ut_ad(trx->state == TRX_STATE_NOT_STARTED); ut_ad(!trx->rw_trx_hash_pins); DBUG_LOG("trx", "Create: " << trx); @@ -376,9 +379,10 @@ trx_t *trx_create() ut_ad(trx->lock.n_rec_locks == 0); ut_ad(trx->lock.table_cached == 0); ut_ad(trx->lock.rec_cached == 0); + ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0); #ifdef WITH_WSREP - trx->wsrep_event = NULL; + trx->wsrep_event= NULL; #endif /* WITH_WSREP */ trx_sys.register_trx(trx); @@ -425,11 +429,11 @@ void trx_free(trx_t*& trx) } trx->dict_operation = TRX_DICT_OP_NONE; - assert_trx_is_inactive(trx); + ut_ad(!trx->dict_operation_lock_mode); trx_sys.deregister_trx(trx); - assert_trx_is_free(trx); + trx->assert_freed(); trx_sys.rw_trx_hash.put_pins(trx); trx->mysql_thd = 0; @@ -455,6 +459,9 @@ void trx_free(trx_t*& trx) it is operating also on the freed transaction objects. */ MEM_UNDEFINED(&trx->mutex, sizeof trx->mutex); /* For innobase_kill_connection() */ +# ifdef WITH_WSREP + MEM_UNDEFINED(&trx->wsrep, sizeof trx->wsrep); +# endif MEM_UNDEFINED(&trx->state, sizeof trx->state); MEM_UNDEFINED(&trx->mysql_thd, sizeof trx->mysql_thd); #endif @@ -465,6 +472,9 @@ void trx_free(trx_t*& trx) trx_pools->mem_free(trx). */ MEM_MAKE_DEFINED(&trx->mutex, sizeof trx->mutex); /* For innobase_kill_connection() */ +# ifdef WITH_WSREP + MEM_MAKE_DEFINED(&trx->wsrep, sizeof trx->wsrep); +# endif MEM_MAKE_DEFINED(&trx->state, sizeof trx->state); MEM_MAKE_DEFINED(&trx->mysql_thd, sizeof trx->mysql_thd); #endif @@ -622,10 +632,10 @@ trx_resurrect_table_locks( if (dict_table_t* table = dict_table_open_on_id( *i, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE)) { if (!table->is_readable()) { - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); dict_table_close(table, TRUE, FALSE); - dict_table_remove_from_cache(table); - mutex_exit(&dict_sys->mutex); + dict_sys.remove(table); + mutex_exit(&dict_sys.mutex); continue; } @@ -832,14 +842,9 @@ static trx_rseg_t* trx_assign_rseg_low() /* Choose a rollback segment evenly distributed between 0 and innodb_undo_logs-1 in a round-robin fashion, skipping those - undo tablespaces that are scheduled for truncation. - - Because rseg_slot is not protected by atomics or any mutex, race - conditions are possible, meaning that multiple transactions - that start modifications concurrently will write their undo - log to the same rollback segment. */ - static ulong rseg_slot; - ulint slot = rseg_slot++ % srv_undo_logs; + undo tablespaces that are scheduled for truncation. */ + static Atomic_counter<unsigned> rseg_slot; + ulong slot = ulong{rseg_slot++} % srv_undo_logs; trx_rseg_t* rseg; #ifdef UNIV_DEBUG @@ -931,11 +936,8 @@ trx_t::assign_temp_rseg() compile_time_assert(ut_is_2pow(TRX_SYS_N_RSEGS)); /* Choose a temporary rollback segment between 0 and 127 - in a round-robin fashion. Because rseg_slot is not protected by - atomics or any mutex, race conditions are possible, meaning that - multiple transactions that start modifications concurrently - will write their undo log to the same rollback segment. */ - static ulong rseg_slot; + in a round-robin fashion. */ + static Atomic_counter<unsigned> rseg_slot; trx_rseg_t* rseg = trx_sys.temp_rsegs[ rseg_slot++ & (TRX_SYS_N_RSEGS - 1)]; ut_ad(!rseg->is_persistent()); @@ -1266,16 +1268,10 @@ trx_update_mod_tables_timestamp( trx_mod_tables_t::const_iterator end = trx->mod_tables.end(); #ifdef UNIV_DEBUG -# if MYSQL_VERSION_ID >= 100405 -# define dict_sys_mutex dict_sys.mutex -# else -# define dict_sys_mutex dict_sys->mutex -# endif - const bool preserve_tables = !innodb_evict_tables_on_commit_debug || trx->is_recovered /* avoid trouble with XA recovery */ # if 1 /* if dict_stats_exec_sql() were not playing dirty tricks */ - || mutex_own(&dict_sys_mutex) + || mutex_own(&dict_sys.mutex) # else /* this would be more proper way to do it */ || trx->dict_operation_lock_mode || trx->dict_operation # endif @@ -1305,316 +1301,301 @@ trx_update_mod_tables_timestamp( } /* recheck while holding the mutex that blocks table->acquire() */ - mutex_enter(&dict_sys_mutex); + mutex_enter(&dict_sys.mutex); if (!table->get_ref_count()) { -# if MYSQL_VERSION_ID >= 100405 dict_sys.remove(table, true); -# else - dict_table_remove_from_cache_low(table, true); -# endif } - mutex_exit(&dict_sys_mutex); + mutex_exit(&dict_sys.mutex); #endif } trx->mod_tables.clear(); } -/****************************************************************//** -Commits a transaction in memory. */ -static -void -trx_commit_in_memory( -/*=================*/ - trx_t* trx, /*!< in/out: transaction */ - const mtr_t* mtr) /*!< in: mini-transaction of - trx_write_serialisation_history(), or NULL if - the transaction did not modify anything */ +/** Evict a table definition due to the rollback of ALTER TABLE. +@param[in] table_id table identifier */ +void trx_t::evict_table(table_id_t table_id) { - trx->must_flush_log_later = false; - trx->read_view.close(); + ut_ad(in_rollback); - if (trx_is_autocommit_non_locking(trx)) { - ut_ad(trx->id == 0); - ut_ad(trx->read_only); - ut_a(!trx->is_recovered); - ut_ad(trx->rsegs.m_redo.rseg == NULL); - - /* Note: We are asserting without holding the lock mutex. But - that is OK because this transaction is not waiting and cannot - be rolled back and no new locks can (or should) be added - because it is flagged as a non-locking read-only transaction. */ - - ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); + dict_table_t* table = dict_table_open_on_id( + table_id, true, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED); + if (!table) { + return; + } - /* This state change is not protected by any mutex, therefore - there is an inherent race here around state transition during - printouts. We ignore this race for the sake of efficiency. - However, the trx_sys_t::mutex will protect the trx_t instance - and it cannot be removed from the trx_list and freed - without first acquiring the trx_sys_t::mutex. */ + if (!table->release()) { + /* This must be a DDL operation that is being rolled + back in an active connection. */ + ut_a(table->get_ref_count() == 1); + ut_ad(!is_recovered); + ut_ad(mysql_thd); + return; + } - ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + /* This table should only be locked by this transaction, if at all. */ + ut_ad(UT_LIST_GET_LEN(table->locks) <= 1); + const bool locked = UT_LIST_GET_LEN(table->locks); + ut_ad(!locked || UT_LIST_GET_FIRST(table->locks)->trx == this); + dict_sys.remove(table, true, locked); + if (locked) { + UT_LIST_ADD_FIRST(lock.evicted_tables, table); + } +} - MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT); +/** Mark a transaction committed in the main memory data structures. */ +inline void trx_t::commit_in_memory(const mtr_t *mtr) +{ + must_flush_log_later= false; + read_view.close(); - DBUG_LOG("trx", "Autocommit in memory: " << trx); - trx->state = TRX_STATE_NOT_STARTED; - } else { + if (trx_is_autocommit_non_locking(this)) + { + ut_ad(id == 0); + ut_ad(read_only); + ut_a(!is_recovered); + ut_ad(!rsegs.m_redo.rseg); + + /* Note: We are asserting without holding the lock mutex. But + that is OK because this transaction is not waiting and cannot + be rolled back and no new locks can (or should) be added + because it is flagged as a non-locking read-only transaction. */ + ut_a(UT_LIST_GET_LEN(lock.trx_locks) == 0); + + /* This state change is not protected by any mutex, therefore + there is an inherent race here around state transition during + printouts. We ignore this race for the sake of efficiency. + However, the trx_sys_t::mutex will protect the trx_t instance + and it cannot be removed from the trx_list and freed + without first acquiring the trx_sys_t::mutex. */ + ut_ad(trx_state_eq(this, TRX_STATE_ACTIVE)); + + MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT); + + DBUG_LOG("trx", "Autocommit in memory: " << this); + state= TRX_STATE_NOT_STARTED; + } + else + { #ifdef UNIV_DEBUG - if (!UT_LIST_GET_LEN(trx->lock.trx_locks)) { - for (lock_list::iterator it - = trx->lock.table_locks.begin(); - it != trx->lock.table_locks.end(); - it++) { - ut_ad(!*it); - } - } + if (!UT_LIST_GET_LEN(lock.trx_locks)) + for (auto l : lock.table_locks) + ut_ad(!l); #endif /* UNIV_DEBUG */ - trx->commit_state(); + commit_state(); - if (trx->id) { - trx_sys.deregister_rw(trx); - - /* Wait for any implicit-to-explicit lock - conversions to cease, so that there will be no - race condition in lock_release(). */ - while (UNIV_UNLIKELY(trx->is_referenced())) { - ut_delay(srv_spin_wait_delay); - } - - trx->release_locks(); - trx->id = 0; - } else { - ut_ad(trx->read_only || !trx->rsegs.m_redo.rseg); - trx->release_locks(); - } - - DEBUG_SYNC_C("after_trx_committed_in_memory"); - - if (trx->read_only || !trx->rsegs.m_redo.rseg) { - MONITOR_INC(MONITOR_TRX_RO_COMMIT); - } else { - trx_update_mod_tables_timestamp(trx); - MONITOR_INC(MONITOR_TRX_RW_COMMIT); - trx->is_recovered = false; - } - } - - ut_ad(!trx->rsegs.m_redo.undo); - - if (trx_rseg_t* rseg = trx->rsegs.m_redo.rseg) { - mutex_enter(&rseg->mutex); - ut_ad(rseg->trx_ref_count > 0); - --rseg->trx_ref_count; - mutex_exit(&rseg->mutex); - - if (trx_undo_t*& insert = trx->rsegs.m_redo.old_insert) { - ut_ad(insert->rseg == rseg); - trx_undo_commit_cleanup(insert, false); - insert = NULL; - } - } - - ut_ad(!trx->rsegs.m_redo.old_insert); + if (id) + { + trx_sys.deregister_rw(this); + + /* Wait for any implicit-to-explicit lock conversions to cease, + so that there will be no race condition in lock_release(). */ + while (UNIV_UNLIKELY(is_referenced())) + ut_delay(srv_spin_wait_delay); + release_locks(); + id= 0; + } + else + { + ut_ad(read_only || !rsegs.m_redo.rseg); + release_locks(); + } - if (mtr != NULL) { - if (trx_undo_t*& undo = trx->rsegs.m_noredo.undo) { - ut_ad(undo->rseg == trx->rsegs.m_noredo.rseg); - trx_undo_commit_cleanup(undo, true); - undo = NULL; - } + DEBUG_SYNC_C("after_trx_committed_in_memory"); - /* NOTE that we could possibly make a group commit more - efficient here: call os_thread_yield here to allow also other - trxs to come to commit! */ + if (read_only || !rsegs.m_redo.rseg) + { + MONITOR_INC(MONITOR_TRX_RO_COMMIT); + } + else + { + trx_update_mod_tables_timestamp(this); + MONITOR_INC(MONITOR_TRX_RW_COMMIT); + is_recovered= false; + } - /*-------------------------------------*/ + while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables)) + { + UT_LIST_REMOVE(lock.evicted_tables, table); + dict_mem_table_free(table); + } + } - /* Depending on the my.cnf options, we may now write the log - buffer to the log files, making the transaction durable if - the OS does not crash. We may also flush the log files to - disk, making the transaction durable also at an OS crash or a - power outage. + ut_ad(!rsegs.m_redo.undo); + ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0); - The idea in InnoDB's group commit is that a group of - transactions gather behind a trx doing a physical disk write - to log files, and when that physical write has been completed, - one of those transactions does a write which commits the whole - group. Note that this group commit will only bring benefit if - there are > 2 users in the database. Then at least 2 users can - gather behind one doing the physical log write to disk. + if (trx_rseg_t *rseg= rsegs.m_redo.rseg) + { + mutex_enter(&rseg->mutex); + ut_ad(rseg->trx_ref_count > 0); + --rseg->trx_ref_count; + mutex_exit(&rseg->mutex); - If we are calling trx_commit() under prepare_commit_mutex, we - will delay possible log write and flush to a separate function - trx_commit_complete_for_mysql(), which is only called when the - thread has released the mutex. This is to make the - group commit algorithm to work. Otherwise, the prepare_commit - mutex would serialize all commits and prevent a group of - transactions from gathering. */ - - lsn_t lsn = mtr->commit_lsn(); - - if (lsn == 0) { - /* Nothing to be done. */ - } else if (trx->flush_log_later) { - /* Do nothing yet */ - trx->must_flush_log_later = true; - } else if (srv_flush_log_at_trx_commit == 0) { - /* Do nothing */ - } else { - trx_flush_log_if_needed(lsn, trx); - } + if (trx_undo_t *&insert= rsegs.m_redo.old_insert) + { + ut_ad(insert->rseg == rseg); + trx_undo_commit_cleanup(insert, false); + insert= nullptr; + } + } - trx->commit_lsn = lsn; + ut_ad(!rsegs.m_redo.old_insert); - /* Tell server some activity has happened, since the trx - does changes something. Background utility threads like - master thread, purge thread or page_cleaner thread might - have some work to do. */ - srv_active_wake_master_thread(); - } + if (mtr) + { + if (trx_undo_t *&undo= rsegs.m_noredo.undo) + { + ut_ad(undo->rseg == rsegs.m_noredo.rseg); + trx_undo_commit_cleanup(undo, true); + undo= nullptr; + } - ut_ad(!trx->rsegs.m_noredo.undo); + /* NOTE that we could possibly make a group commit more efficient + here: call os_thread_yield here to allow also other trxs to come + to commit! */ + + /*-------------------------------------*/ + + /* Depending on the my.cnf options, we may now write the log + buffer to the log files, making the transaction durable if the OS + does not crash. We may also flush the log files to disk, making + the transaction durable also at an OS crash or a power outage. + + The idea in InnoDB's group commit is that a group of transactions + gather behind a trx doing a physical disk write to log files, and + when that physical write has been completed, one of those + transactions does a write which commits the whole group. Note that + this group commit will only bring benefit if there are > 2 users + in the database. Then at least 2 users can gather behind one doing + the physical log write to disk. + + If we are calling trx_t::commit() under prepare_commit_mutex, we + will delay possible log write and flush to a separate function + trx_commit_complete_for_mysql(), which is only called when the + thread has released the mutex. This is to make the group commit + algorithm to work. Otherwise, the prepare_commit mutex would + serialize all commits and prevent a group of transactions from + gathering. */ + + commit_lsn= mtr->commit_lsn(); + if (!commit_lsn) + /* Nothing to be done. */; + else if (flush_log_later) + /* Do nothing yet */ + must_flush_log_later= true; + else if (srv_flush_log_at_trx_commit) + trx_flush_log_if_needed(commit_lsn, this); + + /* Tell server some activity has happened, since the trx does + changes something. Background utility threads like master thread, + purge thread or page_cleaner thread might have some work to do. */ + srv_active_wake_master_thread(); + } - /* Free all savepoints, starting from the first. */ - trx_named_savept_t* savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + ut_ad(!rsegs.m_noredo.undo); - trx_roll_savepoints_free(trx, savep); + /* Free all savepoints, starting from the first. */ + trx_named_savept_t *savep= UT_LIST_GET_FIRST(trx_savepoints); - if (trx->fts_trx != NULL) { - trx_finalize_for_fts(trx, trx->undo_no != 0); - } + trx_roll_savepoints_free(this, savep); - trx_mutex_enter(trx); - trx->dict_operation = TRX_DICT_OP_NONE; - trx->lock.was_chosen_as_deadlock_victim = false; + if (fts_trx) + trx_finalize_for_fts(this, undo_no != 0); - DBUG_LOG("trx", "Commit in memory: " << trx); - trx->state = TRX_STATE_NOT_STARTED; #ifdef WITH_WSREP - trx->wsrep = false; -#endif - - assert_trx_is_free(trx); + /* Serialization history has been written and the transaction is + committed in memory, which makes this commit ordered. Release commit + order critical section. */ + if (wsrep) + { + wsrep= false; + wsrep_commit_ordered(mysql_thd); + } + lock.was_chosen_as_wsrep_victim= false; +#endif /* WITH_WSREP */ + trx_mutex_enter(this); + dict_operation= TRX_DICT_OP_NONE; - trx_init(trx); + DBUG_LOG("trx", "Commit in memory: " << this); + state= TRX_STATE_NOT_STARTED; - trx_mutex_exit(trx); + assert_freed(); + trx_init(this); + trx_mutex_exit(this); - ut_a(trx->error_state == DB_SUCCESS); - srv_wake_purge_thread_if_not_active(); + ut_a(error_state == DB_SUCCESS); + if (!srv_read_only_mode) + srv_wake_purge_thread_if_not_active(); } -/** Commit a transaction and a mini-transaction. -@param[in,out] trx transaction -@param[in,out] mtr mini-transaction (NULL if no modifications) */ -void trx_commit_low(trx_t* trx, mtr_t* mtr) +/** Commit the transaction in a mini-transaction. +@param mtr mini-transaction (if there are any persistent modifications) */ +void trx_t::commit_low(mtr_t *mtr) { - assert_trx_nonlocking_or_in_list(trx); - ut_ad(!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)); - ut_ad(!mtr || mtr->is_active()); - ut_d(bool aborted = trx->in_rollback - && trx->error_state == DB_DEADLOCK); - ut_ad(!mtr == (aborted || !trx->has_logged_or_recovered())); - ut_ad(!mtr || !aborted); - - /* undo_no is non-zero if we're doing the final commit. */ - if (trx->fts_trx != NULL && trx->undo_no != 0) { - dberr_t error; - - ut_a(!trx_is_autocommit_non_locking(trx)); - - error = fts_commit(trx); - - /* FTS-FIXME: Temporarily tolerate DB_DUPLICATE_KEY - instead of dying. This is a possible scenario if there - is a crash between insert to DELETED table committing - and transaction committing. The fix would be able to - return error from this function */ - if (error != DB_SUCCESS && error != DB_DUPLICATE_KEY) { - /* FTS-FIXME: once we can return values from this - function, we should do so and signal an error - instead of just dying. */ - - ut_error; - } - } + assert_trx_nonlocking_or_in_list(this); + ut_ad(!trx_state_eq(this, TRX_STATE_COMMITTED_IN_MEMORY)); + ut_ad(!mtr || mtr->is_active()); + ut_d(bool aborted = in_rollback && error_state == DB_DEADLOCK); + ut_ad(!mtr == (aborted || !has_logged_or_recovered())); + ut_ad(!mtr || !aborted); + + /* undo_no is non-zero if we're doing the final commit. */ + if (fts_trx && undo_no) + { + ut_a(!trx_is_autocommit_non_locking(this)); + dberr_t error= fts_commit(this); + /* FTS-FIXME: Temporarily tolerate DB_DUPLICATE_KEY instead of + dying. This is a possible scenario if there is a crash between + insert to DELETED table committing and transaction committing. The + fix would be able to return error from this function */ + ut_a(error == DB_SUCCESS || error == DB_DUPLICATE_KEY); + } #ifndef DBUG_OFF - const bool debug_sync = trx->mysql_thd && trx->has_logged_persistent(); + const bool debug_sync= mysql_thd && has_logged_persistent(); #endif - if (mtr != NULL) { - trx_write_serialisation_history(trx, mtr); - - /* The following call commits the mini-transaction, making the - whole transaction committed in the file-based world, at this - log sequence number. The transaction becomes 'durable' when - we write the log to disk, but in the logical sense the commit - in the file-based data structures (undo logs etc.) happens - here. - - NOTE that transaction numbers, which are assigned only to - transactions with an update undo log, do not necessarily come - in exactly the same order as commit lsn's, if the transactions - have different rollback segments. To get exactly the same - order we should hold the kernel mutex up to this point, - adding to the contention of the kernel mutex. However, if - a transaction T2 is able to see modifications made by - a transaction T1, T2 will always get a bigger transaction - number and a bigger commit lsn than T1. */ - - /*--------------*/ - mtr_commit(mtr); - - DBUG_EXECUTE_IF("ib_crash_during_trx_commit_in_mem", - if (trx->has_logged()) { - log_write_up_to(mtr->commit_lsn(), - true); - DBUG_SUICIDE(); - }); - /*--------------*/ - } + if (mtr) + { + trx_write_serialisation_history(this, mtr); + + /* The following call commits the mini-transaction, making the + whole transaction committed in the file-based world, at this log + sequence number. The transaction becomes 'durable' when we write + the log to disk, but in the logical sense the commit in the + file-based data structures (undo logs etc.) happens here. + + NOTE that transaction numbers, which are assigned only to + transactions with an update undo log, do not necessarily come in + exactly the same order as commit lsn's, if the transactions have + different rollback segments. To get exactly the same order we + should hold the kernel mutex up to this point, adding to the + contention of the kernel mutex. However, if a transaction T2 is + able to see modifications made by a transaction T1, T2 will always + get a bigger transaction number and a bigger commit lsn than T1. */ + + mtr->commit(); + } #ifndef DBUG_OFF - /* In case of this function is called from a stack executing - THD::release_resources -> ... - innobase_connection_close() -> - trx_rollback_for_mysql... -> . - mysql's thd does not seem to have - thd->debug_sync_control defined any longer. However the stack - is possible only with a prepared trx not updating any data. - */ - if (debug_sync) { - DEBUG_SYNC_C("before_trx_state_committed_in_memory"); - } + if (debug_sync) + DEBUG_SYNC_C("before_trx_state_committed_in_memory"); #endif - trx_commit_in_memory(trx, mtr); + commit_in_memory(mtr); } -/****************************************************************//** -Commits a transaction. */ -void -trx_commit( -/*=======*/ - trx_t* trx) /*!< in/out: transaction */ -{ - mtr_t* mtr; - mtr_t local_mtr; - - DBUG_EXECUTE_IF("ib_trx_commit_crash_before_trx_commit_start", - DBUG_SUICIDE();); - - if (trx->has_logged_or_recovered()) { - mtr = &local_mtr; - mtr->start(); - } else { - mtr = NULL; - } +void trx_t::commit() +{ + mtr_t *mtr= nullptr; + mtr_t local_mtr; - trx_commit_low(trx, mtr); + if (has_logged_or_recovered()) + { + mtr= &local_mtr; + local_mtr.start(); + } + commit_low(mtr); } /****************************************************************//** @@ -1707,11 +1688,8 @@ trx_commit_step( trx_commit_or_rollback_prepare(trx); trx->lock.que_state = TRX_QUE_COMMITTING; - - trx_commit(trx); - + trx->commit(); ut_ad(trx->lock.wait_thr == NULL); - trx->lock.que_state = TRX_QUE_RUNNING; thr = NULL; @@ -1749,9 +1727,7 @@ trx_commit_for_mysql( case TRX_STATE_PREPARED: case TRX_STATE_PREPARED_RECOVERED: trx->op_info = "committing"; - - trx_commit(trx); - + trx->commit(); MONITOR_DEC(MONITOR_TRX_ACTIVE); trx->op_info = ""; return(DB_SUCCESS); @@ -2235,7 +2211,7 @@ static my_bool trx_get_trx_by_xid_callback(rw_trx_hash_element_t *element, transaction needs a valid trx->xid for invoking trx_sys_update_wsrep_checkpoint(). */ if (!wsrep_is_wsrep_xid(trx->xid)) -#endif +#endif /* WITH_WSREP */ /* Invalidate the XID, so that subsequent calls will not find it. */ trx->xid->null(); arg->trx= trx; diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc index 965e2c911f2..54e3f4de467 100644 --- a/storage/innobase/trx/trx0undo.cc +++ b/storage/innobase/trx/trx0undo.cc @@ -187,7 +187,7 @@ trx_undo_get_prev_rec_from_prev_page( space = page_get_space_id(undo_page); buf_block_t* block = buf_page_get( - page_id_t(space, prev_page_no), univ_page_size, + page_id_t(space, prev_page_no), 0, shared ? RW_S_LATCH : RW_X_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); @@ -388,7 +388,7 @@ trx_undo_parse_page_init(const byte* ptr, const byte* end_ptr, page_t* page) const ulint type = *ptr++; if (type > TRX_UNDO_UPDATE) { - recv_sys->found_corrupt_log = true; + recv_sys.found_corrupt_log = true; } else if (page) { /* Starting with MDEV-12288 in MariaDB 10.3.1, we use type=0 for the combined insert/update undo log @@ -662,6 +662,10 @@ trx_undo_write_xid( const XID* xid, /*!< in: X/Open XA Transaction Identification */ mtr_t* mtr) /*!< in: mtr */ { + DBUG_ASSERT(xid->gtrid_length >= 0); + DBUG_ASSERT(xid->bqual_length >= 0); + DBUG_ASSERT(xid->gtrid_length + xid->bqual_length < XIDDATASIZE); + mlog_write_ulint(log_hdr + TRX_UNDO_XA_FORMAT, static_cast<ulint>(xid->formatID), MLOG_4BYTES, mtr); @@ -673,10 +677,15 @@ trx_undo_write_xid( mlog_write_ulint(log_hdr + TRX_UNDO_XA_BQUAL_LEN, static_cast<ulint>(xid->bqual_length), MLOG_4BYTES, mtr); - + const ulint xid_length = static_cast<ulint>(xid->gtrid_length + + xid->bqual_length); mlog_write_string(log_hdr + TRX_UNDO_XA_XID, reinterpret_cast<const byte*>(xid->data), - XIDDATASIZE, mtr); + xid_length, mtr); + if (UNIV_LIKELY(xid_length < XIDDATASIZE)) { + mlog_memset(log_hdr + TRX_UNDO_XA_XID + xid_length, + XIDDATASIZE - xid_length, 0, mtr); + } } /********************************************************************//** @@ -844,7 +853,7 @@ trx_undo_free_page( TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + undo_page, mtr); fseg_free_page(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER + header_page, - rseg->space, page_no, mtr); + rseg->space, page_no, true, mtr); const fil_addr_t last_addr = flst_get_last( TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + header_page, mtr); @@ -883,54 +892,55 @@ trx_undo_free_last_page(trx_undo_t* undo, mtr_t* mtr) @param[in,out] undo undo log @param[in] limit all undo logs after this limit will be discarded @param[in] is_temp whether this is temporary undo log */ -void -trx_undo_truncate_end(trx_undo_t* undo, undo_no_t limit, bool is_temp) +void trx_undo_truncate_end(trx_undo_t& undo, undo_no_t limit, bool is_temp) { - ut_ad(mutex_own(&undo->rseg->mutex)); - ut_ad(is_temp == !undo->rseg->is_persistent()); + mtr_t mtr; + ut_ad(is_temp == !undo.rseg->is_persistent()); for (;;) { - mtr_t mtr; mtr.start(); if (is_temp) { mtr.set_log_mode(MTR_LOG_NO_REDO); } trx_undo_rec_t* trunc_here = NULL; + mutex_enter(&undo.rseg->mutex); page_t* undo_page = trx_undo_page_get( - page_id_t(undo->rseg->space->id, undo->last_page_no), + page_id_t(undo.rseg->space->id, undo.last_page_no), &mtr); trx_undo_rec_t* rec = trx_undo_page_get_last_rec( - undo_page, undo->hdr_page_no, undo->hdr_offset); + undo_page, undo.hdr_page_no, undo.hdr_offset); while (rec) { - if (trx_undo_rec_get_undo_no(rec) >= limit) { - /* Truncate at least this record off, maybe - more */ - trunc_here = rec; - } else { - goto function_exit; + if (trx_undo_rec_get_undo_no(rec) < limit) { + goto func_exit; } + /* Truncate at least this record off, maybe more */ + trunc_here = rec; rec = trx_undo_page_get_prev_rec(rec, - undo->hdr_page_no, - undo->hdr_offset); + undo.hdr_page_no, + undo.hdr_offset); } - if (undo->last_page_no == undo->hdr_page_no) { -function_exit: - if (trunc_here) { - mlog_write_ulint(undo_page + TRX_UNDO_PAGE_HDR - + TRX_UNDO_PAGE_FREE, - ulint(trunc_here - undo_page), - MLOG_2BYTES, &mtr); - } - + if (undo.last_page_no != undo.hdr_page_no) { + trx_undo_free_last_page(&undo, &mtr); + mutex_exit(&undo.rseg->mutex); mtr.commit(); - return; + continue; + } + +func_exit: + mutex_exit(&undo.rseg->mutex); + + if (trunc_here) { + mlog_write_ulint(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE, + ulint(trunc_here - undo_page), + MLOG_2BYTES, &mtr); } - trx_undo_free_last_page(undo, &mtr); mtr.commit(); + return; } } @@ -1334,7 +1344,7 @@ trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo, buf_block_t* block = buf_page_get(page_id_t(undo->rseg->space->id, undo->hdr_page_no), - univ_page_size, RW_X_LATCH, mtr); + 0, RW_X_LATCH, mtr); if (!block) { return NULL; } @@ -1402,7 +1412,7 @@ trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr) if (undo) { return buf_page_get_gen( page_id_t(undo->rseg->space->id, undo->last_page_no), - univ_page_size, RW_X_LATCH, + 0, RW_X_LATCH, buf_pool_is_obsolete(undo->withdraw_clock) ? NULL : undo->guess_block, BUF_GET, __FILE__, __LINE__, mtr, err); @@ -1458,7 +1468,7 @@ trx_undo_assign_low(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo, if (*undo) { return buf_page_get_gen( page_id_t(rseg->space->id, (*undo)->last_page_no), - univ_page_size, RW_X_LATCH, + 0, RW_X_LATCH, buf_pool_is_obsolete((*undo)->withdraw_clock) ? NULL : (*undo)->guess_block, BUF_GET, __FILE__, __LINE__, mtr, err); diff --git a/storage/innobase/ut/ut0crc32.cc b/storage/innobase/ut/ut0crc32.cc index 2c8aa3afe4d..bb2c530a174 100644 --- a/storage/innobase/ut/ut0crc32.cc +++ b/storage/innobase/ut/ut0crc32.cc @@ -474,34 +474,6 @@ ut_crc32_64_sw( *len -= 8; } -#ifdef INNODB_BUG_ENDIAN_CRC32 -/** Calculate CRC32 over 64-bit byte string using a software implementation. -The byte string is converted to a 64-bit integer using big endian byte order. -@param[in,out] crc crc32 checksum so far when this function is called, -when the function ends it will contain the new checksum -@param[in,out] data data to be checksummed, the pointer will be advanced -with 8 bytes -@param[in,out] len remaining bytes, it will be decremented with 8 */ -inline -void -ut_crc32_64_legacy_big_endian_sw( - uint32_t* crc, - const byte** data, - ulint* len) -{ - uint64_t data_int = *reinterpret_cast<const uint64_t*>(*data); - -#ifndef WORDS_BIGENDIAN - data_int = ut_crc32_swap_byteorder(data_int); -#endif /* WORDS_BIGENDIAN */ - - *crc = ut_crc32_64_low_sw(*crc, data_int); - - *data += 8; - *len -= 8; -} -#endif /* INNODB_BUG_ENDIAN_CRC32 */ - /** Calculates CRC32 in software, without using CPU instructions. @param[in] buf data over which to calculate CRC32 @param[in] len data length @@ -552,57 +524,6 @@ ut_crc32_sw( return(~crc); } -#ifdef INNODB_BUG_ENDIAN_CRC32 -/** Calculates CRC32 in software, without using CPU instructions. -This function uses big endian byte ordering when converting byte sequence to -integers. -@param[in] buf data over which to calculate CRC32 -@param[in] len data length -@return CRC-32C (polynomial 0x11EDC6F41) */ -uint32_t ut_crc32_legacy_big_endian(const byte* buf, ulint len) -{ - uint32_t crc = 0xFFFFFFFFU; - - ut_a(ut_crc32_slice8_table_initialized); - - /* Calculate byte-by-byte up to an 8-byte aligned address. After - this consume the input 8-bytes at a time. */ - while (len > 0 && (reinterpret_cast<uintptr_t>(buf) & 7) != 0) { - ut_crc32_8_sw(&crc, &buf, &len); - } - - while (len >= 128) { - /* This call is repeated 16 times. 16 * 8 = 128. */ - ut_crc32_64_legacy_big_endian_sw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_sw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_sw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_sw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_sw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_sw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_sw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_sw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_sw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_sw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_sw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_sw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_sw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_sw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_sw(&crc, &buf, &len); - ut_crc32_64_legacy_big_endian_sw(&crc, &buf, &len); - } - - while (len >= 8) { - ut_crc32_64_legacy_big_endian_sw(&crc, &buf, &len); - } - - while (len > 0) { - ut_crc32_8_sw(&crc, &buf, &len); - } - - return(~crc); -} -#endif /* INNODB_BUG_ENDIAN_CRC32 */ - /********************************************************************//** Initializes the data structures used by ut_crc32*(). Does not do any allocations, would not hurt if called twice, but would be pointless. */ @@ -625,9 +546,6 @@ ut_crc32_init() if (features_ecx & 1 << 20) { ut_crc32 = ut_crc32_hw; -#ifdef INNODB_BUG_ENDIAN_CRC32 - ut_crc32_legacy_big_endian = ut_crc32_legacy_big_endian_hw; -#endif /* INNODB_BUG_ENDIAN_CRC32 */ ut_crc32_implementation = "Using SSE2 crc32 instructions"; } #endif diff --git a/storage/innobase/ut/ut0new.cc b/storage/innobase/ut/ut0new.cc index 2a372ca9f63..f47a5112fd7 100644 --- a/storage/innobase/ut/ut0new.cc +++ b/storage/innobase/ut/ut0new.cc @@ -147,7 +147,6 @@ ut_new_boot() "row0merge", "row0mysql", "row0sel", - "row0trunc", "srv0conc", "srv0srv", "srv0start", diff --git a/storage/innobase/ut/ut0rnd.cc b/storage/innobase/ut/ut0rnd.cc index 8265121ef2e..a2e569514cb 100644 --- a/storage/innobase/ut/ut0rnd.cc +++ b/storage/innobase/ut/ut0rnd.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2019, MariaDB Corporation. +Copyright (c) 2019, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -27,7 +27,7 @@ Created 5/11/1994 Heikki Tuuri #include "ut0rnd.h" /** Seed value of ut_rnd_gen() */ -int32 ut_rnd_current; +std::atomic<uint32_t> ut_rnd_current; /** These random numbers are used in ut_find_prime */ /*@{*/ diff --git a/storage/innobase/ut/ut0ut.cc b/storage/innobase/ut/ut0ut.cc index 6d2b84625f7..fc2fbb7f240 100644 --- a/storage/innobase/ut/ut0ut.cc +++ b/storage/innobase/ut/ut0ut.cc @@ -415,8 +415,6 @@ ut_strerr( return("Tablespace already exists"); case DB_TABLESPACE_DELETED: return("Tablespace deleted or being deleted"); - case DB_TABLESPACE_TRUNCATED: - return("Tablespace was truncated"); case DB_TABLESPACE_NOT_FOUND: return("Tablespace not found"); case DB_LOCK_TABLE_FULL: |