diff options
49 files changed, 3660 insertions, 1820 deletions
diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc index 03533896d0b..8bce6567233 100644 --- a/extra/mariabackup/xtrabackup.cc +++ b/extra/mariabackup/xtrabackup.cc @@ -590,26 +590,25 @@ std::string filename_to_spacename(const byte *filename, size_t len) /** Report an operation to create, delete, or rename a file during backup. @param[in] space_id tablespace identifier -@param[in] flags tablespace flags (NULL if not create) +@param[in] create whether the file is being created @param[in] name file name (not NUL-terminated) @param[in] len length of name, in bytes @param[in] new_name new file name (NULL if not rename) @param[in] new_len length of new_name, in bytes (0 if NULL) */ -static void backup_file_op(ulint space_id, const byte* flags, +static void backup_file_op(ulint space_id, bool create, const byte* name, ulint len, const byte* new_name, ulint new_len) { - ut_ad(!flags || !new_name); + ut_ad(!create || !new_name); ut_ad(name); ut_ad(len); ut_ad(!new_name == !new_len); pthread_mutex_lock(&backup_mutex); - if (flags) { + if (create) { ddl_tracker.id_to_name[space_id] = filename_to_spacename(name, len); - msg("DDL tracking : create %zu \"%.*s\": %x", - space_id, int(len), name, mach_read_from_4(flags)); + msg("DDL tracking : create %zu \"%.*s\"", space_id, int(len), name); } else if (new_name) { ddl_tracker.id_to_name[space_id] = filename_to_spacename(new_name, new_len); @@ -632,14 +631,14 @@ static void backup_file_op(ulint space_id, const byte* flags, We will abort backup in this case. */ -static void backup_file_op_fail(ulint space_id, const byte* flags, +static void backup_file_op_fail(ulint space_id, bool create, const byte* name, ulint len, const byte* new_name, ulint new_len) { bool fail; - if (flags) { - msg("DDL tracking : create %zu \"%.*s\": %x", - space_id, int(len), name, mach_read_from_4(flags)); + if (create) { + msg("DDL tracking : create %zu \"%.*s\"", + space_id, int(len), name); std::string spacename = filename_to_spacename(name, len); fail = !check_if_skip_table(spacename.c_str()); } diff --git a/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result b/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result index 0d32ce422a9..55c1bd718ef 100644 --- a/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result +++ b/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result @@ -136,7 +136,7 @@ WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); COUNT(*) 1 -FOUND 1 /InnoDB: .* started; log sequence number 121397[09]/ in mysqld.1.err +FOUND 1 /InnoDB: .* started; log sequence number 12139[78]\d; transaction id 0/ in mysqld.1.err # Empty 10.2 redo log # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES diff --git a/mysql-test/suite/innodb/r/log_alter_table.result b/mysql-test/suite/innodb/r/log_alter_table.result deleted file mode 100644 index ae021b82e37..00000000000 --- a/mysql-test/suite/innodb/r/log_alter_table.result +++ /dev/null @@ -1,21 +0,0 @@ -# restart -# -# Bug#21801423 INNODB REDO LOG DOES NOT INDICATE WHEN -# FILES ARE CREATED -# -# Bug#21796691 INNODB REDO LOG DOES NOT INDICATE WHEN -# REDO LOGGING IS SKIPPED -# -CREATE TABLE t1 (a INT NOT NULL, b INT UNIQUE) ENGINE=InnoDB; -INSERT INTO t1 VALUES (1,2); -ALTER TABLE t1 ADD PRIMARY KEY(a), LOCK=SHARED, ALGORITHM=INPLACE; -ALTER TABLE t1 DROP INDEX b, ADD INDEX (b), LOCK=SHARED; -# Kill the server -# restart: --debug=d,ib_log -FOUND 2 /scan \d+: multi-log rec MLOG_FILE_CREATE2 len \d+ page \d+:0/ in mysqld.1.err -NOT FOUND /scan \d+: log rec MLOG_INDEX_LOAD/ in mysqld.1.err -CHECK TABLE t1; -Table Op Msg_type Msg_text -test.t1 check status OK -# restart -DROP TABLE t1; diff --git a/mysql-test/suite/innodb/r/log_corruption.result b/mysql-test/suite/innodb/r/log_corruption.result index 67a03d53e40..ab33ea1b152 100644 --- a/mysql-test/suite/innodb/r/log_corruption.result +++ b/mysql-test/suite/innodb/r/log_corruption.result @@ -136,7 +136,7 @@ WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); COUNT(*) 1 -FOUND 1 /InnoDB: .* started; log sequence number 121397[09]/ in mysqld.1.err +FOUND 1 /InnoDB: .* started; log sequence number 12139[78]\d; transaction id 0/ in mysqld.1.err # Empty 10.2 redo log # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES diff --git a/mysql-test/suite/innodb/r/log_file_name_debug.result b/mysql-test/suite/innodb/r/log_file_name_debug.result index 1fce4ecb674..4deef6f2785 100644 --- a/mysql-test/suite/innodb/r/log_file_name_debug.result +++ b/mysql-test/suite/innodb/r/log_file_name_debug.result @@ -12,7 +12,7 @@ FOUND 1 /InnoDB: Tablespace 4294967280 was not found at .*, but there were no mo # restart: --debug=d,innodb_log_abort_3,ib_log --innodb-log-files-in-group=2 --innodb-log-file-size=4M SELECT * FROM t1; ERROR 42000: Unknown storage engine 'InnoDB' -FOUND 1 /srv_prepare_to_delete_redo_log_files: ib_log: MLOG_CHECKPOINT.* written/ in mysqld.1.err +FOUND 1 /srv_prepare_to_delete_redo_log_files: ib_log: FILE_CHECKPOINT.* written/ in mysqld.1.err # restart # restart DROP TABLE t1; diff --git a/mysql-test/suite/innodb/t/log_alter_table.opt b/mysql-test/suite/innodb/t/log_alter_table.opt deleted file mode 100644 index ef236fcec40..00000000000 --- a/mysql-test/suite/innodb/t/log_alter_table.opt +++ /dev/null @@ -1 +0,0 @@ ---innodb-log-optimize-ddl diff --git a/mysql-test/suite/innodb/t/log_alter_table.test b/mysql-test/suite/innodb/t/log_alter_table.test deleted file mode 100644 index b0669c64f77..00000000000 --- a/mysql-test/suite/innodb/t/log_alter_table.test +++ /dev/null @@ -1,46 +0,0 @@ ---source include/have_innodb.inc ---source include/have_debug.inc - -# Embedded server does not support crashing ---source include/not_embedded.inc - -# start afresh ---source include/restart_mysqld.inc - ---echo # ---echo # Bug#21801423 INNODB REDO LOG DOES NOT INDICATE WHEN ---echo # FILES ARE CREATED ---echo # ---echo # Bug#21796691 INNODB REDO LOG DOES NOT INDICATE WHEN ---echo # REDO LOGGING IS SKIPPED ---echo # ---source include/no_checkpoint_start.inc -CREATE TABLE t1 (a INT NOT NULL, b INT UNIQUE) ENGINE=InnoDB; -# MLOG_INDEX_LOAD will not be emitted for empty tables. Insert a row. -INSERT INTO t1 VALUES (1,2); -# We should get two MLOG_INDEX_LOAD for this. -ALTER TABLE t1 ADD PRIMARY KEY(a), LOCK=SHARED, ALGORITHM=INPLACE; -# And one MLOG_INDEX_LOAD for this. -ALTER TABLE t1 DROP INDEX b, ADD INDEX (b), LOCK=SHARED; - ---let CLEANUP_IF_CHECKPOINT=DROP TABLE t1; ---source include/no_checkpoint_end.inc - ---let $restart_parameters= --debug=d,ib_log ---source include/start_mysqld.inc - -let SEARCH_FILE = $MYSQLTEST_VARDIR/log/mysqld.1.err; -# ensure that we have exactly 2 records there. -let SEARCH_PATTERN=scan \d+: multi-log rec MLOG_FILE_CREATE2 len \d+ page \d+:0; ---source include/search_pattern_in_file.inc -# ensure that we have 0 records there. -let SEARCH_PATTERN=scan \d+: log rec MLOG_INDEX_LOAD; ---source include/search_pattern_in_file.inc - -CHECK TABLE t1; - -# Remove the --debug=d,ib_log setting. ---let $restart_parameters= ---source include/restart_mysqld.inc - -DROP TABLE t1; diff --git a/mysql-test/suite/innodb/t/log_corruption.test b/mysql-test/suite/innodb/t/log_corruption.test index 46318fb37d2..6c2ef5db0bb 100644 --- a/mysql-test/suite/innodb/t/log_corruption.test +++ b/mysql-test/suite/innodb/t/log_corruption.test @@ -424,8 +424,8 @@ AND support IN ('YES', 'DEFAULT', 'ENABLED'); # In encryption.innodb_encrypt_log_corruption, we would convert the # log to encrypted format. Writing an extra log checkpoint before the # redo log conversion would advance the LSN by the size of a -# MLOG_CHECKPOINT record (9 bytes). ---let SEARCH_PATTERN= InnoDB: .* started; log sequence number 121397[09] +# FILE_CHECKPOINT record (12 bytes). +--let SEARCH_PATTERN= InnoDB: .* started; log sequence number 12139[78]\d; transaction id 0 --source include/search_pattern_in_file.inc --echo # Empty 10.2 redo log diff --git a/mysql-test/suite/innodb/t/log_file_name_debug.test b/mysql-test/suite/innodb/t/log_file_name_debug.test index d85fbf08194..fac1a72fe45 100644 --- a/mysql-test/suite/innodb/t/log_file_name_debug.test +++ b/mysql-test/suite/innodb/t/log_file_name_debug.test @@ -39,7 +39,7 @@ SELECT * FROM t1; --source include/restart_mysqld.inc --error ER_UNKNOWN_STORAGE_ENGINE SELECT * FROM t1; ---let SEARCH_PATTERN= srv_prepare_to_delete_redo_log_files: ib_log: MLOG_CHECKPOINT.* written +--let SEARCH_PATTERN= srv_prepare_to_delete_redo_log_files: ib_log: FILE_CHECKPOINT.* written --source include/search_pattern_in_file.inc --let $restart_parameters= diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index e4b2b05734b..0c7e3e38d78 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -438,32 +438,33 @@ btr_page_create( ulint level, /*!< in: the B-tree level of the page */ mtr_t* mtr) /*!< in: mtr */ { - ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); - byte *index_id= &block->frame[PAGE_HEADER + PAGE_INDEX_ID]; + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + byte *index_id= my_assume_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID + + block->frame); - if (UNIV_LIKELY_NULL(page_zip)) { - page_create_zip(block, index, level, 0, mtr); - mach_write_to_8(index_id, index->id); - page_zip_write_header(block, index_id, 8, mtr); - } else { - page_create(block, mtr, dict_table_is_comp(index->table)); - if (index->is_spatial()) { - static_assert(((FIL_PAGE_INDEX & 0xff00) - | byte(FIL_PAGE_RTREE)) - == FIL_PAGE_RTREE, "compatibility"); - mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame, - byte(FIL_PAGE_RTREE)); - if (mach_read_from_8(block->frame - + FIL_RTREE_SPLIT_SEQ_NUM)) { - mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, - 8, 0); - } - } - /* Set the level of the new index page */ - mtr->write<2,mtr_t::OPT>(*block, PAGE_HEADER + PAGE_LEVEL - + block->frame, level); - mtr->write<8,mtr_t::OPT>(*block, index_id, index->id); - } + if (UNIV_LIKELY_NULL(page_zip)) + { + mach_write_to_8(index_id, index->id); + page_create_zip(block, index, level, 0, mtr); + } + else + { + page_create(block, mtr, dict_table_is_comp(index->table)); + if (index->is_spatial()) + { + static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) == + FIL_PAGE_RTREE, "compatibility"); + mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame, + byte(FIL_PAGE_RTREE)); + if (mach_read_from_8(block->frame + FIL_RTREE_SPLIT_SEQ_NUM)) + mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0); + } + /* Set the level of the new index page */ + mtr->write<2,mtr_t::OPT>(*block, + my_assume_aligned<2>(PAGE_HEADER + PAGE_LEVEL + + block->frame), level); + mtr->write<8,mtr_t::OPT>(*block, index_id, index->id); + } } /**************************************************************//** @@ -984,14 +985,12 @@ static void btr_free_root(buf_block_t *block, mtr_t *mtr, bool invalidate) #endif /* UNIV_BTR_DEBUG */ if (invalidate) { - byte *page_index_id= PAGE_HEADER + PAGE_INDEX_ID + block->frame; - if (UNIV_LIKELY_NULL(block->page.zip.data)) - { - mach_write_to_8(page_index_id, BTR_FREED_INDEX_ID); - page_zip_write_header(block, page_index_id, 8, mtr); - } - else - mtr->write<8,mtr_t::OPT>(*block, page_index_id, BTR_FREED_INDEX_ID); + constexpr uint16_t field= PAGE_HEADER + PAGE_INDEX_ID; + + byte *page_index_id= my_assume_aligned<2>(field + block->frame); + if (mtr->write<8,mtr_t::OPT>(*block, page_index_id, BTR_FREED_INDEX_ID) && + UNIV_LIKELY_NULL(block->page.zip.data)) + memcpy_aligned<2>(&block->page.zip.data[field], page_index_id, 8); } /* Free the entire segment in small steps. */ @@ -1120,16 +1119,17 @@ btr_create( buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW); } - byte* page_index_id = PAGE_HEADER + PAGE_INDEX_ID + block->frame; + ut_ad(!page_has_siblings(block->frame)); + + constexpr uint16_t field = PAGE_HEADER + PAGE_INDEX_ID; + + byte* page_index_id = my_assume_aligned<2>(field + block->frame); /* Create a new index page on the allocated segment page */ if (UNIV_LIKELY_NULL(block->page.zip.data)) { - page_create_zip(block, index, 0, 0, mtr); mach_write_to_8(page_index_id, index_id); - page_zip_write_header(block, page_index_id, 8, mtr); - static_assert(FIL_PAGE_PREV % 8 == 0, "alignment"); - memset_aligned<8>(FIL_PAGE_PREV + block->page.zip.data, - 0xff, 8); + ut_ad(!page_has_siblings(block->page.zip.data)); + page_create_zip(block, index, 0, 0, mtr); } else { page_create(block, mtr, index->table->not_redundant()); if (index->is_spatial()) { @@ -1150,11 +1150,6 @@ btr_create( mtr->write<8,mtr_t::OPT>(*block, page_index_id, index_id); } - /* Set the next node and previous node fields */ - compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4); - compile_time_assert(FIL_NULL == 0xffffffff); - mtr->memset(block, FIL_PAGE_PREV, 8, 0xff); - /* We reset the free bits for the page in a separate mini-transaction to allow creation of several trees in the same mtr, otherwise the latch on a bitmap page would prevent @@ -1781,6 +1776,49 @@ void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr) } } +/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE. +@param[in] index clustered index with instant ALTER TABLE +@param[in] all whether to reset FIL_PAGE_TYPE as well +@param[in,out] mtr mini-transaction */ +ATTRIBUTE_COLD +void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr) +{ + ut_ad(!index.table->is_temporary()); + ut_ad(index.is_primary()); + if (buf_block_t *root = btr_root_block_get(&index, RW_SX_LATCH, mtr)) + { + byte *page_type= root->frame + FIL_PAGE_TYPE; + if (all) + { + ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT || + mach_read_from_2(page_type) == FIL_PAGE_INDEX); + mtr->write<2,mtr_t::OPT>(*root, page_type, FIL_PAGE_INDEX); + byte *instant= PAGE_INSTANT + PAGE_HEADER + root->frame; + mtr->write<2,mtr_t::OPT>(*root, instant, + page_ptr_get_direction(instant + 1)); + } + else + ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT); + static const byte supremuminfimum[8 + 8] = "supremuminfimum"; + uint16_t infimum, supremum; + if (page_is_comp(root->frame)) + { + infimum= PAGE_NEW_INFIMUM; + supremum= PAGE_NEW_SUPREMUM; + } + else + { + infimum= PAGE_OLD_INFIMUM; + supremum= PAGE_OLD_SUPREMUM; + } + ut_ad(!memcmp(&root->frame[infimum], supremuminfimum + 8, 8) == + !memcmp(&root->frame[supremum], supremuminfimum, 8)); + mtr->memcpy<mtr_t::OPT>(*root, &root->frame[infimum], supremuminfimum + 8, + 8); + mtr->memcpy<mtr_t::OPT>(*root, &root->frame[supremum], supremuminfimum, 8); + } +} + /*************************************************************//** Makes tree one level higher by splitting the root, and inserts the tuple. It is assumed that mtr contains an x-latch on the tree. @@ -1859,16 +1897,13 @@ btr_root_raise_and_insert( == page_zip_get_size(root_page_zip)); btr_page_create(new_block, new_page_zip, index, level, mtr); - - /* Set the next node and previous node fields of new page */ - if (!page_has_siblings(new_block->frame)) { - ut_ad(index->is_ibuf()); - } else { + if (page_has_siblings(new_block->frame)) { compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4); compile_time_assert(FIL_NULL == 0xffffffff); + static_assert(FIL_PAGE_PREV % 8 == 0, "alignment"); + memset_aligned<8>(new_block->frame + FIL_PAGE_PREV, 0xff, 8); mtr->memset(new_block, FIL_PAGE_PREV, 8, 0xff); if (UNIV_LIKELY_NULL(new_page_zip)) { - static_assert(FIL_PAGE_PREV % 8 == 0, "alignment"); memset_aligned<8>(new_page_zip->data + FIL_PAGE_PREV, 0xff, 8); } @@ -1902,6 +1937,7 @@ btr_root_raise_and_insert( } } + constexpr uint16_t max_trx_id = PAGE_HEADER + PAGE_MAX_TRX_ID; if (dict_index_is_sec_or_ibuf(index)) { /* In secondary indexes and the change buffer, PAGE_MAX_TRX_ID can be reset on the root page, because @@ -1910,11 +1946,12 @@ btr_root_raise_and_insert( set PAGE_MAX_TRX_ID on all secondary index pages.) */ byte* p = my_assume_aligned<8>( PAGE_HEADER + PAGE_MAX_TRX_ID + root->frame); - if (UNIV_LIKELY_NULL(root->page.zip.data)) { - memset_aligned<8>(p, 0, 8); - page_zip_write_header(root, p, 8, mtr); - } else if (mach_read_from_8(p)) { - mtr->memset(root, PAGE_HEADER + PAGE_MAX_TRX_ID, 8, 0); + if (mach_read_from_8(p)) { + mtr->memset(root, max_trx_id, 8, 0); + if (UNIV_LIKELY_NULL(root->page.zip.data)) { + memset_aligned<8>(max_trx_id + + root->page.zip.data, 0, 8); + } } } else { /* PAGE_ROOT_AUTO_INC is only present in the clustered index @@ -1922,12 +1959,13 @@ btr_root_raise_and_insert( the field PAGE_MAX_TRX_ID for future use. */ byte* p = my_assume_aligned<8>( PAGE_HEADER + PAGE_MAX_TRX_ID + new_block->frame); - if (UNIV_LIKELY_NULL(new_block->page.zip.data)) { - memset_aligned<8>(p, 0, 8); - page_zip_write_header(new_block, p, 8, mtr); - } else if (mach_read_from_8(p)) { - mtr->memset(new_block, PAGE_HEADER + PAGE_MAX_TRX_ID, - 8, 0); + if (mach_read_from_8(p)) { + mtr->memset(new_block, max_trx_id, 8, 0); + if (UNIV_LIKELY_NULL(new_block->page.zip.data)) { + memset_aligned<8>(max_trx_id + + new_block->page.zip.data, + 0, 8); + } } } @@ -2522,37 +2560,15 @@ btr_attach_half_pages( if (direction == FSP_DOWN) { ut_ad(lower_block == new_block); ut_ad(btr_page_get_next(upper_block->frame) == next_page_no); - if (UNIV_UNLIKELY(btr_page_get_prev(lower_block->frame) - == prev_page_no)) { - ut_ad(index->is_ibuf()); - } else { - btr_page_set_prev(lower_block, prev_page_no, mtr); - } + btr_page_set_prev(lower_block, prev_page_no, mtr); } else { ut_ad(upper_block == new_block); ut_ad(btr_page_get_prev(lower_block->frame) == prev_page_no); - if (UNIV_UNLIKELY(btr_page_get_next(upper_block->frame) - == next_page_no)) { - ut_ad(index->is_ibuf()); - } else { - btr_page_set_next(upper_block, next_page_no, mtr); - } + btr_page_set_next(upper_block, next_page_no, mtr); } - if (UNIV_UNLIKELY(btr_page_get_next(lower_block->frame) - == upper_block->page.id.page_no())) { - ut_ad(index->is_ibuf()); - } else { - btr_page_set_next(lower_block, upper_block->page.id.page_no(), - mtr); - } - if (UNIV_UNLIKELY(btr_page_get_prev(upper_block->frame) - == lower_block->page.id.page_no())) { - ut_ad(index->is_ibuf()); - } else { - btr_page_set_prev(upper_block, lower_block->page.id.page_no(), - mtr); - } + btr_page_set_prev(upper_block, lower_block->page.id.page_no(), mtr); + btr_page_set_next(lower_block, upper_block->page.id.page_no(), mtr); } /*************************************************************//** @@ -2838,8 +2854,9 @@ func_start: return(NULL);); /* 2. Allocate a new page to the index */ + const uint16_t page_level = btr_page_get_level(page); new_block = btr_page_alloc(cursor->index, hint_page_no, direction, - btr_page_get_level(page), mtr, mtr); + page_level, mtr, mtr); if (!new_block) { return(NULL); @@ -2847,10 +2864,16 @@ func_start: new_page = buf_block_get_frame(new_block); new_page_zip = buf_block_get_page_zip(new_block); + + if (page_level && UNIV_LIKELY_NULL(new_page_zip)) { + /* ROW_FORMAT=COMPRESSED non-leaf pages are not expected + to contain FIL_NULL in FIL_PAGE_PREV at this stage. */ + memset_aligned<4>(new_page + FIL_PAGE_PREV, 0, 4); + } btr_page_create(new_block, new_page_zip, cursor->index, - btr_page_get_level(page), mtr); + page_level, mtr); /* Only record the leaf level page splits. */ - if (page_is_leaf(page)) { + if (!page_level) { cursor->index->stat_defrag_n_page_split ++; cursor->index->stat_defrag_modified_counter ++; btr_defragment_save_defrag_stats_if_needed(cursor->index); @@ -2895,6 +2918,7 @@ insert_empty: /* 4. Do first the modifications in the tree structure */ + /* FIXME: write FIL_PAGE_PREV,FIL_PAGE_NEXT in new_block earlier! */ btr_attach_half_pages(flags, cursor->index, block, first_rec, new_block, direction, mtr); diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc index cf1fb62bce0..d892c429a1e 100644 --- a/storage/innobase/btr/btr0bulk.cc +++ b/storage/innobase/btr/btr0bulk.cc @@ -82,26 +82,21 @@ PageBulk::init() new_page = buf_block_get_frame(new_block); new_page_no = page_get_page_no(new_page); - byte* index_id = PAGE_HEADER + PAGE_INDEX_ID + new_page; + byte* index_id = my_assume_aligned<2> + (PAGE_HEADER + PAGE_INDEX_ID + new_page); + compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4); + compile_time_assert(FIL_NULL == 0xffffffff); + memset_aligned<8>(new_page + FIL_PAGE_PREV, 0xff, 8); if (UNIV_LIKELY_NULL(new_block->page.zip.data)) { + mach_write_to_8(index_id, m_index->id); page_create_zip(new_block, m_index, m_level, 0, &m_mtr); - static_assert(FIL_PAGE_PREV % 8 == 0, "alignment"); - memset_aligned<8>(FIL_PAGE_PREV + new_page, 0xff, 8); - page_zip_write_header(new_block, - FIL_PAGE_PREV + new_page, - 8, &m_mtr); - mach_write_to_8(index_id, m_index->id); - page_zip_write_header(new_block, index_id, 8, &m_mtr); } else { ut_ad(!m_index->is_spatial()); page_create(new_block, &m_mtr, m_index->table->not_redundant()); - compile_time_assert(FIL_PAGE_NEXT - == FIL_PAGE_PREV + 4); - compile_time_assert(FIL_NULL == 0xffffffff); - m_mtr.memset(new_block, FIL_PAGE_PREV, 8, 0xff); + m_mtr.memset(*new_block, FIL_PAGE_PREV, 8, 0xff); m_mtr.write<2,mtr_t::OPT>(*new_block, PAGE_HEADER + PAGE_LEVEL + new_page, m_level); @@ -155,22 +150,25 @@ PageBulk::init() /** Insert a record in the page. @tparam fmt the page format -@param[in] rec record +@param[in,out] rec record @param[in] offsets record offsets */ template<PageBulk::format fmt> -inline void PageBulk::insertPage(const rec_t *rec, offset_t *offsets) +inline void PageBulk::insertPage(rec_t *rec, offset_t *offsets) { ut_ad((m_page_zip != nullptr) == (fmt == COMPRESSED)); ut_ad((fmt != REDUNDANT) == m_is_comp); - + ut_ad(page_align(m_heap_top) == m_page); ut_ad(m_heap); - ulint rec_size= rec_offs_size(offsets); + const ulint rec_size= rec_offs_size(offsets); + const ulint extra_size= rec_offs_extra_size(offsets); + ut_ad(page_align(m_heap_top + rec_size) == m_page); ut_d(const bool is_leaf= page_rec_is_leaf(m_cur_rec)); #ifdef UNIV_DEBUG /* Check whether records are in order. */ - if (!page_rec_is_infimum_low(page_offset(m_cur_rec))) + if (page_offset(m_cur_rec) != + (fmt == REDUNDANT ? PAGE_OLD_INFIMUM : PAGE_NEW_INFIMUM)) { const rec_t *old_rec = m_cur_rec; offset_t *old_offsets= rec_get_offsets(old_rec, m_index, nullptr, is_leaf, @@ -181,41 +179,126 @@ inline void PageBulk::insertPage(const rec_t *rec, offset_t *offsets) m_total_data+= rec_size; #endif /* UNIV_DEBUG */ - /* Copy the record payload. */ - rec_t *insert_rec= rec_copy(m_heap_top, rec, offsets); - ut_ad(page_align(insert_rec) == m_page); - rec_offs_make_valid(insert_rec, m_index, is_leaf, offsets); + rec_t* const insert_rec= m_heap_top + extra_size; /* Insert the record in the linked list. */ if (fmt != REDUNDANT) { - rec_t *next_rec= m_page + + const rec_t *next_rec= m_page + page_offset(m_cur_rec + mach_read_from_2(m_cur_rec - REC_NEXT)); - mach_write_to_2(insert_rec - REC_NEXT, - static_cast<uint16_t>(next_rec - insert_rec)); if (fmt != COMPRESSED) m_mtr.write<2>(*m_block, m_cur_rec - REC_NEXT, static_cast<uint16_t>(insert_rec - m_cur_rec)); else + { mach_write_to_2(m_cur_rec - REC_NEXT, static_cast<uint16_t>(insert_rec - m_cur_rec)); - rec_set_bit_field_1(insert_rec, 0, REC_NEW_N_OWNED, REC_N_OWNED_MASK, + memcpy(m_heap_top, rec - extra_size, rec_size); + } + + rec_t * const this_rec= fmt != COMPRESSED + ? const_cast<rec_t*>(rec) : insert_rec; + rec_set_bit_field_1(this_rec, 0, REC_NEW_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); - rec_set_bit_field_2(insert_rec, PAGE_HEAP_NO_USER_LOW + m_rec_no, + rec_set_bit_field_2(this_rec, PAGE_HEAP_NO_USER_LOW + m_rec_no, REC_NEW_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); + mach_write_to_2(this_rec - REC_NEXT, + static_cast<uint16_t>(next_rec - insert_rec)); } else { - memcpy(insert_rec - REC_NEXT, m_cur_rec - REC_NEXT, 2); + memcpy(const_cast<rec_t*>(rec) - REC_NEXT, m_cur_rec - REC_NEXT, 2); m_mtr.write<2>(*m_block, m_cur_rec - REC_NEXT, page_offset(insert_rec)); - rec_set_bit_field_1(insert_rec, 0, REC_OLD_N_OWNED, REC_N_OWNED_MASK, - REC_N_OWNED_SHIFT); - rec_set_bit_field_2(insert_rec, PAGE_HEAP_NO_USER_LOW + m_rec_no, + rec_set_bit_field_1(const_cast<rec_t*>(rec), 0, + REC_OLD_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + rec_set_bit_field_2(const_cast<rec_t*>(rec), + PAGE_HEAP_NO_USER_LOW + m_rec_no, REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); } - if (fmt != COMPRESSED) - m_mtr.memcpy(*m_block, page_offset(m_heap_top), rec_offs_size(offsets)); + if (fmt == COMPRESSED) + /* We already wrote the record. Log is written in PageBulk::compress(). */; + else if (page_offset(m_cur_rec) == + (fmt == REDUNDANT ? PAGE_OLD_INFIMUM : PAGE_NEW_INFIMUM)) + m_mtr.memcpy(*m_block, m_heap_top, rec - extra_size, rec_size); + else + { + /* Try to copy common prefix from the preceding record. */ + const byte *r= rec - extra_size; + const byte * const insert_rec_end= m_heap_top + rec_size; + byte *b= m_heap_top; + + /* Skip any unchanged prefix of the record. */ + for (; * b == *r; b++, r++); + + ut_ad(b < insert_rec_end); + + const byte *c= m_cur_rec - (rec - r); + const byte * const c_end= std::min(m_cur_rec + rec_offs_data_size(offsets), + m_heap_top); + + /* Try to copy any bytes of the preceding record. */ + if (UNIV_LIKELY(c >= m_page && c < c_end)) + { + const byte *cm= c; + byte *bm= b; + const byte *rm= r; + for (; cm < c_end && *rm == *cm; cm++, bm++, rm++); + ut_ad(bm <= insert_rec_end); + size_t len= static_cast<size_t>(rm - r); + ut_ad(!memcmp(r, c, len)); + if (len > 2) + { + memcpy(b, c, len); + m_mtr.memmove(*m_block, page_offset(b), page_offset(c), len); + c= cm; + b= bm; + r= rm; + } + } + + if (c < m_cur_rec) + { + if (!rec_offs_data_size(offsets)) + { +no_data: + m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, m_cur_rec - c); + goto rec_done; + } + /* Some header bytes differ. Compare the data separately. */ + const byte *cd= m_cur_rec; + byte *bd= insert_rec; + const byte *rd= rec; + /* Skip any unchanged prefix of the record. */ + for (; *bd == *rd; cd++, bd++, rd++) + if (bd == insert_rec_end) + goto no_data; + + /* Try to copy any data bytes of the preceding record. */ + const byte *cdm= cd; + const byte *rdm= rd; + for (; cdm < c_end && *rdm == *cdm; cdm++, rdm++) + ut_ad(rdm - rd + bd <= insert_rec_end); + size_t len= static_cast<size_t>(rdm - rd); + ut_ad(!memcmp(rd, cd, len)); + if (len > 2) + { + m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, m_cur_rec - c); + memcpy(bd, cd, len); + m_mtr.memmove(*m_block, page_offset(bd), page_offset(cd), len); + c= cdm; + b= rdm - rd + bd; + r= rdm; + } + } + + if (size_t len= static_cast<size_t>(insert_rec_end - b)) + m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, len); + } + +rec_done: + ut_ad(fmt == COMPRESSED || !memcmp(m_heap_top, rec - extra_size, rec_size)); + rec_offs_make_valid(insert_rec, m_index, is_leaf, offsets); /* Update the member variables. */ ulint slot_size= page_dir_calc_reserved_space(m_rec_no + 1) - @@ -235,12 +318,25 @@ inline void PageBulk::insertPage(const rec_t *rec, offset_t *offsets) @param[in] offsets record offsets */ inline void PageBulk::insert(const rec_t *rec, offset_t *offsets) { + byte rec_hdr[REC_N_OLD_EXTRA_BYTES]; + static_assert(REC_N_OLD_EXTRA_BYTES > REC_N_NEW_EXTRA_BYTES, "file format"); + if (UNIV_LIKELY_NULL(m_page_zip)) - insertPage<COMPRESSED>(rec, offsets); + insertPage<COMPRESSED>(const_cast<rec_t*>(rec), offsets); else if (m_is_comp) - insertPage<DYNAMIC>(rec, offsets); + { + memcpy(rec_hdr, rec - REC_N_NEW_EXTRA_BYTES, REC_N_NEW_EXTRA_BYTES); + insertPage<DYNAMIC>(const_cast<rec_t*>(rec), offsets); + memcpy(const_cast<rec_t*>(rec) - REC_N_NEW_EXTRA_BYTES, rec_hdr, + REC_N_NEW_EXTRA_BYTES); + } else - insertPage<REDUNDANT>(rec, offsets); + { + memcpy(rec_hdr, rec - REC_N_OLD_EXTRA_BYTES, REC_N_OLD_EXTRA_BYTES); + insertPage<REDUNDANT>(const_cast<rec_t*>(rec), offsets); + memcpy(const_cast<rec_t*>(rec) - REC_N_OLD_EXTRA_BYTES, rec_hdr, + REC_N_OLD_EXTRA_BYTES); + } } /** Set the number of owned records in the uncompressed page of @@ -283,18 +379,13 @@ inline void PageBulk::finishPage() if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2) { slot-= PAGE_DIR_SLOT_SIZE; + mach_write_to_2(slot, offset); if (fmt != COMPRESSED) - { - m_mtr.write<2,mtr_t::OPT>(*m_block, slot, offset); page_rec_set_n_owned<false>(m_block, m_page + offset, count, true, &m_mtr); - } else - { - mach_write_to_2(slot, offset); rec_set_n_owned_zip(m_page + offset, count); - } count= 0; } @@ -321,17 +412,12 @@ inline void PageBulk::finishPage() else slot-= PAGE_DIR_SLOT_SIZE; + mach_write_to_2(slot, PAGE_NEW_SUPREMUM); if (fmt != COMPRESSED) - { - m_mtr.write<2,mtr_t::OPT>(*m_block, slot, PAGE_NEW_SUPREMUM); page_rec_set_n_owned<false>(m_block, m_page + PAGE_NEW_SUPREMUM, count + 1, true, &m_mtr); - } else - { - mach_write_to_2(slot, PAGE_NEW_SUPREMUM); rec_set_n_owned_zip(m_page + PAGE_NEW_SUPREMUM, count + 1); - } } else { @@ -347,7 +433,7 @@ inline void PageBulk::finishPage() if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2) { slot-= PAGE_DIR_SLOT_SIZE; - m_mtr.write<2,mtr_t::OPT>(*m_block, slot, page_offset(insert_rec)); + mach_write_to_2(slot, page_offset(insert_rec)); page_rec_set_n_owned<false>(m_block, insert_rec, count, false, &m_mtr); count= 0; } @@ -368,31 +454,35 @@ inline void PageBulk::finishPage() else slot-= PAGE_DIR_SLOT_SIZE; - m_mtr.write<2,mtr_t::OPT>(*m_block, slot, PAGE_OLD_SUPREMUM); + mach_write_to_2(slot, PAGE_OLD_SUPREMUM); page_rec_set_n_owned<false>(m_block, m_page + PAGE_OLD_SUPREMUM, count + 1, false, &m_mtr); } - ut_ad(!dict_index_is_spatial(m_index)); + ut_ad(!m_index->is_spatial()); ut_ad(!page_get_instant(m_page)); ut_ad(!mach_read_from_2(PAGE_HEADER + PAGE_N_DIRECTION + m_page)); if (fmt != COMPRESSED) { - m_mtr.write<2,mtr_t::OPT>(*m_block, - PAGE_HEADER + PAGE_N_DIR_SLOTS + m_page, - 1 + static_cast<ulint>(slot0 - slot) / - PAGE_DIR_SLOT_SIZE); - m_mtr.write<2>(*m_block, PAGE_HEADER + PAGE_HEAP_TOP + m_page, - static_cast<ulint>(m_heap_top - m_page)); - m_mtr.write<2>(*m_block, PAGE_HEADER + PAGE_N_HEAP + m_page, - (PAGE_HEAP_NO_USER_LOW + m_rec_no) | - uint16_t{fmt != REDUNDANT} << 15); - m_mtr.write<2>(*m_block, PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no); - m_mtr.write<2>(*m_block, PAGE_HEADER + PAGE_LAST_INSERT + m_page, - static_cast<ulint>(m_cur_rec - m_page)); - m_mtr.write<2>(*m_block, PAGE_HEADER + PAGE_DIRECTION_B - 1 + m_page, - PAGE_RIGHT); + static_assert(PAGE_N_DIR_SLOTS == 0, "compatibility"); + alignas(8) byte page_header[PAGE_N_RECS + 2]; + mach_write_to_2(page_header + PAGE_N_DIR_SLOTS, + 1 + (slot0 - slot) / PAGE_DIR_SLOT_SIZE); + mach_write_to_2(page_header + PAGE_HEAP_TOP, m_heap_top - m_page); + mach_write_to_2(page_header + PAGE_N_HEAP, + (PAGE_HEAP_NO_USER_LOW + m_rec_no) | + uint16_t{fmt != REDUNDANT} << 15); + memset_aligned<2>(page_header + PAGE_FREE, 0, 4); + static_assert(PAGE_GARBAGE == PAGE_FREE + 2, "compatibility"); + mach_write_to_2(page_header + PAGE_LAST_INSERT, m_cur_rec - m_page); + mach_write_to_2(page_header + PAGE_DIRECTION_B - 1, PAGE_RIGHT); + mach_write_to_2(page_header + PAGE_N_DIRECTION, m_rec_no); + memcpy_aligned<2>(page_header + PAGE_N_RECS, + page_header + PAGE_N_DIRECTION, 2); + m_mtr.memcpy(*m_block, PAGE_HEADER + m_page, page_header, + sizeof page_header); + m_mtr.memcpy(*m_block, page_offset(slot), slot0 - slot); } else { diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index 09972a2786c..22495ddbd88 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -3898,49 +3898,94 @@ static void btr_cur_write_sys( } /** Update DB_TRX_ID, DB_ROLL_PTR in a clustered index record. -@param[in,out] block clustered index leaf page -@param[in,out] rec clustered index record -@param[in] index clustered index -@param[in] offsets rec_get_offsets(rec, index) -@param[in] trx transaction -@param[in] roll_ptr DB_ROLL_PTR value -@param[in,out] mtr mini-transaction */ -static void btr_cur_upd_rec_sys(buf_block_t *block, rec_t* rec, - dict_index_t* index, const offset_t* offsets, - const trx_t* trx, roll_ptr_t roll_ptr, - mtr_t* mtr) +@param[in,out] block clustered index leaf page +@param[in,out] rec clustered index record +@param[in] index clustered index +@param[in] offsets rec_get_offsets(rec, index) +@param[in] trx transaction +@param[in] roll_ptr DB_ROLL_PTR value +@param[in,out] mtr mini-transaction */ +static void btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec, + dict_index_t *index, const offset_t *offsets, + const trx_t *trx, roll_ptr_t roll_ptr, + mtr_t *mtr) { - ut_ad(index->is_primary()); - ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(index->is_primary()); + ut_ad(rec_offs_validate(rec, index, offsets)); - if (UNIV_LIKELY_NULL(block->page.zip.data)) { - page_zip_write_trx_id_and_roll_ptr(block, rec, offsets, - index->db_trx_id(), - trx->id, roll_ptr, mtr); - } else { - ulint offset = index->trx_id_offset; + if (UNIV_LIKELY_NULL(block->page.zip.data)) + { + page_zip_write_trx_id_and_roll_ptr(block, rec, offsets, index->db_trx_id(), + trx->id, roll_ptr, mtr); + return; + } - if (!offset) { - offset = row_get_trx_id_offset(index, offsets); - } + ulint offset= index->trx_id_offset; - compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR); + if (!offset) + offset= row_get_trx_id_offset(index, offsets); - /* During IMPORT the trx id in the record can be in the - future, if the .ibd file is being imported from another - instance. During IMPORT roll_ptr will be 0. */ - ut_ad(roll_ptr == 0 - || lock_check_trx_id_sanity( - trx_read_trx_id(rec + offset), - rec, index, offsets)); - - trx_write_trx_id(rec + offset, trx->id); - trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, roll_ptr); - /* MDEV-12353 FIXME: consider emitting MEMMOVE for the - DB_TRX_ID if it is found in the preceding record */ - mtr->memcpy(*block, page_offset(rec + offset), - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); - } + compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR); + + /* During IMPORT the trx id in the record can be in the future, if + the .ibd file is being imported from another instance. During IMPORT + roll_ptr will be 0. */ + ut_ad(roll_ptr == 0 || + lock_check_trx_id_sanity(trx_read_trx_id(rec + offset), + rec, index, offsets)); + + byte sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]; + + trx_write_trx_id(sys, trx->id); + trx_write_roll_ptr(sys + DATA_TRX_ID_LEN, roll_ptr); + + ulint d= 0; + const byte *src= nullptr; + byte *dest= rec + offset; + ulint len= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + if (UNIV_LIKELY(index->trx_id_offset)) + { + const rec_t *prev= page_rec_get_prev_const(rec); + if (UNIV_UNLIKELY(prev == rec)) + ut_ad(0); + else if (page_rec_is_infimum(prev)); + else + for (src= prev + offset; d < DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; d++) + if (src[d] != sys[d]) + break; + if (d > 6 && memcmp(dest, sys, d)) + { + /* We save space by replacing a single record + + WRITE,page_offset(dest),byte[13] + + with two records: + + MEMMOVE,page_offset(dest),d(1 byte),offset(1..3 bytes), + WRITE|0x80,0,byte[13-d] + + The single WRITE record would be x+13 bytes long, with x>2. + The MEMMOVE record would be up to x+1+3 = x+4 bytes, and the + second WRITE would be 1+1+13-d = 15-d bytes. + + The total size is: x+13 versus x+4+15-d = x+19-d bytes. + To save space, we must have d>6, that is, the complete DB_TRX_ID and + the first byte(s) of DB_ROLL_PTR must match the previous record. */ + memcpy(dest, src, d); + mtr->memmove(*block, page_offset(dest), page_offset(src), d); + dest+= d; + len-= d; + /* DB_TRX_ID,DB_ROLL_PTR must be unique in each record when + DB_TRX_ID refers to an active transaction. */ + ut_ad(len); + } + else + d= 0; + } + + if (UNIV_LIKELY(len)) /* extra safety, to avoid corrupting the log */ + mtr->memcpy<mtr_t::OPT>(*block, dest, sys + d, len); } /*********************************************************************//** @@ -4400,10 +4445,13 @@ void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index, if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) { ut_ad(!rec_offs_nth_sql_null(offsets, n)); ut_ad(!index->table->not_redundant()); - mtr->memset(block, - page_offset(rec + rec_get_field_start_offs( - rec, n)), - rec_get_nth_field_size(rec, n), 0); + if (ulint size = rec_get_nth_field_size(rec, n)) { + mtr->memset( + block, + page_offset(rec_get_field_start_offs( + rec, n) + rec), + size, 0); + } ulint l = rec_get_1byte_offs_flag(rec) ? (n + 1) : (n + 1) * 2; byte* b = &rec[-REC_N_OLD_EXTRA_BYTES - l]; @@ -4436,7 +4484,10 @@ void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index, byte(*b & ~REC_1BYTE_SQL_NULL_MASK)); } - mtr->memcpy(block, page_offset(data), uf->new_val.data, len); + if (len) { + mtr->memcpy<mtr_t::OPT>(*block, data, uf->new_val.data, + len); + } } if (UNIV_LIKELY_NULL(block->page.zip.data)) { @@ -7855,21 +7906,10 @@ btr_store_big_rec_extern_fields( int err; page_zip_des_t* blob_page_zip; - /* Write FIL_PAGE_TYPE to the redo log - separately, before logging any other - changes to the block, so that the debug - assertions in - recv_parse_or_apply_log_rec_body() can - be made simpler. Before InnoDB Plugin - 1.0.4, the initialization of - FIL_PAGE_TYPE was logged as part of - the mtr_t::memcpy() below. */ - - mtr.write<2>(*block, - block->frame + FIL_PAGE_TYPE, - prev_page_no == FIL_NULL - ? FIL_PAGE_TYPE_ZBLOB - : FIL_PAGE_TYPE_ZBLOB2); + mach_write_to_2(block->frame + FIL_PAGE_TYPE, + prev_page_no == FIL_NULL + ? FIL_PAGE_TYPE_ZBLOB + : FIL_PAGE_TYPE_ZBLOB2); c_stream.next_out = block->frame + FIL_PAGE_DATA; @@ -7886,9 +7926,9 @@ btr_store_big_rec_extern_fields( compile_time_assert(FIL_NULL == 0xffffffff); mtr.memset(block, FIL_PAGE_PREV, 8, 0xff); mtr.memcpy(*block, - FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, + FIL_PAGE_TYPE, page_zip_get_size(page_zip) - - FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + - FIL_PAGE_TYPE - c_stream.avail_out); /* Zero out the unused part of the page. */ if (c_stream.avail_out) { @@ -7966,12 +8006,14 @@ next_zip_page: store_len = extern_len; } - mtr.memcpy(block, - FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE, - (const byte*) - big_rec_vec->fields[i].data - + big_rec_vec->fields[i].len - - extern_len, store_len); + mtr.memcpy<mtr_t::OPT>( + *block, + FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE + + block->frame, + static_cast<const byte*> + (big_rec_vec->fields[i].data) + + big_rec_vec->fields[i].len + - extern_len, store_len); mtr.write<4>(*block, BTR_BLOB_HDR_PART_LEN + FIL_PAGE_DATA + block->frame, store_len); diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 00abae10d5a..6e2c62a1e57 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -5493,7 +5493,7 @@ release_page: } if (recv_recovery_is_on()) { - recv_recover_page(bpage); + recv_recover_page(space, bpage); } if (uncompressed @@ -5536,27 +5536,13 @@ release_page: ut_ad(buf_pool->n_pend_reads > 0); buf_pool->n_pend_reads--; buf_pool->stat.n_pages_read++; - ut_ad(!uncompressed || !bpage->zip.data - || !recv_recovery_is_on() - || buf_page_can_relocate(bpage)); - mutex_exit(block_mutex); if (uncompressed) { -#if 1 /* MDEV-12353 FIXME: Remove this! */ - if (UNIV_LIKELY_NULL(bpage->zip.data) - && recv_recovery_is_on()) { - rw_lock_x_unlock_gen( - &reinterpret_cast<buf_block_t*>(bpage) - ->lock, BUF_IO_READ); - if (!buf_LRU_free_page(bpage, false)) { - ut_ad(!"could not remove"); - } - goto func_exit; - } -#endif rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_READ); } + + mutex_exit(block_mutex); } else { /* Write means a flush operation: call the completion routine in the flush system */ @@ -5590,7 +5576,6 @@ release_page: DBUG_PRINT("ib_buf", ("%s page %u:%u", io_type == BUF_IO_READ ? "read" : "wrote", bpage->id.space(), bpage->id.page_no())); -func_exit: mutex_exit(&buf_pool->mutex); return DB_SUCCESS; } diff --git a/storage/innobase/fil/fil0crypt.cc b/storage/innobase/fil/fil0crypt.cc index 4ec9a6d7952..fc2a26d8b04 100644 --- a/storage/innobase/fil/fil0crypt.cc +++ b/storage/innobase/fil/fil0crypt.cc @@ -418,9 +418,7 @@ void fil_space_crypt_t::write_page0(buf_block_t* block, mtr_t* mtr) + fsp_header_get_encryption_offset(block->zip_size()); byte* b = block->frame + offset; - if (memcmp(b, CRYPT_MAGIC, MAGIC_SZ)) { - mtr->memcpy(block, offset, CRYPT_MAGIC, MAGIC_SZ); - } + mtr->memcpy<mtr_t::OPT>(*block, b, CRYPT_MAGIC, MAGIC_SZ); b += MAGIC_SZ; byte* const start = b; @@ -436,6 +434,8 @@ void fil_space_crypt_t::write_page0(buf_block_t* block, mtr_t* mtr) b += 4; *b++ = byte(encryption); ut_ad(b - start == 11 + MY_AES_BLOCK_SIZE); + /* We must log also any unchanged bytes, because recovery will + invoke fil_crypt_parse() based on this log record. */ mtr->memcpy(*block, offset + MAGIC_SZ, b - start); } diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 0079be0e9f0..0c9b6fdd6dd 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -1817,68 +1817,62 @@ fil_create_directory_for_tablename( @param space_id tablespace identifier @param first_page_no first page number in the file @param path file path -@param new_path new file path for type=MLOG_FILE_RENAME2 -@param flags tablespace flags for type=MLOG_FILE_CREATE2 */ -inline void mtr_t::log_file_op(mlog_id_t type, +@param new_path new file path for type=FILE_RENAME */ +inline void mtr_t::log_file_op(mfile_type_t type, ulint space_id, ulint first_page_no, - const char *path, const char *new_path, - ulint flags) + const char *path, const char *new_path) { - ulint len; - - ut_ad(first_page_no == 0 || type == MLOG_FILE_CREATE2); - ut_ad(fil_space_t::is_valid_flags(flags, space_id)); - - /* fil_name_parse() requires that there be at least one path - separator and that the file path end with ".ibd". */ - ut_ad(strchr(path, OS_PATH_SEPARATOR) != NULL); - ut_ad(first_page_no /* trimming an undo tablespace */ - || !strcmp(&path[strlen(path) - strlen(DOT_IBD)], DOT_IBD)); - - set_modified(); - if (m_log_mode != MTR_LOG_ALL) { - return; - } - - byte* log_ptr = log_write_low(type, page_id_t(space_id, first_page_no), - m_log.open(11 + 4 + 2 + 1)); - - if (type == MLOG_FILE_CREATE2) { - mach_write_to_4(log_ptr, flags); - log_ptr += 4; - } - - /* Let us store the strings as null-terminated for easier readability - and handling */ - - len = strlen(path) + 1; - - mach_write_to_2(log_ptr, len); - log_ptr += 2; - m_log.close(log_ptr); - - m_log.push(reinterpret_cast<const byte*>(path), uint32_t(len)); + ut_ad(first_page_no == 0 || type == FILE_CREATE); + ut_ad((new_path != nullptr) == (type == FILE_RENAME)); + ut_ad(!(byte(type) & 15)); + + /* fil_name_parse() requires that there be at least one path + separator and that the file path end with ".ibd". */ + ut_ad(strchr(path, OS_PATH_SEPARATOR) != NULL); + ut_ad(first_page_no /* trimming an undo tablespace */ || + !strcmp(&path[strlen(path) - strlen(DOT_IBD)], DOT_IBD)); + + set_modified(); + if (m_log_mode != MTR_LOG_ALL) + return; + m_last= nullptr; + + const size_t len= strlen(path); + const size_t new_len= type == FILE_RENAME ? 1 + strlen(new_path) : 0; + ut_ad(len > 0); + byte *const log_ptr= m_log.open(1 + 3/*length*/ + 5/*space_id*/ + + 5/*first_page_no*/); + byte *end= log_ptr + 1; + end= mlog_encode_varint(end, space_id); + end= mlog_encode_varint(end, first_page_no); + if (UNIV_LIKELY(end + len + new_len >= &log_ptr[16])) + { + *log_ptr= type; + size_t total_len= len + new_len + end - log_ptr - 15; + if (total_len >= MIN_3BYTE) + total_len+= 2; + else if (total_len >= MIN_2BYTE) + total_len++; + end= mlog_encode_varint(log_ptr + 1, total_len); + end= mlog_encode_varint(end, space_id); + end= mlog_encode_varint(end, first_page_no); + } + else + { + *log_ptr= type | static_cast<byte>(end + len + new_len - &log_ptr[1]); + ut_ad(*log_ptr & 15); + } - switch (type) { - case MLOG_FILE_RENAME2: - ut_ad(strchr(new_path, OS_PATH_SEPARATOR) != NULL); - len = strlen(new_path) + 1; - log_ptr = m_log.open(2 + len); - ut_a(log_ptr); - mach_write_to_2(log_ptr, len); - log_ptr += 2; - m_log.close(log_ptr); + m_log.close(end); - m_log.push(reinterpret_cast<const byte*>(new_path), - uint32_t(len)); - break; - case MLOG_FILE_NAME: - case MLOG_FILE_DELETE: - case MLOG_FILE_CREATE2: - break; - default: - ut_ad(0); - } + if (type == FILE_RENAME) + { + ut_ad(strchr(new_path, OS_PATH_SEPARATOR)); + m_log.push(reinterpret_cast<const byte*>(path), uint32_t(len + 1)); + m_log.push(reinterpret_cast<const byte*>(new_path), uint32_t(new_len)); + } + else + m_log.push(reinterpret_cast<const byte*>(path), uint32_t(len)); } /** Write redo log for renaming a file. @@ -1897,8 +1891,7 @@ fil_name_write_rename_low( mtr_t* mtr) { ut_ad(!is_predefined_tablespace(space_id)); - mtr->log_file_op(MLOG_FILE_RENAME2, space_id, first_page_no, - old_name, new_name); + mtr->log_file_op(FILE_RENAME, space_id, first_page_no, old_name, new_name); } /** Write redo log for renaming a file. @@ -1918,7 +1911,7 @@ fil_name_write_rename( log_write_up_to(mtr.commit_lsn(), true); } -/** Write MLOG_FILE_NAME for a file. +/** Write FILE_MODIFY for a file. @param[in] space_id tablespace id @param[in] first_page_no first page number in the file @param[in] name tablespace file name @@ -1931,9 +1924,10 @@ fil_name_write( const char* name, mtr_t* mtr) { - mtr->log_file_op(MLOG_FILE_NAME, space_id, first_page_no, name); + ut_ad(!is_predefined_tablespace(space_id)); + mtr->log_file_op(FILE_MODIFY, space_id, first_page_no, name); } -/** Write MLOG_FILE_NAME for a file. +/** Write FILE_MODIFY for a file. @param[in] space tablespace @param[in] first_page_no first page number in the file @param[in] file tablespace file @@ -1946,7 +1940,7 @@ fil_name_write( const fil_node_t* file, mtr_t* mtr) { - mtr->log_file_op(MLOG_FILE_NAME, space->id, first_page_no, file->name); + fil_name_write(space->id, first_page_no, file->name, mtr); } /** Replay a file rename operation if possible. @@ -2347,7 +2341,7 @@ fil_delete_tablespace( mtr_t mtr; mtr.start(); - mtr.log_file_op(MLOG_FILE_DELETE, id, 0, path); + mtr.log_file_op(FILE_DELETE, id, 0, path); mtr.commit(); /* Even if we got killed shortly after deleting the tablespace file, the record must have already been @@ -2429,13 +2423,12 @@ fil_space_t* fil_truncate_prepare(ulint space_id) /** Write log about an undo tablespace truncate operation. */ void fil_truncate_log(fil_space_t* space, ulint size, mtr_t* mtr) { - /* Write a MLOG_FILE_CREATE2 record with the new size, so that - recovery and backup will ignore any preceding redo log records - for writing pages that are after the new end of the tablespace. */ - ut_ad(UT_LIST_GET_LEN(space->chain) == 1); - const fil_node_t* file = UT_LIST_GET_FIRST(space->chain); - mtr->log_file_op(MLOG_FILE_CREATE2, space->id, size, file->name, - nullptr, space->flags & ~FSP_FLAGS_MEM_MASK); + /* Write a record with the new size, so that recovery and + backup will ignore any preceding redo log records for writing + pages that are after the new end of the tablespace. */ + ut_ad(UT_LIST_GET_LEN(space->chain) == 1); + const fil_node_t *file= UT_LIST_GET_FIRST(space->chain); + mtr->log_file_op(FILE_CREATE, space->id, size, file->name); } /*******************************************************************//** @@ -2928,9 +2921,7 @@ err_exit: false, true); mtr_t mtr; mtr.start(); - mtr.log_file_op(MLOG_FILE_CREATE2, space_id, 0, node->name, - nullptr, space->flags & ~FSP_FLAGS_MEM_MASK); - fil_name_write(space, 0, node, &mtr); + mtr.log_file_op(FILE_CREATE, space_id, 0, node->name); mtr.commit(); node->find_metadata(file); @@ -4561,7 +4552,7 @@ fil_space_validate_for_mtr_commit( } #endif /* UNIV_DEBUG */ -/** Write a MLOG_FILE_NAME record for a persistent tablespace. +/** Write a FILE_MODIFY record for a persistent tablespace. @param[in] space tablespace @param[in,out] mtr mini-transaction */ static @@ -4591,22 +4582,20 @@ fil_names_dirty( space->max_lsn = log_sys.lsn; } -/** Write MLOG_FILE_NAME records when a non-predefined persistent +/** Write FILE_MODIFY records when a non-predefined persistent tablespace was modified for the first time since the latest fil_names_clear(). -@param[in,out] space tablespace -@param[in,out] mtr mini-transaction */ -void -fil_names_dirty_and_write( - fil_space_t* space, - mtr_t* mtr) +@param[in,out] space tablespace */ +void fil_names_dirty_and_write(fil_space_t* space) { ut_ad(log_mutex_own()); ut_d(fil_space_validate_for_mtr_commit(space)); ut_ad(space->max_lsn == log_sys.lsn); UT_LIST_ADD_LAST(fil_system.named_spaces, space); - fil_names_write(space, mtr); + mtr_t mtr; + mtr.start(); + fil_names_write(space, &mtr); DBUG_EXECUTE_IF("fil_names_write_bogus", { @@ -4614,14 +4603,16 @@ fil_names_dirty_and_write( os_normalize_path(bogus_name); fil_name_write( SRV_SPACE_ID_UPPER_BOUND, 0, - bogus_name, mtr); + bogus_name, &mtr); }); + + mtr.commit_files(); } /** On a log checkpoint, reset fil_names_dirty_and_write() flags -and write out MLOG_FILE_NAME and MLOG_CHECKPOINT if needed. +and write out FILE_MODIFY and FILE_CHECKPOINT if needed. @param[in] lsn checkpoint LSN -@param[in] do_write whether to always write MLOG_CHECKPOINT +@param[in] do_write whether to always write FILE_CHECKPOINT @return whether anything was written to the redo log @retval false if no flags were set and nothing written @retval true if anything was written to the redo log */ @@ -4631,7 +4622,7 @@ fil_names_clear( bool do_write) { mtr_t mtr; - ulint mtr_checkpoint_size = LOG_CHECKPOINT_FREE_PER_THREAD; + ulint mtr_checkpoint_size = RECV_SCAN_SIZE - 1; DBUG_EXECUTE_IF( "increase_mtr_checkpoint_size", @@ -4650,6 +4641,14 @@ fil_names_clear( for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.named_spaces); space != NULL; ) { + if (mtr.get_log()->size() + + (3 + 5 + 1) + strlen(space->chain.start->name) + >= mtr_checkpoint_size) { + /* Prevent log parse buffer overflow */ + mtr.commit_files(); + mtr.start(); + } + fil_space_t* next = UT_LIST_GET_NEXT(named_spaces, space); ut_ad(space->max_lsn > 0); @@ -4671,19 +4670,6 @@ fil_names_clear( fil_names_write(space, &mtr); do_write = true; - const mtr_buf_t* mtr_log = mtr_get_log(&mtr); - - /** If the mtr buffer size exceeds the size of - LOG_CHECKPOINT_FREE_PER_THREAD then commit the multi record - mini-transaction, start the new mini-transaction to - avoid the parsing buffer overflow error during recovery. */ - - if (mtr_log->size() > mtr_checkpoint_size) { - ut_ad(mtr_log->size() < (RECV_PARSING_BUF_SIZE / 2)); - mtr.commit_files(); - mtr.start(); - } - space = next; } diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc index 5fcc58300f1..ede8d4f8c16 100644 --- a/storage/innobase/fsp/fsp0fsp.cc +++ b/storage/innobase/fsp/fsp0fsp.cc @@ -476,27 +476,29 @@ xdes_get_offset( /** Initialize a file page whose prior contents should be ignored. @param[in,out] block buffer pool block */ -void fsp_apply_init_file_page(buf_block_t* block) +void fsp_apply_init_file_page(buf_block_t *block) { - page_t* page = buf_block_get_frame(block); - - memset(page, 0, srv_page_size); - - mach_write_to_4(page + FIL_PAGE_OFFSET, block->page.id.page_no()); - mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, - block->page.id.space()); - - if (page_zip_des_t* page_zip= buf_block_get_page_zip(block)) { - memset(page_zip->data, 0, page_zip_get_size(page_zip)); - static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); - memcpy_aligned<4>(page_zip->data + FIL_PAGE_OFFSET, - page + FIL_PAGE_OFFSET, 4); - static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2, - "not perfect alignment"); - memcpy_aligned<2>(page_zip->data - + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, - page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4); - } + memset_aligned<UNIV_PAGE_SIZE_MIN>(block->frame, 0, srv_page_size); + + mach_write_to_4(block->frame + FIL_PAGE_OFFSET, block->page.id.page_no()); + if (log_sys.is_physical()) + memset_aligned<8>(block->frame + FIL_PAGE_PREV, 0xff, 8); + mach_write_to_4(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + block->page.id.space()); + if (page_zip_des_t* page_zip= buf_block_get_page_zip(block)) + { + memset_aligned<UNIV_ZIP_SIZE_MIN>(page_zip->data, 0, + page_zip_get_size(page_zip)); + static_assert(FIL_PAGE_OFFSET == 4, "compatibility"); + memcpy_aligned<4>(page_zip->data + FIL_PAGE_OFFSET, + block->frame + FIL_PAGE_OFFSET, 4); + if (log_sys.is_physical()) + memset_aligned<8>(page_zip->data + FIL_PAGE_PREV, 0xff, 8); + static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2, + "not perfect alignment"); + memcpy_aligned<2>(page_zip->data + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4); + } } #ifdef UNIV_DEBUG @@ -577,8 +579,12 @@ void fsp_header_init(fil_space_t* space, ulint size, mtr_t* mtr) + block->frame, space->id); ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_NOT_USED + block->frame)); - mtr->write<4>(*block, FSP_HEADER_OFFSET + FSP_SIZE + block->frame, - size); + /* recv_sys_t::parse() expects to find a WRITE record that + covers all 4 bytes. Therefore, we must specify mtr_t::FORCED + in order to avoid optimizing away any unchanged most + significant bytes of FSP_SIZE. */ + mtr->write<4,mtr_t::FORCED>(*block, FSP_HEADER_OFFSET + FSP_SIZE + + block->frame, size); ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + block->frame)); mtr->write<4,mtr_t::OPT>(*block, FSP_HEADER_OFFSET + FSP_SPACE_FLAGS @@ -636,8 +642,12 @@ fsp_try_extend_data_file_with_pages( success = fil_space_extend(space, page_no + 1); /* The size may be less than we wanted if we ran out of disk space. */ - mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_SIZE + header->frame, - space->size); + /* recv_sys_t::parse() expects to find a WRITE record that + covers all 4 bytes. Therefore, we must specify mtr_t::FORCED + in order to avoid optimizing away any unchanged most + significant bytes of FSP_SIZE. */ + mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE + + header->frame, space->size); space->size_in_header = space->size; return(success); @@ -770,8 +780,12 @@ fsp_try_extend_data_file(fil_space_t *space, buf_block_t *header, mtr_t *mtr) space->size_in_header = ut_2pow_round(space->size, (1024 * 1024) / ps); - mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_SIZE + header->frame, - space->size_in_header); + /* recv_sys_t::parse() expects to find a WRITE record that + covers all 4 bytes. Therefore, we must specify mtr_t::FORCED + in order to avoid optimizing away any unchanged most + significant bytes of FSP_SIZE. */ + mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE + + header->frame, space->size_in_header); return(size_increase); } @@ -1511,8 +1525,7 @@ static void fsp_free_seg_inode( iblock, FSEG_INODE_PAGE_NODE, mtr); } - mtr->write<8>(*iblock, inode + FSEG_ID, 0U); - mtr->write<4>(*iblock, inode + FSEG_MAGIC_N, 0xfa051ce3); + mtr->memset(iblock, page_offset(inode) + FSEG_ID, FSEG_INODE_SIZE, 0); if (ULINT_UNDEFINED == fsp_seg_inode_page_find_used(iblock->frame, physical_size)) { diff --git a/storage/innobase/fut/fut0lst.cc b/storage/innobase/fut/fut0lst.cc index 93249aeab54..618eb1881e3 100644 --- a/storage/innobase/fut/fut0lst.cc +++ b/storage/innobase/fut/fut0lst.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2019, MariaDB Corporation. +Copyright (c) 2019, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -28,6 +28,61 @@ Created 11/28/1995 Heikki Tuuri #include "buf0buf.h" #include "page0page.h" + +/** Write a file address. +@param[in] block file page +@param[in,out] faddr file address location +@param[in] page page number +@param[in] boffset byte offset +@param[in,out] mtr mini-transaction */ +static void flst_write_addr(const buf_block_t& block, byte *faddr, + uint32_t page, uint16_t boffset, mtr_t* mtr) +{ + ut_ad(mtr->memo_contains_page_flagged(faddr, + MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_SX_FIX)); + ut_a(page == FIL_NULL || boffset >= FIL_PAGE_DATA); + ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA); + + static_assert(FIL_ADDR_PAGE == 0, "compatibility"); + static_assert(FIL_ADDR_BYTE == 4, "compatibility"); + static_assert(FIL_ADDR_SIZE == 6, "compatibility"); + + const bool same_page= mach_read_from_4(faddr + FIL_ADDR_PAGE) == page; + const bool same_offset= mach_read_from_2(faddr + FIL_ADDR_BYTE) == boffset; + if (same_page) + { + if (!same_offset) + mtr->write<2>(block, faddr + FIL_ADDR_BYTE, boffset); + return; + } + if (same_offset) + mtr->write<4>(block, faddr + FIL_ADDR_PAGE, page); + else + { + alignas(4) byte fil_addr[6]; + mach_write_to_4(fil_addr + FIL_ADDR_PAGE, page); + mach_write_to_2(fil_addr + FIL_ADDR_BYTE, boffset); + mtr->memcpy(block, faddr + FIL_ADDR_PAGE, fil_addr, 6); + } +} + +/** Write 2 null file addresses. +@param[in] b file page +@param[in,out] addr file address to be zeroed out +@param[in,out] mtr mini-transaction */ +static void flst_zero_both(const buf_block_t& b, byte *addr, mtr_t *mtr) +{ + if (mach_read_from_4(addr + FIL_ADDR_PAGE) != FIL_NULL) + mtr->memset(&b, ulint(addr - b.frame) + FIL_ADDR_PAGE, 4, 0xff); + mtr->write<2,mtr_t::OPT>(b, addr + FIL_ADDR_BYTE, 0U); + /* Initialize the other address by (MEMMOVE|0x80,offset,FIL_ADDR_SIZE,source) + which is 4 bytes, or less than FIL_ADDR_SIZE. */ + memcpy(addr + FIL_ADDR_SIZE, addr, FIL_ADDR_SIZE); + const uint16_t boffset= page_offset(addr); + mtr->memmove(b, boffset + FIL_ADDR_SIZE, boffset, FIL_ADDR_SIZE); +} + /** Add a node to an empty list. */ static void flst_add_to_empty(buf_block_t *base, uint16_t boffset, buf_block_t *add, uint16_t aoffset, mtr_t *mtr) @@ -41,20 +96,22 @@ static void flst_add_to_empty(buf_block_t *base, uint16_t boffset, ut_ad(mtr_memo_contains_page_flagged(mtr, add->frame, MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)); - fil_addr_t addr= { add->page.id.page_no(), aoffset }; + ut_ad(!mach_read_from_4(base->frame + boffset + FLST_LEN)); + mtr->write<1>(*base, base->frame + boffset + (FLST_LEN + 3), 1U); /* Update first and last fields of base node */ - flst_write_addr(*base, base->frame + boffset + FLST_FIRST, addr, mtr); - /* MDEV-12353 TODO: use MEMMOVE record */ - flst_write_addr(*base, base->frame + boffset + FLST_LAST, addr, mtr); + flst_write_addr(*base, base->frame + boffset + FLST_FIRST, + add->page.id.page_no(), aoffset, mtr); + memcpy(base->frame + boffset + FLST_LAST, base->frame + boffset + FLST_FIRST, + FIL_ADDR_SIZE); + /* Initialize FLST_LAST by (MEMMOVE|0x80,offset,FIL_ADDR_SIZE,source) + which is 4 bytes, or less than FIL_ADDR_SIZE. */ + mtr->memmove(*base, boffset + FLST_LAST, boffset + FLST_FIRST, + FIL_ADDR_SIZE); /* Set prev and next fields of node to add */ - flst_zero_addr(*add, add->frame + aoffset + FLST_PREV, mtr); - flst_zero_addr(*add, add->frame + aoffset + FLST_NEXT, mtr); - - /* Update len of base node */ - ut_ad(!mach_read_from_4(base->frame + boffset + FLST_LEN)); - mtr->write<1>(*base, base->frame + boffset + (FLST_LEN + 3), 1U); + static_assert(FLST_NEXT == FLST_PREV + FIL_ADDR_SIZE, "compatibility"); + flst_zero_both(*add, add->frame + aoffset + FLST_PREV, mtr); } /** Insert a node after another one. @@ -85,24 +142,27 @@ static void flst_insert_after(buf_block_t *base, uint16_t boffset, MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)); - fil_addr_t cur_addr= { cur->page.id.page_no(), coffset }; - fil_addr_t add_addr= { add->page.id.page_no(), aoffset }; fil_addr_t next_addr= flst_get_next_addr(cur->frame + coffset); - flst_write_addr(*add, add->frame + aoffset + FLST_PREV, cur_addr, mtr); - flst_write_addr(*add, add->frame + aoffset + FLST_NEXT, next_addr, mtr); + flst_write_addr(*add, add->frame + aoffset + FLST_PREV, + cur->page.id.page_no(), coffset, mtr); + flst_write_addr(*add, add->frame + aoffset + FLST_NEXT, + next_addr.page, next_addr.boffset, mtr); if (fil_addr_is_null(next_addr)) - flst_write_addr(*base, base->frame + boffset + FLST_LAST, add_addr, mtr); + flst_write_addr(*base, base->frame + boffset + FLST_LAST, + add->page.id.page_no(), aoffset, mtr); else { buf_block_t *block; flst_node_t *next= fut_get_ptr(add->page.id.space(), add->zip_size(), next_addr, RW_SX_LATCH, mtr, &block); - flst_write_addr(*block, next + FLST_PREV, add_addr, mtr); + flst_write_addr(*block, next + FLST_PREV, + add->page.id.page_no(), aoffset, mtr); } - flst_write_addr(*cur, cur->frame + coffset + FLST_NEXT, add_addr, mtr); + flst_write_addr(*cur, cur->frame + coffset + FLST_NEXT, + add->page.id.page_no(), aoffset, mtr); byte *len= &base->frame[boffset + FLST_LEN]; mtr->write<4>(*base, len, mach_read_from_4(len) + 1); @@ -136,29 +196,45 @@ static void flst_insert_before(buf_block_t *base, uint16_t boffset, MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)); - fil_addr_t cur_addr= { cur->page.id.page_no(), coffset }; - fil_addr_t add_addr= { add->page.id.page_no(), aoffset }; fil_addr_t prev_addr= flst_get_prev_addr(cur->frame + coffset); - flst_write_addr(*add, add->frame + aoffset + FLST_PREV, prev_addr, mtr); - flst_write_addr(*add, add->frame + aoffset + FLST_NEXT, cur_addr, mtr); + flst_write_addr(*add, add->frame + aoffset + FLST_PREV, + prev_addr.page, prev_addr.boffset, mtr); + flst_write_addr(*add, add->frame + aoffset + FLST_NEXT, + cur->page.id.page_no(), coffset, mtr); if (fil_addr_is_null(prev_addr)) - flst_write_addr(*base, base->frame + boffset + FLST_FIRST, add_addr, mtr); + flst_write_addr(*base, base->frame + boffset + FLST_FIRST, + add->page.id.page_no(), aoffset, mtr); else { buf_block_t *block; flst_node_t *prev= fut_get_ptr(add->page.id.space(), add->zip_size(), prev_addr, RW_SX_LATCH, mtr, &block); - flst_write_addr(*block, prev + FLST_NEXT, add_addr, mtr); + flst_write_addr(*block, prev + FLST_NEXT, + add->page.id.page_no(), aoffset, mtr); } - flst_write_addr(*cur, cur->frame + coffset + FLST_PREV, add_addr, mtr); + flst_write_addr(*cur, cur->frame + coffset + FLST_PREV, + add->page.id.page_no(), aoffset, mtr); byte *len= &base->frame[boffset + FLST_LEN]; mtr->write<4>(*base, len, mach_read_from_4(len) + 1); } +/** Initialize a list base node. +@param[in] block file page +@param[in,out] base base node +@param[in,out] mtr mini-transaction */ +void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr) +{ + ut_ad(mtr->memo_contains_page_flagged(base, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + mtr->write<4,mtr_t::OPT>(block, base + FLST_LEN, 0U); + static_assert(FLST_LAST == FLST_FIRST + FIL_ADDR_SIZE, "compatibility"); + flst_zero_both(block, base + FLST_FIRST, mtr); +} + /** Append a file list node to a list. @param[in,out] base base node block @param[in] boffset byte offset of the base node @@ -251,7 +327,8 @@ void flst_remove(buf_block_t *base, uint16_t boffset, const fil_addr_t next_addr= flst_get_next_addr(cur->frame + coffset); if (fil_addr_is_null(prev_addr)) - flst_write_addr(*base, base->frame + boffset + FLST_FIRST, next_addr, mtr); + flst_write_addr(*base, base->frame + boffset + FLST_FIRST, + next_addr.page, next_addr.boffset, mtr); else { buf_block_t *block= cur; @@ -259,11 +336,13 @@ void flst_remove(buf_block_t *base, uint16_t boffset, ? cur->frame + prev_addr.boffset : fut_get_ptr(cur->page.id.space(), cur->zip_size(), prev_addr, RW_SX_LATCH, mtr, &block); - flst_write_addr(*block, prev + FLST_NEXT, next_addr, mtr); + flst_write_addr(*block, prev + FLST_NEXT, + next_addr.page, next_addr.boffset, mtr); } if (fil_addr_is_null(next_addr)) - flst_write_addr(*base, base->frame + boffset + FLST_LAST, prev_addr, mtr); + flst_write_addr(*base, base->frame + boffset + FLST_LAST, + prev_addr.page, prev_addr.boffset, mtr); else { buf_block_t *block= cur; @@ -271,7 +350,8 @@ void flst_remove(buf_block_t *base, uint16_t boffset, ? cur->frame + next_addr.boffset : fut_get_ptr(cur->page.id.space(), cur->zip_size(), next_addr, RW_SX_LATCH, mtr, &block); - flst_write_addr(*block, next + FLST_PREV, prev_addr, mtr); + flst_write_addr(*block, next + FLST_PREV, + prev_addr.page, prev_addr.boffset, mtr); } byte *len= &base->frame[boffset + FLST_LEN]; diff --git a/storage/innobase/gis/gis0rtree.cc b/storage/innobase/gis/gis0rtree.cc index 8cb47457415..a4dfb9fa453 100644 --- a/storage/innobase/gis/gis0rtree.cc +++ b/storage/innobase/gis/gis0rtree.cc @@ -300,8 +300,9 @@ rtr_update_mbr_field( memcpy(rec, node_ptr->fields[0].data, DATA_MBR_LEN); page_zip_write_rec(block, rec, index, offsets, 0, mtr); } else { - mtr->memcpy(block, page_offset(rec), - node_ptr->fields[0].data, DATA_MBR_LEN); + mtr->memcpy<mtr_t::OPT>(*block, rec, + node_ptr->fields[0].data, + DATA_MBR_LEN); } if (cursor2) { @@ -895,7 +896,6 @@ rtr_page_split_and_insert( rtr_split_node_t* cur_split_node; rtr_split_node_t* end_split_node; double* buf_pos; - ulint page_level; node_seq_t current_ssn; node_seq_t next_ssn; buf_block_t* root_block; @@ -926,7 +926,6 @@ func_start: block = btr_cur_get_block(cursor); page = buf_block_get_frame(block); page_zip = buf_block_get_page_zip(block); - page_level = btr_page_get_level(page); current_ssn = page_get_ssn_id(page); ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); @@ -971,9 +970,19 @@ func_start: /* Allocate a new page to the index */ hint_page_no = page_no + 1; + const uint16_t page_level = btr_page_get_level(page); new_block = btr_page_alloc(cursor->index, hint_page_no, FSP_UP, page_level, mtr, mtr); + if (!new_block) { + return NULL; + } + new_page_zip = buf_block_get_page_zip(new_block); + if (page_level && UNIV_LIKELY_NULL(new_page_zip)) { + /* ROW_FORMAT=COMPRESSED non-leaf pages are not expected + to contain FIL_NULL in FIL_PAGE_PREV at this stage. */ + memset_aligned<4>(new_block->frame + FIL_PAGE_PREV, 0, 4); + } btr_page_create(new_block, new_page_zip, cursor->index, page_level, mtr); diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 0a078cce7df..5b42501d45d 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -18585,7 +18585,7 @@ checkpoint_now_set(THD*, st_mysql_sys_var*, void*, const void* save) mysql_mutex_unlock(&LOCK_global_system_variables); while (log_sys.last_checkpoint_lsn - + SIZE_OF_MLOG_CHECKPOINT + + SIZE_OF_FILE_CHECKPOINT + (log_sys.append_on_checkpoint != NULL ? log_sys.append_on_checkpoint->size() : 0) < log_sys.lsn) { diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc index 3634e372bd9..c644ce9593f 100644 --- a/storage/innobase/handler/handler0alter.cc +++ b/storage/innobase/handler/handler0alter.cc @@ -10890,19 +10890,14 @@ ha_innobase::commit_inplace_alter_table( ut_ad(trx->has_logged()); if (mtr.get_log()->size() > 0) { - ut_ad(*mtr.get_log()->front()->begin() - == MLOG_FILE_RENAME2); - - /* Append the MLOG_FILE_RENAME2 + ut_ad((*mtr.get_log()->front()->begin() + & 0xf0) == FILE_RENAME); + /* Append the FILE_RENAME records on checkpoint, as a separate mini-transaction before the one that - contains the MLOG_CHECKPOINT marker. */ - static const byte multi - = MLOG_MULTI_REC_END; - + contains the FILE_CHECKPOINT marker. */ mtr.get_log()->for_each_block(logs); - logs.m_buf.push(&multi, sizeof multi); - + logs.m_buf.push(field_ref_zero, 1); log_append_on_checkpoint(&logs.m_buf); } diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index e0077e1ca42..c59474f7d12 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -276,23 +276,15 @@ btr_page_get_index_id( /*==================*/ const page_t* page) /*!< in: index page */ MY_ATTRIBUTE((warn_unused_result)); -/********************************************************//** -Gets the node level field in an index page. -@param[in] page index page -@return level, leaf level == 0 */ -UNIV_INLINE -ulint -btr_page_get_level(const page_t* page) +/** Read the B-tree or R-tree PAGE_LEVEL. +@param page B-tree or R-tree page +@return number of child page links to reach the leaf level +@retval 0 for leaf pages */ +inline uint16_t btr_page_get_level(const page_t *page) { - ulint level; - - ut_ad(page); - - level = mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL); - - ut_ad(level <= BTR_MAX_NODE_LEVEL); - - return(level); + uint16_t level = mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL); + ut_ad(level <= BTR_MAX_NODE_LEVEL); + return level; } MY_ATTRIBUTE((warn_unused_result)) /** Read FIL_PAGE_NEXT. @@ -403,6 +395,13 @@ btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset = false) @param[in,out] mtr mini-transaction */ void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr); +/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE. +@param[in] index clustered index with instant ALTER TABLE +@param[in] all whether to reset FIL_PAGE_TYPE as well +@param[in,out] mtr mini-transaction */ +ATTRIBUTE_COLD __attribute__((nonnull)) +void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr); + /*************************************************************//** Makes tree one level higher by splitting the root, and inserts the tuple. It is assumed that mtr contains an x-latch on the tree. diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic index 8f8ed344fa4..35ac49dd117 100644 --- a/storage/innobase/include/btr0btr.ic +++ b/storage/innobase/include/btr0btr.ic @@ -49,16 +49,11 @@ inline void btr_page_set_level(buf_block_t *block, ulint level, mtr_t *mtr) { ut_ad(level <= BTR_MAX_NODE_LEVEL); - - byte *page_level= PAGE_HEADER + PAGE_LEVEL + block->frame; - - if (UNIV_LIKELY_NULL(block->page.zip.data)) - { - mach_write_to_2(page_level, level); - page_zip_write_header(block, page_level, 2, mtr); - } - else - mtr->write<2,mtr_t::OPT>(*block, page_level, level); + constexpr uint16_t field= PAGE_HEADER + PAGE_LEVEL; + byte *b= my_assume_aligned<2>(&block->frame[field]); + if (mtr->write<2,mtr_t::OPT>(*block, b, level) && + UNIV_LIKELY_NULL(block->page.zip.data)) + memcpy_aligned<2>(&block->page.zip.data[field], b, 2); } /** Set FIL_PAGE_NEXT. @@ -67,14 +62,11 @@ void btr_page_set_level(buf_block_t *block, ulint level, mtr_t *mtr) @param[in,out] mtr mini-transaction */ inline void btr_page_set_next(buf_block_t *block, ulint next, mtr_t *mtr) { - byte *fil_page_next= block->frame + FIL_PAGE_NEXT; - if (UNIV_LIKELY_NULL(block->page.zip.data)) - { - mach_write_to_4(fil_page_next, next); - page_zip_write_header(block, fil_page_next, 4, mtr); - } - else - mtr->write<4>(*block, fil_page_next, next); + constexpr uint16_t field= FIL_PAGE_NEXT; + byte *b= my_assume_aligned<4>(&block->frame[field]); + if (mtr->write<4,mtr_t::OPT>(*block, b, next) && + UNIV_LIKELY_NULL(block->page.zip.data)) + memcpy_aligned<4>(&block->page.zip.data[field], b, 4); } /** Set FIL_PAGE_PREV. @@ -83,14 +75,11 @@ inline void btr_page_set_next(buf_block_t *block, ulint next, mtr_t *mtr) @param[in,out] mtr mini-transaction */ inline void btr_page_set_prev(buf_block_t *block, ulint prev, mtr_t *mtr) { - byte *fil_page_prev= block->frame + FIL_PAGE_PREV; - if (UNIV_LIKELY_NULL(block->page.zip.data)) - { - mach_write_to_4(fil_page_prev, prev); - page_zip_write_header(block, fil_page_prev, 4, mtr); - } - else - mtr->write<4>(*block, fil_page_prev, prev); + constexpr uint16_t field= FIL_PAGE_PREV; + byte *b= my_assume_aligned<4>(&block->frame[field]); + if (mtr->write<4,mtr_t::OPT>(*block, b, prev) && + UNIV_LIKELY_NULL(block->page.zip.data)) + memcpy_aligned<4>(&block->page.zip.data[field], b, 4); } /**************************************************************//** diff --git a/storage/innobase/include/btr0bulk.h b/storage/innobase/include/btr0bulk.h index 12ecddca44f..18ab4cfa4fc 100644 --- a/storage/innobase/include/btr0bulk.h +++ b/storage/innobase/include/btr0bulk.h @@ -109,10 +109,9 @@ private: template<format> inline void finishPage(); /** Insert a record in the page. @tparam format the page format - @param[in] rec record + @param[in,out] rec record @param[in] offsets record offsets */ - template<format> inline void insertPage(const rec_t* rec, - offset_t* offsets); + template<format> inline void insertPage(rec_t* rec, offset_t* offsets); public: /** Mark end of insertion to the page. Scan all records to set page diff --git a/storage/innobase/include/dyn0buf.h b/storage/innobase/include/dyn0buf.h index f66c7e3d405..b15d35c4831 100644 --- a/storage/innobase/include/dyn0buf.h +++ b/storage/innobase/include/dyn0buf.h @@ -382,6 +382,9 @@ public: return(m_heap == NULL); } + /** @return whether the buffer is empty */ + bool empty() const { return !back()->m_used; } + private: // Disable copying mtr_buf_t(const mtr_buf_t&); diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 770c2e43a6c..5f5f3204a87 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -149,7 +149,7 @@ struct fil_space_t rw_lock_t latch; /*!< latch protecting the file space storage allocation */ UT_LIST_NODE_T(fil_space_t) named_spaces; - /*!< list of spaces for which MLOG_FILE_NAME + /*!< list of spaces for which FILE_MODIFY records have been issued */ /** Checks that this tablespace in a list of unflushed tablespaces. @return true if in a list */ @@ -641,13 +641,6 @@ extern const char* dot_ext[]; but in the MySQL Embedded Server Library and mysqlbackup it is not the default directory, and we must set the base file path explicitly */ extern const char* fil_path_to_mysql_datadir; - -/* Space address data type; this is intended to be used when -addresses accurate to a byte are stored in file pages. If the page part -of the address is FIL_NULL, the address is considered undefined. */ - -typedef byte fil_faddr_t; /*!< 'type' definition in C: an address - stored in a file page is a string of bytes */ #else # include "univ.i" #endif /* !UNIV_INNOCHECKSUM */ @@ -951,7 +944,7 @@ public: /*!< list of all file spaces */ UT_LIST_BASE_NODE_T(fil_space_t) named_spaces; /*!< list of all file spaces - for which a MLOG_FILE_NAME + for which a FILE_MODIFY record has been written since the latest redo log checkpoint. Protected only by log_sys.mutex. */ @@ -1531,26 +1524,18 @@ void fil_names_dirty( fil_space_t* space); -/** Write MLOG_FILE_NAME records when a non-predefined persistent +/** Write FILE_MODIFY records when a non-predefined persistent tablespace was modified for the first time since the latest fil_names_clear(). -@param[in,out] space tablespace -@param[in,out] mtr mini-transaction */ -void -fil_names_dirty_and_write( - fil_space_t* space, - mtr_t* mtr); +@param[in,out] space tablespace */ +void fil_names_dirty_and_write(fil_space_t* space); -/** Write MLOG_FILE_NAME records if a persistent tablespace was modified +/** Write FILE_MODIFY records if a persistent tablespace was modified for the first time since the latest fil_names_clear(). @param[in,out] space tablespace @param[in,out] mtr mini-transaction -@return whether any MLOG_FILE_NAME record was written */ -inline MY_ATTRIBUTE((warn_unused_result)) -bool -fil_names_write_if_was_clean( - fil_space_t* space, - mtr_t* mtr) +@return whether any FILE_MODIFY record was written */ +inline bool fil_names_write_if_was_clean(fil_space_t* space) { ut_ad(log_mutex_own()); @@ -1563,7 +1548,7 @@ fil_names_write_if_was_clean( space->max_lsn = log_sys.lsn; if (was_clean) { - fil_names_dirty_and_write(space, mtr); + fil_names_dirty_and_write(space); } return(was_clean); @@ -1588,9 +1573,9 @@ inline void fil_space_open_if_needed(fil_space_t* space) } /** On a log checkpoint, reset fil_names_dirty_and_write() flags -and write out MLOG_FILE_NAME and MLOG_CHECKPOINT if needed. +and write out FILE_MODIFY and FILE_CHECKPOINT if needed. @param[in] lsn checkpoint LSN -@param[in] do_write whether to always write MLOG_CHECKPOINT +@param[in] do_write whether to always write FILE_CHECKPOINT @return whether anything was written to the redo log @retval false if no flags were set and nothing written @retval true if anything was written to the redo log */ diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h index 1aeb3867eea..1bde90fdad4 100644 --- a/storage/innobase/include/fsp0fsp.h +++ b/storage/innobase/include/fsp0fsp.h @@ -612,7 +612,7 @@ inline bool fsp_descr_page(const page_id_t page_id, ulint physical_size) /** Initialize a file page whose prior contents should be ignored. @param[in,out] block buffer pool block */ -void fsp_apply_init_file_page(buf_block_t* block); +void fsp_apply_init_file_page(buf_block_t *block); /** Initialize a file page. @param[in] space tablespace diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h index e9355948599..1ade24cd069 100644 --- a/storage/innobase/include/fut0lst.h +++ b/storage/innobase/include/fut0lst.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2018, 2019, MariaDB Corporation. +Copyright (c) 2018, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -78,47 +78,12 @@ inline void flst_init(const buf_block_t* block, uint16_t ofs, mtr_t* mtr) mtr->memset(block, FLST_LAST + FIL_ADDR_PAGE + ofs, 4, 0xff); } -/** Write a null file address. -@param[in] b file page -@param[in,out] addr file address to be zeroed out -@param[in,out] mtr mini-transaction */ -inline void flst_zero_addr(const buf_block_t& b, fil_faddr_t *addr, mtr_t *mtr) -{ - if (mach_read_from_4(addr + FIL_ADDR_PAGE) != FIL_NULL) - mtr->memset(&b, ulint(addr - b.frame) + FIL_ADDR_PAGE, 4, 0xff); - mtr->write<2,mtr_t::OPT>(b, addr + FIL_ADDR_BYTE, 0U); -} - -/** Write a file address. -@param[in] block file page -@param[in,out] faddr file address location -@param[in] addr file address to be written out -@param[in,out] mtr mini-transaction */ -inline void flst_write_addr(const buf_block_t& block, fil_faddr_t *faddr, - fil_addr_t addr, mtr_t* mtr) -{ - ut_ad(mtr->memo_contains_page_flagged(faddr, - MTR_MEMO_PAGE_X_FIX - | MTR_MEMO_PAGE_SX_FIX)); - ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA); - ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA); - - mtr->write<4,mtr_t::OPT>(block, faddr + FIL_ADDR_PAGE, addr.page); - mtr->write<2,mtr_t::OPT>(block, faddr + FIL_ADDR_BYTE, addr.boffset); -} - /** Initialize a list base node. @param[in] block file page @param[in,out] base base node @param[in,out] mtr mini-transaction */ -inline void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr) -{ - ut_ad(mtr->memo_contains_page_flagged(base, MTR_MEMO_PAGE_X_FIX | - MTR_MEMO_PAGE_SX_FIX)); - mtr->write<4,mtr_t::OPT>(block, base + FLST_LEN, 0U); - flst_zero_addr(block, base + FLST_FIRST, mtr); - flst_zero_addr(block, base + FLST_LAST, mtr); -} +void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); /** Append a file list node to a list. @param[in,out] base base node block @@ -155,7 +120,7 @@ inline uint32_t flst_get_len(const flst_base_node_t *base) } /** @return a file address */ -inline fil_addr_t flst_read_addr(const fil_faddr_t *faddr) +inline fil_addr_t flst_read_addr(const byte *faddr) { fil_addr_t addr= { mach_read_from_4(faddr + FIL_ADDR_PAGE), mach_read_from_2(faddr + FIL_ADDR_BYTE) }; diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index 015e22cdfa4..49851cd6929 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -2,7 +2,7 @@ Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2009, Google Inc. -Copyright (c) 2017, 2019, MariaDB Corporation. +Copyright (c) 2017, 2020, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -206,7 +206,7 @@ logs_empty_and_mark_files_at_shutdown(void); @param[in] header 0 or LOG_CHECKPOINT_1 or LOG_CHECKPOINT2 */ void log_header_read(ulint header); /** Write checkpoint info to the log header and invoke log_mutex_exit(). -@param[in] end_lsn start LSN of the MLOG_CHECKPOINT mini-transaction */ +@param[in] end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */ void log_write_checkpoint_info(lsn_t end_lsn); /** Set extra data to be written to the redo log during checkpoint. @@ -499,6 +499,10 @@ struct log_t{ static constexpr uint32_t FORMAT_ENCRYPTED = 1U << 31; /** The MariaDB 10.4.0 log format (only with innodb_encrypt_log=ON) */ static constexpr uint32_t FORMAT_ENC_10_4 = FORMAT_10_4 | FORMAT_ENCRYPTED; + /** The MariaDB 10.5 physical redo log format */ + static constexpr uint32_t FORMAT_10_5 = 0x50485953; + /** The MariaDB 10.5 physical format (only with innodb_encrypt_log=ON) */ + static constexpr uint32_t FORMAT_ENC_10_5 = FORMAT_10_5 | FORMAT_ENCRYPTED; MY_ALIGNED(CACHE_LINE_SIZE) lsn_t lsn; /*!< log sequence number */ @@ -548,7 +552,7 @@ struct log_t{ struct files { /** number of files */ ulint n_files; - /** format of the redo log: e.g., FORMAT_10_4 */ + /** format of the redo log: e.g., FORMAT_10_5 */ uint32_t format; /** redo log subformat: 0 with separately logged TRUNCATE, 2 with fully redo-logged TRUNCATE (1 in MariaDB 10.2) */ @@ -586,6 +590,9 @@ struct log_t{ /** @return whether the redo log is encrypted */ bool is_encrypted() const { return format & FORMAT_ENCRYPTED; } + /** @return whether the redo log is in the physical format */ + bool is_physical() const + { return (format & ~FORMAT_ENCRYPTED) == FORMAT_10_5; } /** @return capacity in bytes */ lsn_t capacity() const{ return (file_size - LOG_FILE_HDR_SIZE) * n_files; } /** Calculate the offset of a log sequence number. @@ -718,6 +725,8 @@ public: /** @return whether the redo log is encrypted */ bool is_encrypted() const { return(log.is_encrypted()); } + /** @return whether the redo log is in the physical format */ + bool is_physical() const { return log.is_physical(); } bool is_initialised() const { return m_initialised; } diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h index bd66f68b5ab..90b6cfe69d8 100644 --- a/storage/innobase/include/log0recv.h +++ b/storage/innobase/include/log0recv.h @@ -48,8 +48,10 @@ recv_find_max_checkpoint(ulint* max_field) MY_ATTRIBUTE((nonnull, warn_unused_result)); /** Apply any buffered redo log to a page that was just read from a data file. +@param[in,out] space tablespace @param[in,out] bpage buffer pool page */ -ATTRIBUTE_COLD void recv_recover_page(buf_page_t* bpage); +ATTRIBUTE_COLD void recv_recover_page(fil_space_t* space, buf_page_t* bpage) + MY_ATTRIBUTE((nonnull)); /** Start recovering from a redo log checkpoint. @see recv_recovery_from_checkpoint_finish @@ -102,24 +104,21 @@ to wait merging to file pages. @param[in] checkpoint_lsn the LSN of the latest checkpoint @param[in] store whether to store page operations @param[in] apply whether to apply the records -@return whether MLOG_CHECKPOINT record was seen the first time, -or corruption was noticed */ -bool recv_parse_log_recs( - lsn_t checkpoint_lsn, - store_t* store, - bool apply); +@return whether MLOG_CHECKPOINT or FILE_CHECKPOINT record +was seen the first time, or corruption was noticed */ +bool recv_parse_log_recs(lsn_t checkpoint_lsn, store_t *store, bool apply); /** Moves the parsing buffer data left to the buffer start */ void recv_sys_justify_left_parsing_buf(); /** Report an operation to create, delete, or rename a file during backup. @param[in] space_id tablespace identifier -@param[in] flags tablespace flags (NULL if not create) +@param[in] create whether the file is being created @param[in] name file name (not NUL-terminated) @param[in] len length of name, in bytes @param[in] new_name new file name (NULL if not rename) @param[in] new_len length of new_name, in bytes (0 if NULL) */ -extern void (*log_file_op)(ulint space_id, const byte* flags, +extern void (*log_file_op)(ulint space_id, bool create, const byte* name, ulint len, const byte* new_name, ulint new_len); @@ -134,7 +133,10 @@ struct log_rec_t /** next record */ log_rec_t *next; /** mtr_t::commit_lsn() of the mini-transaction */ - const lsn_t lsn; + lsn_t lsn; + +protected: + void set_lsn(lsn_t end_lsn) { ut_ad(lsn <= end_lsn); lsn= end_lsn; } }; struct recv_dblwr_t { @@ -171,13 +173,17 @@ struct page_recv_t /** log records are being applied on the page */ RECV_BEING_PROCESSED } state= RECV_NOT_PROCESSED; + /** Latest written byte offset when applying the log records. + @see mtr_t::m_last_offset */ + uint16_t last_offset= 1; /** log records for a page */ class recs_t { /** The first log record */ - log_rec_t *head= NULL; + log_rec_t *head= nullptr; /** The last log record */ - log_rec_t *tail= NULL; + log_rec_t *tail= nullptr; + friend struct page_recv_t; public: /** Append a redo log snippet for the page @param recs log snippet */ @@ -190,12 +196,10 @@ struct page_recv_t tail= recs; } - /** Trim old log records for a page. - @param start_lsn oldest log sequence number to preserve - @return whether all the log for the page was trimmed */ - inline bool trim(lsn_t start_lsn); /** @return the last log snippet */ const log_rec_t* last() const { return tail; } + /** @return the last log snippet */ + log_rec_t* last() { return tail; } class iterator { @@ -213,6 +217,10 @@ struct page_recv_t inline void clear(); } log; + /** Trim old log records for a page. + @param start_lsn oldest log sequence number to preserve + @return whether all the log for the page was trimmed */ + inline bool trim(lsn_t start_lsn); /** Ignore any earlier redo log records for this page. */ inline void will_not_read(); /** @return whether the log records for the page are being processed */ @@ -288,7 +296,7 @@ struct recv_sys_t{ (indexed by page_id_t::space() - srv_undo_space_id_start) */ struct trunc { - /** log sequence number of MLOG_FILE_CREATE2, or 0 if none */ + /** log sequence number of FILE_CREATE, or 0 if none */ lsn_t lsn; /** truncated size of the tablespace, or 0 if not truncated */ unsigned pages; @@ -342,8 +350,25 @@ public: const byte* body, const byte* rec_end, lsn_t lsn, lsn_t end_lsn); - /** Clear a fully processed set of stored redo log records. */ - inline void clear(); + /** Register a redo log snippet for a page. + @param page_id page identifier + @param start_lsn start LSN of the mini-transaction + @param lsn @see mtr_t::commit_lsn() + @param l redo log snippet @see log_t::FORMAT_10_5 + @param len length of l, in bytes */ + inline void add(const page_id_t page_id, lsn_t start_lsn, lsn_t lsn, + const byte *l, size_t len); + + /** Parse and register one mini-transaction in log_t::FORMAT_10_5. + @param checkpoint_lsn the log sequence number of the latest checkpoint + @param store whether to store the records + @param apply whether to apply file-level log records + @return whether FILE_CHECKPOINT record was seen the first time, + or corruption was noticed */ + inline bool parse(lsn_t checkpoint_lsn, store_t store, bool apply); + + /** Clear a fully processed set of stored redo log records. */ + inline void clear(); /** Determine whether redo log recovery progress should be reported. @param[in] time the current time @@ -362,19 +387,15 @@ public: /** The alloc() memory alignment, in bytes */ static constexpr size_t ALIGNMENT= sizeof(size_t); - /** Get the memory block for storing recv_t and redo log data - @param[in] len length of the data to be stored - @param[in] store_recv whether to store recv_t object + /** Allocate memory for log_rec_t + @param len allocation size, in bytes @return pointer to len bytes of memory (never NULL) */ - inline byte *alloc(size_t len, bool store_recv= false); + inline void *alloc(size_t len, bool store_recv= false); /** Free a redo log snippet. @param data buffer returned by alloc() */ inline void free(const void *data); - /** @return the free length of the latest alloc() block, in bytes */ - inline size_t get_free_len() const; - /** Remove records for a corrupted page. This function should only be called when innodb_force_recovery is set. @param page_id corrupted page identifier */ diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h index 06afdbb54bc..71faf119cf0 100644 --- a/storage/innobase/include/mtr0log.h +++ b/storage/innobase/include/mtr0log.h @@ -33,82 +33,478 @@ Created 12/7/1995 Heikki Tuuri // Forward declaration struct dict_index_t; +/** The minimum 2-byte integer (0b10xxxxxx xxxxxxxx) */ +constexpr uint32_t MIN_2BYTE= 1 << 7; +/** The minimum 3-byte integer (0b110xxxxx xxxxxxxx xxxxxxxx) */ +constexpr uint32_t MIN_3BYTE= MIN_2BYTE + (1 << 14); +/** The minimum 4-byte integer (0b1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx) */ +constexpr uint32_t MIN_4BYTE= MIN_3BYTE + (1 << 21); +/** Minimum 5-byte integer (0b11110000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx) */ +constexpr uint32_t MIN_5BYTE= MIN_4BYTE + (1 << 28); + +/** Error from mlog_decode_varint() */ +constexpr uint32_t MLOG_DECODE_ERROR= ~0U; + +/** Decode the length of a variable-length encoded integer. +@param first first byte of the encoded integer +@return the length, in bytes */ +inline uint8_t mlog_decode_varint_length(byte first) +{ + uint8_t len= 1; + for (; first & 0x80; len++, first<<= 1); + return len; +} + +/** Decode an integer in a redo log record. +@param log redo log record buffer +@return the decoded integer +@retval MLOG_DECODE_ERROR on error */ +inline uint32_t mlog_decode_varint(const byte* log) +{ + uint32_t i= *log; + if (i < MIN_2BYTE) + return i; + if (i < 0xc0) + return MIN_2BYTE + ((i & ~0x80) << 8 | log[1]); + if (i < 0xe0) + return MIN_3BYTE + ((i & ~0xc0) << 16 | uint32_t{log[1]} << 8 | log[2]); + if (i < 0xf0) + return MIN_4BYTE + ((i & ~0xe0) << 24 | uint32_t{log[1]} << 16 | + uint32_t{log[2]} << 8 | log[3]); + if (i == 0xf0) + { + i= uint32_t{log[1]} << 24 | uint32_t{log[2]} << 16 | + uint32_t{log[3]} << 8 | log[4]; + if (i <= ~MIN_5BYTE) + return MIN_5BYTE + i; + } + return MLOG_DECODE_ERROR; +} + +/** Encode an integer in a redo log record. +@param log redo log record buffer +@param i the integer to encode +@return end of the encoded integer */ +inline byte *mlog_encode_varint(byte *log, size_t i) +{ + if (i < MIN_2BYTE) + { + } + else if (i < MIN_3BYTE) + { + i-= MIN_2BYTE; + static_assert(MIN_3BYTE - MIN_2BYTE == 1 << 14, "compatibility"); + *log++= 0x80 | static_cast<byte>(i >> 8); + } + else if (i < MIN_4BYTE) + { + i-= MIN_3BYTE; + static_assert(MIN_4BYTE - MIN_3BYTE == 1 << 21, "compatibility"); + *log++= 0xc0 | static_cast<byte>(i >> 16); + goto last2; + } + else if (i < MIN_5BYTE) + { + i-= MIN_4BYTE; + static_assert(MIN_5BYTE - MIN_4BYTE == 1 << 28, "compatibility"); + *log++= 0xe0 | static_cast<byte>(i >> 24); + goto last3; + } + else + { + ut_ad(i < MLOG_DECODE_ERROR); + i-= MIN_5BYTE; + *log++= 0xf0; + *log++= static_cast<byte>(i >> 24); +last3: + *log++= static_cast<byte>(i >> 16); +last2: + *log++= static_cast<byte>(i >> 8); + } + *log++= static_cast<byte>(i); + return log; +} + +/** Determine the length of a log record. +@param log start of log record +@param end end of the log record buffer +@return the length of the record, in bytes +@retval 0 if the log extends past the end +@retval MLOG_DECODE_ERROR if the record is corrupted */ +inline uint32_t mlog_decode_len(const byte *log, const byte *end) +{ + ut_ad(log < end); + uint32_t i= *log; + if (!i) + return 0; /* end of mini-transaction */ + if (~i & 15) + return (i & 15) + 1; /* 1..16 bytes */ + if (UNIV_UNLIKELY(++log == end)) + return 0; /* end of buffer */ + i= *log; + if (UNIV_LIKELY(i < MIN_2BYTE)) /* 1 additional length byte: 16..143 bytes */ + return 16 + i; + if (i < 0xc0) /* 2 additional length bytes: 144..16,527 bytes */ + { + if (UNIV_UNLIKELY(log + 1 == end)) + return 0; /* end of buffer */ + return 16 + MIN_2BYTE + ((i & ~0xc0) << 8 | log[1]); + } + if (i < 0xe0) /* 3 additional length bytes: 16528..1065103 bytes */ + { + if (UNIV_UNLIKELY(log + 2 == end)) + return 0; /* end of buffer */ + return 16 + MIN_3BYTE + ((i & ~0xe0) << 16 | + static_cast<uint32_t>(log[1]) << 8 | log[2]); + } + /* 1,065,103 bytes per log record ought to be enough for everyone */ + return MLOG_DECODE_ERROR; +} + /** Write 1, 2, 4, or 8 bytes to a file page. @param[in] block file page @param[in,out] ptr pointer in file page @param[in] val value to write @tparam l number of bytes to write @tparam w write request type -@tparam V type of val */ +@tparam V type of val +@return whether any log was written */ template<unsigned l,mtr_t::write_type w,typename V> -inline void mtr_t::write(const buf_block_t &block, byte *ptr, V val) +inline bool mtr_t::write(const buf_block_t &block, void *ptr, V val) { ut_ad(ut_align_down(ptr, srv_page_size) == block.frame); - ut_ad(m_log_mode == MTR_LOG_NONE || m_log_mode == MTR_LOG_NO_REDO || - !block.page.zip.data || - /* written by fil_crypt_rotate_page() or innodb_make_page_dirty()? */ - (w == FORCED && l == 1 && ptr == &block.frame[FIL_PAGE_SPACE_ID]) || - mach_read_from_2(block.frame + FIL_PAGE_TYPE) <= FIL_PAGE_TYPE_ZBLOB2); static_assert(l == 1 || l == 2 || l == 4 || l == 8, "wrong length"); + byte buf[l]; switch (l) { case 1: - if (w == OPT && mach_read_from_1(ptr) == val) return; - ut_ad(w != NORMAL || mach_read_from_1(ptr) != val); ut_ad(val == static_cast<byte>(val)); - *ptr= static_cast<byte>(val); + buf[0]= static_cast<byte>(val); break; case 2: ut_ad(val == static_cast<uint16_t>(val)); - if (w == OPT && mach_read_from_2(ptr) == val) return; - ut_ad(w != NORMAL || mach_read_from_2(ptr) != val); - mach_write_to_2(ptr, static_cast<uint16_t>(val)); + mach_write_to_2(buf, static_cast<uint16_t>(val)); break; case 4: ut_ad(val == static_cast<uint32_t>(val)); - if (w == OPT && mach_read_from_4(ptr) == val) return; - ut_ad(w != NORMAL || mach_read_from_4(ptr) != val); - mach_write_to_4(ptr, static_cast<uint32_t>(val)); + mach_write_to_4(buf, static_cast<uint32_t>(val)); break; case 8: - if (w == OPT && mach_read_from_8(ptr) == val) return; - ut_ad(w != NORMAL || mach_read_from_8(ptr) != val); - mach_write_to_8(ptr, val); + mach_write_to_8(buf, val); break; } + byte *p= static_cast<byte*>(ptr); + const byte *const end= p + l; + if (w != FORCED && m_log_mode == MTR_LOG_ALL) + { + const byte *b= buf; + while (*p++ == *b++) + { + if (p == end) + { + ut_ad(w == OPT); + return false; + } + } + p--; + } + ::memcpy(ptr, buf, l); + memcpy_low(block.page, static_cast<uint16_t> + (ut_align_offset(p, srv_page_size)), p, end - p); + return true; +} + +/** Log an initialization of a string of bytes. +@param[in] b buffer page +@param[in] ofs byte offset from b->frame +@param[in] len length of the data to write +@param[in] val the data byte to write */ +inline void mtr_t::memset(const buf_block_t &b, ulint ofs, ulint len, byte val) +{ + ut_ad(len); set_modified(); if (m_log_mode != MTR_LOG_ALL) return; - byte *log_ptr= m_log.open(11 + 2 + (l == 8 ? 9 : 5)); - if (l == 8) - log_write(block, ptr, static_cast<mlog_id_t>(l), log_ptr, uint64_t{val}); - else - log_write(block, ptr, static_cast<mlog_id_t>(l), log_ptr, - static_cast<uint32_t>(val)); + + static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency"); + size_t lenlen= (len < MIN_2BYTE ? 1 + 1 : len < MIN_3BYTE ? 2 + 1 : 3 + 1); + byte *l= log_write<MEMSET>(b.page.id, &b.page, lenlen, true, ofs); + l= mlog_encode_varint(l, len); + *l++= val; + m_log.close(l); + m_last_offset= static_cast<uint16_t>(ofs + len); } -/** Write a byte string to a page. +/** Initialize a string of bytes. @param[in,out] b buffer page +@param[in] ofs byte offset from block->frame +@param[in] len length of the data to write +@param[in] val the data byte to write */ +inline void mtr_t::memset(const buf_block_t *b, ulint ofs, ulint len, byte val) +{ + ut_ad(ofs <= ulint(srv_page_size)); + ut_ad(ofs + len <= ulint(srv_page_size)); + ::memset(ofs + b->frame, val, len); + memset(*b, ofs, len, val); +} + +/** Log an initialization of a repeating string of bytes. +@param[in] b buffer page +@param[in] ofs byte offset from b->frame +@param[in] len length of the data to write, in bytes +@param[in] str the string to write +@param[in] size size of str, in bytes */ +inline void mtr_t::memset(const buf_block_t &b, ulint ofs, size_t len, + const void *str, size_t size) +{ + ut_ad(size); + ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */ + set_modified(); + if (m_log_mode != MTR_LOG_ALL) + return; + + static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency"); + size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3); + byte *l= log_write<MEMSET>(b.page.id, &b.page, lenlen + size, true, ofs); + l= mlog_encode_varint(l, len); + ::memcpy(l, str, size); + l+= size; + m_log.close(l); + m_last_offset= static_cast<uint16_t>(ofs + len); +} + +/** Initialize a repeating string of bytes. +@param[in,out] b buffer page +@param[in] ofs byte offset from b->frame +@param[in] len length of the data to write, in bytes +@param[in] str the string to write +@param[in] size size of str, in bytes */ +inline void mtr_t::memset(const buf_block_t *b, ulint ofs, size_t len, + const void *str, size_t size) +{ + ut_ad(ofs <= ulint(srv_page_size)); + ut_ad(ofs + len <= ulint(srv_page_size)); + ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */ + size_t s= 0; + while (s < len) + { + ::memcpy(ofs + s + b->frame, str, size); + s+= len; + } + ::memcpy(ofs + s + b->frame, str, len - s); + memset(*b, ofs, len, str, size); +} + +/** Log a write of a byte string to a page. +@param[in] b buffer page @param[in] offset byte offset from b->frame @param[in] str the data to write @param[in] len length of the data to write */ -inline -void mtr_t::memcpy(buf_block_t *b, ulint offset, const void *str, ulint len) +inline void mtr_t::memcpy(const buf_block_t &b, ulint offset, ulint len) +{ + ut_ad(len); + ut_ad(offset <= ulint(srv_page_size)); + ut_ad(offset + len <= ulint(srv_page_size)); + memcpy_low(b.page, uint16_t(offset), &b.frame[offset], len); +} + +/** Log a write of a byte string to a page. +@param id page identifier +@param offset byte offset within page +@param data data to be written +@param len length of the data, in bytes */ +inline void mtr_t::memcpy_low(const buf_page_t &bpage, uint16_t offset, + const void *data, size_t len) +{ + ut_ad(len); + set_modified(); + if (m_log_mode != MTR_LOG_ALL) + return; + if (len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5)) + { + byte *end= log_write<WRITE>(bpage.id, &bpage, len, true, offset); + ::memcpy(end, data, len); + m_log.close(end + len); + } + else + { + m_log.close(log_write<WRITE>(bpage.id, &bpage, len, false, offset)); + m_log.push(static_cast<const byte*>(data), static_cast<uint32_t>(len)); + } + m_last_offset= static_cast<uint16_t>(offset + len); +} + +/** Log that a string of bytes was copied from the same page. +@param[in] b buffer page +@param[in] d destination offset within the page +@param[in] s source offset within the page +@param[in] len length of the data to copy */ +inline void mtr_t::memmove(const buf_block_t &b, ulint d, ulint s, ulint len) { - ::memcpy(b->frame + offset, str, len); - memcpy(*b, offset, len); + ut_ad(d >= 8); + ut_ad(s >= 8); + ut_ad(len); + ut_ad(s <= ulint(srv_page_size)); + ut_ad(s + len <= ulint(srv_page_size)); + ut_ad(s != d); + ut_ad(d <= ulint(srv_page_size)); + ut_ad(d + len <= ulint(srv_page_size)); + + set_modified(); + if (m_log_mode != MTR_LOG_ALL) + return; + static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency"); + size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3); + /* The source offset is encoded relative to the destination offset, + with the sign in the least significant bit. */ + if (s > d) + s= (s - d) << 1; + else + s= (d - s) << 1 | 1; + /* The source offset 0 is not possible. */ + s-= 1 << 1; + size_t slen= (s < MIN_2BYTE ? 1 : s < MIN_3BYTE ? 2 : 3); + byte *l= log_write<MEMMOVE>(b.page.id, &b.page, lenlen + slen, true, d); + l= mlog_encode_varint(l, len); + l= mlog_encode_varint(l, s); + m_log.close(l); + m_last_offset= static_cast<uint16_t>(d + len); +} + +/** +Write a log record. +@tparam type redo log record type +@param id persistent page identifier +@param bpage buffer pool page, or nullptr +@param len number of additional bytes to write +@param alloc whether to allocate the additional bytes +@param offset byte offset, or 0 if the record type does not allow one +@return end of mini-transaction log, minus len */ +template<byte type> +inline byte *mtr_t::log_write(const page_id_t id, const buf_page_t *bpage, + size_t len, bool alloc, size_t offset) +{ + static_assert(!(type & 15) && type != RESERVED && type != OPTION && + type <= FILE_CHECKPOINT, "invalid type"); + ut_ad(type >= FILE_CREATE || is_named_space(id.space())); + ut_ad(!bpage || bpage->id == id); + constexpr bool have_len= type != INIT_PAGE && type != FREE_PAGE; + constexpr bool have_offset= type == WRITE || type == MEMSET || + type == MEMMOVE; + static_assert(!have_offset || have_len, "consistency"); + ut_ad(have_len || len == 0); + ut_ad(have_len || !alloc); + ut_ad(have_offset || offset == 0); + ut_ad(offset + len <= srv_page_size); + static_assert(MIN_4BYTE >= UNIV_PAGE_SIZE_MAX, "consistency"); + + size_t max_len; + if (!have_len) + max_len= 1 + 5 + 5; + else if (!have_offset) + max_len= m_last == bpage + ? 1 + 3 + : 1 + 3 + 5 + 5; + else if (m_last == bpage && m_last_offset <= offset) + { + /* Encode the offset relative from m_last_offset. */ + offset-= m_last_offset; + max_len= 1 + 3 + 3; + } + else + max_len= 1 + 3 + 5 + 5 + 3; + byte *const log_ptr= m_log.open(alloc ? max_len + len : max_len); + byte *end= log_ptr + 1; + const byte same_page= max_len < 1 + 5 + 5 ? 0x80 : 0; + if (!same_page) + { + end= mlog_encode_varint(end, id.space()); + end= mlog_encode_varint(end, id.page_no()); + m_last= bpage; + } + if (have_offset) + { + byte* oend= mlog_encode_varint(end, offset); + if (oend + len > &log_ptr[16]) + { + len+= oend - log_ptr - 15; + if (len >= MIN_3BYTE) + len+= 2; + else if (len >= MIN_2BYTE) + len++; + + *log_ptr= type | same_page; + end= mlog_encode_varint(log_ptr + 1, len); + if (!same_page) + { + end= mlog_encode_varint(end, id.space()); + end= mlog_encode_varint(end, id.page_no()); + } + end= mlog_encode_varint(end, offset); + return end; + } + else + end= oend; + } + else if (len >= 3 && end + len > &log_ptr[16]) + { + len+= end - log_ptr - 16; + if (len >= MIN_3BYTE) + len+= 2; + else if (len >= MIN_2BYTE) + len++; + + end= log_ptr; + *end++= type | same_page; + mlog_encode_varint(end, len); + + if (!same_page) + { + end= mlog_encode_varint(end, id.space()); + end= mlog_encode_varint(end, id.page_no()); + } + return end; + } + + ut_ad(end + len >= &log_ptr[1] + !same_page); + ut_ad(end + len <= &log_ptr[16]); + ut_ad(end <= &log_ptr[max_len]); + *log_ptr= type | same_page | static_cast<byte>(end + len - log_ptr - 1); + ut_ad(*log_ptr & 15); + return end; } /** Write a byte string to a page. -@param[in,out] b ROW_FORMAT=COMPRESSED index page -@param[in] ofs byte offset from b->zip.data +@param[in] b buffer page +@param[in] dest destination within b.frame @param[in] str the data to write -@param[in] len length of the data to write */ -inline -void mtr_t::zmemcpy(buf_page_t *b, ulint offset, const void *str, ulint len) +@param[in] len length of the data to write +@tparam w write request type */ +template<mtr_t::write_type w> +inline void mtr_t::memcpy(const buf_block_t &b, void *dest, const void *str, + ulint len) { - ::memcpy(b->zip.data + offset, str, len); - zmemcpy(*b, offset, len); + ut_ad(ut_align_down(dest, srv_page_size) == b.frame); + char *d= static_cast<char*>(dest); + const char *s= static_cast<const char*>(str); + if (w != FORCED && m_log_mode == MTR_LOG_ALL) + { + ut_ad(len); + const char *const end= d + len; + while (*d++ == *s++) + { + if (d == end) + { + ut_ad(w == OPT); + return; + } + } + s--; + d--; + len= static_cast<ulint>(end - d); + } + ::memcpy(d, s, len); + memcpy(b, ut_align_offset(d, srv_page_size), len); } /** Initialize an entire page. @@ -121,13 +517,37 @@ inline void mtr_t::init(buf_block_t *b) return; } - m_log.close(log_write_low(MLOG_INIT_FILE_PAGE2, b->page.id, m_log.open(11))); + m_log.close(log_write<INIT_PAGE>(b->page.id, &b->page)); + m_last_offset= FIL_PAGE_TYPE; b->page.init_on_flush= true; } +/** Free a page. +@param id page identifier */ +inline void mtr_t::free(const page_id_t id) +{ + if (m_log_mode == MTR_LOG_ALL) + m_log.close(log_write<FREE_PAGE>(id, nullptr)); +} + +/** Partly initialize a B-tree page. +@param block B-tree page +@param comp false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */ +inline void mtr_t::page_create(const buf_block_t &block, bool comp) +{ + set_modified(); + if (m_log_mode != MTR_LOG_ALL) + return; + byte *l= log_write<INIT_INDEX_PAGE>(block.page.id, &block.page, 1, true); + *l++= comp; + m_log.close(l); + m_last_offset= FIL_PAGE_TYPE; +} + /********************************************************//** -Parses an initial log record written by mtr_t::log_write_low(). +Parses an initial log record written by mlog_write_initial_log_record_low(). @return parsed record end, NULL if not a complete record */ +ATTRIBUTE_COLD /* only used when crash-upgrading */ const byte* mlog_parse_initial_log_record( /*==========================*/ diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index f2f8ee13a2a..49537faa030 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -129,7 +129,7 @@ struct mtr_t { /** Commit a mini-transaction that did not modify any pages, but generated some redo log on a higher level, such as - MLOG_FILE_NAME records and an optional MLOG_CHECKPOINT marker. + FILE_MODIFY records and an optional FILE_CHECKPOINT marker. The caller must invoke log_mutex_enter() and log_mutex_exit(). This is to be used at log_checkpoint(). @param checkpoint_lsn the log sequence number of a checkpoint, or 0 */ @@ -171,7 +171,7 @@ struct mtr_t { inline mtr_log_t set_log_mode(mtr_log_t mode); /** Copy the tablespaces associated with the mini-transaction - (needed for generating MLOG_FILE_NAME records) + (needed for generating FILE_MODIFY records) @param[in] mtr mini-transaction that may modify the same set of tablespaces as this one */ void set_spaces(const mtr_t& mtr) @@ -184,7 +184,7 @@ struct mtr_t { } /** Set the tablespace associated with the mini-transaction - (needed for generating a MLOG_FILE_NAME record) + (needed for generating a FILE_MODIFY record) @param[in] space_id user or system tablespace ID @return the tablespace */ fil_space_t* set_named_space_id(ulint space_id) @@ -203,7 +203,7 @@ struct mtr_t { } /** Set the tablespace associated with the mini-transaction - (needed for generating a MLOG_FILE_NAME record) + (needed for generating a FILE_MODIFY record) @param[in] space user or system tablespace */ void set_named_space(fil_space_t* space) { @@ -216,12 +216,12 @@ struct mtr_t { #ifdef UNIV_DEBUG /** Check the tablespace associated with the mini-transaction - (needed for generating a MLOG_FILE_NAME record) + (needed for generating a FILE_MODIFY record) @param[in] space tablespace @return whether the mini-transaction is associated with the space */ bool is_named_space(ulint space) const; /** Check the tablespace associated with the mini-transaction - (needed for generating a MLOG_FILE_NAME record) + (needed for generating a FILE_MODIFY record) @param[in] space tablespace @return whether the mini-transaction is associated with the space */ bool is_named_space(const fil_space_t* space) const; @@ -407,136 +407,124 @@ struct mtr_t { @param[in] val value to write @tparam l number of bytes to write @tparam w write request type - @tparam V type of val */ + @tparam V type of val + @return whether any log was written */ template<unsigned l,write_type w= NORMAL,typename V> - inline void write(const buf_block_t &block, byte *ptr, V val) + inline bool write(const buf_block_t &block, void *ptr, V val) MY_ATTRIBUTE((nonnull)); /** Log a write of a byte string to a page. @param[in] b buffer page @param[in] ofs byte offset from b->frame @param[in] len length of the data to write */ - void memcpy(const buf_block_t &b, ulint ofs, ulint len); + inline void memcpy(const buf_block_t &b, ulint ofs, ulint len); /** Write a byte string to a page. @param[in,out] b buffer page - @param[in] offset byte offset from b->frame + @param[in] dest destination within b.frame @param[in] str the data to write - @param[in] len length of the data to write */ - inline void memcpy(buf_block_t *b, ulint offset, const void *str, ulint len); + @param[in] len length of the data to write + @tparam w write request type */ + template<write_type w= NORMAL> + inline void memcpy(const buf_block_t &b, void *dest, const void *str, + ulint len); - /** Write a byte string to a ROW_FORMAT=COMPRESSED page. + /** Log a write of a byte string to a ROW_FORMAT=COMPRESSED page. @param[in] b ROW_FORMAT=COMPRESSED index page - @param[in] ofs byte offset from b.zip.data + @param[in] offset byte offset from b.zip.data @param[in] len length of the data to write */ - void zmemcpy(const buf_page_t &b, ulint offset, ulint len); + inline void zmemcpy(const buf_page_t &b, ulint offset, ulint len); /** Write a byte string to a ROW_FORMAT=COMPRESSED page. @param[in,out] b ROW_FORMAT=COMPRESSED index page - @param[in] ofs byte offset from b->zip.data + @param[in] dest destination within b.zip.data @param[in] str the data to write - @param[in] len length of the data to write */ - inline void zmemcpy(buf_page_t *b, ulint offset, const void *str, ulint len); + @param[in] len length of the data to write + @tparam w write request type */ + template<write_type w= NORMAL> + inline void zmemcpy(const buf_page_t &b, void *dest, const void *str, + ulint len); + + /** Log an initialization of a string of bytes. + @param[in] b buffer page + @param[in] ofs byte offset from b->frame + @param[in] len length of the data to write + @param[in] val the data byte to write */ + inline void memset(const buf_block_t &b, ulint ofs, ulint len, byte val); /** Initialize a string of bytes. @param[in,out] b buffer page @param[in] ofs byte offset from b->frame @param[in] len length of the data to write @param[in] val the data byte to write */ - void memset(const buf_block_t* b, ulint ofs, ulint len, byte val); + inline void memset(const buf_block_t *b, ulint ofs, ulint len, byte val); + + /** Log an initialization of a repeating string of bytes. + @param[in] b buffer page + @param[in] ofs byte offset from b->frame + @param[in] len length of the data to write, in bytes + @param[in] str the string to write + @param[in] size size of str, in bytes */ + inline void memset(const buf_block_t &b, ulint ofs, size_t len, + const void *str, size_t size); + + /** Initialize a repeating string of bytes. + @param[in,out] b buffer page + @param[in] ofs byte offset from b->frame + @param[in] len length of the data to write, in bytes + @param[in] str the string to write + @param[in] size size of str, in bytes */ + inline void memset(const buf_block_t *b, ulint ofs, size_t len, + const void *str, size_t size); + + /** Log that a string of bytes was copied from the same page. + @param[in] b buffer page + @param[in] d destination offset within the page + @param[in] s source offset within the page + @param[in] len length of the data to copy */ + inline void memmove(const buf_block_t &b, ulint d, ulint s, ulint len); /** Initialize an entire page. @param[in,out] b buffer page */ void init(buf_block_t *b); /** Free a page. @param id page identifier */ - void free(const page_id_t id) { log_page_write(id, MLOG_INIT_FREE_PAGE); } - + inline void free(const page_id_t id); /** Partly initialize a B-tree page. - @param id page identifier + @param block B-tree page @param comp false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */ - void page_create(const page_id_t id, bool comp) - { - set_modified(); - log_page_write(id, comp ? MLOG_COMP_PAGE_CREATE : MLOG_PAGE_CREATE); - } + inline void page_create(const buf_block_t &block, bool comp); /** Write a log record about a file operation. @param type file operation @param space_id tablespace identifier @param first_page_no first page number in the file @param path file path - @param new_path new file path for type=MLOG_FILE_RENAME2 - @param flags tablespace flags for type=MLOG_FILE_CREATE2 */ - inline void log_file_op(mlog_id_t type, ulint space_id, ulint first_page_no, - const char *path, - const char *new_path= nullptr, ulint flags= 0); + @param new_path new file path for type=FILE_RENAME */ + inline void log_file_op(mfile_type_t type, ulint space_id, + ulint first_page_no, const char *path, + const char *new_path= nullptr); private: - /** - Write a complex page operation. - @param id page identifier - @param type type of operation */ - void log_page_write(const page_id_t id, mlog_id_t type) - { - ut_ad(type == MLOG_INIT_FREE_PAGE || type == MLOG_COMP_PAGE_CREATE || - type == MLOG_PAGE_CREATE); - - if (m_log_mode == MTR_LOG_ALL) - m_log.close(log_write_low(type, id, m_log.open(11))); - } + /** Log a write of a byte string to a page. + @param b buffer page + @param offset byte offset within page + @param data data to be written + @param len length of the data, in bytes */ + inline void memcpy_low(const buf_page_t &bpage, uint16_t offset, + const void *data, size_t len); /** Write a log record. - @param type redo log record type + @tparam type redo log record type @param id persistent page identifier - @param l current end of mini-transaction log - @return new end of mini-transaction log */ - inline byte *log_write_low(mlog_id_t type, const page_id_t id, byte *l) - { - ut_ad(type <= MLOG_BIGGEST_TYPE); - ut_ad(type == MLOG_FILE_NAME || type == MLOG_FILE_DELETE || - type == MLOG_FILE_CREATE2 || type == MLOG_FILE_RENAME2 || - is_named_space(id.space())); - - *l++= type; - - l+= mach_write_compressed(l, id.space()); - l+= mach_write_compressed(l, id.page_no()); - - ++m_n_log_recs; - return l; - } - - /** - Write a log record for writing 1, 2, 4, or 8 bytes. - @param[in] type number of bytes to write - @param[in] block file page - @param[in] ptr pointer within block.frame - @param[in,out] l log record buffer - @return new end of mini-transaction log */ - byte *log_write_low(mlog_id_t type, const buf_block_t &block, - const byte *ptr, byte *l); - - /** - Write a log record for writing 1, 2, or 4 bytes. - @param[in] block file page - @param[in,out] ptr pointer in file page - @param[in] l number of bytes to write - @param[in,out] log_ptr log record buffer - @param[in] val value to write */ - void log_write(const buf_block_t &block, byte *ptr, mlog_id_t l, - byte *log_ptr, uint32_t val) - MY_ATTRIBUTE((nonnull)); - /** - Write a log record for writing 8 bytes. - @param[in] block file page - @param[in,out] ptr pointer in file page - @param[in] l number of bytes to write (8) - @param[in,out] log_ptr log record buffer - @param[in] val value to write */ - void log_write(const buf_block_t &block, byte *ptr, mlog_id_t l, - byte *log_ptr, uint64_t val) - MY_ATTRIBUTE((nonnull)); + @param bpage buffer pool page, or nullptr + @param len number of additional bytes to write + @param alloc whether to allocate the additional bytes + @param offset byte offset, or 0 if the record type does not allow one + @return end of mini-transaction log, minus len */ + template<byte type> + inline byte *log_write(const page_id_t id, const buf_page_t *bpage, + size_t len= 0, bool alloc= false, size_t offset= 0); /** Prepare to write the mini-transaction log to the redo log buffer. @return number of bytes to write in finish_write() */ @@ -563,6 +551,11 @@ private: bool m_commit= false; #endif + /** The page of the most recent m_log record written, or NULL */ + const buf_page_t* m_last; + /** The current byte offset in m_last, or 0 */ + uint16_t m_last_offset; + /** specifies which operations should be logged; default MTR_LOG_ALL */ uint16_t m_log_mode:2; @@ -576,8 +569,6 @@ private: to suppress some read-ahead operations, @see ibuf_inside() */ uint16_t m_inside_ibuf:1; - /** number of m_log records */ - uint16_t m_n_log_recs:11; #ifdef UNIV_DEBUG /** Persistent user tablespace associated with the mini-transaction, or 0 (TRX_SYS_SPACE) if none yet */ diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic index e00ae2057df..016d7c768c2 100644 --- a/storage/innobase/include/mtr0mtr.ic +++ b/storage/innobase/include/mtr0mtr.ic @@ -204,7 +204,7 @@ mtr_t::set_log_mode(mtr_log_t mode) case MTR_LOG_ALL: /* MTR_LOG_NO_REDO can only be set before generating any redo log records. */ - ut_ad(mode != MTR_LOG_NO_REDO || m_n_log_recs == 0); + ut_ad(mode != MTR_LOG_NO_REDO || m_log.empty()); m_log_mode = mode; return(old_mode); } diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h index ef180fb36bc..bdcca691c2e 100644 --- a/storage/innobase/include/mtr0types.h +++ b/storage/innobase/include/mtr0types.h @@ -29,6 +29,8 @@ Created 11/26/1995 Heikki Tuuri #ifndef UNIV_INNOCHECKSUM #include "sync0rw.h" +#else +#include "univ.i" #endif /* UNIV_INNOCHECKSUM */ struct mtr_t; @@ -47,6 +49,233 @@ enum mtr_log_t { MTR_LOG_NO_REDO }; +/* +A mini-transaction is a stream of records that is always terminated by +a NUL byte. The first byte of a mini-transaction record is never NUL, +but NUL bytes can occur within mini-transaction records. The first +bytes of each record will explicitly encode the length of the record. +NUL bytes also acts as padding in log blocks, that is, there can be +multiple sucessive NUL bytes between mini-transactions in a redo log +block. + +The first byte of the record would contain a record type, flags, and a +part of length. The optional second byte of the record will contain +more length. (Not needed for short records.) + +Bit 7 of the first byte of a redo log record is the same_page flag. +If same_page=1, the record is referring to the same page as the +previous record. Records that do not refer to data pages but to file +operations are identified by setting the same_page=1 in the very first +record(s) of the mini-transaction. A mini-transaction record that +carries same_page=0 must only be followed by page-oriented records. + +Bits 6..4 of the first byte of a redo log record identify the redo log +type. The following record types refer to data pages: + + FREE_PAGE (0): corresponds to MLOG_INIT_FREE_PAGE + INIT_PAGE (1): corresponds to MLOG_INIT_FILE_PAGE2 + INIT_INDEX_PAGE (2): initialize a B-tree or R-tree page + WRITE (3): replaces MLOG_nBYTES, MLOG_WRITE_STRING, MLOG_ZIP_* + MEMSET (4): extends the 10.4 MLOG_MEMSET record + MEMMOVE (5): copy data within the page (avoids logging redundant data) + RESERVED (6): reserved for future use; a subtype code + (encoded immediately after the length) would be written + to reserve code space for further extensions + OPTION (7): optional record that may be ignored; a subtype code + (encoded immediately after the length) would distinguish actual + usage, such as: + * MDEV-18976 page checksum record + * binlog record + * SQL statement (at the start of statement) + +Bits 3..0 indicate the redo log record length, excluding the first +byte, but including additional length bytes and any other bytes, +such as the optional tablespace identifier and page number. +Values 1..15 represent lengths of 1 to 15 bytes. The special value 0 +indicates that 1 to 3 length bytes will follow to encode the remaining +length that exceeds 16 bytes. + +Additional length bytes if length>16: 0 to 3 bytes +0xxxxxxx for 0 to 127 (total: 16 to 143 bytes) +10xxxxxx xxxxxxxx for 128 to 16511 (total: 144 to 16527) +110xxxxx xxxxxxxx xxxxxxxx for 16512 to 2113663 (total: 16528 to 2113679) +111xxxxx reserved (corrupted record, and file!) + +If same_page=0, the tablespace identifier and page number will use +similar 1-to-5-byte variable-length encoding: +0xxxxxxx for 0 to 127 +10xxxxxx xxxxxxxx for 128 to 16,511 +110xxxxx xxxxxxxx xxxxxxxx for 16,512 to 2,113,663 +1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx for 2,113,664 to 270,549,119 +11110xxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx for 270,549,120 to 34,630,287,487 +11111xxx reserved (corrupted record) +Note: Some 5-byte values are reserved, because the tablespace identifier +and page number can only be up to 4,294,967,295. + +If same_page=1 is set in a record that follows a same_page=0 record +in a mini-transaction, the tablespace identifier and page number +fields will be omitted. + +(For some file-oriented records (if same_page=1 for the first records +of a mini-transaction), we will write tablespace identifier using the +same 1-to-5-byte encoding. TBD: describe the exact format of +file-oriented records. With MDEV-14425, we could write file-level log +records to a separate file, not interleaved with page-level redo log +at all. We could reserve the file ib_logfile0 for checkpoint information +and for file-level redo log records.) + +For FREE_PAGE or INIT_PAGE, if same_page=1, the record will be treated +as corrupted (or reserved for future extension). The type code must +be followed by 1+1 to 5+5 bytes (to encode the tablespace identifier +and page number). If the record length does not match the encoded +lengths of the tablespace identifier and page number, the record will +be treated as corrupted. This allows future expansion of the format. + +If there is a FREE_PAGE record in a mini-transaction, it must be the +only record for that page in the mini-transaction. If there is an +INIT_PAGE record for a page in a mini-transaction, it must be the +first record for that page in the mini-transaction. + +An INIT_INDEX_PAGE must be followed by 1+1 to 5+5 bytes for the page +identifier (unless the same_page flag is set) and a subtype code: +0 for ROW_FORMAT=REDUNDANT and 1 for ROW_FORMAT=COMPACT or DYNAMIC. + +For WRITE, MEMSET, MEMMOVE, the next 1 to 3 bytes are the byte offset +on the page, relative from the previous offset. If same_page=0, the +"previous offset" is 0. If same_page=1, the "previous offset" is where +the previous operation ended (FIL_PAGE_TYPE for INIT_PAGE or INIT_INDEX_PAGE). +0xxxxxxx for 0 to 127 +10xxxxxx xxxxxxxx for 128 to 16,511 +110xxxxx xxxxxxxx xxxxxxxx for 16,512 to 2,113,663 +111xxxxx reserved (corrupted record) +If the sum of the "previous offset" and the current offset exceeds the +page size, the record is treated as corrupted. Negative relative offsets +cannot be written. Instead, a record with same_page=0 can be written. + +For MEMSET and MEMMOVE, the target length will follow, encoded in 1 to +3 bytes. If the length+offset exceeds the page size, the record will +be treated as corrupted. + +For MEMMOVE, the source offset will follow, encoded in 1 to 3 bytes, +relative to the current offset. The offset 0 is not possible, and +the sign bit is the least significant bit. That is, ++x is encoded as (x-1)<<1 (+1,+2,+3,... is 0,2,4,...) and +-x is encoded as (x-1)<<1|1 (-1,-2,-3,... is 1,3,5,...). +The source offset must be within the page size, or else the record +will be treated as corrupted. + +For MEMSET or WRITE, the byte(s) to be written will follow. For +MEMSET, it usually is a single byte, but it could also be a multi-byte +string, which would be copied over and over until the target length is +reached. The length of the remaining bytes is implied by the length +bytes at the start of the record. + +For MEMMOVE, if any bytes follow, the record is treated as corrupted +(future expansion). + +As mentioned at the start of this comment, the type byte 0 would be +special, marking the end of a mini-transaction. We could use the +corresponding value 0x80 (with same_page=1) for something special, +such as a future extension when more type codes are needed, or for +encoding rarely needed redo log records. + +Examples: + +INIT could be logged as 0x12 0x34 0x56, meaning "type code 1 (INIT), 2 +bytes to follow" and "tablespace ID 0x34", "page number 0x56". +The first byte must be between 0x12 and 0x1a, and the total length of +the record must match the lengths of the encoded tablespace ID and +page number. + +WRITE could be logged as 0x36 0x40 0x57 0x60 0x12 0x34 0x56, meaning +"type code 3 (WRITE), 6 bytes to follow" and "tablespace ID 0x40", +"page number 0x57", "byte offset 0x60", data 0x34,0x56. + +A subsequent WRITE to the same page could be logged 0xb5 0x7f 0x23 +0x34 0x56 0x78, meaning "same page, type code 3 (WRITE), 5 bytes to +follow", "byte offset 0x7f"+0x60+2, bytes 0x23,0x34,0x56,0x78. + +The end of the mini-transaction would be indicated by a NUL byte. +*/ + +/** Redo log record types. These bit patterns (3 bits) will be written +to the redo log file, so the existing codes or their interpretation on +crash recovery must not be changed. */ +enum mrec_type_t +{ + /** Free a page. On recovery, it is unnecessary to read the page. + The next record for the page (if any) must be INIT_PAGE or + INIT_INDEX_PAGE. After this record has been written, the page may be + overwritten with zeros, or discarded or trimmed. */ + FREE_PAGE = 0, + /** Zero-initialize a page. The current byte offset (for subsequent + records) will be reset to FIL_PAGE_TYPE. */ + INIT_PAGE = 0x10, + /** Like INIT_PAGE, but initializing a B-tree or R-tree index page, + including writing the "infimum" and "supremum" pseudo-records. The + current byte offset will be reset to FIL_PAGE_TYPE. The + type code is followed by a subtype byte to specify the ROW_FORMAT: + 0 for ROW_FORMAT=REDUNDANT, 1 for ROW_FORMAT=COMPACT or DYNAMIC. */ + INIT_INDEX_PAGE = 0x20, + /** Write a string of bytes. Followed by the byte offset (unsigned, + relative to the current byte offset, encoded in 1 to 3 bytes) and + the bytes to write (at least one). The current byte offset will be + set after the last byte written. */ + WRITE = 0x30, + /** Like WRITE, but before the bytes to write, the data_length-1 + (encoded in 1 to 3 bytes) will be encoded, and it must be more + than the length of the following data bytes to write. + The data byte(s) will be repeatedly copied to the output until + the data_length is reached. */ + MEMSET = 0x40, + /** Like MEMSET, but instead of the bytes to write, a source byte + offset (signed, nonzero, relative to the target byte offset, encoded + in 1 to 3 bytes, with the sign bit in the least significant bit) + will be written. + That is, +x is encoded as (x-1)<<1 (+1,+2,+3,... is 0,2,4,...) + and -x is encoded as (x-1)<<1|1 (-1,-2,-3,... is 1,3,5,...). + The source offset and data_length must be within the page size, or + else the record will be treated as corrupted. The data will be + copied from the page as it was at the start of the + mini-transaction. */ + MEMMOVE = 0x50, + /** Reserved for future use. */ + RESERVED = 0x60, + /** Optional record that may be ignored in crash recovery. + A subtype code will be encoded immediately after the length. + Possible subtypes would include a MDEV-18976 page checksum record, + a binlog record, or an SQL statement. */ + OPTION = 0x70 +}; + + +/** Redo log record types for file-level operations. These bit +patterns will be written to redo log files, so the existing codes or +their interpretation on crash recovery must not be changed. */ +enum mfile_type_t +{ + /** Create a file. Followed by tablespace ID and the file name. */ + FILE_CREATE = 0x80, + /** Delete a file. Followed by tablespace ID and the file name. */ + FILE_DELETE = 0x90, + /** Rename a file. Followed by tablespace ID and the old file name, + NUL, and the new file name. */ + FILE_RENAME = 0xa0, + /** Modify a file. Followed by tablespace ID and the file name. */ + FILE_MODIFY = 0xb0, +#if 1 /* MDEV-14425 FIXME: Remove this! */ + /** End-of-checkpoint marker. Followed by 2 dummy bytes of page identifier, + 8 bytes of LSN, and padded with a NUL; @see SIZE_OF_FILE_CHECKPOINT. */ + FILE_CHECKPOINT = 0xf0 +#endif +}; + +#if 1 /* MDEV-14425 FIXME: Remove this! */ +/** Size of a FILE_CHECKPOINT record, including the trailing byte to +terminate the mini-transaction. */ +constexpr byte SIZE_OF_FILE_CHECKPOINT= 3/*type,page_id*/ + 8/*LSN*/ + 1; +#endif + /** @name Log item types The log items are declared 'byte' so that the compiler can warn if val and type parameters are switched in a call to mlog_write. NOTE! @@ -120,9 +349,6 @@ enum mlog_id_t { /** initialize an ibuf bitmap page (used in MariaDB 10.2 and 10.3) */ MLOG_IBUF_BITMAP_INIT = 27, - /** MDEV-12353 WIP: write to a ROW_FORMAT=COMPRESSED page */ - MLOG_ZIP_WRITE_STRING = 29, - /** write a string to a page */ MLOG_WRITE_STRING = 30, diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h index 6f57fd38848..46981c777cd 100644 --- a/storage/innobase/include/page0page.h +++ b/storage/innobase/include/page0page.h @@ -178,7 +178,7 @@ the first record in the list of records. */ #define PAGE_DIR FIL_PAGE_DATA_END /* We define a slot in the page directory as two bytes */ -#define PAGE_DIR_SLOT_SIZE 2 +constexpr uint16_t PAGE_DIR_SLOT_SIZE= 2; /* The offset of the physically lower end of the directory, counted from page end, when the page is empty */ @@ -840,15 +840,6 @@ page_rec_is_second_last( const page_t* page) /*!< in: page */ MY_ATTRIBUTE((warn_unused_result)); -/***************************************************************//** -Looks for the record which owns the given record. -@return the owner record */ -UNIV_INLINE -rec_t* -page_rec_find_owner_rec( -/*====================*/ - rec_t* rec); /*!< in: the physical record */ - /************************************************************//** Returns the maximum combined size of records which can be inserted on top of record heap. @@ -924,7 +915,7 @@ page_get_instant(const page_t* page); @param[in,out] block buffer block @param[in,out] mtr mini-transaction @param[in] comp set unless ROW_FORMAT=REDUNDANT */ -void page_create(buf_block_t* block, mtr_t* mtr, bool comp); +void page_create(buf_block_t *block, mtr_t *mtr, bool comp); /**********************************************************//** Create a compressed B-tree index page. */ void diff --git a/storage/innobase/include/page0page.ic b/storage/innobase/include/page0page.ic index 5cc6b4d9d50..8604f088adf 100644 --- a/storage/innobase/include/page0page.ic +++ b/storage/innobase/include/page0page.ic @@ -89,17 +89,14 @@ page_set_ssn_id( node_seq_t ssn_id, /*!< in: transaction id */ mtr_t* mtr) /*!< in/out: mini-transaction */ { - ut_ad(!mtr || mtr_memo_contains_flagged(mtr, block, - MTR_MEMO_PAGE_SX_FIX - | MTR_MEMO_PAGE_X_FIX)); - - byte* ssn = block->frame + FIL_RTREE_SPLIT_SEQ_NUM; - if (UNIV_LIKELY_NULL(page_zip)) { - mach_write_to_8(ssn, ssn_id); - page_zip_write_header(block, ssn, 8, mtr); - } else { - mtr->write<8,mtr_t::OPT>(*block, ssn, ssn_id); - } + ut_ad(mtr_memo_contains_flagged(mtr, block, + MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_PAGE_X_FIX)); + ut_ad(!page_zip || page_zip == &block->page.zip); + constexpr uint16_t field= FIL_RTREE_SPLIT_SEQ_NUM; + byte *b= my_assume_aligned<2>(&block->frame[field]); + if (mtr->write<8,mtr_t::OPT>(*block, b, ssn_id) && + UNIV_LIKELY_NULL(page_zip)) + memcpy_aligned<2>(&page_zip->data[field], b, 8); } #endif /* !UNIV_INNOCHECKSUM */ @@ -133,15 +130,11 @@ Reset PAGE_LAST_INSERT. @param[in,out] mtr mini-transaction */ inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr) { - byte *b= &block->frame[PAGE_HEADER + PAGE_LAST_INSERT]; - - if (UNIV_LIKELY_NULL(block->page.zip.data)) - { - mach_write_to_2(b, 0); - page_zip_write_header(block, b, 2, mtr); - } - else - mtr->write<2,mtr_t::OPT>(*block, b, 0U); + constexpr uint16_t field= PAGE_HEADER + PAGE_LAST_INSERT; + byte *b= my_assume_aligned<2>(&block->frame[field]); + if (mtr->write<2,mtr_t::OPT>(*block, b, 0U) && + UNIV_LIKELY_NULL(block->page.zip.data)) + memcpy_aligned<2>(&block->page.zip.data[field], b, 2); } /***************************************************************//** @@ -576,30 +569,6 @@ page_rec_get_prev( return((rec_t*) page_rec_get_prev_const(rec)); } -/***************************************************************//** -Looks for the record which owns the given record. -@return the owner record */ -UNIV_INLINE -rec_t* -page_rec_find_owner_rec( -/*====================*/ - rec_t* rec) /*!< in: the physical record */ -{ - ut_ad(page_rec_check(rec)); - - if (page_rec_is_comp(rec)) { - while (rec_get_n_owned_new(rec) == 0) { - rec = page_rec_get_next(rec); - } - } else { - while (rec_get_n_owned_old(rec) == 0) { - rec = page_rec_get_next(rec); - } - } - - return(rec); -} - /**********************************************************//** Returns the base extra size of a physical record. This is the size of the fixed header, independent of the record size. diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h index c7def1d77fb..8b8a4e5b984 100644 --- a/storage/innobase/include/page0zip.h +++ b/storage/innobase/include/page0zip.h @@ -230,19 +230,6 @@ page_zip_available( the heap */ MY_ATTRIBUTE((warn_unused_result)); -/**********************************************************************//** -Write data to the uncompressed header portion of a page. The data must -already have been written to the uncompressed page. */ -UNIV_INLINE -void -page_zip_write_header( -/*==================*/ - buf_block_t* block, /*!< in/out: compressed page */ - const byte* str, /*!< in: address on the uncompressed page */ - ulint length, /*!< in: length of the data */ - mtr_t* mtr) /*!< in/out: mini-transaction */ - MY_ATTRIBUTE((nonnull)); - /** Write an entire record to the ROW_FORMAT=COMPRESSED page. The data must already have been written to the uncompressed page. @param[in,out] block ROW_FORMAT=COMPRESSED page @@ -342,17 +329,14 @@ page_zip_parse_write_trx_id( page_zip_des_t* page_zip) MY_ATTRIBUTE((nonnull(1,2), warn_unused_result)); -/**********************************************************************//** -Write the "deleted" flag of a record on a compressed page. The flag must -already have been written on the uncompressed page. */ -void -page_zip_rec_set_deleted( -/*=====================*/ - buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */ - const byte* rec, /*!< in: record on the uncompressed page */ - ulint flag, /*!< in: the deleted flag (nonzero=TRUE) */ - mtr_t* mtr) /*!< in,out: mini-transaction */ - MY_ATTRIBUTE((nonnull)); +/** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record. +@param[in,out] block buffer block +@param[in,out] rec record on a physical index page +@param[in] flag the value of the delete-mark flag +@param[in,out] mtr mini-transaction */ +void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag, + mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); /**********************************************************************//** Insert a record to the dense page directory. */ @@ -360,8 +344,8 @@ void page_zip_dir_insert( /*================*/ page_cur_t* cursor, /*!< in/out: page cursor */ - const byte* free_rec,/*!< in: record from which rec was - allocated, or NULL */ + uint16_t free_rec,/*!< in: record from which rec was + allocated, or 0 */ byte* rec, /*!< in: record to insert */ mtr_t* mtr) /*!< in/out: mini-transaction */ MY_ATTRIBUTE((nonnull(1,3,4))); diff --git a/storage/innobase/include/page0zip.ic b/storage/innobase/include/page0zip.ic index 32879109cec..1ca59116407 100644 --- a/storage/innobase/include/page0zip.ic +++ b/storage/innobase/include/page0zip.ic @@ -25,10 +25,7 @@ Compressed page interface Created June 2005 by Marko Makela *******************************************************/ -#include "page0zip.h" -#include "mtr0log.h" #include "page0page.h" -#include "srv0srv.h" /* The format of compressed pages is as follows. @@ -320,29 +317,6 @@ page_zip_des_init( } /**********************************************************************//** -Write data to the uncompressed header portion of a page. The data must -already have been written to the uncompressed page. -However, the data portion of the uncompressed page may differ from -the compressed page when a record is being inserted in -page_cur_insert_rec_zip(). */ -UNIV_INLINE -void -page_zip_write_header( -/*==================*/ - buf_block_t* block, /*!< in/out: compressed page */ - const byte* str, /*!< in: address on the uncompressed page */ - ulint length, /*!< in: length of the data */ - mtr_t* mtr) /*!< in/out: mini-transaction */ -{ - ut_ad(page_align(str) == block->frame); - const uint16_t pos = page_offset(str); - - ut_ad(pos < PAGE_DATA); - ut_ad(pos + length < PAGE_DATA); - mtr->zmemcpy(&block->page, pos, str, length); -} - -/**********************************************************************//** Reset the counters used for filling INFORMATION_SCHEMA.innodb_cmp_per_index. */ UNIV_INLINE diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index bf8d7c958e0..37742bb2008 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -717,7 +717,7 @@ void log_t::files::create(ulint n_files) ut_ad(log_sys.is_initialised()); this->n_files= n_files; - format= srv_encrypt_log ? log_t::FORMAT_ENC_10_4 : log_t::FORMAT_10_4; + format= srv_encrypt_log ? log_t::FORMAT_ENC_10_5 : log_t::FORMAT_10_5; subformat= 2; file_size= srv_log_file_size; lsn= LOG_START_LSN; @@ -745,8 +745,8 @@ log_file_header_flush( ut_ad(log_write_mutex_own()); ut_ad(!recv_no_log_write); ut_a(nth_file < log_sys.log.n_files); - ut_ad(log_sys.log.format == log_t::FORMAT_10_4 - || log_sys.log.format == log_t::FORMAT_ENC_10_4); + ut_ad(log_sys.log.format == log_t::FORMAT_10_5 + || log_sys.log.format == log_t::FORMAT_ENC_10_5); // man 2 open suggests this buffer to be aligned by 512 for O_DIRECT MY_ALIGNED(OS_FILE_LOG_BLOCK_SIZE) @@ -1273,14 +1273,14 @@ void log_header_read(ulint header) } /** Write checkpoint info to the log header and invoke log_mutex_exit(). -@param[in] end_lsn start LSN of the MLOG_CHECKPOINT mini-transaction */ +@param[in] end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */ void log_write_checkpoint_info(lsn_t end_lsn) { ut_ad(log_mutex_own()); ut_ad(!srv_read_only_mode); ut_ad(end_lsn == 0 || end_lsn >= log_sys.next_checkpoint_lsn); ut_ad(end_lsn <= log_sys.lsn); - ut_ad(end_lsn + SIZE_OF_MLOG_CHECKPOINT <= log_sys.lsn + ut_ad(end_lsn + SIZE_OF_FILE_CHECKPOINT <= log_sys.lsn || srv_shutdown_state != SRV_SHUTDOWN_NONE); DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF @@ -1415,23 +1415,23 @@ bool log_checkpoint() ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn); if (oldest_lsn - > log_sys.last_checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT) { + > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT) { /* Some log has been written since the previous checkpoint. */ } else if (srv_shutdown_state != SRV_SHUTDOWN_NONE) { - /* MariaDB 10.3 startup expects the redo log file to be + /* MariaDB startup expects the redo log file to be logically empty (not even containing a MLOG_CHECKPOINT record) after a clean shutdown. Perform an extra checkpoint at shutdown. */ } else { /* Do nothing, because nothing was logged (other than - a MLOG_CHECKPOINT marker) since the previous checkpoint. */ + a FILE_CHECKPOINT marker) since the previous checkpoint. */ log_mutex_exit(); return(true); } - /* Repeat the MLOG_FILE_NAME records after the checkpoint, in + /* Repeat the FILE_MODIFY records after the checkpoint, in case some log records between the checkpoint and log_sys.lsn - need them. Finally, write a MLOG_CHECKPOINT marker. Redo log - apply expects to see a MLOG_CHECKPOINT after the checkpoint, + need them. Finally, write a FILE_CHECKPOINT marker. Redo log + apply expects to see a FILE_CHECKPOINT after the checkpoint, except on clean shutdown, where the log will be empty after the checkpoint. It is important that we write out the redo log before any @@ -1446,7 +1446,7 @@ bool log_checkpoint() || flush_lsn != end_lsn; if (fil_names_clear(flush_lsn, do_write)) { - ut_ad(log_sys.lsn >= end_lsn + SIZE_OF_MLOG_CHECKPOINT); + ut_ad(log_sys.lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT); flush_lsn = log_sys.lsn; } @@ -1794,7 +1794,9 @@ wait_suspend_loop: lsn = log_sys.lsn; - const bool lsn_changed = lsn != log_sys.last_checkpoint_lsn; + const bool lsn_changed = lsn != log_sys.last_checkpoint_lsn + && lsn != log_sys.last_checkpoint_lsn + + SIZE_OF_FILE_CHECKPOINT; ut_ad(lsn >= log_sys.last_checkpoint_lsn); log_mutex_exit(); @@ -1956,7 +1958,7 @@ void log_pad_current_log_block(void) /*===========================*/ { - byte b = MLOG_DUMMY_RECORD; + byte b = 0; ulint pad_length; ulint i; lsn_t lsn; diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index 3a9ee20c3dd..9a229d4bb20 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -90,12 +90,14 @@ TRUE means that recovery is running and no operations on the log files are allowed yet: the variable name is misleading. */ bool recv_no_ibuf_operations; +#if 1 /* MDEV-12353: only for parsing old redo log format */ /** The type of the previous parsed redo log record */ static mlog_id_t recv_previous_parsed_rec_type; /** The offset of the previous parsed redo log record */ static ulint recv_previous_parsed_rec_offset; /** The 'multi' flag of the previous parsed redo log record */ static ulint recv_previous_parsed_rec_is_multi; +#endif /** The maximum lsn we see for a page during the recovery process. If this is bigger than the lsn we are able to scan up to, that is an indication that @@ -110,7 +112,8 @@ mysql_pfs_key_t recv_writer_thread_key; bool recv_writer_thread_active; -/** Stored physiological log record with byte-oriented start/end LSN */ +/** Stored physiological log record with byte-oriented start/end LSN +(before log_t::FORMAT_10_5) */ struct recv_t : public log_rec_t { /** @@ -171,6 +174,254 @@ struct recv_t : public log_rec_t }; +/** Stored physical log record with logical LSN (@see log_t::FORMAT_10_5) */ +struct log_phys_t : public log_rec_t +{ +#if 1 // MDEV-14425 FIXME: remove this! + /** start LSN of the mini-transaction (not necessarily of this record) */ + const lsn_t start_lsn; +#endif +private: + /** length of the record, in bytes */ + uint16_t len; + + /** @return start of the log records */ + byte *begin() { return reinterpret_cast<byte*>(&len + 1); } + /** @return start of the log records */ + const byte *begin() const { return const_cast<log_phys_t*>(this)->begin(); } + /** @return end of the log records */ + byte *end() { byte *e= begin() + len; ut_ad(!*e); return e; } +public: + /** @return end of the log records */ + const byte *end() const { return const_cast<log_phys_t*>(this)->end(); } + + /** Determine the allocated size of the object. + @param len length of recs, excluding terminating NUL byte + @return the total allocation size */ + static size_t alloc_size(size_t len) + { + return len + 1 + + reinterpret_cast<size_t>(reinterpret_cast<log_phys_t*>(0)->begin()); + } + + /** Constructor. + @param start_lsn start LSN of the mini-transaction + @param lsn mtr_t::commit_lsn() of the mini-transaction + @param recs the first log record for the page in the mini-transaction + @param size length of recs, in bytes, excluding terminating NUL byte */ + log_phys_t(lsn_t start_lsn, lsn_t lsn, const byte *recs, size_t size) : + log_rec_t(lsn), start_lsn(start_lsn), len(static_cast<uint16_t>(size)) + { + ut_ad(start_lsn); + ut_ad(start_lsn < lsn); + ut_ad(len == size); + reinterpret_cast<byte*>(memcpy(begin(), recs, size))[size]= 0; + } + + /** Append a record to the log. + @param recs log to append + @param size size of the log, in bytes + @param lsn the commit LSN of the record */ + void append(const byte *recs, size_t size, lsn_t lsn) + { + ut_ad(start_lsn < lsn); + set_lsn(lsn); + reinterpret_cast<byte*>(memcpy(end(), recs, size))[size]= 0; + len+= static_cast<uint16_t>(size); + } + + /** The status of apply() */ + enum apply_status { + /** The page was not affected */ + APPLIED_NO= 0, + /** The page was modified */ + APPLIED_YES, + /** The page was modified, affecting the encryption parameters */ + APPLIED_TO_ENCRYPTION, + /** The page was modified, affecting the tablespace header */ + APPLIED_TO_FSP_HEADER + }; + + /** Apply log to a page frame. + @param[in,out] block buffer block + @param[in,out] last_offset last byte offset, for same_page records + @return whether any log was applied to the page */ + apply_status apply(const buf_block_t &block, uint16_t &last_offset) const + { + const byte * const recs= begin(); + byte *const frame= block.page.zip.ssize + ? block.page.zip.data : block.frame; + const size_t size= block.physical_size(); + apply_status applied= APPLIED_NO; + + for (const byte *l= recs;;) + { + const byte b= *l++; + if (!b) + return applied; + ut_ad((b & 0x70) != RESERVED); + size_t rlen= b & 0xf; + if (!rlen) + { + const size_t lenlen= mlog_decode_varint_length(*l); + const uint32_t addlen= mlog_decode_varint(l); + ut_ad(addlen != MLOG_DECODE_ERROR); + rlen= addlen + 15 - lenlen; + l+= lenlen; + } + if (!(b & 0x80)) + { + /* Skip the page identifier. It has already been validated. */ + size_t idlen= mlog_decode_varint_length(*l); + ut_ad(idlen <= 5); + ut_ad(idlen < rlen); + ut_ad(mlog_decode_varint(l) == block.page.id.space()); + l+= idlen; + rlen-= idlen; + idlen= mlog_decode_varint_length(*l); + ut_ad(idlen <= 5); + ut_ad(idlen <= rlen); + ut_ad(mlog_decode_varint(l) == block.page.id.page_no()); + l+= idlen; + rlen-= idlen; + last_offset= 0; + } + + switch (b & 0x70) { + case FREE_PAGE: + ut_ad(last_offset == 0); + goto next_not_same_page; + case INIT_PAGE: + if (UNIV_LIKELY(rlen == 0)) + { + memset_aligned<UNIV_ZIP_SIZE_MIN>(frame, 0, size); + mach_write_to_4(frame + FIL_PAGE_OFFSET, block.page.id.page_no()); + memset_aligned<8>(FIL_PAGE_PREV + frame, 0xff, 8); + mach_write_to_4(frame + FIL_PAGE_SPACE_ID, block.page.id.space()); + last_offset= FIL_PAGE_TYPE; + next_after_applying: + if (applied == APPLIED_NO) + applied= APPLIED_YES; + } + else + { + record_corrupted: + if (!srv_force_recovery) + { + recv_sys.found_corrupt_log= true; + return applied; + } + next_not_same_page: + last_offset= 1; /* the next record must not be same_page */ + } + next: + l+= rlen; + continue; + } + + ut_ad(mach_read_from_4(frame + FIL_PAGE_OFFSET) == + block.page.id.page_no()); + ut_ad(mach_read_from_4(frame + FIL_PAGE_SPACE_ID) == + block.page.id.space()); + ut_ad(last_offset <= 1 || last_offset > 8); + ut_ad(last_offset <= size); + + switch (b & 0x70) { + case OPTION: + goto next; + case INIT_INDEX_PAGE: + if (UNIV_UNLIKELY(block.page.id.page_no() < 3 || + block.page.zip.ssize) && + !srv_force_recovery) + goto record_corrupted; + if (UNIV_UNLIKELY(rlen != 1 || *l > 1)) + goto record_corrupted; + page_create_low(&block, *l != 0); + last_offset= FIL_PAGE_TYPE; + goto next_after_applying; + case WRITE: + case MEMSET: + case MEMMOVE: + if (UNIV_UNLIKELY(last_offset == 1)) + goto record_corrupted; + const size_t olen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(olen >= rlen) || UNIV_UNLIKELY(olen > 3)) + goto record_corrupted; + const uint32_t offset= mlog_decode_varint(l); + ut_ad(offset != MLOG_DECODE_ERROR); + static_assert(FIL_PAGE_OFFSET == 4, "compatibility"); + if (UNIV_UNLIKELY(offset >= size)) + goto record_corrupted; + if (UNIV_UNLIKELY(offset + last_offset < 8 || + offset + last_offset >= size)) + goto record_corrupted; + last_offset+= static_cast<uint16_t>(offset); + l+= olen; + rlen-= olen; + size_t llen= rlen; + if ((b & 0x70) == WRITE) + { + if (UNIV_UNLIKELY(rlen + last_offset > size)) + goto record_corrupted; + memcpy(frame + last_offset, l, llen); + if (UNIV_LIKELY(block.page.id.page_no())); + else if (llen == 11 + MY_AES_BLOCK_SIZE && + last_offset == FSP_HEADER_OFFSET + MAGIC_SZ + + fsp_header_get_encryption_offset(block.zip_size())) + applied= APPLIED_TO_ENCRYPTION; + else if (last_offset < FSP_HEADER_OFFSET + FSP_FREE + FLST_LEN + 4 && + last_offset + llen >= FSP_HEADER_OFFSET + FSP_SIZE) + applied= APPLIED_TO_FSP_HEADER; + next_after_applying_write: + ut_ad(llen + last_offset <= size); + last_offset+= static_cast<uint16_t>(llen); + goto next_after_applying; + } + llen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(llen > rlen || llen > 3)) + goto record_corrupted; + const uint32_t len= mlog_decode_varint(l); + ut_ad(len != MLOG_DECODE_ERROR); + if (UNIV_UNLIKELY(len + last_offset > size)) + goto record_corrupted; + l+= llen; + rlen-= llen; + llen= len; + if ((b & 0x70) == MEMSET) + { + ut_ad(rlen < llen); + if (UNIV_UNLIKELY(rlen != 1)) + { + size_t s; + for (s= 0; s < llen; s+= rlen) + memcpy(frame + last_offset + s, l, rlen); + memcpy(frame + last_offset + s, l, llen - s); + } + else + memset(frame + last_offset, *l, llen); + goto next_after_applying_write; + } + const size_t slen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(slen != rlen || slen > 3)) + goto record_corrupted; + uint32_t s= mlog_decode_varint(l); + ut_ad(slen != MLOG_DECODE_ERROR); + if (s & 1) + s= last_offset - (s >> 1) - 1; + else + s= last_offset + (s >> 1) + 1; + if (UNIV_LIKELY(s >= 8 && s + llen <= size)) + { + memmove(frame + last_offset, frame + s, llen); + goto next_after_applying_write; + } + } + goto record_corrupted; + } + } +}; + + #ifndef DBUG_OFF /** Return string name of the redo log record type. @param[in] type record log record enum @@ -180,7 +431,7 @@ static const char* get_mlog_string(mlog_id_t type); /** Tablespace item during recovery */ struct file_name_t { - /** Tablespace file name (MLOG_FILE_NAME) */ + /** Tablespace file name (MLOG_FILE_NAME or FILE_MODIFY) */ std::string name; /** Tablespace object (NULL if not valid or not found) */ fil_space_t* space; @@ -218,16 +469,17 @@ static recv_spaces_t recv_spaces; /** Report an operation to create, delete, or rename a file during backup. @param[in] space_id tablespace identifier -@param[in] flags tablespace flags (NULL if not create) +@param[in] create whether the file is being created @param[in] name file name (not NUL-terminated) @param[in] len length of name, in bytes @param[in] new_name new file name (NULL if not rename) @param[in] new_len length of new_name, in bytes (0 if NULL) */ -void (*log_file_op)(ulint space_id, const byte* flags, +void (*log_file_op)(ulint space_id, bool create, const byte* name, ulint len, const byte* new_name, ulint new_len); -/** Information about initializing page contents during redo log processing */ +/** Information about initializing page contents during redo log processing. +FIXME: Rely on recv_sys.pages! */ class mlog_init_t { public: @@ -358,7 +610,7 @@ inline void recv_sys_t::trim(const page_id_t page_id, lsn_t lsn) for (recv_sys_t::map::iterator p = pages.lower_bound(page_id); p != pages.end() && p->first.space() == page_id.space();) { recv_sys_t::map::iterator r = p++; - if (r->second.log.trim(lsn)) { + if (r->second.trim(lsn)) { pages.erase(r); } } @@ -373,11 +625,12 @@ inline void recv_sys_t::trim(const page_id_t page_id, lsn_t lsn) DBUG_VOID_RETURN; } -/** Process a file name from a MLOG_FILE_* record. +/** Process a file name from a MLOG_FILE_* or FILE_* record. @param[in,out] name file name @param[in] len length of the file name @param[in] space_id the tablespace ID -@param[in] deleted whether this is a MLOG_FILE_DELETE record */ +@param[in] deleted whether this is a MLOG_FILE_DELETE + or FILE_DELETE record */ static void fil_name_process(char* name, ulint len, ulint space_id, bool deleted) @@ -395,15 +648,15 @@ fil_name_process(char* name, ulint len, ulint space_id, bool deleted) scanned before applying any page records for the space_id. */ os_normalize_path(name); - file_name_t fname(std::string(name, len - 1), deleted); - std::pair<recv_spaces_t::iterator,bool> p = recv_spaces.insert( - std::make_pair(space_id, fname)); + const file_name_t fname(std::string(name, len), deleted); + std::pair<recv_spaces_t::iterator,bool> p = recv_spaces.emplace( + space_id, fname); ut_ad(p.first->first == space_id); file_name_t& f = p.first->second; if (deleted) { - /* Got MLOG_FILE_DELETE */ + /* Got MLOG_FILE_DELETE oR FILE_DELETE */ if (!p.second && f.status != file_name_t::DELETED) { f.status = file_name_t::DELETED; @@ -414,7 +667,9 @@ fil_name_process(char* name, ulint len, ulint space_id, bool deleted) } ut_ad(f.space == NULL); - } else if (p.second // the first MLOG_FILE_NAME or MLOG_FILE_RENAME2 + } else if (p.second + /* the first MLOG_FILE_NAME or MLOG_FILE_RENAME2 + or FILE_MODIFY or FILE_RENAME */ || f.name != fname.name) { fil_space_t* space; @@ -451,7 +706,7 @@ fil_name_process(char* name, ulint len, ulint space_id, bool deleted) case FIL_LOAD_NOT_FOUND: /* No matching tablespace was found; maybe it was renamed, and we will find a subsequent - MLOG_FILE_* record. */ + MLOG_FILE_* or FILE_* record. */ ut_ad(space == NULL); if (srv_force_recovery) { @@ -562,7 +817,7 @@ fil_name_parse( } } - byte* end_ptr = ptr + len; + byte* end_ptr = ptr + len--; switch (type) { default: @@ -603,7 +858,7 @@ fil_name_parse( t.pages = uint32_t(page_id.page_no()); } else if (log_file_op) { log_file_op(page_id.space(), - type == MLOG_FILE_CREATE2 ? ptr - 4 : NULL, + type == MLOG_FILE_CREATE2, ptr, len, NULL, 0); } break; @@ -630,6 +885,7 @@ fil_name_parse( corrupt = corrupt || new_len < sizeof "/a.ibd\0" || memcmp(new_name + new_len - 5, DOT_IBD, 5) != 0; + new_len--; if (!corrupt && !memchr(new_name, OS_PATH_SEPARATOR, new_len)) { if (byte* c = static_cast<byte*> @@ -664,7 +920,7 @@ fil_name_parse( page_id.space(), false); if (log_file_op) { - log_file_op(page_id.space(), NULL, + log_file_op(page_id.space(), false, ptr, len, new_name, new_len); } @@ -872,18 +1128,7 @@ void recv_sys_t::debug_free() mutex_exit(&mutex); } -inline size_t recv_sys_t::get_free_len() const -{ - if (const buf_block_t* block= UT_LIST_GET_FIRST(blocks)) - { - if (const size_t used= static_cast<uint16_t>(block->page.access_time)) - return srv_page_size - used; - ut_ad(srv_page_size == 65536); - } - return 0; -} - -inline byte* recv_sys_t::alloc(size_t len, bool store_recv) +inline void *recv_sys_t::alloc(size_t len, bool store_recv) { ut_ad(mutex_own(&mutex)); ut_ad(len); @@ -913,9 +1158,6 @@ create_block: ut_ad(free_offset <= srv_page_size); free_offset+= len; - if (store_recv && free_offset + sizeof(recv_t::data) + 1 > srv_page_size) - goto create_block; - if (free_offset > srv_page_size) goto create_block; @@ -1292,6 +1534,8 @@ recv_find_max_checkpoint(ulint* max_field) case log_t::FORMAT_10_3 | log_t::FORMAT_ENCRYPTED: case log_t::FORMAT_10_4: case log_t::FORMAT_10_4 | log_t::FORMAT_ENCRYPTED: + case log_t::FORMAT_10_5: + case log_t::FORMAT_10_5 | log_t::FORMAT_ENCRYPTED: break; default: ib::error() << "Unsupported redo log format." @@ -1763,40 +2007,6 @@ parse_log: contents can be ignored. We do not write or apply this record yet. */ break; - case MLOG_ZIP_WRITE_STRING: - ut_ad(!page_zip - || !fil_page_get_type(page_zip->data) - || fil_page_get_type(page_zip->data) == FIL_PAGE_INDEX - || fil_page_get_type(page_zip->data) == FIL_PAGE_RTREE); - if (ptr + 4 > end_ptr) { - goto truncated; - } else { - const ulint ofs = mach_read_from_2(ptr); - const ulint len = mach_read_from_2(ptr + 2); - if (ofs < FIL_PAGE_PREV || !len) { - goto corrupted; - } - ptr += 4 + len; - if (ptr > end_ptr) { - goto truncated; - } - if (!page_zip) { - break; - } - ut_ad(ofs + len <= block->zip_size()); - memcpy(page_zip->data + ofs, old_ptr + 4, len); - if (ofs >= FIL_PAGE_TYPE +2 - || ofs + len < FIL_PAGE_TYPE + 2) { - break; - } - /* Ensure that buf_flush_init_for_writing() - will treat the page as an index page, and - not overwrite the compressed page with the - contents of the uncompressed page. */ - memcpy_aligned<2>(&page[FIL_PAGE_TYPE], - &page_zip->data[FIL_PAGE_TYPE], 2); - } - break; case MLOG_WRITE_STRING: ut_ad(!page_zip || fil_page_get_type(page_zip->data) @@ -1875,9 +2085,7 @@ parse_log: default: ib::error() << "Incorrect log record type " << ib::hex(unsigned(type)); -corrupted: recv_sys.found_corrupt_log = true; -truncated: ptr = NULL; } @@ -1891,6 +2099,26 @@ truncated: return(ptr); } +/*******************************************************//** +Calculates the new value for lsn when more data is added to the log. */ +static +lsn_t +recv_calc_lsn_on_data_add( +/*======================*/ + lsn_t lsn, /*!< in: old lsn */ + ib_uint64_t len) /*!< in: this many bytes of data is + added, log block headers not included */ +{ + unsigned frag_len = (lsn % OS_FILE_LOG_BLOCK_SIZE) - LOG_BLOCK_HDR_SIZE; + unsigned payload_size = log_sys.payload_size(); + ut_ad(frag_len < payload_size); + lsn_t lsn_len = len; + lsn_len += (lsn_len + frag_len) / payload_size + * (OS_FILE_LOG_BLOCK_SIZE - payload_size); + + return(lsn + lsn_len); +} + /** Store a redo log record for applying. @param type record type @param page_id page identifier @@ -1909,22 +2137,13 @@ inline void recv_sys_t::add(mlog_id_t type, const page_id_t page_id, ut_ad(type != MLOG_DUMMY_RECORD); ut_ad(type != MLOG_CHECKPOINT); ut_ad(type != MLOG_TRUNCATE); + ut_ad(!log_sys.is_physical()); std::pair<map::iterator, bool> p= pages.insert(map::value_type (page_id, page_recv_t())); page_recv_t& recs= p.first->second; ut_ad(p.second == recs.log.empty()); - switch (type) { - case MLOG_INIT_FILE_PAGE2: - case MLOG_ZIP_PAGE_COMPRESS: - case MLOG_INIT_FREE_PAGE: - recs.will_not_read(); - mlog_init.add(page_id, lsn); - default: - break; - } - /* Store the log record body in limited-size chunks, because the heap grows into the buffer pool. */ size_t len= static_cast<size_t>(rec_end - body); @@ -1935,7 +2154,10 @@ inline void recv_sys_t::add(mlog_id_t type, const page_id_t page_id, for (recv_t::data_t *prev= nullptr;;) { - const size_t l= std::min(len, get_free_len() - sizeof(recv_t::data)); + const size_t used= static_cast<uint16_t> + (UT_LIST_GET_FIRST(blocks)->page.access_time); + ut_ad(used || srv_page_size == 65536); + const size_t l= std::min(len, srv_page_size - used - sizeof(recv_t::data)); recv_t::data_t *d= new (alloc(sizeof(recv_t::data) + l)) recv_t::data_t(body, l); if (prev) @@ -1954,16 +2176,30 @@ inline void recv_sys_t::add(mlog_id_t type, const page_id_t page_id, /** Trim old log records for a page. @param start_lsn oldest log sequence number to preserve @return whether all the log for the page was trimmed */ -inline bool page_recv_t::recs_t::trim(lsn_t start_lsn) +inline bool page_recv_t::trim(lsn_t start_lsn) { - while (head) + if (log_sys.is_physical()) { - if (head->lsn >= start_lsn) return false; - log_rec_t *next= head->next; - static_cast<const recv_t*>(head)->free(); - head= next; + while (log.head) + { + if (log.head->lsn >= start_lsn) return false; + last_offset= 1; /* the next record must not be same_page */ + log_rec_t *next= log.head->next; + recv_sys.free(log.head); + log.head= next; + } + log.tail= nullptr; + return true; } - tail= nullptr; + + while (log.head) + { + if (log.head->lsn >= start_lsn) return false; + log_rec_t *next= log.head->next; + static_cast<const recv_t*>(log.head)->free(); + log.head= next; + } + log.tail= nullptr; return true; } @@ -1971,6 +2207,17 @@ inline bool page_recv_t::recs_t::trim(lsn_t start_lsn) inline void page_recv_t::recs_t::clear() { ut_ad(mutex_own(&recv_sys.mutex)); + if (log_sys.is_physical()) + { + for (const log_rec_t *l= head; l; ) + { + const log_rec_t *next= l->next; + recv_sys.free(l); + l= next; + } + head= tail= nullptr; + return; + } for (const log_rec_t *l= head; l; ) { const log_rec_t *next= l->next; @@ -1990,6 +2237,501 @@ inline void page_recv_t::will_not_read() } +/** Register a redo log snippet for a page. +@param page_id page identifier +@param start_lsn start LSN of the mini-transaction +@param lsn @see mtr_t::commit_lsn() +@param recs redo log snippet @see log_t::FORMAT_10_5 +@param len length of l, in bytes */ +inline void recv_sys_t::add(const page_id_t page_id, + lsn_t start_lsn, lsn_t lsn, const byte *l, + size_t len) +{ + ut_ad(mutex_own(&mutex)); + std::pair<map::iterator, bool> p= pages.emplace(map::value_type + (page_id, page_recv_t())); + page_recv_t& recs= p.first->second; + ut_ad(p.second == recs.log.empty()); + + switch (*l & 0x70) { + case FREE_PAGE: case INIT_PAGE: + recs.will_not_read(); + mlog_init.add(page_id, start_lsn); /* FIXME: remove this! */ + /* fall through */ + default: + log_phys_t *tail= static_cast<log_phys_t*>(recs.log.last()); + if (!tail) + break; +#if 1 // MDEV-14425 FIXME: remove this! + if (tail->start_lsn != start_lsn) + break; +#endif + buf_block_t *block= UT_LIST_GET_LAST(blocks); + ut_ad(block); + const size_t used= static_cast<uint16_t>(block->page.access_time - 1) + 1; + ut_ad(used >= ALIGNMENT); + const byte *end= const_cast<const log_phys_t*>(tail)->end(); + if (!((reinterpret_cast<size_t>(end + len) ^ + reinterpret_cast<size_t>(end)) & ~(ALIGNMENT - 1))) + { + /* Use already allocated 'padding' bytes */ +append: + UNIV_MEM_ALLOC(end + 1, len); + /* Append to the preceding record for the page */ + tail->append(l, len, lsn); + return; + } + if (end <= &block->frame[used - ALIGNMENT] || &block->frame[used] >= end) + break; /* Not the last allocated record in the page */ + const size_t new_used= static_cast<size_t>(end - block->frame + len + 1); + ut_ad(new_used > used); + if (new_used > srv_page_size) + break; + block->page.access_time= (block->page.access_time & ~0U << 16) | + ut_calc_align<uint16_t>(static_cast<uint16_t>(new_used), ALIGNMENT); + goto append; + } + recs.log.append(new (alloc(log_phys_t::alloc_size(len))) + log_phys_t(start_lsn, lsn, l, len)); +} + + +/** Parse and register one mini-transaction in log_t::FORMAT_10_5. +@param checkpoint_lsn the log sequence number of the latest checkpoint +@param store whether to store the records +@param apply whether to apply file-level log records +@return whether FILE_CHECKPOINT record was seen the first time, +or corruption was noticed */ +inline bool recv_sys_t::parse(lsn_t checkpoint_lsn, store_t store, bool apply) +{ + const byte *const end= buf + len; +loop: + const byte *const log= buf + recovered_offset; + const lsn_t start_lsn= recovered_lsn; + + /* Check that the entire mini-transaction is included within the buffer */ + const byte *l; + uint32_t rlen; + for (l= log; l < end; l+= rlen) + { + if (!*l) + goto eom_found; + if (UNIV_LIKELY((*l & 0x70) != RESERVED)); + else if (srv_force_recovery) + ib::warn() << "Ignoring unknown log record at LSN " << recovered_lsn; + else + { +malformed: + ib::error() << "Malformed log record;" + " set innodb_force_recovery=1 to ignore."; +corrupted: + const size_t trailing_bytes= std::min<size_t>(100, size_t(end - l)); + ib::info() << "Dump from the start of the mini-transaction (LSN=" + << start_lsn << ") to " + << trailing_bytes << " bytes after the record:"; + ut_print_buf(stderr, log, l - log + trailing_bytes); + putc('\n', stderr); + found_corrupt_log= true; + return true; + } + rlen= *l++ & 0xf; + if (l + (rlen ? rlen : 16) >= end) + break; + if (!rlen) + { + rlen= mlog_decode_varint_length(*l); + if (l + rlen >= end) + break; + const uint32_t addlen= mlog_decode_varint(l); + if (UNIV_UNLIKELY(addlen == MLOG_DECODE_ERROR)) + { + ib::error() << "Corrupted record length"; + goto corrupted; + } + rlen= addlen + 15; + } + } + + /* Not the entire mini-transaction was present. */ + return false; + +eom_found: + ut_ad(!*l); + ut_d(const byte *const el= l + 1); + + const lsn_t end_lsn= recv_calc_lsn_on_data_add(start_lsn, l + 1 - log); + if (UNIV_UNLIKELY(end_lsn > scanned_lsn)) + /* The log record filled a log block, and we require that also the + next log block should have been scanned in */ + return false; + + ut_d(std::set<page_id_t> freed); +#if 0 && defined UNIV_DEBUG /* MDEV-21727 FIXME: enable this */ + /* Pages that have been modified in this mini-transaction. + If a mini-transaction writes INIT_PAGE for a page, it should not have + written any log records for the page. Unfortunately, this does not + hold for ROW_FORMAT=COMPRESSED pages, because page_zip_compress() + can be invoked in a pessimistic operation, even after log has + been written for other pages. */ + ut_d(std::set<page_id_t> modified); +#endif + + uint32_t space_id= 0, page_no= 0, last_offset= 0; +#if 1 /* MDEV-14425 FIXME: remove this */ + bool got_page_op= false; +#endif + for (l= log; l < end; l+= rlen) + { + const byte *const recs= l; + const byte b= *l++; + + if (!b) + break; + ut_ad(UNIV_LIKELY(b & 0x70) != RESERVED || srv_force_recovery); + rlen= b & 0xf; + ut_ad(l + rlen < end); + ut_ad(rlen || l + 16 < end); + if (!rlen) + { + const uint32_t lenlen= mlog_decode_varint_length(*l); + ut_ad(l + lenlen < end); + const uint32_t addlen= mlog_decode_varint(l); + ut_ad(addlen != MLOG_DECODE_ERROR); + rlen= addlen + 15 - lenlen; + l+= lenlen; + } + ut_ad(l + rlen < end); + uint32_t idlen; + if ((b & 0x80) && got_page_op) + { + /* This record is for the same page as the previous one. */ + if (UNIV_UNLIKELY((b & 0x70) <= INIT_PAGE)) + { +record_corrupted: + /* FREE_PAGE,INIT_PAGE cannot be with same_page flag */ + if (!srv_force_recovery) + goto malformed; + ib::warn() << "Ignoring malformed log record at LSN " << recovered_lsn; + last_offset= 1; /* the next record must not be same_page */ + continue; + } + goto same_page; + } + last_offset= 0; + idlen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(idlen > 5 || idlen >= rlen)) + { +page_id_corrupted: + if (!srv_force_recovery) + { + ib::error() << "Corrupted page identifier at " << recovered_lsn + << "; set innodb_force_recovery=1 to ignore the record."; + goto corrupted; + } + ib::warn() << "Ignoring corrupted page identifier at LSN " + << recovered_lsn; + continue; + } + space_id= mlog_decode_varint(l); + if (UNIV_UNLIKELY(space_id == MLOG_DECODE_ERROR)) + goto page_id_corrupted; + l+= idlen; + rlen-= idlen; + idlen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(idlen > 5 || idlen > rlen)) + goto page_id_corrupted; + page_no= mlog_decode_varint(l); + if (UNIV_UNLIKELY(page_no == MLOG_DECODE_ERROR)) + goto page_id_corrupted; + l+= idlen; + rlen-= idlen; + got_page_op = !(b & 0x80); + if (got_page_op && apply && !is_predefined_tablespace(space_id)) + { + recv_spaces_t::iterator i= recv_spaces.lower_bound(space_id); + if (i != recv_spaces.end() && i->first == space_id); + else if (recovered_lsn < mlog_checkpoint_lsn) + /* We have not seen all records between the checkpoint and + FILE_CHECKPOINT. There should be a FILE_DELETE for this + tablespace later. */ + recv_spaces.emplace_hint(i, space_id, file_name_t("", false)); + else + { + const page_id_t id(space_id, page_no); + if (!srv_force_recovery) + { + ib::error() << "Missing FILE_DELETE or FILE_MODIFY for " << id + << " at " << recovered_lsn + << "; set innodb_force_recovery=1 to ignore the record."; + goto corrupted; + } + ib::warn() << "Ignoring record for " << id << " at " << recovered_lsn; + continue; + } + } +same_page: + DBUG_PRINT("ib_log", + ("scan " LSN_PF ": rec %x len %zu page %u:%u", + recovered_lsn, b, static_cast<size_t>(l + rlen - recs), + space_id, page_no)); + + if (got_page_op) + { + ut_d(const page_id_t id(space_id, page_no)); + ut_d(if ((b & 0x70) == INIT_PAGE) freed.erase(id)); + ut_ad(freed.find(id) == freed.end()); + switch (b & 0x70) { + case FREE_PAGE: + ut_ad(freed.emplace(id).second); + last_offset= 1; /* the next record must not be same_page */ + goto free_or_init_page; + case INIT_PAGE: + free_or_init_page: + last_offset= FIL_PAGE_TYPE; + if (UNIV_UNLIKELY(rlen != 0)) + goto record_corrupted; + break; + case INIT_INDEX_PAGE: + if (UNIV_UNLIKELY(rlen != 1)) + goto record_corrupted; + last_offset= FIL_PAGE_TYPE; + break; + case RESERVED: + case OPTION: + continue; + case WRITE: + case MEMMOVE: + case MEMSET: + if (UNIV_UNLIKELY(rlen == 0 || last_offset == 1)) + goto record_corrupted; + const uint32_t olen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(olen >= rlen) || UNIV_UNLIKELY(olen > 3)) + goto record_corrupted; + const uint32_t offset= mlog_decode_varint(l); + ut_ad(offset != MLOG_DECODE_ERROR); + static_assert(FIL_PAGE_OFFSET == 4, "compatibility"); + if (UNIV_UNLIKELY(offset >= srv_page_size)) + goto record_corrupted; + last_offset+= offset; + if (UNIV_UNLIKELY(last_offset < 8 || last_offset >= srv_page_size)) + goto record_corrupted; + l+= olen; + rlen-= olen; + if ((b & 0x70) == WRITE) + { + if (UNIV_UNLIKELY(rlen + last_offset > srv_page_size)) + goto record_corrupted; + if (UNIV_UNLIKELY(page_no == 0) && apply && + last_offset <= FSP_HEADER_OFFSET + FSP_SIZE && + last_offset + rlen >= FSP_HEADER_OFFSET + FSP_SIZE + 4) + { + recv_spaces_t::iterator it= recv_spaces.find(space_id); + const uint32_t size= mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + + l - last_offset); + if (it == recv_spaces.end()) + ut_ad(!mlog_checkpoint_lsn || space_id == TRX_SYS_SPACE || + srv_is_undo_tablespace(space_id)); + else if (!it->second.space) + it->second.size= size; + fil_space_set_recv_size(space_id, size); + } + last_offset+= rlen; + break; + } + uint32_t llen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(llen > rlen || llen > 3)) + goto record_corrupted; + const uint32_t len= mlog_decode_varint(l); + ut_ad(len != MLOG_DECODE_ERROR); + if (UNIV_UNLIKELY(last_offset + len > srv_page_size)) + goto record_corrupted; + l+= llen; + rlen-= llen; + llen= len; + if ((b & 0x70) == MEMSET) + { + if (UNIV_UNLIKELY(rlen > llen)) + goto record_corrupted; + last_offset+= llen; + break; + } + const uint32_t slen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(slen != rlen || slen > 3)) + goto record_corrupted; + uint32_t s= mlog_decode_varint(l); + ut_ad(slen != MLOG_DECODE_ERROR); + if (s & 1) + s= last_offset - (s >> 1) - 1; + else + s= last_offset + (s >> 1) + 1; + if (UNIV_UNLIKELY(s < 8 || s + llen > srv_page_size)) + goto record_corrupted; + last_offset+= llen; + break; + } +#if 0 && defined UNIV_DEBUG + switch (b & 0x70) { + case RESERVED: + case OPTION: + ut_ad(0); /* we did "continue" earlier */ + break; + case FREE_PAGE: + break; + default: + ut_ad(modified.emplace(id).second || (b & 0x70) != INIT_PAGE); + } +#endif + switch (store) { + case STORE_NO: + continue; + case STORE_IF_EXISTS: + if (!fil_space_get_size(space_id)) + continue; + /* fall through */ + case STORE_YES: + add(page_id_t(space_id, page_no), start_lsn, end_lsn, recs, + static_cast<size_t>(l + rlen - recs)); + } + } +#if 1 /* MDEV-14425 FIXME: this must be in the checkpoint file only! */ + else if (rlen) + { + switch (b & 0xf0) { +# if 1 /* MDEV-14425 FIXME: Remove this! */ + case FILE_CHECKPOINT: + if (space_id == 0 && page_no == 0 && rlen == 8) + { + const lsn_t lsn= mach_read_from_8(l); + + if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) + fprintf(stderr, "FILE_CHECKPOINT(" LSN_PF ") %s at " LSN_PF "\n", + lsn, lsn != checkpoint_lsn + ? "ignored" + : mlog_checkpoint_lsn ? "reread" : "read", + recovered_lsn); + + DBUG_PRINT("ib_log", ("FILE_CHECKPOINT(" LSN_PF ") %s at " LSN_PF, + lsn, lsn != checkpoint_lsn + ? "ignored" + : mlog_checkpoint_lsn ? "reread" : "read", + recovered_lsn)); + + if (lsn == checkpoint_lsn) + { + ut_ad(mlog_checkpoint_lsn <= recovered_lsn); + if (mlog_checkpoint_lsn) + continue; + mlog_checkpoint_lsn= recovered_lsn; + l+= 8; + recovered_offset= l - buf; + return true; + } + continue; + } +# endif + /* fall through */ + default: + if (!srv_force_recovery) + goto malformed; + ib::warn() << "Ignoring malformed log record at LSN " << recovered_lsn; + continue; + case FILE_DELETE: + case FILE_MODIFY: + case FILE_RENAME: + if (UNIV_UNLIKELY(page_no != 0)) + { + file_rec_error: + if (!srv_force_recovery) + { + ib::error() << "Corrupted file-level record;" + " set innodb_force_recovery=1 to ignore."; + goto corrupted; + } + + ib::warn() << "Ignoring corrupted file-level record at LSN " + << recovered_lsn; + continue; + } + /* fall through */ + case FILE_CREATE: + if (UNIV_UNLIKELY(space_id == 0)) + goto file_rec_error; + /* There is no terminating NUL character. Names must end in .ibd. + For FILE_RENAME, there is a NUL between the two file names. */ + const char * const fn= reinterpret_cast<const char*>(l); + const char *fn2= static_cast<const char*>(memchr(fn, 0, rlen)); + + if (UNIV_UNLIKELY((fn2 == nullptr) == ((b & 0xf0) == FILE_RENAME))) + goto file_rec_error; + + const char * const fnend= fn2 ? fn2 : fn + rlen; + const char * const fn2end= fn2 ? fn + rlen : nullptr; + + if (fn2) + { + fn2++; + if (memchr(fn2, 0, fn2end - fn2)) + goto file_rec_error; + if (fn2end - fn2 < 4 || memcmp(fn2end - 4, DOT_IBD, 4)) + goto file_rec_error; + } + + if (page_no) + { + if (UNIV_UNLIKELY((b & 0xf0) != FILE_CREATE)) + goto file_rec_error; + /* truncating an undo log tablespace */ + ut_ad(fnend - fn >= 7); + ut_ad(!memcmp(fnend - 7, "undo", 4)); + ut_d(char n[4]; char *end; memcpy(n, fnend - 3, 3); n[3]= 0); + ut_ad(strtoul(n, &end, 10) <= 127); + ut_ad(end == &n[3]); + ut_ad(page_no == SRV_UNDO_TABLESPACE_SIZE_IN_PAGES); + ut_ad(srv_is_undo_tablespace(space_id)); + static_assert(UT_ARR_SIZE(truncated_undo_spaces) == + TRX_SYS_MAX_UNDO_SPACES, "compatibility"); + truncated_undo_spaces[space_id - srv_undo_space_id_start]= + { recovered_lsn, page_no }; + continue; + } + if (is_predefined_tablespace(space_id)) + goto file_rec_error; + if (fnend - fn < 4 || memcmp(fnend - 4, DOT_IBD, 4)) + goto file_rec_error; + + const char saved_end= fn[rlen]; + const_cast<char&>(fn[rlen])= '\0'; + fil_name_process(const_cast<char*>(fn), fnend - fn, space_id, + (b & 0xf0) == FILE_DELETE); + if (fn2) + fil_name_process(const_cast<char*>(fn2), fn2end - fn2, space_id, + false); + if ((b & 0xf0) < FILE_MODIFY && log_file_op) + log_file_op(space_id, (b & 0xf0) == FILE_CREATE, + l, static_cast<ulint>(fnend - fn), + reinterpret_cast<const byte*>(fn2), + fn2 ? static_cast<ulint>(fn2end - fn2) : 0); + + if (!fn2 || !apply); + else if (!fil_op_replay_rename(space_id, 0, fn, fn2)) + found_corrupt_fs= true; + const_cast<char&>(fn[rlen])= saved_end; + if (UNIV_UNLIKELY(found_corrupt_fs)) + return true; + } + } +#endif + else + goto malformed; + } + + ut_ad(l == el); + recovered_offset= l - buf; + recovered_lsn= end_lsn; + goto loop; +} + + /*********************************************************************//** Copies the log record body from recv to buf. */ static ATTRIBUTE_COLD @@ -2018,13 +2760,15 @@ lsn of a log record. @param[in,out] block buffer pool page @param[in,out] mtr mini-transaction @param[in,out] p recovery address +@param[in,out] space tablespace, or NULL if not looked up yet @param[in,out] init page initialization operation, or NULL */ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, const recv_sys_t::map::iterator& p, + fil_space_t* space = NULL, mlog_init_t::init* init = NULL) { page_t* page; - page_zip_des_t* page_zip; + page_zip_des_t* page_zip; ut_ad(mutex_own(&recv_sys.mutex)); ut_ad(recv_sys.apply_log_recs); @@ -2033,12 +2777,15 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, ut_ad(!init || init->lsn); ut_ad(block->page.id == p->first); ut_ad(!p->second.is_being_processed()); + ut_ad(!space || space->id == block->page.id.space()); if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) { ib::info() << "Applying log to page " << block->page.id; } - DBUG_LOG("ib_log", "Applying log to page " << block->page.id); + DBUG_PRINT("ib_log", ("Applying log to page %u:%u", + block->page.id.space(), + block->page.id.page_no())); p->second.state = page_recv_t::RECV_BEING_PROCESSED; @@ -2047,11 +2794,17 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, page = block->frame; page_zip = buf_block_get_page_zip(block); - const lsn_t page_lsn = mach_read_from_8(page + FIL_PAGE_LSN); + byte *frame = UNIV_LIKELY_NULL(block->page.zip.data) + ? block->page.zip.data + : page; + const lsn_t page_lsn = init + ? 0 + : mach_read_from_8(frame + FIL_PAGE_LSN); bool free_page = false; lsn_t start_lsn = 0, end_lsn = 0; ut_d(lsn_t recv_start_lsn = 0); const lsn_t init_lsn = init ? init->lsn : 0; + const bool is_physical = log_sys.is_physical(); for (const log_rec_t* l : p->second.log) { ut_ad(l->lsn); @@ -2065,23 +2818,108 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, ut_d(recv_start_lsn = recv->start_lsn); if (recv->start_lsn < page_lsn) { - /* Ignore this record, because there are later changes - for this page. */ - DBUG_LOG("ib_log", "apply skip " - << get_mlog_string(recv->type) - << " LSN " << recv->start_lsn << " < " - << page_lsn); - } else if (recv->start_lsn < init_lsn) { - DBUG_LOG("ib_log", "init skip " - << get_mlog_string(recv->type) - << " LSN " << recv->start_lsn << " < " - << init_lsn); + /* This record has already been applied. */ + DBUG_PRINT("ib_log", ("apply skip %u:%u LSN " LSN_PF + " < " LSN_PF, + block->page.id.space(), + block->page.id.page_no(), + recv->start_lsn, page_lsn)); + continue; + } + + if (recv->start_lsn < init_lsn) { + DBUG_PRINT("ib_log", ("init skip %s %u:%u LSN " LSN_PF + " < " LSN_PF, + is_physical + ? "?" + : get_mlog_string(recv->type), + block->page.id.space(), + block->page.id.page_no(), + recv->start_lsn, init_lsn)); + continue; + } + + if (is_physical) { + const log_phys_t *f= static_cast<const log_phys_t*>(l); + + if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) { + ib::info() << "apply " << f->start_lsn + << ": " << block->page.id; + } + + DBUG_PRINT("ib_log", ("apply " LSN_PF ": %u:%u", + f->start_lsn, + block->page.id.space(), + block->page.id.page_no())); + + log_phys_t::apply_status a= f->apply( + *block, p->second.last_offset); + + switch (a) { + case log_phys_t::APPLIED_NO: + ut_ad(!mtr.has_modifications()); + free_page = true; + start_lsn = 0; + continue; + case log_phys_t::APPLIED_YES: + goto set_start_lsn; + case log_phys_t::APPLIED_TO_FSP_HEADER: + case log_phys_t::APPLIED_TO_ENCRYPTION: + break; + } + + if (fil_space_t* s = space + ? space + : fil_space_acquire(block->page.id.space())) { + switch (a) { + case log_phys_t::APPLIED_TO_FSP_HEADER: + s->flags = mach_read_from_4( + FSP_HEADER_OFFSET + + FSP_SPACE_FLAGS + frame); + s->size_in_header = mach_read_from_4( + FSP_HEADER_OFFSET + FSP_SIZE + + frame); + s->free_limit = mach_read_from_4( + FSP_HEADER_OFFSET + + FSP_FREE_LIMIT + frame); + s->free_len = mach_read_from_4( + FSP_HEADER_OFFSET + FSP_FREE + + FLST_LEN + frame); + break; + default: + byte* b= frame + + fsp_header_get_encryption_offset( + block->zip_size()) + + FSP_HEADER_OFFSET; + if (memcmp(b, CRYPT_MAGIC, MAGIC_SZ)) { + break; + } + b += MAGIC_SZ; + if (*b != CRYPT_SCHEME_UNENCRYPTED + && *b != CRYPT_SCHEME_1) { + break; + } + if (b[1] != MY_AES_BLOCK_SIZE) { + break; + } + if (b[2 + MY_AES_BLOCK_SIZE + 4 + 4] + > FIL_ENCRYPTION_OFF) { + break; + } + fil_crypt_parse(s, b); + } + + if (s != space) { + s->release(); + } + } } else { if (recv->type == MLOG_INIT_FREE_PAGE) { /* This does not really modify the page. */ + ut_ad(!mtr.has_modifications()); free_page = true; - } else if (!start_lsn) { - start_lsn = recv->start_lsn; + start_lsn = 0; + continue; } if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) { @@ -2130,9 +2968,24 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, FIL_PAGE_LSN + page, 8); } } + +set_start_lsn: + if (!start_lsn) { + start_lsn = recv->start_lsn; + } } if (start_lsn) { + ut_ad(end_lsn >= start_lsn); + mach_write_to_8(FIL_PAGE_LSN + frame, end_lsn); + if (UNIV_LIKELY(frame == block->frame)) { + mach_write_to_8(srv_page_size + - FIL_PAGE_END_LSN_OLD_CHKSUM + + frame, end_lsn); + } else { + buf_zip_decompress(block, false); + } + buf_block_modify_clock_inc(block); log_flush_order_mutex_enter(); buf_flush_note_modification(block, start_lsn, end_lsn); @@ -2187,8 +3040,9 @@ ATTRIBUTE_COLD void recv_sys_t::free_corrupted_page(page_id_t page_id) } /** Apply any buffered redo log to a page that was just read from a data file. +@param[in,out] space tablespace @param[in,out] bpage buffer pool page */ -void recv_recover_page(buf_page_t* bpage) +void recv_recover_page(fil_space_t* space, buf_page_t* bpage) { mtr_t mtr; mtr.start(); @@ -2211,7 +3065,7 @@ void recv_recover_page(buf_page_t* bpage) recv_sys_t::map::iterator p = recv_sys.pages.find(bpage->id); if (p != recv_sys.pages.end() && !p->second.is_being_processed()) { - recv_recover_page(block, mtr, p); + recv_recover_page(block, mtr, p, space); p->second.log.clear(); recv_sys.pages.erase(p); goto func_exit; @@ -2391,7 +3245,7 @@ void recv_apply_hashed_log_recs(bool last_batch) buf_block_dbg_add_level( block, SYNC_NO_ORDER_CHECK); mtr.x_latch_at_savepoint(0, block); - recv_recover_page(block, mtr, p, &i); + recv_recover_page(block, mtr, p, space, &i); ut_ad(mtr.has_committed()); p->second.log.clear(); recv_sys.pages.erase(p); @@ -2560,26 +3414,6 @@ recv_parse_log_rec( return ulint(new_ptr - ptr); } -/*******************************************************//** -Calculates the new value for lsn when more data is added to the log. */ -static -lsn_t -recv_calc_lsn_on_data_add( -/*======================*/ - lsn_t lsn, /*!< in: old lsn */ - ib_uint64_t len) /*!< in: this many bytes of data is - added, log block headers not included */ -{ - unsigned frag_len = (lsn % OS_FILE_LOG_BLOCK_SIZE) - LOG_BLOCK_HDR_SIZE; - unsigned payload_size = log_sys.payload_size(); - ut_ad(frag_len < payload_size); - lsn_t lsn_len = len; - lsn_len += (lsn_len + frag_len) / payload_size - * (OS_FILE_LOG_BLOCK_SIZE - payload_size); - - return(lsn + lsn_len); -} - /** Prints diagnostic info of corrupt log. @param[in] ptr pointer to corrupt log record @param[in] type type of the log record (could be garbage) @@ -2658,10 +3492,18 @@ hash table to wait merging to file pages. @param[in] checkpoint_lsn the LSN of the latest checkpoint @param[in] store whether to store page operations @param[in] apply whether to apply the records -@return whether MLOG_CHECKPOINT record was seen the first time, -or corruption was noticed */ -bool recv_parse_log_recs(lsn_t checkpoint_lsn, store_t* store, bool apply) +@return whether MLOG_CHECKPOINT or FILE_CHECKPOINT record +was seen the first time, or corruption was noticed */ +bool recv_parse_log_recs(lsn_t checkpoint_lsn, store_t *store, bool apply) { + ut_ad(log_mutex_own()); + ut_ad(mutex_own(&recv_sys.mutex)); + ut_ad(recv_sys.parse_start_lsn != 0); + + if (log_sys.is_physical()) { + return recv_sys.parse(checkpoint_lsn, *store, apply); + } + bool single_rec; ulint len; lsn_t new_recovered_lsn; @@ -2672,9 +3514,6 @@ bool recv_parse_log_recs(lsn_t checkpoint_lsn, store_t* store, bool apply) const byte* body; const bool last_phase = (*store == STORE_IF_EXISTS); - ut_ad(log_mutex_own()); - ut_ad(mutex_own(&recv_sys.mutex)); - ut_ad(recv_sys.parse_start_lsn != 0); loop: const byte* ptr = recv_sys.buf + recv_sys.recovered_offset; const byte* end_ptr = recv_sys.buf + recv_sys.len; @@ -3087,6 +3926,10 @@ static bool recv_scan_log_recs( const byte* const log_end = log_block + ulint(end_lsn - start_lsn); + const ulint sizeof_checkpoint= log_sys.is_physical() + ? SIZE_OF_FILE_CHECKPOINT + : SIZE_OF_MLOG_CHECKPOINT; + do { ut_ad(!finished); @@ -3132,11 +3975,17 @@ static bool recv_scan_log_recs( scanned_lsn += data_len; - if (data_len == LOG_BLOCK_HDR_SIZE + SIZE_OF_MLOG_CHECKPOINT - && scanned_lsn == checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT - && log_block[LOG_BLOCK_HDR_SIZE] == MLOG_CHECKPOINT - && checkpoint_lsn == mach_read_from_8(LOG_BLOCK_HDR_SIZE - + 1 + log_block)) { + if (data_len == LOG_BLOCK_HDR_SIZE + sizeof_checkpoint + && scanned_lsn == checkpoint_lsn + sizeof_checkpoint + && log_block[LOG_BLOCK_HDR_SIZE] + == (log_sys.is_physical() + ? FILE_CHECKPOINT | (SIZE_OF_FILE_CHECKPOINT - 2) + : MLOG_CHECKPOINT) + && checkpoint_lsn == mach_read_from_8( + (log_sys.is_physical() + ? LOG_BLOCK_HDR_SIZE + 1 + 2 + : LOG_BLOCK_HDR_SIZE + 1) + + log_block)) { /* The redo log is logically empty. */ ut_ad(recv_sys.mlog_checkpoint_lsn == 0 || recv_sys.mlog_checkpoint_lsn @@ -3170,8 +4019,7 @@ static bool recv_scan_log_recs( DBUG_EXECUTE_IF( "reduce_recv_parsing_buf", - recv_parsing_buf_size - = (70 * 1024); + recv_parsing_buf_size = RECV_SCAN_SIZE * 2; ); if (recv_sys.len + 4 * OS_FILE_LOG_BLOCK_SIZE @@ -3231,7 +4079,10 @@ static bool recv_scan_log_recs( recv_sys.is_memory_exhausted(store); - if (recv_sys.recovered_offset > recv_parsing_buf_size / 4) { + if (recv_sys.recovered_offset > recv_parsing_buf_size / 4 + || (recv_sys.recovered_offset + && recv_sys.len + >= recv_parsing_buf_size - RECV_SCAN_SIZE)) { /* Move parsing buffer data to the buffer start */ recv_sys_justify_left_parsing_buf(); } @@ -3469,10 +4320,15 @@ recv_init_crash_recovery_spaces(bool rescan, bool& missing_tablespace) are some redo log records for it. */ fil_names_dirty(rs.second.space); } else if (rs.second.name == "") { - ib::error() << "Missing MLOG_FILE_NAME" - " or MLOG_FILE_DELETE" - " before MLOG_CHECKPOINT for tablespace " - << rs.first; + ib::error() << (log_sys.is_physical() + ? "Missing FILE_CREATE, FILE_DELETE" + " or FILE_MODIFY" + " before FILE_CHECKPOINT" + " for tablespace " + : "Missing MLOG_FILE_NAME" + " or MLOG_FILE_DELETE" + " before MLOG_CHECKPOINT" + " for tablespace ") << rs.first; recv_sys.found_corrupt_log = true; return(DB_CORRUPTION); } else { @@ -3576,7 +4432,7 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) return(DB_ERROR); } - /* Look for MLOG_CHECKPOINT. */ + /* Look for MLOG_CHECKPOINT or FILE_CHECKPOINT. */ recv_group_scan_log_recs(checkpoint_lsn, &contiguous_lsn, false); /* The first scan should not have stored or applied any records. */ ut_ad(recv_sys.pages.empty()); @@ -3598,7 +4454,9 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) if (!srv_read_only_mode && scan_lsn != checkpoint_lsn) { log_mutex_exit(); ib::error err; - err << "Missing MLOG_CHECKPOINT"; + err << (log_sys.is_physical() + ? "Missing FILE_CHECKPOINT" + : "Missing MLOG_CHECKPOINT"); if (end_lsn) { err << " at " << end_lsn; } @@ -3624,14 +4482,17 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) /* NOTE: we always do a 'recovery' at startup, but only if there is something wrong we will print a message to the user about recovery: */ + const ulint sizeof_checkpoint= log_sys.is_physical() + ? SIZE_OF_FILE_CHECKPOINT + : SIZE_OF_MLOG_CHECKPOINT; - if (flush_lsn == checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT + if (flush_lsn == checkpoint_lsn + sizeof_checkpoint && recv_sys.mlog_checkpoint_lsn == checkpoint_lsn) { /* The redo log is logically empty. */ } else if (checkpoint_lsn != flush_lsn) { ut_ad(!srv_log_files_created); - if (checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT < flush_lsn) { + if (checkpoint_lsn + sizeof_checkpoint < flush_lsn) { ib::warn() << "Are you sure you are using the" " right ib_logfiles to start up the database?" " Log sequence number in the ib_logfiles is " @@ -3770,7 +4631,7 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) log_sys.last_checkpoint_lsn = checkpoint_lsn; if (!srv_read_only_mode && srv_operation == SRV_OPERATION_NORMAL) { - /* Write a MLOG_CHECKPOINT marker as the first thing, + /* Write a MLOG_CHECKPOINT or FILE_CHECKPOINT first, before generating any other redo log. This ensures that subsequent crash recovery will be possible even if the server were killed soon after this. */ @@ -3937,9 +4798,6 @@ static const char* get_mlog_string(mlog_id_t type) case MLOG_IBUF_BITMAP_INIT: return("MLOG_IBUF_BITMAP_INIT"); - case MLOG_ZIP_WRITE_STRING: - return("MLOG_ZIP_WRITE_STRING"); - case MLOG_WRITE_STRING: return("MLOG_WRITE_STRING"); diff --git a/storage/innobase/mtr/mtr0log.cc b/storage/innobase/mtr/mtr0log.cc index 63a313ff0b8..fb363f012ce 100644 --- a/storage/innobase/mtr/mtr0log.cc +++ b/storage/innobase/mtr/mtr0log.cc @@ -26,15 +26,14 @@ Created 12/7/1995 Heikki Tuuri #include "mtr0log.h" #include "buf0buf.h" -#include "dict0dict.h" +#include "dict0mem.h" #include "log0recv.h" #include "page0page.h" -#include "buf0dblwr.h" -#include "dict0boot.h" /********************************************************//** -Parses an initial log record written by mtr_t::write_low(). +Parses an initial log record written by mlog_write_initial_log_record_low(). @return parsed record end, NULL if not a complete record */ +ATTRIBUTE_COLD /* only used when crash-upgrading */ const byte* mlog_parse_initial_log_record( /*==========================*/ @@ -196,112 +195,6 @@ mlog_parse_nbytes( return const_cast<byte*>(ptr); } -/** -Write a log record for writing 1, 2, 4, or 8 bytes. -@param[in] type number of bytes to write -@param[in] block file page -@param[in] ptr pointer within block.frame -@param[in,out] l log record buffer -@return new end of mini-transaction log */ -byte *mtr_t::log_write_low(mlog_id_t type, const buf_block_t &block, - const byte *ptr, byte *l) -{ - ut_ad(type == MLOG_1BYTE || type == MLOG_2BYTES || type == MLOG_4BYTES || - type == MLOG_8BYTES); - ut_ad(block.page.state == BUF_BLOCK_FILE_PAGE); - ut_ad(ptr >= block.frame + FIL_PAGE_OFFSET); - ut_ad(ptr + unsigned(type) <= - &block.frame[srv_page_size - FIL_PAGE_DATA_END]); - l= log_write_low(type, block.page.id, l); - mach_write_to_2(l, page_offset(ptr)); - return l + 2; -} - -/** -Write a log record for writing 1, 2, or 4 bytes. -@param[in] block file page -@param[in,out] ptr pointer in file page -@param[in] l number of bytes to write -@param[in,out] log_ptr log record buffer -@param[in] val value to write */ -void mtr_t::log_write(const buf_block_t &block, byte *ptr, mlog_id_t l, - byte *log_ptr, uint32_t val) -{ - ut_ad(l == MLOG_1BYTE || l == MLOG_2BYTES || l == MLOG_4BYTES); - log_ptr= log_write_low(l, block, ptr, log_ptr); - log_ptr+= mach_write_compressed(log_ptr, val); - m_log.close(log_ptr); -} - -/** -Write a log record for writing 8 bytes. -@param[in] block file page -@param[in,out] ptr pointer in file page -@param[in] l number of bytes to write -@param[in,out] log_ptr log record buffer -@param[in] val value to write */ -void mtr_t::log_write(const buf_block_t &block, byte *ptr, mlog_id_t l, - byte *log_ptr, uint64_t val) -{ - ut_ad(l == MLOG_8BYTES); - log_ptr= log_write_low(l, block, ptr, log_ptr); - log_ptr+= mach_u64_write_compressed(log_ptr, val); - m_log.close(log_ptr); -} - -/** Log a write of a byte string to a page. -@param[in] b buffer page -@param[in] ofs byte offset from b->frame -@param[in] len length of the data to write */ -void mtr_t::memcpy(const buf_block_t &b, ulint ofs, ulint len) -{ - ut_ad(len); - ut_ad(ofs <= ulint(srv_page_size)); - ut_ad(ofs + len <= ulint(srv_page_size)); - - set_modified(); - if (m_log_mode != MTR_LOG_ALL) - { - ut_ad(m_log_mode == MTR_LOG_NONE || m_log_mode == MTR_LOG_NO_REDO); - return; - } - - ut_ad(ofs + len < PAGE_DATA || !b.page.zip.data || - mach_read_from_2(b.frame + FIL_PAGE_TYPE) <= FIL_PAGE_TYPE_ZBLOB2); - - byte *l= log_write_low(MLOG_WRITE_STRING, b.page.id, m_log.open(11 + 2 + 2)); - mach_write_to_2(l, ofs); - mach_write_to_2(l + 2, len); - m_log.close(l + 4); - m_log.push(b.frame + ofs, static_cast<uint32_t>(len)); -} - -/** Write a byte string to a ROW_FORMAT=COMPRESSED page. -@param[in] b ROW_FORMAT=COMPRESSED index page -@param[in] ofs byte offset from b.zip.data -@param[in] len length of the data to write */ -void mtr_t::zmemcpy(const buf_page_t &b, ulint offset, ulint len) -{ - ut_ad(page_zip_simple_validate(&b.zip)); - ut_ad(len); - ut_ad(offset + len <= page_zip_get_size(&b.zip)); - ut_ad(mach_read_from_2(b.zip.data + FIL_PAGE_TYPE) == FIL_PAGE_INDEX || - mach_read_from_2(b.zip.data + FIL_PAGE_TYPE) == FIL_PAGE_RTREE); - - set_modified(); - if (m_log_mode != MTR_LOG_ALL) - { - ut_ad(m_log_mode == MTR_LOG_NONE || m_log_mode == MTR_LOG_NO_REDO); - return; - } - - byte *l= log_write_low(MLOG_ZIP_WRITE_STRING, b.id, m_log.open(11 + 2 + 2)); - mach_write_to_2(l, offset); - mach_write_to_2(l + 2, len); - m_log.close(l + 4); - m_log.push(b.zip.data + offset, static_cast<uint32_t>(len)); -} - /********************************************************//** Parses a log record written by mtr_t::memcpy(). @return parsed record end, NULL if not a complete record */ @@ -353,34 +246,6 @@ mlog_parse_string( return(ptr + len); } -/** Initialize a string of bytes. -@param[in,out] b buffer page -@param[in] ofs byte offset from block->frame -@param[in] len length of the data to write -@param[in] val the data byte to write */ -void mtr_t::memset(const buf_block_t* b, ulint ofs, ulint len, byte val) -{ - ut_ad(len); - ut_ad(ofs <= ulint(srv_page_size)); - ut_ad(ofs + len <= ulint(srv_page_size)); - ut_ad(ofs + len < PAGE_DATA || !b->page.zip.data || - mach_read_from_2(b->frame + FIL_PAGE_TYPE) <= FIL_PAGE_TYPE_ZBLOB2); - ::memset(ofs + b->frame, val, len); - - set_modified(); - if (m_log_mode != MTR_LOG_ALL) - { - ut_ad(m_log_mode == MTR_LOG_NONE || m_log_mode == MTR_LOG_NO_REDO); - return; - } - - byte *l= log_write_low(MLOG_MEMSET, b->page.id, m_log.open(11 + 2 + 2 + 1)); - mach_write_to_2(l, ofs); - mach_write_to_2(l + 2, len); - l[4]= val; - m_log.close(l + 5); -} - /********************************************************//** Parses a log record written by mlog_open_and_write_index. @return parsed record end, NULL if not a complete record */ diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index cacdb4878c8..2e907d6b113 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -378,13 +378,15 @@ void mtr_t::start() ut_d(m_start= true); ut_d(m_commit= false); + m_last= nullptr; + m_last_offset= 0; + new(&m_memo) mtr_buf_t(); new(&m_log) mtr_buf_t(); m_made_dirty= false; m_inside_ibuf= false; m_modifications= false; - m_n_log_recs= 0; m_log_mode= MTR_LOG_ALL; ut_d(m_user_space_id= TRX_SYS_SPACE); m_user_space= nullptr; @@ -411,7 +413,7 @@ void mtr_t::commit() ut_ad(!m_modifications || !recv_no_log_write); ut_ad(!m_modifications || m_log_mode != MTR_LOG_NONE); - if (m_modifications && (m_n_log_recs || m_log_mode == MTR_LOG_NO_REDO)) + if (m_modifications && (m_log_mode == MTR_LOG_NO_REDO || !m_log.empty())) { ut_ad(!srv_read_only_mode || m_log_mode == MTR_LOG_NO_REDO); @@ -445,7 +447,7 @@ void mtr_t::commit() /** Commit a mini-transaction that did not modify any pages, but generated some redo log on a higher level, such as -MLOG_FILE_NAME records and an optional MLOG_CHECKPOINT marker. +FILE_MODIFY records and an optional FILE_CHECKPOINT marker. The caller must invoke log_mutex_enter() and log_mutex_exit(). This is to be used at log_checkpoint(). @param[in] checkpoint_lsn log checkpoint LSN, or 0 */ @@ -458,23 +460,16 @@ void mtr_t::commit_files(lsn_t checkpoint_lsn) ut_ad(!m_made_dirty); ut_ad(m_memo.size() == 0); ut_ad(!srv_read_only_mode); - ut_ad(checkpoint_lsn || m_n_log_recs > 1); - - switch (m_n_log_recs) { - case 0: - break; - case 1: - *m_log.front()->begin() |= MLOG_SINGLE_REC_FLAG; - break; - default: - *m_log.push<byte*>(1) = MLOG_MULTI_REC_END; - } if (checkpoint_lsn) { - byte* ptr = m_log.push<byte*>(SIZE_OF_MLOG_CHECKPOINT); - compile_time_assert(SIZE_OF_MLOG_CHECKPOINT == 1 + 8); - *ptr = MLOG_CHECKPOINT; - mach_write_to_8(ptr + 1, checkpoint_lsn); + byte* ptr = m_log.push<byte*>(SIZE_OF_FILE_CHECKPOINT); + compile_time_assert(SIZE_OF_FILE_CHECKPOINT == 3 + 8 + 1); + *ptr = FILE_CHECKPOINT | (SIZE_OF_FILE_CHECKPOINT - 2); + ::memset(ptr + 1, 0, 2); + mach_write_to_8(ptr + 3, checkpoint_lsn); + ptr[3 + 8] = 0; + } else { + *m_log.push<byte*>(1) = 0; } finish_write(m_log.size()); @@ -482,14 +477,14 @@ void mtr_t::commit_files(lsn_t checkpoint_lsn) if (checkpoint_lsn) { DBUG_PRINT("ib_log", - ("MLOG_CHECKPOINT(" LSN_PF ") written at " LSN_PF, + ("FILE_CHECKPOINT(" LSN_PF ") written at " LSN_PF, checkpoint_lsn, log_sys.lsn)); } } #ifdef UNIV_DEBUG /** Check if a tablespace is associated with the mini-transaction -(needed for generating a MLOG_FILE_NAME record) +(needed for generating a FILE_MODIFY record) @param[in] space tablespace @return whether the mini-transaction is associated with the space */ bool @@ -510,7 +505,7 @@ mtr_t::is_named_space(ulint space) const return(false); } /** Check if a tablespace is associated with the mini-transaction -(needed for generating a MLOG_FILE_NAME record) +(needed for generating a FILE_MODIFY record) @param[in] space tablespace @return whether the mini-transaction is associated with the space */ bool mtr_t::is_named_space(const fil_space_t* space) const @@ -618,53 +613,32 @@ inline ulint mtr_t::prepare_write() } ulint len = m_log.size(); - ulint n_recs = m_n_log_recs; ut_ad(len > 0); - ut_ad(n_recs > 0); if (len > srv_log_buffer_size / 2) { log_buffer_extend(ulong((len + 1) * 2)); } - ut_ad(m_n_log_recs == n_recs); - fil_space_t* space = m_user_space; if (space != NULL && is_predefined_tablespace(space->id)) { - /* Omit MLOG_FILE_NAME for predefined tablespaces. */ + /* Omit FILE_MODIFY for predefined tablespaces. */ space = NULL; } log_mutex_enter(); - if (fil_names_write_if_was_clean(space, this)) { - /* This mini-transaction was the first one to modify - this tablespace since the latest checkpoint, so - some MLOG_FILE_NAME records were appended to m_log. */ - ut_ad(m_n_log_recs > n_recs); - *m_log.push<byte*>(1) = MLOG_MULTI_REC_END; + if (fil_names_write_if_was_clean(space)) { len = m_log.size(); } else { /* This was not the first time of dirtying a tablespace since the latest checkpoint. */ - - ut_ad(n_recs == m_n_log_recs); - - if (n_recs <= 1) { - ut_ad(n_recs == 1); - - /* Flag the single log record as the - only record in this mini-transaction. */ - *m_log.front()->begin() |= MLOG_SINGLE_REC_FLAG; - } else { - /* Because this mini-transaction comprises - multiple log records, append MLOG_MULTI_REC_END - at the end. */ - *m_log.push<byte*>(1) = MLOG_MULTI_REC_END; - len++; - } + ut_ad(len == m_log.size()); } + *m_log.push<byte*>(1) = 0; + len++; + /* check and attempt a checkpoint if exceeding capacity */ log_margin_checkpoint_age(len); diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc index e3f59187650..d6d908a3163 100644 --- a/storage/innobase/page/page0cur.cc +++ b/storage/innobase/page/page0cur.cc @@ -791,6 +791,13 @@ static void rec_set_heap_no(rec_t *rec, ulint heap_no, bool compact) REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); } +static rec_t* +page_cur_parse_insert_rec_zip( + page_cur_t* cursor, /*!< in/out: page cursor */ + dict_index_t* index, /*!< in: record descriptor */ + const rec_t* rec, /*!< in: pointer to a physical record */ + offset_t* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr); /*!< in/out: mini-transaction */ /***********************************************************//** Parses a log record of a record insert on a page. @return end of log record or NULL */ @@ -960,9 +967,9 @@ page_cur_parse_insert_rec( /* The redo log record should only have been written after the write was successful. */ if (block->page.zip.data) { - if (!page_cur_insert_rec_zip(&cursor, index, - buf + origin_offset, - offsets, mtr)) { + if (!page_cur_parse_insert_rec_zip(&cursor, index, + buf + origin_offset, + offsets, mtr)) { ut_error; } } else if (!page_cur_insert_rec_low(&cursor, index, @@ -983,60 +990,6 @@ page_cur_parse_insert_rec( return(const_cast<byte*>(ptr + end_seg_len)); } -/** Reset PAGE_DIRECTION and PAGE_N_DIRECTION. -@tparam compressed whether the page is in ROW_FORMAT=COMPRESSED -@param[in,out] block index page -@param[in,out] ptr the PAGE_DIRECTION_B field -@param[in,out] mtr mini-transaction */ -template<bool compressed=false> -inline void page_direction_reset(buf_block_t *block, byte *ptr, mtr_t *mtr) -{ - ut_ad(!block->page.zip.data || page_is_comp(block->frame)); - ut_ad(!compressed || block->page.zip.data); - ut_ad(ptr == PAGE_HEADER + PAGE_DIRECTION_B + block->frame); - static_assert(PAGE_DIRECTION_B + 1 == PAGE_N_DIRECTION, "adjacent fields"); - - if (compressed) - { - *ptr= PAGE_NO_DIRECTION; /* no instant ALTER bits */ - memset_aligned<2>(ptr + 1, 0, 2); - page_zip_write_header(block, ptr, 3, mtr); - } - else - { - mtr->write<1,mtr_t::OPT>(*block, ptr, (*ptr & ~((1U << 3) - 1)) - | PAGE_NO_DIRECTION); - mtr->write<2,mtr_t::OPT>(*block, ptr + 1, 0U); - } -} - -/** Increment PAGE_N_DIRECTION. -@tparam compressed whether the page is in ROW_FORMAT=COMPRESSED -@param[in,out] block index page -@param[in,out] ptr the PAGE_DIRECTION_B field -@param[in] dir PAGE_RIGHT or PAGE_LEFT -@param[in,out] mtr mini-transaction */ -template<bool compressed=false> -inline void page_direction_increment(buf_block_t *block, byte *ptr, uint dir, - mtr_t *mtr) -{ - ut_ad(!block->page.zip.data || page_is_comp(block->frame)); - ut_ad(!compressed || block->page.zip.data); - ut_ad(ptr == PAGE_HEADER + PAGE_DIRECTION_B + block->frame); - ut_ad(dir == PAGE_RIGHT || dir == PAGE_LEFT); - if (compressed) - { - *ptr= static_cast<byte>(dir); - mach_write_to_2(ptr + 1, 1 + mach_read_from_2(ptr + 1)); - page_zip_write_header(block, ptr, 3, mtr); - } - else - { - mtr->write<1,mtr_t::OPT>(*block, ptr, (*ptr & ~((1U << 3) - 1)) | dir); - mtr->write<2>(*block, ptr + 1, 1U + mach_read_from_2(ptr + 1)); - } -} - /** Set the owned records field of the record pointed to by a directory slot. @tparam compressed whether to update any ROW_FORMAT=COMPRESSED page as well @@ -1082,7 +1035,8 @@ static void page_dir_split_slot(buf_block_t *block, ulint s, mtr_t* mtr) rec= page_rec_get_next_const(rec); /* Add a directory slot immediately below this one. */ - byte *n_slots_p= PAGE_N_DIR_SLOTS + PAGE_HEADER + block->frame; + constexpr uint16_t n_slots_f= PAGE_N_DIR_SLOTS + PAGE_HEADER; + byte *n_slots_p= my_assume_aligned<2>(n_slots_f + block->frame); const uint16_t n_slots= mach_read_from_2(n_slots_p); page_dir_slot_t *last_slot= static_cast<page_dir_slot_t*> @@ -1093,12 +1047,13 @@ static void page_dir_split_slot(buf_block_t *block, ulint s, mtr_t* mtr) const ulint half_owned= n_owned / 2; + mtr->write<2>(*block, n_slots_p, 1U + n_slots); + if (compressed) { /* Log changes to the compressed page header and the dense page directory. */ - mach_write_to_2(n_slots_p, n_slots + 1); - page_zip_write_header(block, n_slots_p, 2, mtr); + memcpy_aligned<2>(&block->page.zip.data[n_slots_f], n_slots_p, 2); mach_write_to_2(slot, page_offset(rec)); page_rec_set_n_owned<true>(block, page_dir_slot_get_rec(slot), half_owned, true, mtr); @@ -1109,8 +1064,9 @@ static void page_dir_split_slot(buf_block_t *block, ulint s, mtr_t* mtr) } else { - mtr->write<2>(*block, n_slots_p, 1U + n_slots); - mtr->memcpy(*block, page_offset(last_slot), slot - last_slot); + mtr->memmove(*block, page_offset(last_slot), + page_offset(last_slot) + PAGE_DIR_SLOT_SIZE, + slot - last_slot); mtr->write<2>(*block, slot, page_offset(rec)); const bool comp= page_is_comp(block->frame) != 0; page_rec_set_n_owned<false>(block, page_dir_slot_get_rec(slot), half_owned, @@ -1164,22 +1120,20 @@ static void page_dir_balance_slot(buf_block_t *block, ulint s, mtr_t *mtr) block->frame, n_slots - 1); memmove_aligned<2>(last_slot + PAGE_DIR_SLOT_SIZE, last_slot, slot - last_slot); + constexpr uint16_t n_slots_f = PAGE_N_DIR_SLOTS + PAGE_HEADER; + byte *n_slots_p= my_assume_aligned<2> + (n_slots_f + block->frame); + mtr->write<2>(*block, n_slots_p, n_slots - 1); + if (UNIV_LIKELY_NULL(block->page.zip.data)) { memset_aligned<2>(last_slot, 0, 2); - mach_write_to_2(PAGE_N_DIR_SLOTS + PAGE_HEADER - + block->frame, n_slots - 1); - page_zip_write_header(block, - PAGE_N_DIR_SLOTS + PAGE_HEADER - + block->frame, 2, mtr); + memcpy_aligned<2>(n_slots_f + block->page.zip.data, + n_slots_p, 2); } else { - mtr->write<2>(*block, - PAGE_N_DIR_SLOTS + PAGE_HEADER - + block->frame, - n_slots - 1); + mtr->memmove(*block, PAGE_DIR_SLOT_SIZE + + page_offset(last_slot), + page_offset(last_slot), slot - last_slot); mtr->write<2>(*block, last_slot, 0U); - mtr->memcpy(*block, page_offset(last_slot) - + PAGE_DIR_SLOT_SIZE, - slot - last_slot); } return; @@ -1245,14 +1199,14 @@ static byte* page_mem_alloc_heap(buf_block_t *block, ulint need, mach_write_to_2(heap_top, top + need); mach_write_to_2(n_heap, h + 1); + mtr->memcpy(*block, PAGE_HEAP_TOP + PAGE_HEADER, 4); if (compressed) { ut_ad(h & 0x8000); - page_zip_write_header(block, heap_top, 4, mtr); + memcpy_aligned<4>(&block->page.zip.data[PAGE_HEAP_TOP + PAGE_HEADER], + heap_top, 4); } - else - mtr->memcpy(*block, PAGE_HEAP_TOP + PAGE_HEADER, 4); compile_time_assert(PAGE_N_HEAP == PAGE_HEAP_TOP + 2); return &block->frame[top]; @@ -1272,236 +1226,332 @@ page_cur_insert_rec_low( offset_t* offsets,/*!< in/out: rec_get_offsets(rec, index) */ mtr_t* mtr) /*!< in/out: mini-transaction */ { - byte* insert_buf; - ulint rec_size; - rec_t* last_insert; /*!< cursor position at previous - insert */ - rec_t* free_rec; /*!< a free record that was reused, - or NULL */ - rec_t* insert_rec; /*!< inserted record */ - ulint heap_no; /*!< heap number of the inserted - record */ - - rec_t* current_rec = cur->rec; - buf_block_t* block = cur->block; + buf_block_t* block = cur->block; - ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_validate(rec, index, offsets)); - ut_ad(dict_table_is_comp(index->table) - == (ibool) !!page_is_comp(block->frame)); - ut_ad(fil_page_index_page_check(block->frame)); - ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->frame) - == index->id - || index->is_dummy - || mtr->is_inside_ibuf()); + ut_ad(index->table->not_redundant() == !!page_is_comp(block->frame)); + ut_ad(!!page_is_comp(block->frame) == !!rec_offs_comp(offsets)); + ut_ad(fil_page_index_page_check(block->frame)); + ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->frame) == + index->id || + index->is_dummy || + mtr->is_inside_ibuf()); - ut_ad(!page_rec_is_supremum(current_rec)); + ut_ad(!page_rec_is_supremum(cur->rec)); - /* We should not write log for ROW_FORMAT=COMPRESSED pages here. */ - ut_ad(mtr->get_log_mode() == MTR_LOG_NONE - || mtr->get_log_mode() == MTR_LOG_NO_REDO - || !(index->table->flags & DICT_TF_MASK_ZIP_SSIZE)); + /* We should not write log for ROW_FORMAT=COMPRESSED pages here. */ + ut_ad(mtr->get_log_mode() != MTR_LOG_ALL || + !(index->table->flags & DICT_TF_MASK_ZIP_SSIZE)); - /* 1. Get the size of the physical record in the page */ - rec_size = rec_offs_size(offsets); + /* 1. Get the size of the physical record in the page */ + const ulint rec_size= rec_offs_size(offsets); #ifdef UNIV_DEBUG_VALGRIND - { - const void* rec_start - = rec - rec_offs_extra_size(offsets); - ulint extra_size - = rec_offs_extra_size(offsets) - - (rec_offs_comp(offsets) - ? REC_N_NEW_EXTRA_BYTES - : REC_N_OLD_EXTRA_BYTES); - - /* All data bytes of the record must be valid. */ - UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); - /* The variable-length header must be valid. */ - UNIV_MEM_ASSERT_RW(rec_start, extra_size); - } + { + const void *rec_start= rec - rec_offs_extra_size(offsets); + ulint extra_size= rec_offs_extra_size(offsets) - + (page_is_comp(block->frame) + ? REC_N_NEW_EXTRA_BYTES + : REC_N_OLD_EXTRA_BYTES); + /* All data bytes of the record must be valid. */ + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + /* The variable-length header must be valid. */ + UNIV_MEM_ASSERT_RW(rec_start, extra_size); + } #endif /* UNIV_DEBUG_VALGRIND */ - /* 2. Try to find suitable space from page memory management */ + /* 2. Try to find suitable space from page memory management */ + ulint heap_no; + byte *insert_buf; + alignas(2) byte hdr[8]; - free_rec = page_header_get_ptr(block->frame, PAGE_FREE); - if (UNIV_LIKELY_NULL(free_rec)) { - /* Try to allocate from the head of the free list. */ - offset_t foffsets_[REC_OFFS_NORMAL_SIZE]; - offset_t* foffsets = foffsets_; - mem_heap_t* heap = NULL; - - rec_offs_init(foffsets_); + if (rec_t* free_rec = page_header_get_ptr(block->frame, PAGE_FREE)) + { + /* Try to reuse the head of PAGE_FREE. */ + offset_t foffsets_[REC_OFFS_NORMAL_SIZE]; + mem_heap_t *heap= nullptr; + + rec_offs_init(foffsets_); + + offset_t *foffsets= rec_get_offsets(free_rec, index, foffsets_, + page_is_leaf(block->frame), + ULINT_UNDEFINED, &heap); + insert_buf= free_rec - rec_offs_extra_size(foffsets); + const bool too_small= rec_offs_size(foffsets) < rec_size; + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + + if (too_small) + goto use_heap; + + byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER + + block->frame); + if (page_is_comp(block->frame)) + { + heap_no= rec_get_heap_no_new(free_rec); + const rec_t *next= rec_get_next_ptr(free_rec, true); + mach_write_to_2(hdr, next ? page_offset(next) : 0); + } + else + { + heap_no= rec_get_heap_no_old(free_rec); + memcpy(hdr, free_rec - REC_NEXT, 2); + } - foffsets = rec_get_offsets( - free_rec, index, foffsets, page_is_leaf(block->frame), - ULINT_UNDEFINED, &heap); - if (rec_offs_size(foffsets) < rec_size) { - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } + static_assert(PAGE_GARBAGE == PAGE_FREE + 2, "compatibility"); + byte *page_garbage = my_assume_aligned<2>(page_free + 2); + ut_ad(mach_read_from_2(page_garbage) >= rec_size); + mach_write_to_2(my_assume_aligned<2>(hdr + 2), + mach_read_from_2(page_garbage) - rec_size); + mtr->memcpy(*block, page_free, hdr, 4); + } + else + { +use_heap: + insert_buf= page_mem_alloc_heap(block, rec_size, &heap_no, mtr); - goto use_heap; - } + if (UNIV_UNLIKELY(!insert_buf)) + return nullptr; + } - insert_buf = free_rec - rec_offs_extra_size(foffsets); + const ulint extra_size= rec_offs_extra_size(offsets); + ut_ad(cur->rec != insert_buf + extra_size); - byte* page_free = my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER - + block->frame); - byte* page_garbage = my_assume_aligned<2>(PAGE_GARBAGE - + PAGE_HEADER - + block->frame); - ut_ad(mach_read_from_2(page_garbage) >= rec_size); - mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) - - rec_size); - if (page_is_comp(block->frame)) { - heap_no = rec_get_heap_no_new(free_rec); - const rec_t* next = rec_get_next_ptr(free_rec, true); - mach_write_to_2(page_free, - next ? page_offset(next) : 0); - } else { - heap_no = rec_get_heap_no_old(free_rec); - memcpy(page_free, free_rec - REC_NEXT, 2); - } + const rec_t *next_rec= page_rec_get_next_low(cur->rec, + page_is_comp(block->frame)); - compile_time_assert(PAGE_GARBAGE == PAGE_FREE + 2); - mtr->memcpy(*block, PAGE_FREE + PAGE_HEADER, 4); + /* Update page header fields */ + rec_t *last_insert= page_header_get_ptr(block->frame, PAGE_LAST_INSERT); + ut_ad(!last_insert || !page_is_comp(block->frame) || + rec_get_node_ptr_flag(last_insert) == rec_get_node_ptr_flag(rec)); - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } - } else { -use_heap: - free_rec = NULL; - insert_buf = page_mem_alloc_heap(block, rec_size, &heap_no, - mtr); + static_assert(PAGE_N_RECS - PAGE_LAST_INSERT + 2 == sizeof hdr, + "compatibility"); - if (UNIV_UNLIKELY(insert_buf == NULL)) { - return(NULL); - } - } + /* Write PAGE_LAST_INSERT */ + mach_write_to_2(hdr, page_offset(insert_buf + extra_size)); + static_assert(PAGE_INSTANT - PAGE_LAST_INSERT == 2, "compatibility"); + static_assert(PAGE_DIRECTION_B - PAGE_INSTANT == 1, "compatibility"); + static_assert(PAGE_N_DIRECTION - PAGE_DIRECTION_B == 1, "compat."); + static_assert(PAGE_N_RECS - PAGE_N_DIRECTION == 2, "compatibility"); - /* 3. Create the record */ - insert_rec = rec_copy(insert_buf, rec, offsets); - rec_offs_make_valid(insert_rec, index, page_is_leaf(block->frame), - offsets); + /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */ + memcpy_aligned<2>(hdr + 2, PAGE_HEADER + PAGE_INSTANT + block->frame, + PAGE_N_RECS - PAGE_INSTANT + 2); - /* 4. Insert the record in the linked list of records */ - ut_ad(current_rec != insert_rec); + if (!index->is_spatial()) + { + byte *dir= &hdr[PAGE_DIRECTION_B - PAGE_LAST_INSERT]; + byte *n= my_assume_aligned<2>(&hdr[PAGE_N_DIRECTION - PAGE_LAST_INSERT]); + if (UNIV_UNLIKELY(!last_insert)) + { +no_direction: + *dir= (*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION; + memset(n, 0, 2); + } + else if (last_insert == cur->rec && (*dir & ((1U << 3) - 1)) != PAGE_LEFT) + { + *dir= (*dir & ~((1U << 3) - 1)) | PAGE_RIGHT; +inc_dir: + mach_write_to_2(n, mach_read_from_2(n) + 1); + } + else if (next_rec == last_insert && (*dir & ((1U << 3) - 1)) != PAGE_RIGHT) + { + *dir= (*dir & ~((1U << 3) - 1)) | PAGE_LEFT; + goto inc_dir; + } + else + goto no_direction; + } - { - /* next record after current before the insertion */ - if (page_is_comp(block->frame)) { - const rec_t* next_rec = page_rec_get_next_low( - current_rec, true); + /* Update PAGE_N_RECS. */ + mach_write_to_2(hdr + PAGE_N_RECS - PAGE_LAST_INSERT, + mach_read_from_2(hdr + PAGE_N_RECS - PAGE_LAST_INSERT) + 1); + /* Write the header fields in one record. */ + mtr->memcpy(*block, PAGE_LAST_INSERT + PAGE_HEADER + block->frame, + hdr, PAGE_N_RECS - PAGE_LAST_INSERT + 2); + + /* Update the preceding record header, the 'owner' record and + prepare the record to insert. */ + ulint n_owned; + static_assert(sizeof hdr >= REC_N_NEW_EXTRA_BYTES, "compatibility"); + static_assert(sizeof hdr >= REC_N_OLD_EXTRA_BYTES, "compatibility"); + ulint fixed_hdr; + + if (page_is_comp(block->frame)) + { #ifdef UNIV_DEBUG - switch (rec_get_status(current_rec)) { - case REC_STATUS_ORDINARY: - case REC_STATUS_NODE_PTR: - case REC_STATUS_INSTANT: - case REC_STATUS_INFIMUM: - break; - case REC_STATUS_SUPREMUM: - ut_ad(!"wrong status on current_rec"); - } - switch (rec_get_status(insert_rec)) { - case REC_STATUS_ORDINARY: - case REC_STATUS_NODE_PTR: - case REC_STATUS_INSTANT: - break; - case REC_STATUS_INFIMUM: - case REC_STATUS_SUPREMUM: - ut_ad(!"wrong status on insert_rec"); - } - ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM); + switch (rec_get_status(cur->rec)) { + case REC_STATUS_ORDINARY: + case REC_STATUS_NODE_PTR: + case REC_STATUS_INSTANT: + case REC_STATUS_INFIMUM: + break; + case REC_STATUS_SUPREMUM: + ut_ad(!"wrong status on cur->rec"); + } + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + case REC_STATUS_NODE_PTR: + case REC_STATUS_INSTANT: + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + ut_ad(!"wrong status on rec"); + } + ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM); #endif - mach_write_to_2(insert_rec - REC_NEXT, - static_cast<uint16_t> - (next_rec - insert_rec)); - mtr->write<2>(*block, current_rec - REC_NEXT, - static_cast<uint16_t> - (insert_rec - current_rec)); - } else { - memcpy(insert_rec - REC_NEXT, current_rec - REC_NEXT, - 2); - mtr->write<2>(*block, current_rec - REC_NEXT, - page_offset(insert_rec)); - } - } - - mtr->write<2>(*block, PAGE_N_RECS + PAGE_HEADER + block->frame, - 1U + page_get_n_recs(block->frame)); + memcpy(hdr, rec - REC_N_NEW_EXTRA_BYTES, REC_N_NEW_EXTRA_BYTES); + rec_set_bit_field_1(hdr + REC_N_NEW_EXTRA_BYTES, 0, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + rec_set_bit_field_2(hdr + REC_N_NEW_EXTRA_BYTES, heap_no, + REC_NEW_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); + const rec_t *insert_rec= insert_buf + extra_size; + mach_write_to_2(REC_N_NEW_EXTRA_BYTES - REC_NEXT + hdr, + static_cast<uint16_t>(next_rec - insert_rec)); + mtr->write<2>(*block, cur->rec - REC_NEXT, + static_cast<uint16_t>(insert_rec - cur->rec)); + while (!(n_owned = rec_get_n_owned_new(next_rec))) + next_rec= page_rec_get_next_low(next_rec, true); + page_rec_set_n_owned<false>(block, const_cast<rec_t*>(next_rec), + n_owned + 1, true, mtr); + fixed_hdr= REC_N_NEW_EXTRA_BYTES; + } + else + { + memcpy(hdr, rec - REC_N_OLD_EXTRA_BYTES, REC_N_OLD_EXTRA_BYTES); + rec_set_bit_field_1(hdr + REC_N_OLD_EXTRA_BYTES, 0, REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + rec_set_bit_field_2(hdr + REC_N_OLD_EXTRA_BYTES, heap_no, + REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); + memcpy(hdr + REC_N_OLD_EXTRA_BYTES - REC_NEXT, cur->rec - REC_NEXT, 2); + mtr->write<2>(*block, cur->rec - REC_NEXT, + page_offset(insert_buf + extra_size)); + while (!(n_owned = rec_get_n_owned_old(next_rec))) + next_rec= page_rec_get_next_low(next_rec, false); + page_rec_set_n_owned<false>(block, const_cast<rec_t*>(next_rec), + n_owned + 1, false, mtr); + fixed_hdr= REC_N_OLD_EXTRA_BYTES; + } - /* 5. Set the n_owned field in the inserted record to zero, - and set the heap_no field */ - if (page_is_comp(block->frame)) { - rec_set_bit_field_1(insert_rec, 0, REC_NEW_N_OWNED, - REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); - rec_set_bit_field_2(insert_rec, heap_no, REC_NEW_HEAP_NO, - REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); - } else { - rec_set_bit_field_1(insert_rec, 0, REC_OLD_N_OWNED, - REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); - rec_set_bit_field_2(insert_rec, heap_no, REC_OLD_HEAP_NO, - REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); - } + ut_ad(fixed_hdr <= extra_size); + /* Insert the record, possibly copying from the preceding record. */ + const ulint data_size = rec_offs_data_size(offsets); + ut_ad(mtr->has_modifications()); - UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets), - rec_offs_size(offsets)); - mtr->memcpy(*block, page_offset(insert_buf), rec_offs_size(offsets)); + if (mtr->get_log_mode() == MTR_LOG_ALL) + { + /* Temporarily write everything to rec, to simplify the code below. */ + byte rec_hdr[REC_N_OLD_EXTRA_BYTES]; + memcpy(rec_hdr, rec - fixed_hdr, fixed_hdr); + memcpy(const_cast<rec_t*>(rec - fixed_hdr), hdr, fixed_hdr); - /* 6. Update the last insertion info in page header */ + byte *b= insert_buf; + const byte *r= rec - extra_size; - last_insert = page_header_get_ptr(block->frame, PAGE_LAST_INSERT); - ut_ad(!last_insert || !page_is_comp(block->frame) - || rec_get_node_ptr_flag(last_insert) - == rec_get_node_ptr_flag(insert_rec)); + /* Skip any unchanged prefix of the record header. */ + for (;; b++, r++) + if (UNIV_UNLIKELY(b == insert_buf + rec_size)) + goto rec_done; + else if (*b != *r) + break; - if (!index->is_spatial()) { - byte* ptr = PAGE_HEADER + PAGE_DIRECTION_B + block->frame; - if (UNIV_UNLIKELY(last_insert == NULL)) { -no_direction: - page_direction_reset(block, ptr, mtr); - } else if (last_insert == current_rec - && page_ptr_get_direction(ptr) != PAGE_LEFT) { - page_direction_increment(block, ptr, PAGE_RIGHT, mtr); - } else if (page_ptr_get_direction(ptr) != PAGE_RIGHT - && page_rec_get_next(insert_rec) == last_insert) { - page_direction_increment(block, ptr, PAGE_LEFT, mtr); - } else { - goto no_direction; - } - } + { + const byte *c= cur->rec - (rec - r); + const byte *c_end= std::min(cur->rec + data_size, + block->frame + srv_page_size); + if (c <= insert_buf && c_end > insert_buf) + c_end= insert_buf; + + /* Try to copy any bytes of the preceding record. */ + if (UNIV_LIKELY(c >= block->frame && c < c_end)) + { + const byte *cm= c; + const byte *rm= r; + while (*rm++ == *cm++) + if (cm == c_end) + break; + rm--, cm--; + ut_ad(rm - r + b <= insert_buf + rec_size); + size_t len= static_cast<size_t>(rm - r); + ut_ad(!memcmp(r, c, len)); + if (len > 2) + { + memcpy(b, c, len); + mtr->memmove(*block, page_offset(b), page_offset(c), len); + c= cm; + b+= rm - r; + r= rm; + } + } + + if (c < cur->rec) + { + if (!data_size) + { +no_data: + mtr->memcpy<mtr_t::FORCED>(*block, b, r, cur->rec - c); + goto rec_done; + } + /* Some header bytes differ. Compare the data separately. */ + byte *bd= insert_buf + extra_size; + const byte *rd= rec; + /* Skip any unchanged prefix of the record payload. */ + for (;; bd++, rd++) + if (bd == insert_buf + rec_size) + goto no_data; + else if (*bd != *rd) + break; + + /* Try to copy any data bytes of the preceding record. */ + const byte * const cd= cur->rec + (rd - rec); + const byte *cdm= cd; + const byte *rdm= rd; + while (*rdm++ == *cdm++) + if (cdm == c_end) + break; + cdm--, rdm--; + ut_ad(rdm - rd + bd <= insert_buf + rec_size); + size_t len= static_cast<size_t>(rdm - rd); + ut_ad(!memcmp(rd, cd, len)); + if (len > 2) + { + mtr->memcpy<mtr_t::FORCED>(*block, b, r, cur->rec - c); + memcpy(bd, cd, len); + mtr->memmove(*block, page_offset(bd), page_offset(cd), len); + c= cdm; + b= rdm - rd + bd; + r= rdm; + } + } + } - mtr->write<2>(*block, PAGE_LAST_INSERT + PAGE_HEADER + block->frame, - page_offset(insert_rec)); + if (size_t len= static_cast<size_t>(insert_buf + rec_size - b)) + mtr->memcpy<mtr_t::FORCED>(*block, b, r, len); +rec_done: + ut_ad(!memcmp(insert_buf, rec - extra_size, rec_size)); - /* 7. It remains to update the owner record. */ - { - rec_t* owner_rec = page_rec_find_owner_rec(insert_rec); - ulint n_owned; - if (page_is_comp(block->frame)) { - n_owned = rec_get_n_owned_new(owner_rec); - page_rec_set_n_owned<false>(block, owner_rec, - n_owned + 1, true, mtr); - } else { - n_owned = rec_get_n_owned_old(owner_rec); - page_rec_set_n_owned<false>(block, owner_rec, - n_owned + 1, false, mtr); - } + /* Restore the record header. */ + memcpy(const_cast<rec_t*>(rec - fixed_hdr), rec_hdr, fixed_hdr); + } + else + { + memcpy(insert_buf, rec - extra_size, extra_size - fixed_hdr); + memcpy(insert_buf + extra_size - fixed_hdr, hdr, fixed_hdr); + memcpy(insert_buf + extra_size, rec, data_size); + } - /* 8. Now we have incremented the n_owned field of the owner - record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, - we have to split the corresponding directory slot in two. */ + /* We have incremented the n_owned field of the owner record. + If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, we have to split the + corresponding directory slot in two. */ - if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) { - page_dir_split_slot<false>( - block, - page_dir_find_owner_slot(owner_rec), mtr); - } - } + if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) + page_dir_split_slot<false>(block, page_dir_find_owner_slot(next_rec), mtr); - return(insert_rec); + rec_offs_make_valid(insert_buf + extra_size, index, + page_is_leaf(block->frame), offsets); + return insert_buf + extra_size; } /** Add a slot to the dense page directory. @@ -1541,8 +1591,8 @@ static inline void page_zip_dir_add_slot(buf_block_t *block, if (const ulint len = ulint(stored - externs)) { memmove(dst, externs, len); - /* TODO: write MEMMOVE record */ - mtr->zmemcpy(block->page, dst - page_zip->data, len); + mtr->memmove(*block, dst - page_zip->data, externs - page_zip->data, + len); } } else @@ -1558,8 +1608,7 @@ static inline void page_zip_dir_add_slot(buf_block_t *block, { byte* dst = stored - PAGE_ZIP_DIR_SLOT_SIZE; memmove(dst, stored, len); - /* TODO: write MEMMOVE record */ - mtr->zmemcpy(block->page, dst - page_zip->data, len); + mtr->memmove(*block, dst - page_zip->data, stored - page_zip->data, len); } } @@ -1584,16 +1633,396 @@ page_cur_insert_rec_zip( offset_t* offsets,/*!< in/out: rec_get_offsets(rec, index) */ mtr_t* mtr) /*!< in/out: mini-transaction */ { + page_zip_des_t * const page_zip= page_cur_get_page_zip(cursor); + ut_ad(page_zip); + ut_ad(rec_offs_validate(rec, index, offsets)); + + ut_ad(index->table->not_redundant()); + ut_ad(page_is_comp(cursor->block->frame)); + ut_ad(rec_offs_comp(offsets)); + ut_ad(fil_page_get_type(cursor->block->frame) == FIL_PAGE_INDEX || + fil_page_get_type(cursor->block->frame) == FIL_PAGE_RTREE); + ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + cursor->block->frame) == + index->id || + index->is_dummy || + mtr->is_inside_ibuf()); + ut_ad(!page_get_instant(cursor->block->frame)); + ut_ad(!page_cur_is_after_last(cursor)); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, cursor->block->frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + + /* 1. Get the size of the physical record in the page */ + const ulint rec_size= rec_offs_size(offsets); + +#ifdef UNIV_DEBUG_VALGRIND + { + const void *rec_start= rec - rec_offs_extra_size(offsets); + ulint extra_size= rec_offs_extra_size(offsets) - REC_N_NEW_EXTRA_BYTES; + /* All data bytes of the record must be valid. */ + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + /* The variable-length header must be valid. */ + UNIV_MEM_ASSERT_RW(rec_start, extra_size); + } +#endif /* UNIV_DEBUG_VALGRIND */ + const bool reorg_before_insert= page_has_garbage(cursor->block->frame) && + rec_size > page_get_max_insert_size(cursor->block->frame, 1) && + rec_size <= page_get_max_insert_size_after_reorganize(cursor->block->frame, + 1); + constexpr uint16_t page_free_f= PAGE_FREE + PAGE_HEADER; + byte* const page_free = my_assume_aligned<4>(page_free_f + + cursor->block->frame); + uint16_t free_rec= 0; + + /* 2. Try to find suitable space from page memory management */ + ulint heap_no; + byte *insert_buf; + + if (reorg_before_insert || + !page_zip_available(page_zip, index->is_clust(), rec_size, 1)) + { + /* SET GLOBAL might be executed concurrently. Sample the value once. */ + ulint level= page_zip_level; +#ifdef UNIV_DEBUG + const rec_t * const cursor_rec= page_cur_get_rec(cursor); +#endif /* UNIV_DEBUG */ + + if (page_is_empty(cursor->block->frame)) + { + ut_ad(page_cur_is_before_first(cursor)); + + /* This is an empty page. Recreate to remove the modification log. */ + page_create_zip(cursor->block, index, + page_header_get_field(cursor->block->frame, PAGE_LEVEL), + 0, mtr); + ut_ad(!page_header_get_ptr(cursor->block->frame, PAGE_FREE)); + + if (page_zip_available(page_zip, index->is_clust(), rec_size, 1)) + goto use_heap; + + /* The cursor should remain on the page infimum. */ + return nullptr; + } + + if (page_zip->m_nonempty || page_has_garbage(cursor->block->frame)) + { + ulint pos= page_rec_get_n_recs_before(cursor->rec); + + if (!page_zip_reorganize(cursor->block, index, level, mtr, true)) + { + ut_ad(cursor->rec == cursor_rec); + return nullptr; + } + + if (pos) + cursor->rec= page_rec_get_nth(cursor->block->frame, pos); + else + ut_ad(cursor->rec == page_get_infimum_rec(cursor->block->frame)); + + ut_ad(!page_header_get_ptr(cursor->block->frame, PAGE_FREE)); + + if (page_zip_available(page_zip, index->is_clust(), rec_size, 1)) + goto use_heap; + } + + /* Try compressing the whole page afterwards. */ + const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NONE); + rec_t *insert_rec= page_cur_insert_rec_low(cursor, index, rec, offsets, + mtr); + mtr->set_log_mode(log_mode); + + if (insert_rec) + { + ulint pos= page_rec_get_n_recs_before(insert_rec); + ut_ad(pos > 0); + + /* We are writing entire page images to the log. Reduce the redo + log volume by reorganizing the page at the same time. */ + if (page_zip_reorganize(cursor->block, index, level, mtr)) + { + /* The page was reorganized: Seek to pos. */ + cursor->rec= pos > 1 + ? page_rec_get_nth(cursor->block->frame, pos - 1) + : cursor->block->frame + PAGE_NEW_INFIMUM; + insert_rec= cursor->block->frame + rec_get_next_offs(cursor->rec, 1); + rec_offs_make_valid(insert_rec, index, + page_is_leaf(cursor->block->frame), offsets); + return insert_rec; + } + + /* Theoretically, we could try one last resort of + page_zip_reorganize() followed by page_zip_available(), but that + would be very unlikely to succeed. (If the full reorganized page + failed to compress, why would it succeed to compress the page, + plus log the insert of this record?) */ + + /* Out of space: restore the page */ + if (!page_zip_decompress(page_zip, cursor->block->frame, false)) + ut_error; /* Memory corrupted? */ + ut_ad(page_validate(cursor->block->frame, index)); + insert_rec= nullptr; + } + return insert_rec; + } + + free_rec= mach_read_from_2(page_free); + if (free_rec) + { + /* Try to allocate from the head of the free list. */ + offset_t foffsets_[REC_OFFS_NORMAL_SIZE]; + mem_heap_t *heap= nullptr; + + rec_offs_init(foffsets_); + + offset_t *foffsets= rec_get_offsets(cursor->block->frame + free_rec, index, + foffsets_, + page_is_leaf(cursor->block->frame), + ULINT_UNDEFINED, &heap); + insert_buf= cursor->block->frame + free_rec - + rec_offs_extra_size(foffsets); + + if (rec_offs_size(foffsets) < rec_size) + { +too_small: + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + free_rec= 0; + goto use_heap; + } + + /* On compressed pages, do not relocate records from + the free list. If extra_size would grow, use the heap. */ + const ssize_t extra_size_diff= lint(rec_offs_extra_size(offsets) - + rec_offs_extra_size(foffsets)); + + if (UNIV_UNLIKELY(extra_size_diff < 0)) + { + /* Add an offset to the extra_size. */ + if (rec_offs_size(foffsets) < rec_size - ssize_t(extra_size_diff)) + goto too_small; + + insert_buf-= extra_size_diff; + } + else if (UNIV_UNLIKELY(extra_size_diff)) + /* Do not allow extra_size to grow */ + goto too_small; + + byte *const free_rec_ptr= cursor->block->frame + free_rec; + heap_no= rec_get_heap_no_new(free_rec_ptr); + int16_t next_rec= mach_read_from_2(free_rec_ptr - REC_NEXT); + /* With innodb_page_size=64k, int16_t would be unsafe to use here, + but that cannot be used with ROW_FORMAT=COMPRESSED. */ + static_assert(UNIV_ZIP_SIZE_SHIFT_MAX == 14, "compatibility"); + if (next_rec) + { + next_rec+= free_rec; + ut_ad(int{PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES} <= next_rec); + ut_ad(static_cast<uint16_t>(next_rec) < srv_page_size); + } + + byte *hdr= my_assume_aligned<4>(&page_zip->data[page_free_f]); + mach_write_to_2(hdr, static_cast<uint16_t>(next_rec)); + const byte *const garbage= my_assume_aligned<2>(page_free + 2); + ut_ad(mach_read_from_2(garbage) >= rec_size); + mach_write_to_2(my_assume_aligned<2>(hdr + 2), + mach_read_from_2(garbage) - rec_size); + static_assert(PAGE_GARBAGE == PAGE_FREE + 2, "compatibility"); + mtr->memcpy(*cursor->block, page_free, hdr, 4); + + if (!page_is_leaf(cursor->block->frame)) + { + /* Zero out the node pointer of free_rec, in case it will not be + overwritten by insert_rec. */ + ut_ad(rec_size > REC_NODE_PTR_SIZE); + + if (rec_offs_size(foffsets) > rec_size) + memset(rec_get_end(free_rec_ptr, foffsets) - + REC_NODE_PTR_SIZE, 0, REC_NODE_PTR_SIZE); + } + else if (index->is_clust()) + { + /* Zero out DB_TRX_ID,DB_ROLL_PTR in free_rec, in case they will + not be overwritten by insert_rec. */ + + ulint len; + ulint trx_id_offs= rec_get_nth_field_offs(foffsets, index->db_trx_id(), + &len); + ut_ad(len == DATA_TRX_ID_LEN); + + if (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + trx_id_offs + + rec_offs_extra_size(foffsets) > rec_size) + memset(free_rec_ptr + trx_id_offs, 0, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + ut_ad(free_rec_ptr + trx_id_offs + DATA_TRX_ID_LEN == + rec_get_nth_field(free_rec_ptr, foffsets, index->db_roll_ptr(), + &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + } + + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + } + else + { +use_heap: + ut_ad(!free_rec); + insert_buf = page_mem_alloc_heap<true>(cursor->block, rec_size, &heap_no, + mtr); + + if (UNIV_UNLIKELY(!insert_buf)) + return insert_buf; + + page_zip_dir_add_slot(cursor->block, index, mtr); + } + + /* 3. Create the record */ + byte *insert_rec= rec_copy(insert_buf, rec, offsets); + rec_offs_make_valid(insert_rec, index, page_is_leaf(cursor->block->frame), + offsets); + + /* 4. Insert the record in the linked list of records */ + ut_ad(cursor->rec != insert_rec); + + /* next record after current before the insertion */ + const rec_t* next_rec = page_rec_get_next_low(cursor->rec, TRUE); + ut_ad(rec_get_status(cursor->rec) <= REC_STATUS_INFIMUM); + ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM); + ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM); + + mach_write_to_2(insert_rec - REC_NEXT, static_cast<uint16_t> + (next_rec - insert_rec)); + mach_write_to_2(cursor->rec - REC_NEXT, static_cast<uint16_t> + (insert_rec - cursor->rec)); + byte *n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + + cursor->block->frame); + mtr->write<2>(*cursor->block, n_recs, 1U + mach_read_from_2(n_recs)); + memcpy_aligned<2>(&page_zip->data[PAGE_N_RECS + PAGE_HEADER], n_recs, 2); + + /* 5. Set the n_owned field in the inserted record to zero, + and set the heap_no field */ + rec_set_bit_field_1(insert_rec, 0, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + rec_set_bit_field_2(insert_rec, heap_no, REC_NEW_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); + + UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets), + rec_offs_size(offsets)); + + /* 6. Update the last insertion info in page header */ + byte *last_insert= my_assume_aligned<4>(PAGE_LAST_INSERT + PAGE_HEADER + + page_zip->data); + const uint16_t last_insert_rec= mach_read_from_2(last_insert); + ut_ad(!last_insert_rec || + rec_get_node_ptr_flag(cursor->block->frame + last_insert_rec) == + rec_get_node_ptr_flag(insert_rec)); + mach_write_to_2(last_insert, page_offset(insert_rec)); + + if (!index->is_spatial()) + { + byte *dir= &page_zip->data[PAGE_HEADER + PAGE_DIRECTION_B]; + ut_ad(!(*dir & ~((1U << 3) - 1))); + byte *n= my_assume_aligned<2> + (&page_zip->data[PAGE_HEADER + PAGE_N_DIRECTION]); + if (UNIV_UNLIKELY(!last_insert_rec)) + { +no_direction: + *dir= PAGE_NO_DIRECTION; + memset(n, 0, 2); + } + else if (*dir != PAGE_LEFT && + cursor->block->frame + last_insert_rec == cursor->rec) + { + *dir= PAGE_RIGHT; +inc_dir: + mach_write_to_2(n, mach_read_from_2(n) + 1); + } + else if (*dir != PAGE_RIGHT && page_rec_get_next(insert_rec) == + cursor->block->frame + last_insert_rec) + { + *dir= PAGE_LEFT; + goto inc_dir; + } + else + goto no_direction; + } + + /* Write the header fields in one record. */ + mtr->memcpy(*cursor->block, + my_assume_aligned<8>(PAGE_LAST_INSERT + PAGE_HEADER + + cursor->block->frame), + my_assume_aligned<8>(PAGE_LAST_INSERT + PAGE_HEADER + + page_zip->data), + PAGE_N_RECS - PAGE_LAST_INSERT + 2); + + /* 7. It remains to update the owner record. */ + ulint n_owned; + + while (!(n_owned = rec_get_n_owned_new(next_rec))) + next_rec = page_rec_get_next_low(next_rec, true); + + rec_set_bit_field_1(const_cast<rec_t*>(next_rec), n_owned + 1, + REC_NEW_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + + page_zip_dir_insert(cursor, free_rec, insert_rec, mtr); + + /* 8. Now we have incremented the n_owned field of the owner + record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, + we have to split the corresponding directory slot in two. */ + if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) + page_dir_split_slot<true>(cursor->block, + page_dir_find_owner_slot(next_rec), mtr); + + page_zip_write_rec(cursor->block, insert_rec, index, offsets, 1, mtr); + return insert_rec; +} + +/** Increment PAGE_N_DIRECTION. +@param[in,out] block ROW_FORMAT=COMPRESSED index page +@param[in,out] ptr the PAGE_DIRECTION_B field +@param[in] dir PAGE_RIGHT or PAGE_LEFT */ +static inline void page_direction_increment(buf_block_t *block, byte *ptr, + uint dir) +{ + ut_ad(ptr == PAGE_HEADER + PAGE_DIRECTION_B + block->frame); + ut_ad(dir == PAGE_RIGHT || dir == PAGE_LEFT); + block->page.zip.data[PAGE_HEADER + PAGE_DIRECTION_B]= *ptr= dir; + mach_write_to_2(PAGE_HEADER + PAGE_N_DIRECTION + block->frame, + 1U + page_header_get_field(block->frame, PAGE_N_DIRECTION)); + memcpy_aligned<2>(PAGE_HEADER + PAGE_N_DIRECTION + block->frame, + PAGE_HEADER + PAGE_N_DIRECTION + block->page.zip.data, 2); +} + +/***********************************************************//** +Inserts a record next to page cursor on a compressed and uncompressed +page. Returns pointer to inserted record if succeed, i.e., +enough space available, NULL otherwise. +The cursor stays at the same position. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to record if succeed, NULL otherwise */ +static rec_t* +page_cur_parse_insert_rec_zip( + page_cur_t* cursor, /*!< in/out: page cursor */ + dict_index_t* index, /*!< in: record descriptor */ + const rec_t* rec, /*!< in: pointer to a physical record */ + offset_t* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ byte* insert_buf; ulint rec_size; page_t* page; /*!< the relevant page */ - rec_t* free_rec; /*!< a free record that was reused, - or NULL */ rec_t* insert_rec; /*!< inserted record */ ulint heap_no; /*!< heap number of the inserted record */ page_zip_des_t* page_zip; + ut_ad(!log_sys.is_physical()); + page_zip = page_cur_get_page_zip(cursor); ut_ad(page_zip); ut_ad(rec_offs_validate(rec, index, offsets)); @@ -1635,6 +2064,9 @@ page_cur_insert_rec_zip( && rec_size > page_get_max_insert_size(page, 1) && rec_size <= page_get_max_insert_size_after_reorganize( page, 1); + constexpr uint16_t page_free_f = PAGE_FREE + PAGE_HEADER; + byte* const page_free = my_assume_aligned<4>(page_free_f + page); + uint16_t free_rec; /* 2. Try to find suitable space from page memory management */ if (!page_zip_available(page_zip, dict_index_is_clust(index), @@ -1646,16 +2078,14 @@ page_cur_insert_rec_zip( rec_t* cursor_rec = page_cur_get_rec(cursor); #endif /* UNIV_DEBUG */ -#if 1 /* MDEV-12353 FIXME: skip this for the physical log format! */ /* If we are not writing compressed page images, we must reorganize the page before attempting the insert. */ - if (recv_recovery_is_on()) { + if (recv_recovery_is_on() && !log_sys.is_physical()) { /* Insert into the uncompressed page only. The page reorganization or creation that we would attempt outside crash recovery would have been covered by a previous redo log record. */ -#endif } else if (page_is_empty(page)) { ut_ad(page_cur_is_before_first(cursor)); @@ -1669,6 +2099,7 @@ page_cur_insert_rec_zip( if (page_zip_available( page_zip, dict_index_is_clust(index), rec_size, 1)) { + free_rec = 0; goto use_heap; } @@ -1700,6 +2131,7 @@ page_cur_insert_rec_zip( rec_size, 1)) { /* After reorganizing, there is space available. */ + free_rec = 0; goto use_heap; } } @@ -1734,14 +2166,12 @@ page_cur_insert_rec_zip( be logged after a successful operation. */ ut_ad(!recv_recovery_is_on()); ut_ad(!index->is_dummy); -#if 1 /* MDEV-12353 FIXME: skip this for the physical log format! */ - } else if (recv_recovery_is_on()) { + } else if (recv_recovery_is_on() && !log_sys.is_physical()) { /* This should be followed by MLOG_ZIP_PAGE_COMPRESS_NO_DATA, which should succeed. */ rec_offs_make_valid(insert_rec, index, page_is_leaf(page), offsets); -#endif } else { ulint pos = page_rec_get_n_recs_before(insert_rec); ut_ad(pos > 0); @@ -1786,8 +2216,8 @@ page_cur_insert_rec_zip( return(insert_rec); } - free_rec = page_header_get_ptr(page, PAGE_FREE); - if (UNIV_LIKELY_NULL(free_rec)) { + free_rec = mach_read_from_2(page_free); + if (free_rec) { /* Try to allocate from the head of the free list. */ lint extra_size_diff; offset_t foffsets_[REC_OFFS_NORMAL_SIZE]; @@ -1796,8 +2226,8 @@ page_cur_insert_rec_zip( rec_offs_init(foffsets_); - foffsets = rec_get_offsets(free_rec, index, foffsets, - page_rec_is_leaf(free_rec), + foffsets = rec_get_offsets(page + free_rec, index, foffsets, + page_is_leaf(page), ULINT_UNDEFINED, &heap); if (rec_offs_size(foffsets) < rec_size) { too_small: @@ -1805,10 +2235,11 @@ too_small: mem_heap_free(heap); } + free_rec = 0; goto use_heap; } - insert_buf = free_rec - rec_offs_extra_size(foffsets); + insert_buf = page + free_rec - rec_offs_extra_size(foffsets); /* On compressed pages, do not relocate records from the free list. If extra_size would grow, use the heap. */ @@ -1830,16 +2261,27 @@ too_small: goto too_small; } - heap_no = rec_get_heap_no_new(free_rec); - const rec_t* next = rec_get_next_ptr_const(free_rec, true); - mach_write_to_2(PAGE_FREE + PAGE_HEADER + page, - next ? page_offset(next) : 0); - byte* garbage = PAGE_GARBAGE + PAGE_HEADER + page; + heap_no = rec_get_heap_no_new(page + free_rec); + int16_t next_rec = mach_read_from_2(page + free_rec - REC_NEXT); + /* We assume that int16_t is safe to use here. + With innodb_page_size=64k it would be unsafe, + but that cannot be used with ROW_FORMAT=COMPRESSED. */ + static_assert(UNIV_ZIP_SIZE_SHIFT_MAX == 14, "compatibility"); + if (next_rec) { + next_rec += free_rec; + ut_ad(int{PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES} + <= next_rec); + ut_ad(static_cast<uint16_t>(next_rec) < srv_page_size); + } + mtr->write<2>(*cursor->block, page_free, + static_cast<uint16_t>(next_rec)); + byte* garbage = my_assume_aligned<2>(page_free + 2); ut_ad(mach_read_from_2(garbage) >= rec_size); - mach_write_to_2(garbage, mach_read_from_2(garbage) - rec_size); + mtr->write<2>(*cursor->block, garbage, + mach_read_from_2(garbage) - rec_size); compile_time_assert(PAGE_GARBAGE == PAGE_FREE + 2); - page_zip_write_header(cursor->block, - PAGE_HEADER + PAGE_FREE + page, 4, mtr); + compile_time_assert(!((PAGE_HEADER + PAGE_FREE) % 4)); + memcpy_aligned<4>(&page_zip->data[page_free_f], page_free, 4); /* TODO: group with PAGE_LAST_INSERT */ if (!page_is_leaf(page)) { @@ -1852,7 +2294,7 @@ too_small: if (rec_offs_extra_size(foffsets) + rec_offs_data_size(foffsets) > rec_size) { - memset(rec_get_end(free_rec, foffsets) + memset(rec_get_end(page + free_rec, foffsets) - REC_NODE_PTR_SIZE, 0, REC_NODE_PTR_SIZE); } @@ -1875,7 +2317,7 @@ too_small: they will not be fully overwritten by insert_rec. */ - memset(free_rec + trx_id_offs, 0, + memset(page + free_rec + trx_id_offs, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); } @@ -1890,7 +2332,7 @@ too_small: } } else { use_heap: - free_rec = NULL; + ut_ad(!free_rec); insert_buf = page_mem_alloc_heap<true>(cursor->block, rec_size, &heap_no, mtr); @@ -1918,9 +2360,10 @@ use_heap: (next_rec - insert_rec)); mach_write_to_2(cursor->rec - REC_NEXT, static_cast<uint16_t> (insert_rec - cursor->rec)); - byte* n_recs = PAGE_N_RECS + PAGE_HEADER + page; - mach_write_to_2(n_recs, mach_read_from_2(n_recs) + 1); - page_zip_write_header(cursor->block, n_recs, 2, mtr); + byte* n_recs = my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page); + mtr->write<2>(*cursor->block, n_recs, 1U + mach_read_from_2(n_recs)); + memcpy_aligned<2>(&page_zip->data[PAGE_N_RECS + PAGE_HEADER], n_recs, + 2); /* 5. Set the n_owned field in the inserted record to zero, and set the heap_no field */ @@ -1935,52 +2378,59 @@ use_heap: page_zip_dir_insert(cursor, free_rec, insert_rec, mtr); /* 6. Update the last insertion info in page header */ - byte* last_insert = PAGE_LAST_INSERT + PAGE_HEADER + page; + byte* last_insert = my_assume_aligned<4>(PAGE_LAST_INSERT + PAGE_HEADER + + page); const uint16_t last_insert_rec = mach_read_from_2(last_insert); ut_ad(!last_insert_rec || rec_get_node_ptr_flag(page + last_insert_rec) == rec_get_node_ptr_flag(insert_rec)); - /* TODO: combine with PAGE_DIRECTION changes */ - mach_write_to_2(last_insert, page_offset(insert_rec)); - page_zip_write_header(cursor->block, last_insert, 2, mtr); + /* FIXME: combine with PAGE_DIRECTION changes */ + mtr->write<2>(*cursor->block, last_insert, page_offset(insert_rec)); + memcpy_aligned<4>(&page_zip->data[PAGE_LAST_INSERT + PAGE_HEADER], + last_insert, 2); if (!index->is_spatial()) { byte* ptr = PAGE_HEADER + PAGE_DIRECTION_B + page; if (UNIV_UNLIKELY(!last_insert_rec)) { no_direction: - page_direction_reset<true>(cursor->block, ptr, mtr); + page_zip->data[PAGE_HEADER + PAGE_DIRECTION_B] = *ptr + = PAGE_NO_DIRECTION; + memset_aligned<2>(PAGE_HEADER + PAGE_N_DIRECTION + page, + 0, 2); + memset_aligned<2>(PAGE_HEADER + PAGE_N_DIRECTION + + page_zip->data, 0, 2); } else if (page + last_insert_rec == cursor->rec && page_ptr_get_direction(ptr) != PAGE_LEFT) { - page_direction_increment<true>(cursor->block, ptr, - PAGE_RIGHT, mtr); + page_direction_increment(cursor->block, ptr, + PAGE_RIGHT); } else if (page_ptr_get_direction(ptr) != PAGE_RIGHT && page_rec_get_next(insert_rec) == page + last_insert_rec) { - page_direction_increment<true>(cursor->block, ptr, - PAGE_LEFT, mtr); + page_direction_increment(cursor->block, ptr, + PAGE_LEFT); } else { goto no_direction; } } /* 7. It remains to update the owner record. */ - { - rec_t* owner_rec = page_rec_find_owner_rec(insert_rec); - ulint n_owned; + ulint n_owned; - n_owned = rec_get_n_owned_new(owner_rec); - rec_set_bit_field_1(owner_rec, n_owned + 1, REC_NEW_N_OWNED, - REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + while (!(n_owned = rec_get_n_owned_new(next_rec))) { + next_rec = page_rec_get_next_low(next_rec, true); + } - /* 8. Now we have incremented the n_owned field of the owner - record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, - we have to split the corresponding directory slot in two. */ + rec_set_bit_field_1(const_cast<rec_t*>(next_rec), n_owned + 1, + REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); - if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) { - page_dir_split_slot<true>( - page_cur_get_block(cursor), - page_dir_find_owner_slot(owner_rec), mtr); - } + /* 8. Now we have incremented the n_owned field of the owner + record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, + we have to split the corresponding directory slot in two. */ + if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) { + page_dir_split_slot<true>(page_cur_get_block(cursor), + page_dir_find_owner_slot(next_rec), + mtr); } page_zip_write_rec(cursor->block, insert_rec, index, offsets, 1, mtr); @@ -2045,10 +2495,15 @@ page_parse_copy_rec_list_to_created_page( + block->page.zip.data, 0, 2); } - if (!index->is_spatial()) { - page_direction_reset<true>(block, - PAGE_HEADER + PAGE_DIRECTION_B - + block->frame, mtr); + if (index->is_spatial()) { + return rec_end; + } + + block->frame[PAGE_HEADER + PAGE_DIRECTION_B] &= ~((1U << 3) - 1); + block->frame[PAGE_HEADER + PAGE_DIRECTION_B] |= PAGE_NO_DIRECTION; + if (block->page.zip.data) { + block->page.zip.data[PAGE_HEADER + PAGE_DIRECTION_B] + = PAGE_NO_DIRECTION; } return(rec_end); @@ -2364,13 +2819,11 @@ page_cur_delete_rec( ut_ad(page_rec_is_user_rec(current_rec)); if (page_get_n_recs(block->frame) == 1 -#if 1 /* MDEV-12353 TODO: skip this for the physical log format */ /* Empty the page, unless we are applying the redo log during crash recovery. During normal operation, the page_create_empty() gets logged as one of MLOG_PAGE_CREATE, MLOG_COMP_PAGE_CREATE, MLOG_ZIP_PAGE_COMPRESS. */ - && !recv_recovery_is_on() -#endif + && !recv_recovery_is_on() && !log_sys.is_physical() && !rec_is_alter_metadata(current_rec, *index)) { /* Empty the page. */ ut_ad(page_is_leaf(block->frame)); diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc index a1711885bcf..7b7479906cf 100644 --- a/storage/innobase/page/page0page.cc +++ b/storage/innobase/page/page0page.cc @@ -198,17 +198,15 @@ page_set_max_trx_id( mtr_t* mtr) /*!< in/out: mini-transaction, or NULL */ { ut_ad(!mtr || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!page_zip || page_zip == &block->page.zip); static_assert((PAGE_HEADER + PAGE_MAX_TRX_ID) % 8 == 0, "alignment"); byte *max_trx_id= my_assume_aligned<8>(PAGE_MAX_TRX_ID + PAGE_HEADER + block->frame); + mtr->write<8>(*block, max_trx_id, trx_id); if (UNIV_LIKELY_NULL(page_zip)) - { - mach_write_to_8(max_trx_id, trx_id); - page_zip_write_header(block, max_trx_id, 8, mtr); - } - else - mtr->write<8>(*block, max_trx_id, trx_id); + memcpy_aligned<8>(&page_zip->data[PAGE_MAX_TRX_ID + PAGE_HEADER], + max_trx_id, 8); } /** Persist the AUTO_INCREMENT value on a clustered index root page. @@ -229,17 +227,16 @@ page_set_autoinc( ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)); - byte *field= PAGE_HEADER + PAGE_ROOT_AUTO_INC + block->frame; + byte *field= my_assume_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC + + block->frame); ib_uint64_t old= mach_read_from_8(field); if (old == autoinc || (old > autoinc && !reset)) - /* nothing to update */; - else if (UNIV_LIKELY_NULL(block->page.zip.data)) - { - mach_write_to_8(field, autoinc); - page_zip_write_header(block, field, 8, mtr); - } - else - mtr->write<8>(*block, field, autoinc); + return; /* nothing to update */ + + mtr->write<8>(*block, field, autoinc); + if (UNIV_LIKELY_NULL(block->page.zip.data)) + memcpy_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC + block->page.zip.data, + field, 8); } /** The page infimum and supremum of an empty page in ROW_FORMAT=REDUNDANT */ @@ -327,11 +324,11 @@ void page_create_low(const buf_block_t* block, bool comp) @param[in,out] block buffer block @param[in,out] mtr mini-transaction @param[in] comp set unless ROW_FORMAT=REDUNDANT */ -void page_create(buf_block_t* block, mtr_t* mtr, bool comp) +void page_create(buf_block_t *block, mtr_t *mtr, bool comp) { - mtr->page_create(block->page.id, comp); - buf_block_modify_clock_inc(block); - page_create_low(block, comp); + mtr->page_create(*block, comp); + buf_block_modify_clock_inc(block); + page_create_low(block, comp); } /**********************************************************//** @@ -961,14 +958,15 @@ delete_all: buf_block_modify_clock_inc(block); const bool is_leaf = page_is_leaf(block->frame); - byte* last_insert = my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER - + block->frame); + mtr->write<2,mtr_t::OPT>(*block, my_assume_aligned<2> + (PAGE_LAST_INSERT + PAGE_HEADER + + block->frame), 0U); if (UNIV_LIKELY_NULL(page_zip)) { ut_ad(page_is_comp(block->frame)); - memset(last_insert, 0, 2); - page_zip_write_header(block, last_insert, 2, mtr); + memset_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER + + page_zip->data, 0, 2); do { page_cur_t cur; @@ -990,8 +988,6 @@ delete_all: return; } - mtr->write<2,mtr_t::OPT>(*block, last_insert, 0U); - prev_rec = page_rec_get_prev(rec); last_rec = page_rec_get_prev(page_get_supremum_rec(block->frame)); diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc index f304616ad9d..c6739f067f4 100644 --- a/storage/innobase/page/page0zip.cc +++ b/storage/innobase/page/page0zip.cc @@ -361,6 +361,54 @@ page_zip_dir_get( - PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1))); } +/** Write a byte string to a ROW_FORMAT=COMPRESSED page. +@param[in] b ROW_FORMAT=COMPRESSED index page +@param[in] offset byte offset from b.zip.data +@param[in] len length of the data to write */ +inline void mtr_t::zmemcpy(const buf_page_t &b, ulint offset, ulint len) +{ + ut_ad(mach_read_from_2(b.zip.data + FIL_PAGE_TYPE) == FIL_PAGE_INDEX || + mach_read_from_2(b.zip.data + FIL_PAGE_TYPE) == FIL_PAGE_RTREE); + ut_ad(page_zip_simple_validate(&b.zip)); + ut_ad(offset + len <= page_zip_get_size(&b.zip)); + + memcpy_low(b, static_cast<uint16_t>(offset), &b.zip.data[offset], len); + m_last_offset= static_cast<uint16_t>(offset + len); +} + +/** Write a byte string to a ROW_FORMAT=COMPRESSED page. +@param[in,out] b ROW_FORMAT=COMPRESSED index page +@param[in] dest destination within b.zip.data +@param[in] str the data to write +@param[in] len length of the data to write +@tparam w write request type */ +template<mtr_t::write_type w> +inline void mtr_t::zmemcpy(const buf_page_t &b, void *dest, const void *str, + ulint len) +{ + byte *d= static_cast<byte*>(dest); + const byte *s= static_cast<const byte*>(str); + ut_ad(d >= b.zip.data + FIL_PAGE_OFFSET); + if (w != FORCED) + { + ut_ad(len); + const byte *const end= d + len; + while (*d++ == *s++) + { + if (d == end) + { + ut_ad(w == OPT); + return; + } + } + s--; + d--; + len= static_cast<ulint>(end - d); + } + ::memcpy(d, s, len); + zmemcpy(b, d - b.zip.data, len); +} + /** Write redo log for compressing a ROW_FORMAT=COMPRESSED index page. @param[in,out] block ROW_FORMAT=COMPRESSED index page @param[in] index the index that the block belongs to @@ -3545,9 +3593,9 @@ page_zip_write_rec_ext( byte* ext_start = ext_end - n_ext * FIELD_REF_SIZE; memmove(ext_start, ext_end, len); - /* TODO: write MEMMOVE record */ - mtr->zmemcpy(block->page, ext_start - - page_zip->data, len); + mtr->memmove(*block, + ext_start - page_zip->data, + ext_end - page_zip->data, len); } } @@ -3783,8 +3831,8 @@ void page_zip_write_rec(buf_block_t *block, const byte *rec, /* Copy the node pointer to the uncompressed area. */ byte* node_ptr = storage - REC_NODE_PTR_SIZE * (heap_no - 1); - mtr->zmemcpy(&block->page, node_ptr - page_zip->data, - rec + len, REC_NODE_PTR_SIZE); + mtr->zmemcpy<mtr_t::OPT>(block->page, node_ptr, + rec + len, REC_NODE_PTR_SIZE); } ut_a(!*data); @@ -3917,8 +3965,8 @@ page_zip_write_blob_ptr( externs -= (blob_no + 1) * BTR_EXTERN_FIELD_REF_SIZE; field += len - BTR_EXTERN_FIELD_REF_SIZE; - mtr->zmemcpy(&block->page, ulint(externs - page_zip->data), - field, BTR_EXTERN_FIELD_REF_SIZE); + mtr->zmemcpy<mtr_t::OPT>(block->page, externs, field, + BTR_EXTERN_FIELD_REF_SIZE); #ifdef UNIV_ZIP_DEBUG ut_a(page_zip_validate(page_zip, page, index)); @@ -4040,8 +4088,7 @@ page_zip_write_node_ptr( #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ compile_time_assert(REC_NODE_PTR_SIZE == 4); mach_write_to_4(field, ptr); - mtr->zmemcpy(&block->page, ulint(storage - page_zip->data), - field, REC_NODE_PTR_SIZE); + mtr->zmemcpy(block->page, storage, field, REC_NODE_PTR_SIZE); } /** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record. @@ -4062,9 +4109,6 @@ page_zip_write_trx_id_and_roll_ptr( roll_ptr_t roll_ptr, mtr_t* mtr) { - byte* field; - byte* storage; - ulint len; page_zip_des_t* const page_zip = &block->page.zip; ut_d(const page_t* const page = block->frame); @@ -4084,12 +4128,13 @@ page_zip_write_trx_id_and_roll_ptr( UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); constexpr ulint sys_len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; - storage = page_zip_dir_start(page_zip) - - (rec_get_heap_no_new(rec) - 1) - * sys_len; + const ulint heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); + byte* storage = page_zip_dir_start(page_zip) - (heap_no - 1) * sys_len; compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR); - field = rec_get_nth_field(rec, offsets, trx_id_col, &len); + ulint len; + byte* field = rec_get_nth_field(rec, offsets, trx_id_col, &len); ut_ad(len == DATA_TRX_ID_LEN); ut_ad(field + DATA_TRX_ID_LEN == rec_get_nth_field(rec, offsets, trx_id_col + 1, &len)); @@ -4101,8 +4146,47 @@ page_zip_write_trx_id_and_roll_ptr( mach_write_to_6(field, trx_id); compile_time_assert(DATA_ROLL_PTR_LEN == 7); mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr); - mtr->zmemcpy(&block->page, ulint(storage - page_zip->data), - field, sys_len); + len = 0; + if (heap_no > PAGE_HEAP_NO_USER_LOW) { + byte* prev = storage + sys_len; + for (; len < sys_len && prev[len] == field[len]; len++); + if (len > 4) { + /* We save space by replacing a single record + + WRITE,offset(storage),byte[13] + + with up to two records: + + MEMMOVE,offset(storage),len(1 byte),+13(1 byte), + WRITE|0x80,0,byte[13-len] + + The single WRITE record would be x+13 bytes long (x>2). + The MEMMOVE record would be x+1+1 = x+2 bytes, and + the second WRITE would be 1+1+13-len = 15-len bytes. + + The total size is: x+13 versus x+2+15-len = x+17-len. + To save space, we must have len>4. */ + memcpy(storage, prev, len); + mtr->memmove(*block, ulint(storage - page_zip->data), + ulint(storage - page_zip->data) + sys_len, + len); + storage += len; + field += len; + if (UNIV_LIKELY(len < sys_len)) { + goto write; + } + } else { + len = 0; + goto write; + } + } else { +write: + mtr->zmemcpy<mtr_t::OPT>(block->page, storage, field, + sys_len - len); + } +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(storage - len, field - len, sys_len)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), @@ -4222,9 +4306,8 @@ page_zip_clear_rec( memset(field, 0, REC_NODE_PTR_SIZE); storage -= (heap_no - 1) * REC_NODE_PTR_SIZE; clear_page_zip: - /* TODO: write MEMSET record */ memset(storage, 0, len); - mtr->zmemcpy(block->page, storage - page_zip->data, len); + mtr->memset(*block, storage - page_zip->data, len, 0); } else if (index->is_clust()) { /* Clear trx_id and roll_ptr. On the compressed page, there is an array of these fields immediately before the @@ -4265,33 +4348,24 @@ clear_page_zip: } } -/**********************************************************************//** -Write the "deleted" flag of a record on a compressed page. The flag must -already have been written on the uncompressed page. */ -void -page_zip_rec_set_deleted( -/*=====================*/ - buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */ - const byte* rec, /*!< in: record on the uncompressed page */ - ulint flag, /*!< in: the deleted flag (nonzero=TRUE) */ - mtr_t* mtr) /*!< in,out: mini-transaction */ +/** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record. +@param[in,out] block buffer block +@param[in,out] rec record on a physical index page +@param[in] flag the value of the delete-mark flag +@param[in,out] mtr mini-transaction */ +void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag, + mtr_t *mtr) { - ut_ad(page_align(rec) == block->frame); - page_zip_des_t* const page_zip = &block->page.zip; - byte* slot = page_zip_dir_find(&block->page.zip, page_offset(rec)); - ut_a(slot); - UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); - byte b = *slot; - if (flag) { - b |= (PAGE_ZIP_DIR_SLOT_DEL >> 8); - } else { - b &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8); - } - if (b != *slot) { - mtr->zmemcpy(&block->page, slot - page_zip->data, &b, 1); - } + ut_ad(page_align(rec) == block->frame); + byte *slot= page_zip_dir_find(&block->page.zip, page_offset(rec)); + byte b= *slot; + if (flag) + b|= (PAGE_ZIP_DIR_SLOT_DEL >> 8); + else + b&= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8); + mtr->zmemcpy<mtr_t::OPT>(block->page, slot, &b, 1); #ifdef UNIV_ZIP_DEBUG - ut_a(page_zip_validate(page_zip, page_align(rec), NULL)); + ut_a(page_zip_validate(&block->page.zip, block->frame, nullptr)); #endif /* UNIV_ZIP_DEBUG */ } @@ -4306,20 +4380,16 @@ page_zip_rec_set_owned( ulint flag, /*!< in: the owned flag (nonzero=TRUE) */ mtr_t* mtr) /*!< in/out: mini-transaction */ { - ut_ad(page_align(rec) == block->frame); - page_zip_des_t* const page_zip = &block->page.zip; - byte* slot = page_zip_dir_find(page_zip, page_offset(rec)); - ut_a(slot); - UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); - byte b = *slot; - if (flag) { - b |= (PAGE_ZIP_DIR_SLOT_OWNED >> 8); - } else { - b &= ~(PAGE_ZIP_DIR_SLOT_OWNED >> 8); - } - if (b != *slot) { - mtr->zmemcpy(&block->page, slot - page_zip->data, &b, 1); - } + ut_ad(page_align(rec) == block->frame); + page_zip_des_t *const page_zip= &block->page.zip; + byte *slot= page_zip_dir_find(page_zip, page_offset(rec)); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + byte b= *slot; + if (flag) + b|= (PAGE_ZIP_DIR_SLOT_OWNED >> 8); + else + b&= ~(PAGE_ZIP_DIR_SLOT_OWNED >> 8); + mtr->zmemcpy<mtr_t::OPT>(block->page, slot, &b, 1); } /**********************************************************************//** @@ -4328,8 +4398,8 @@ void page_zip_dir_insert( /*================*/ page_cur_t* cursor, /*!< in/out: page cursor */ - const byte* free_rec,/*!< in: record from which rec was - allocated, or NULL */ + uint16_t free_rec,/*!< in: record from which rec was + allocated, or 0 */ byte* rec, /*!< in: record to insert */ mtr_t* mtr) /*!< in/out: mini-transaction */ { @@ -4371,7 +4441,7 @@ page_zip_dir_insert( n_dense = page_dir_get_n_heap(page_zip->data) - (PAGE_HEAP_NO_USER_LOW + 1U); - if (UNIV_LIKELY_NULL(free_rec)) { + if (UNIV_UNLIKELY(free_rec)) { /* The record was allocated from the free list. Shift the dense directory only up to that slot. Note that in this case, n_dense is actually @@ -4379,8 +4449,8 @@ page_zip_dir_insert( did not increment n_heap. */ ut_ad(rec_get_heap_no_new(rec) < n_dense + 1 + PAGE_HEAP_NO_USER_LOW); - ut_ad(rec >= free_rec); - slot_free = page_zip_dir_find(page_zip, page_offset(free_rec)); + ut_ad(page_offset(rec) >= free_rec); + slot_free = page_zip_dir_find(page_zip, free_rec); ut_ad(slot_free); slot_free += PAGE_ZIP_DIR_SLOT_SIZE; } else { @@ -4394,17 +4464,20 @@ page_zip_dir_insert( - PAGE_ZIP_DIR_SLOT_SIZE * n_dense; } - const ulint slot_len = ulint(slot_rec - slot_free); - /* Shift the dense directory to allocate place for rec. */ - memmove_aligned<2>(slot_free - PAGE_ZIP_DIR_SLOT_SIZE, slot_free, - slot_len); + if (const ulint slot_len = ulint(slot_rec - slot_free)) { + /* Shift the dense directory to allocate place for rec. */ + memmove_aligned<2>(slot_free - PAGE_ZIP_DIR_SLOT_SIZE, + slot_free, slot_len); + mtr->memmove(*cursor->block, (slot_free - page_zip->data) + - PAGE_ZIP_DIR_SLOT_SIZE, + slot_free - page_zip->data, slot_len); + } /* Write the entry for the inserted record. The "owned" and "deleted" flags must be zero. */ mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, page_offset(rec)); - /* TODO: issue MEMMOVE record to reduce log volume */ - mtr->zmemcpy(cursor->block->page, slot_free - PAGE_ZIP_DIR_SLOT_SIZE - - page_zip->data, PAGE_ZIP_DIR_SLOT_SIZE + slot_len); + mtr->zmemcpy(cursor->block->page, slot_rec - page_zip->data + - PAGE_ZIP_DIR_SLOT_SIZE, PAGE_ZIP_DIR_SLOT_SIZE); } /** Shift the dense page directory and the array of BLOB pointers @@ -4434,12 +4507,13 @@ void page_zip_dir_delete(buf_block_t *block, byte *rec, free ? static_cast<uint16_t>(free - rec) : 0); byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER + block->frame); - mach_write_to_2(page_free, page_offset(rec)); + mtr->write<2>(*block, page_free, page_offset(rec)); byte *garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER + block->frame); - mach_write_to_2(garbage, rec_offs_size(offsets) + mach_read_from_2(garbage)); + mtr->write<2>(*block, garbage, rec_offs_size(offsets) + + mach_read_from_2(garbage)); compile_time_assert(PAGE_GARBAGE == PAGE_FREE + 2); - page_zip_write_header(block, page_free, 4, mtr); + memcpy_aligned<4>(PAGE_FREE + PAGE_HEADER + page_zip->data, page_free, 4); byte *slot_rec= page_zip_dir_find(page_zip, page_offset(rec)); ut_a(slot_rec); uint16_t n_recs= page_get_n_recs(block->frame); @@ -4448,8 +4522,9 @@ void page_zip_dir_delete(buf_block_t *block, byte *rec, /* This could not be done before page_zip_dir_find(). */ byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + block->frame); - mach_write_to_2(page_n_recs, n_recs - 1); - page_zip_write_header(block, page_n_recs, 2, mtr); + mtr->write<2>(*block, page_n_recs, n_recs - 1U); + memcpy_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page_zip->data, page_n_recs, + 2); byte *slot_free; @@ -4468,16 +4543,17 @@ void page_zip_dir_delete(buf_block_t *block, byte *rec, const ulint slot_len= slot_rec > slot_free ? ulint(slot_rec - slot_free) : 0; if (slot_len) - /* MDEV-12353 TODO: issue MEMMOVE record */ + { memmove_aligned<2>(slot_free + PAGE_ZIP_DIR_SLOT_SIZE, slot_free, slot_len); + mtr->memmove(*block, (slot_free - page_zip->data) + PAGE_ZIP_DIR_SLOT_SIZE, + slot_free - page_zip->data, slot_len); + } /* Write the entry for the deleted record. The "owned" and "deleted" flags will be cleared. */ mach_write_to_2(slot_free, page_offset(rec)); - - mtr->zmemcpy(block->page, slot_free - page_zip->data, - slot_len + PAGE_ZIP_DIR_SLOT_SIZE); + mtr->zmemcpy(block->page, slot_free - page_zip->data, 2); if (const ulint n_ext= rec_offs_n_extern(offsets)) { @@ -4491,18 +4567,18 @@ void page_zip_dir_delete(buf_block_t *block, byte *rec, byte *externs= page_zip->data + page_zip_get_size(page_zip) - (page_dir_get_n_heap(block->frame) - PAGE_HEAP_NO_USER_LOW) * PAGE_ZIP_CLUST_LEAF_SLOT_SIZE; - byte *ext_end= externs - page_zip->n_blobs * FIELD_REF_SIZE; /* Shift and zero fill the array. */ - memmove(ext_end + n_ext * FIELD_REF_SIZE, ext_end, - ulint(page_zip->n_blobs - n_ext - blob_no) * - BTR_EXTERN_FIELD_REF_SIZE); + if (const ulint ext_len= ulint(page_zip->n_blobs - n_ext - blob_no) * + BTR_EXTERN_FIELD_REF_SIZE) + { + memmove(ext_end + n_ext * FIELD_REF_SIZE, ext_end, ext_len); + mtr->memmove(*block, (ext_end - page_zip->data) + n_ext * FIELD_REF_SIZE, + ext_end - page_zip->data, ext_len); + } memset(ext_end, 0, n_ext * FIELD_REF_SIZE); - /* TODO: use MEMMOVE and MEMSET records to reduce volume */ - const ulint ext_len= ulint(page_zip->n_blobs - blob_no) * FIELD_REF_SIZE; - - mtr->zmemcpy(block->page, ext_end - page_zip->data, ext_len); + mtr->memset(*block, ext_end - page_zip->data, n_ext * FIELD_REF_SIZE, 0); page_zip->n_blobs -= static_cast<unsigned>(n_ext); } diff --git a/storage/innobase/row/row0uins.cc b/storage/innobase/row/row0uins.cc index 74c0b51fbbf..701f11992e1 100644 --- a/storage/innobase/row/row0uins.cc +++ b/storage/innobase/row/row0uins.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2019, MariaDB Corporation. +Copyright (c) 2017, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -206,32 +206,7 @@ func_exit: if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_INSERT_METADATA) { /* When rolling back the very first instant ADD COLUMN operation, reset the root page to the basic state. */ - ut_ad(!index->table->is_temporary()); - if (buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, - &mtr)) { - byte* page_type = root->frame + FIL_PAGE_TYPE; - ut_ad(mach_read_from_2(page_type) - == FIL_PAGE_TYPE_INSTANT - || mach_read_from_2(page_type) - == FIL_PAGE_INDEX); - mtr.write<2,mtr_t::OPT>(*root, page_type, - FIL_PAGE_INDEX); - byte* instant = PAGE_INSTANT + PAGE_HEADER - + root->frame; - mtr.write<2,mtr_t::OPT>( - *root, instant, - page_ptr_get_direction(instant + 1)); - rec_t* infimum = page_get_infimum_rec(root->frame); - rec_t* supremum = page_get_supremum_rec(root->frame); - static const byte str[8 + 8] = "supremuminfimum"; - if (memcmp(infimum, str + 8, 8) - || memcmp(supremum, str, 8)) { - mtr.memcpy(root, page_offset(infimum), - str + 8, 8); - mtr.memcpy(root, page_offset(supremum), - str, 8); - } - } + btr_reset_instant(*index, true, &mtr); } btr_pcur_commit_specify_mtr(&node->pcur, &mtr); diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc index 5e5da78503a..f2e2e4e70d9 100644 --- a/storage/innobase/row/row0umod.cc +++ b/storage/innobase/row/row0umod.cc @@ -148,37 +148,12 @@ row_undo_mod_clust_low( ut_a(!dummy_big_rec); - static const byte - INFIMUM[8] = {'i','n','f','i','m','u','m',0}, - SUPREMUM[8] = {'s','u','p','r','e','m','u','m'}; - if (err == DB_SUCCESS && node->ref == &trx_undo_metadata && btr_cur_get_index(btr_cur)->table->instant && node->update->info_bits == REC_INFO_METADATA_ADD) { - if (buf_block_t* root = btr_root_block_get( - btr_cur_get_index(btr_cur), RW_SX_LATCH, - mtr)) { - uint16_t infimum, supremum; - if (page_is_comp(root->frame)) { - infimum = PAGE_NEW_INFIMUM; - supremum = PAGE_NEW_SUPREMUM; - } else { - infimum = PAGE_OLD_INFIMUM; - supremum = PAGE_OLD_SUPREMUM; - } - - ut_ad(!memcmp(root->frame + infimum, - INFIMUM, 8) - == !memcmp(root->frame + supremum, - SUPREMUM, 8)); - - if (memcmp(root->frame + infimum, INFIMUM, 8)) { - mtr->memcpy(root, infimum, INFIMUM, 8); - mtr->memcpy(root, supremum, SUPREMUM, - 8); - } - } + btr_reset_instant(*btr_cur_get_index(btr_cur), false, + mtr); } } diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 6f2abf96b69..d8378d271ec 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -1083,7 +1083,7 @@ srv_prepare_to_delete_redo_log_files( ib::info info; if (srv_log_file_size == 0 || (log_sys.log.format & ~log_t::FORMAT_ENCRYPTED) - != log_t::FORMAT_10_4) { + != log_t::FORMAT_10_5) { info << "Upgrading redo log: "; } else if (n_files != srv_n_log_files || srv_log_file_size @@ -1829,8 +1829,8 @@ files_checked: && srv_n_log_files_found == srv_n_log_files && log_sys.log.format == (srv_encrypt_log - ? log_t::FORMAT_ENC_10_4 - : log_t::FORMAT_10_4) + ? log_t::FORMAT_ENC_10_5 + : log_t::FORMAT_10_5) && log_sys.log.subformat == 2) { /* No need to add or remove encryption, upgrade, downgrade, or resize. */ diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc index 546a4c51e03..0a8de8b7fa9 100644 --- a/storage/innobase/trx/trx0rseg.cc +++ b/storage/innobase/trx/trx0rseg.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2019, MariaDB Corporation. +Copyright (c) 2017, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -72,8 +72,8 @@ trx_rseg_write_wsrep_checkpoint( const ulint xid_length = static_cast<ulint>(xid->gtrid_length + xid->bqual_length); - mtr->memcpy(rseg_header, TRX_RSEG + TRX_RSEG_WSREP_XID_DATA, - xid->data, xid_length); + mtr->memcpy(*rseg_header, TRX_RSEG + TRX_RSEG_WSREP_XID_DATA + + rseg_header->frame, xid->data, xid_length); if (UNIV_LIKELY(xid_length < XIDDATASIZE)) { mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_WSREP_XID_DATA + xid_length, @@ -738,9 +738,9 @@ void trx_rseg_update_binlog_offset(buf_block_t *rseg_header, const trx_t *trx, + rseg_header->frame, trx->mysql_log_offset); - if (memcmp(trx->mysql_log_file_name, TRX_RSEG + TRX_RSEG_BINLOG_NAME - + rseg_header->frame, len)) { - mtr->memcpy(rseg_header, TRX_RSEG + TRX_RSEG_BINLOG_NAME, - trx->mysql_log_file_name, len); + void* name = TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_header->frame; + + if (memcmp(trx->mysql_log_file_name, name, len)) { + mtr->memcpy(*rseg_header, name, trx->mysql_log_file_name, len); } } diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc index d6575ba4d49..985cbeba2f9 100644 --- a/storage/innobase/trx/trx0undo.cc +++ b/storage/innobase/trx/trx0undo.cc @@ -390,12 +390,11 @@ static void trx_undo_page_init(const buf_block_t *undo_block, mtr_t *mtr) compile_time_assert(TRX_UNDO_PAGE_START == 2); compile_time_assert(TRX_UNDO_PAGE_NODE == TRX_UNDO_PAGE_FREE + 2); - /* MDEV-12353 FIXME: write minimal number of bytes in the new encoding */ - mtr->write<4>(*undo_block, TRX_UNDO_PAGE_HDR + undo_block->frame, - TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); - mtr->write<2>(*undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + - undo_block->frame, - TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); + alignas(4) byte hdr[6]; + mach_write_to_4(hdr, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); + memcpy_aligned<2>(hdr + 4, hdr + 2, 2); + static_assert(TRX_UNDO_PAGE_FREE == 4, "compatibility"); + mtr->memcpy(*undo_block, undo_block->frame + TRX_UNDO_PAGE_HDR, hdr, 6); } /** Look for a free slot for an undo log segment. @@ -501,41 +500,63 @@ trx_undo_seg_create(fil_space_t *space, buf_block_t *rseg_hdr, ulint *id, static uint16_t trx_undo_header_create(buf_block_t *undo_page, trx_id_t trx_id, mtr_t* mtr) { - const uint16_t free= mach_read_from_2(TRX_UNDO_PAGE_HDR + - TRX_UNDO_PAGE_FREE + undo_page->frame); - const uint16_t new_free= free + TRX_UNDO_LOG_OLD_HDR_SIZE; - + /* Reset the TRX_UNDO_PAGE_TYPE in case this page is being + repurposed after upgrading to MariaDB 10.3. */ + byte *undo_type= my_assume_aligned<2> + (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + undo_page->frame); + ut_ad(mach_read_from_2(undo_type) <= TRX_UNDO_UPDATE); + mtr->write<2,mtr_t::OPT>(*undo_page, undo_type, 0U); + byte *start= my_assume_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + + undo_page->frame); + const uint16_t free= mach_read_from_2(start + 2); + static_assert(TRX_UNDO_PAGE_START + 2 == TRX_UNDO_PAGE_FREE, + "compatibility"); ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < srv_page_size - 100); - mtr->write<2>(*undo_page, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + - undo_page->frame, new_free); - /* MDEV-12353 TODO: use MEMMOVE record */ - mtr->write<2>(*undo_page, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + - undo_page->frame, new_free); - mtr->write<2>(*undo_page, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + - undo_page->frame, TRX_UNDO_ACTIVE); - - mtr->write<2,mtr_t::OPT>(*undo_page, free + TRX_UNDO_NEEDS_PURGE + - undo_page->frame, 1U); - mtr->write<8>(*undo_page, free + TRX_UNDO_TRX_ID + undo_page->frame, trx_id); - mtr->write<2,mtr_t::OPT>(*undo_page, free + TRX_UNDO_LOG_START + - undo_page->frame, new_free); - mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS, - TRX_UNDO_LOG_OLD_HDR_SIZE - TRX_UNDO_XID_EXISTS, 0); - - if (uint16_t prev_log= mach_read_from_2(TRX_UNDO_SEG_HDR + - TRX_UNDO_LAST_LOG + - undo_page->frame)) - { + mach_write_to_2(start, free + TRX_UNDO_LOG_XA_HDR_SIZE); + /* A WRITE of 2 bytes is never longer than a MEMMOVE. + So, WRITE 2+2 bytes is better than WRITE+MEMMOVE. + But, a MEMSET will only be 1+2 bytes, that is, 1 byte shorter! */ + memcpy_aligned<2>(start + 2, start, 2); + mtr->memset(*undo_page, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START, 4, + start, 2); + uint16_t prev_log= mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG + + undo_page->frame); + alignas(4) byte buf[4]; + mach_write_to_2(buf, TRX_UNDO_ACTIVE); + mach_write_to_2(buf + 2, free); + static_assert(TRX_UNDO_STATE + 2 == TRX_UNDO_LAST_LOG, "compatibility"); + static_assert(!((TRX_UNDO_SEG_HDR + TRX_UNDO_STATE) % 4), "alignment"); + mtr->memcpy(*undo_page, my_assume_aligned<4> + (TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + undo_page->frame), + buf, 4); + if (prev_log) mtr->write<2>(*undo_page, prev_log + TRX_UNDO_NEXT_LOG + undo_page->frame, free); - mtr->write<2>(*undo_page, free + TRX_UNDO_PREV_LOG + undo_page->frame, - prev_log); + mtr->write<8>(*undo_page, free + TRX_UNDO_TRX_ID + undo_page->frame, trx_id); + /* Write TRX_UNDO_NEEDS_PURGE=1 and TRX_UNDO_LOG_START. */ + mach_write_to_2(buf, 1); + memcpy_aligned<2>(buf + 2, start, 2); + static_assert(TRX_UNDO_NEEDS_PURGE + 2 == TRX_UNDO_LOG_START, + "compatibility"); + mtr->memcpy(*undo_page, free + TRX_UNDO_NEEDS_PURGE + undo_page->frame, + buf, 4); + /* Initialize all fields TRX_UNDO_XID_EXISTS to TRX_UNDO_HISTORY_NODE. */ + if (prev_log) + { + mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS, + TRX_UNDO_PREV_LOG - TRX_UNDO_XID_EXISTS, 0); + mtr->write<2,mtr_t::OPT>(*undo_page, free + TRX_UNDO_PREV_LOG + + undo_page->frame, prev_log); + static_assert(TRX_UNDO_PREV_LOG + 2 == TRX_UNDO_HISTORY_NODE, + "compatibility"); + mtr->memset(undo_page, free + TRX_UNDO_HISTORY_NODE, FLST_NODE_SIZE, 0); + static_assert(TRX_UNDO_LOG_OLD_HDR_SIZE == TRX_UNDO_HISTORY_NODE + + FLST_NODE_SIZE, "compatibility"); } - - mtr->write<2>(*undo_page, TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG + - undo_page->frame, free); - + else + mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS, + TRX_UNDO_LOG_OLD_HDR_SIZE - TRX_UNDO_XID_EXISTS, 0); return free; } @@ -563,7 +584,8 @@ static void trx_undo_write_xid(buf_block_t *block, uint16_t offset, static_cast<uint32_t>(xid.bqual_length)); const ulint xid_length= static_cast<ulint>(xid.gtrid_length + xid.bqual_length); - mtr->memcpy(block, offset + TRX_UNDO_XA_XID, xid.data, xid_length); + mtr->memcpy(*block, &block->frame[offset + TRX_UNDO_XA_XID], + xid.data, xid_length); if (UNIV_LIKELY(xid_length < XIDDATASIZE)) mtr->memset(block, offset + TRX_UNDO_XA_XID + xid_length, XIDDATASIZE - xid_length, 0); @@ -587,29 +609,6 @@ trx_undo_read_xid(const trx_ulogf_t* log_hdr, XID* xid) memcpy(xid->data, log_hdr + TRX_UNDO_XA_XID, XIDDATASIZE); } -/** Add space for the XA XID after an undo log old-style header. -@param[in,out] block undo page -@param[in] offset offset of the undo log header -@param[in,out] mtr mini-transaction */ -static void trx_undo_header_add_space_for_xid(buf_block_t *block, ulint offset, - mtr_t *mtr) -{ - uint16_t free= mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + - block->frame); - /* free is now the end offset of the old style undo log header */ - ut_a(free == offset + TRX_UNDO_LOG_OLD_HDR_SIZE); - free += TRX_UNDO_LOG_XA_HDR_SIZE - TRX_UNDO_LOG_OLD_HDR_SIZE; - /* Add space for a XID after the header, update the free offset - fields on the undo log page and in the undo log header */ - - mtr->write<2>(*block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block->frame, - free); - /* MDEV-12353 TODO: use MEMMOVE record */ - mtr->write<2>(*block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block->frame, - free); - mtr->write<2>(*block, offset + TRX_UNDO_LOG_START + block->frame, free); -} - /** Parse the redo log entry of an undo log page header create. @param[in] ptr redo log record @param[in] end_ptr end of log buffer @@ -1133,8 +1132,6 @@ trx_undo_create(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo, uint16_t offset = trx_undo_header_create(block, trx->id, mtr); - trx_undo_header_add_space_for_xid(block, offset, mtr); - *undo = trx_undo_mem_create(rseg, id, trx->id, trx->xid, block->page.id.page_no(), offset); if (*undo == NULL) { @@ -1204,17 +1201,6 @@ trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo, *pundo = undo; uint16_t offset = trx_undo_header_create(block, trx->id, mtr); - /* Reset the TRX_UNDO_PAGE_TYPE in case this page is being - repurposed after upgrading to MariaDB 10.3. */ - if (ut_d(ulint type =) UNIV_UNLIKELY( - mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE - + block->frame))) { - ut_ad(type == TRX_UNDO_INSERT || type == TRX_UNDO_UPDATE); - mtr->write<2>(*block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE - + block->frame, 0U); - } - - trx_undo_header_add_space_for_xid(block, offset, mtr); trx_undo_mem_init_for_reuse(undo, trx->id, trx->xid, offset); |