summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--extra/mariabackup/xtrabackup.cc19
-rw-r--r--mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result2
-rw-r--r--mysql-test/suite/innodb/r/log_alter_table.result21
-rw-r--r--mysql-test/suite/innodb/r/log_corruption.result2
-rw-r--r--mysql-test/suite/innodb/r/log_file_name_debug.result2
-rw-r--r--mysql-test/suite/innodb/t/log_alter_table.opt1
-rw-r--r--mysql-test/suite/innodb/t/log_alter_table.test46
-rw-r--r--mysql-test/suite/innodb/t/log_corruption.test4
-rw-r--r--mysql-test/suite/innodb/t/log_file_name_debug.test2
-rw-r--r--storage/innobase/btr/btr0btr.cc204
-rw-r--r--storage/innobase/btr/btr0bulk.cc218
-rw-r--r--storage/innobase/btr/btr0cur.cc174
-rw-r--r--storage/innobase/buf/buf0buf.cc21
-rw-r--r--storage/innobase/fil/fil0crypt.cc6
-rw-r--r--storage/innobase/fil/fil0fil.cc188
-rw-r--r--storage/innobase/fsp/fsp0fsp.cc69
-rw-r--r--storage/innobase/fut/fut0lst.cc138
-rw-r--r--storage/innobase/gis/gis0rtree.cc17
-rw-r--r--storage/innobase/handler/ha_innodb.cc2
-rw-r--r--storage/innobase/handler/handler0alter.cc15
-rw-r--r--storage/innobase/include/btr0btr.h31
-rw-r--r--storage/innobase/include/btr0btr.ic41
-rw-r--r--storage/innobase/include/btr0bulk.h5
-rw-r--r--storage/innobase/include/dyn0buf.h3
-rw-r--r--storage/innobase/include/fil0fil.h37
-rw-r--r--storage/innobase/include/fsp0fsp.h2
-rw-r--r--storage/innobase/include/fut0lst.h43
-rw-r--r--storage/innobase/include/log0log.h15
-rw-r--r--storage/innobase/include/log0recv.h73
-rw-r--r--storage/innobase/include/mtr0log.h498
-rw-r--r--storage/innobase/include/mtr0mtr.h181
-rw-r--r--storage/innobase/include/mtr0mtr.ic2
-rw-r--r--storage/innobase/include/mtr0types.h232
-rw-r--r--storage/innobase/include/page0page.h13
-rw-r--r--storage/innobase/include/page0page.ic57
-rw-r--r--storage/innobase/include/page0zip.h36
-rw-r--r--storage/innobase/include/page0zip.ic26
-rw-r--r--storage/innobase/log/log0log.cc30
-rw-r--r--storage/innobase/log/log0recv.cc1160
-rw-r--r--storage/innobase/mtr/mtr0log.cc141
-rw-r--r--storage/innobase/mtr/mtr0mtr.cc70
-rw-r--r--storage/innobase/page/page0cur.cc1125
-rw-r--r--storage/innobase/page/page0page.cc46
-rw-r--r--storage/innobase/page/page0zip.cc252
-rw-r--r--storage/innobase/row/row0uins.cc29
-rw-r--r--storage/innobase/row/row0umod.cc29
-rw-r--r--storage/innobase/srv/srv0start.cc6
-rw-r--r--storage/innobase/trx/trx0rseg.cc14
-rw-r--r--storage/innobase/trx/trx0undo.cc132
49 files changed, 3660 insertions, 1820 deletions
diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc
index 03533896d0b..8bce6567233 100644
--- a/extra/mariabackup/xtrabackup.cc
+++ b/extra/mariabackup/xtrabackup.cc
@@ -590,26 +590,25 @@ std::string filename_to_spacename(const byte *filename, size_t len)
/** Report an operation to create, delete, or rename a file during backup.
@param[in] space_id tablespace identifier
-@param[in] flags tablespace flags (NULL if not create)
+@param[in] create whether the file is being created
@param[in] name file name (not NUL-terminated)
@param[in] len length of name, in bytes
@param[in] new_name new file name (NULL if not rename)
@param[in] new_len length of new_name, in bytes (0 if NULL) */
-static void backup_file_op(ulint space_id, const byte* flags,
+static void backup_file_op(ulint space_id, bool create,
const byte* name, ulint len,
const byte* new_name, ulint new_len)
{
- ut_ad(!flags || !new_name);
+ ut_ad(!create || !new_name);
ut_ad(name);
ut_ad(len);
ut_ad(!new_name == !new_len);
pthread_mutex_lock(&backup_mutex);
- if (flags) {
+ if (create) {
ddl_tracker.id_to_name[space_id] = filename_to_spacename(name, len);
- msg("DDL tracking : create %zu \"%.*s\": %x",
- space_id, int(len), name, mach_read_from_4(flags));
+ msg("DDL tracking : create %zu \"%.*s\"", space_id, int(len), name);
}
else if (new_name) {
ddl_tracker.id_to_name[space_id] = filename_to_spacename(new_name, new_len);
@@ -632,14 +631,14 @@ static void backup_file_op(ulint space_id, const byte* flags,
We will abort backup in this case.
*/
-static void backup_file_op_fail(ulint space_id, const byte* flags,
+static void backup_file_op_fail(ulint space_id, bool create,
const byte* name, ulint len,
const byte* new_name, ulint new_len)
{
bool fail;
- if (flags) {
- msg("DDL tracking : create %zu \"%.*s\": %x",
- space_id, int(len), name, mach_read_from_4(flags));
+ if (create) {
+ msg("DDL tracking : create %zu \"%.*s\"",
+ space_id, int(len), name);
std::string spacename = filename_to_spacename(name, len);
fail = !check_if_skip_table(spacename.c_str());
}
diff --git a/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result b/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result
index 0d32ce422a9..55c1bd718ef 100644
--- a/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result
+++ b/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result
@@ -136,7 +136,7 @@ WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
COUNT(*)
1
-FOUND 1 /InnoDB: .* started; log sequence number 121397[09]/ in mysqld.1.err
+FOUND 1 /InnoDB: .* started; log sequence number 12139[78]\d; transaction id 0/ in mysqld.1.err
# Empty 10.2 redo log
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
diff --git a/mysql-test/suite/innodb/r/log_alter_table.result b/mysql-test/suite/innodb/r/log_alter_table.result
deleted file mode 100644
index ae021b82e37..00000000000
--- a/mysql-test/suite/innodb/r/log_alter_table.result
+++ /dev/null
@@ -1,21 +0,0 @@
-# restart
-#
-# Bug#21801423 INNODB REDO LOG DOES NOT INDICATE WHEN
-# FILES ARE CREATED
-#
-# Bug#21796691 INNODB REDO LOG DOES NOT INDICATE WHEN
-# REDO LOGGING IS SKIPPED
-#
-CREATE TABLE t1 (a INT NOT NULL, b INT UNIQUE) ENGINE=InnoDB;
-INSERT INTO t1 VALUES (1,2);
-ALTER TABLE t1 ADD PRIMARY KEY(a), LOCK=SHARED, ALGORITHM=INPLACE;
-ALTER TABLE t1 DROP INDEX b, ADD INDEX (b), LOCK=SHARED;
-# Kill the server
-# restart: --debug=d,ib_log
-FOUND 2 /scan \d+: multi-log rec MLOG_FILE_CREATE2 len \d+ page \d+:0/ in mysqld.1.err
-NOT FOUND /scan \d+: log rec MLOG_INDEX_LOAD/ in mysqld.1.err
-CHECK TABLE t1;
-Table Op Msg_type Msg_text
-test.t1 check status OK
-# restart
-DROP TABLE t1;
diff --git a/mysql-test/suite/innodb/r/log_corruption.result b/mysql-test/suite/innodb/r/log_corruption.result
index 67a03d53e40..ab33ea1b152 100644
--- a/mysql-test/suite/innodb/r/log_corruption.result
+++ b/mysql-test/suite/innodb/r/log_corruption.result
@@ -136,7 +136,7 @@ WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
COUNT(*)
1
-FOUND 1 /InnoDB: .* started; log sequence number 121397[09]/ in mysqld.1.err
+FOUND 1 /InnoDB: .* started; log sequence number 12139[78]\d; transaction id 0/ in mysqld.1.err
# Empty 10.2 redo log
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
diff --git a/mysql-test/suite/innodb/r/log_file_name_debug.result b/mysql-test/suite/innodb/r/log_file_name_debug.result
index 1fce4ecb674..4deef6f2785 100644
--- a/mysql-test/suite/innodb/r/log_file_name_debug.result
+++ b/mysql-test/suite/innodb/r/log_file_name_debug.result
@@ -12,7 +12,7 @@ FOUND 1 /InnoDB: Tablespace 4294967280 was not found at .*, but there were no mo
# restart: --debug=d,innodb_log_abort_3,ib_log --innodb-log-files-in-group=2 --innodb-log-file-size=4M
SELECT * FROM t1;
ERROR 42000: Unknown storage engine 'InnoDB'
-FOUND 1 /srv_prepare_to_delete_redo_log_files: ib_log: MLOG_CHECKPOINT.* written/ in mysqld.1.err
+FOUND 1 /srv_prepare_to_delete_redo_log_files: ib_log: FILE_CHECKPOINT.* written/ in mysqld.1.err
# restart
# restart
DROP TABLE t1;
diff --git a/mysql-test/suite/innodb/t/log_alter_table.opt b/mysql-test/suite/innodb/t/log_alter_table.opt
deleted file mode 100644
index ef236fcec40..00000000000
--- a/mysql-test/suite/innodb/t/log_alter_table.opt
+++ /dev/null
@@ -1 +0,0 @@
---innodb-log-optimize-ddl
diff --git a/mysql-test/suite/innodb/t/log_alter_table.test b/mysql-test/suite/innodb/t/log_alter_table.test
deleted file mode 100644
index b0669c64f77..00000000000
--- a/mysql-test/suite/innodb/t/log_alter_table.test
+++ /dev/null
@@ -1,46 +0,0 @@
---source include/have_innodb.inc
---source include/have_debug.inc
-
-# Embedded server does not support crashing
---source include/not_embedded.inc
-
-# start afresh
---source include/restart_mysqld.inc
-
---echo #
---echo # Bug#21801423 INNODB REDO LOG DOES NOT INDICATE WHEN
---echo # FILES ARE CREATED
---echo #
---echo # Bug#21796691 INNODB REDO LOG DOES NOT INDICATE WHEN
---echo # REDO LOGGING IS SKIPPED
---echo #
---source include/no_checkpoint_start.inc
-CREATE TABLE t1 (a INT NOT NULL, b INT UNIQUE) ENGINE=InnoDB;
-# MLOG_INDEX_LOAD will not be emitted for empty tables. Insert a row.
-INSERT INTO t1 VALUES (1,2);
-# We should get two MLOG_INDEX_LOAD for this.
-ALTER TABLE t1 ADD PRIMARY KEY(a), LOCK=SHARED, ALGORITHM=INPLACE;
-# And one MLOG_INDEX_LOAD for this.
-ALTER TABLE t1 DROP INDEX b, ADD INDEX (b), LOCK=SHARED;
-
---let CLEANUP_IF_CHECKPOINT=DROP TABLE t1;
---source include/no_checkpoint_end.inc
-
---let $restart_parameters= --debug=d,ib_log
---source include/start_mysqld.inc
-
-let SEARCH_FILE = $MYSQLTEST_VARDIR/log/mysqld.1.err;
-# ensure that we have exactly 2 records there.
-let SEARCH_PATTERN=scan \d+: multi-log rec MLOG_FILE_CREATE2 len \d+ page \d+:0;
---source include/search_pattern_in_file.inc
-# ensure that we have 0 records there.
-let SEARCH_PATTERN=scan \d+: log rec MLOG_INDEX_LOAD;
---source include/search_pattern_in_file.inc
-
-CHECK TABLE t1;
-
-# Remove the --debug=d,ib_log setting.
---let $restart_parameters=
---source include/restart_mysqld.inc
-
-DROP TABLE t1;
diff --git a/mysql-test/suite/innodb/t/log_corruption.test b/mysql-test/suite/innodb/t/log_corruption.test
index 46318fb37d2..6c2ef5db0bb 100644
--- a/mysql-test/suite/innodb/t/log_corruption.test
+++ b/mysql-test/suite/innodb/t/log_corruption.test
@@ -424,8 +424,8 @@ AND support IN ('YES', 'DEFAULT', 'ENABLED');
# In encryption.innodb_encrypt_log_corruption, we would convert the
# log to encrypted format. Writing an extra log checkpoint before the
# redo log conversion would advance the LSN by the size of a
-# MLOG_CHECKPOINT record (9 bytes).
---let SEARCH_PATTERN= InnoDB: .* started; log sequence number 121397[09]
+# FILE_CHECKPOINT record (12 bytes).
+--let SEARCH_PATTERN= InnoDB: .* started; log sequence number 12139[78]\d; transaction id 0
--source include/search_pattern_in_file.inc
--echo # Empty 10.2 redo log
diff --git a/mysql-test/suite/innodb/t/log_file_name_debug.test b/mysql-test/suite/innodb/t/log_file_name_debug.test
index d85fbf08194..fac1a72fe45 100644
--- a/mysql-test/suite/innodb/t/log_file_name_debug.test
+++ b/mysql-test/suite/innodb/t/log_file_name_debug.test
@@ -39,7 +39,7 @@ SELECT * FROM t1;
--source include/restart_mysqld.inc
--error ER_UNKNOWN_STORAGE_ENGINE
SELECT * FROM t1;
---let SEARCH_PATTERN= srv_prepare_to_delete_redo_log_files: ib_log: MLOG_CHECKPOINT.* written
+--let SEARCH_PATTERN= srv_prepare_to_delete_redo_log_files: ib_log: FILE_CHECKPOINT.* written
--source include/search_pattern_in_file.inc
--let $restart_parameters=
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
index e4b2b05734b..0c7e3e38d78 100644
--- a/storage/innobase/btr/btr0btr.cc
+++ b/storage/innobase/btr/btr0btr.cc
@@ -438,32 +438,33 @@ btr_page_create(
ulint level, /*!< in: the B-tree level of the page */
mtr_t* mtr) /*!< in: mtr */
{
- ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
- byte *index_id= &block->frame[PAGE_HEADER + PAGE_INDEX_ID];
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ byte *index_id= my_assume_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID +
+ block->frame);
- if (UNIV_LIKELY_NULL(page_zip)) {
- page_create_zip(block, index, level, 0, mtr);
- mach_write_to_8(index_id, index->id);
- page_zip_write_header(block, index_id, 8, mtr);
- } else {
- page_create(block, mtr, dict_table_is_comp(index->table));
- if (index->is_spatial()) {
- static_assert(((FIL_PAGE_INDEX & 0xff00)
- | byte(FIL_PAGE_RTREE))
- == FIL_PAGE_RTREE, "compatibility");
- mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
- byte(FIL_PAGE_RTREE));
- if (mach_read_from_8(block->frame
- + FIL_RTREE_SPLIT_SEQ_NUM)) {
- mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM,
- 8, 0);
- }
- }
- /* Set the level of the new index page */
- mtr->write<2,mtr_t::OPT>(*block, PAGE_HEADER + PAGE_LEVEL
- + block->frame, level);
- mtr->write<8,mtr_t::OPT>(*block, index_id, index->id);
- }
+ if (UNIV_LIKELY_NULL(page_zip))
+ {
+ mach_write_to_8(index_id, index->id);
+ page_create_zip(block, index, level, 0, mtr);
+ }
+ else
+ {
+ page_create(block, mtr, dict_table_is_comp(index->table));
+ if (index->is_spatial())
+ {
+ static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) ==
+ FIL_PAGE_RTREE, "compatibility");
+ mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+ byte(FIL_PAGE_RTREE));
+ if (mach_read_from_8(block->frame + FIL_RTREE_SPLIT_SEQ_NUM))
+ mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0);
+ }
+ /* Set the level of the new index page */
+ mtr->write<2,mtr_t::OPT>(*block,
+ my_assume_aligned<2>(PAGE_HEADER + PAGE_LEVEL +
+ block->frame), level);
+ mtr->write<8,mtr_t::OPT>(*block, index_id, index->id);
+ }
}
/**************************************************************//**
@@ -984,14 +985,12 @@ static void btr_free_root(buf_block_t *block, mtr_t *mtr, bool invalidate)
#endif /* UNIV_BTR_DEBUG */
if (invalidate)
{
- byte *page_index_id= PAGE_HEADER + PAGE_INDEX_ID + block->frame;
- if (UNIV_LIKELY_NULL(block->page.zip.data))
- {
- mach_write_to_8(page_index_id, BTR_FREED_INDEX_ID);
- page_zip_write_header(block, page_index_id, 8, mtr);
- }
- else
- mtr->write<8,mtr_t::OPT>(*block, page_index_id, BTR_FREED_INDEX_ID);
+ constexpr uint16_t field= PAGE_HEADER + PAGE_INDEX_ID;
+
+ byte *page_index_id= my_assume_aligned<2>(field + block->frame);
+ if (mtr->write<8,mtr_t::OPT>(*block, page_index_id, BTR_FREED_INDEX_ID) &&
+ UNIV_LIKELY_NULL(block->page.zip.data))
+ memcpy_aligned<2>(&block->page.zip.data[field], page_index_id, 8);
}
/* Free the entire segment in small steps. */
@@ -1120,16 +1119,17 @@ btr_create(
buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
}
- byte* page_index_id = PAGE_HEADER + PAGE_INDEX_ID + block->frame;
+ ut_ad(!page_has_siblings(block->frame));
+
+ constexpr uint16_t field = PAGE_HEADER + PAGE_INDEX_ID;
+
+ byte* page_index_id = my_assume_aligned<2>(field + block->frame);
/* Create a new index page on the allocated segment page */
if (UNIV_LIKELY_NULL(block->page.zip.data)) {
- page_create_zip(block, index, 0, 0, mtr);
mach_write_to_8(page_index_id, index_id);
- page_zip_write_header(block, page_index_id, 8, mtr);
- static_assert(FIL_PAGE_PREV % 8 == 0, "alignment");
- memset_aligned<8>(FIL_PAGE_PREV + block->page.zip.data,
- 0xff, 8);
+ ut_ad(!page_has_siblings(block->page.zip.data));
+ page_create_zip(block, index, 0, 0, mtr);
} else {
page_create(block, mtr, index->table->not_redundant());
if (index->is_spatial()) {
@@ -1150,11 +1150,6 @@ btr_create(
mtr->write<8,mtr_t::OPT>(*block, page_index_id, index_id);
}
- /* Set the next node and previous node fields */
- compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
- compile_time_assert(FIL_NULL == 0xffffffff);
- mtr->memset(block, FIL_PAGE_PREV, 8, 0xff);
-
/* We reset the free bits for the page in a separate
mini-transaction to allow creation of several trees in the
same mtr, otherwise the latch on a bitmap page would prevent
@@ -1781,6 +1776,49 @@ void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr)
}
}
+/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE.
+@param[in] index clustered index with instant ALTER TABLE
+@param[in] all whether to reset FIL_PAGE_TYPE as well
+@param[in,out] mtr mini-transaction */
+ATTRIBUTE_COLD
+void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr)
+{
+ ut_ad(!index.table->is_temporary());
+ ut_ad(index.is_primary());
+ if (buf_block_t *root = btr_root_block_get(&index, RW_SX_LATCH, mtr))
+ {
+ byte *page_type= root->frame + FIL_PAGE_TYPE;
+ if (all)
+ {
+ ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT ||
+ mach_read_from_2(page_type) == FIL_PAGE_INDEX);
+ mtr->write<2,mtr_t::OPT>(*root, page_type, FIL_PAGE_INDEX);
+ byte *instant= PAGE_INSTANT + PAGE_HEADER + root->frame;
+ mtr->write<2,mtr_t::OPT>(*root, instant,
+ page_ptr_get_direction(instant + 1));
+ }
+ else
+ ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT);
+ static const byte supremuminfimum[8 + 8] = "supremuminfimum";
+ uint16_t infimum, supremum;
+ if (page_is_comp(root->frame))
+ {
+ infimum= PAGE_NEW_INFIMUM;
+ supremum= PAGE_NEW_SUPREMUM;
+ }
+ else
+ {
+ infimum= PAGE_OLD_INFIMUM;
+ supremum= PAGE_OLD_SUPREMUM;
+ }
+ ut_ad(!memcmp(&root->frame[infimum], supremuminfimum + 8, 8) ==
+ !memcmp(&root->frame[supremum], supremuminfimum, 8));
+ mtr->memcpy<mtr_t::OPT>(*root, &root->frame[infimum], supremuminfimum + 8,
+ 8);
+ mtr->memcpy<mtr_t::OPT>(*root, &root->frame[supremum], supremuminfimum, 8);
+ }
+}
+
/*************************************************************//**
Makes tree one level higher by splitting the root, and inserts
the tuple. It is assumed that mtr contains an x-latch on the tree.
@@ -1859,16 +1897,13 @@ btr_root_raise_and_insert(
== page_zip_get_size(root_page_zip));
btr_page_create(new_block, new_page_zip, index, level, mtr);
-
- /* Set the next node and previous node fields of new page */
- if (!page_has_siblings(new_block->frame)) {
- ut_ad(index->is_ibuf());
- } else {
+ if (page_has_siblings(new_block->frame)) {
compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
compile_time_assert(FIL_NULL == 0xffffffff);
+ static_assert(FIL_PAGE_PREV % 8 == 0, "alignment");
+ memset_aligned<8>(new_block->frame + FIL_PAGE_PREV, 0xff, 8);
mtr->memset(new_block, FIL_PAGE_PREV, 8, 0xff);
if (UNIV_LIKELY_NULL(new_page_zip)) {
- static_assert(FIL_PAGE_PREV % 8 == 0, "alignment");
memset_aligned<8>(new_page_zip->data + FIL_PAGE_PREV,
0xff, 8);
}
@@ -1902,6 +1937,7 @@ btr_root_raise_and_insert(
}
}
+ constexpr uint16_t max_trx_id = PAGE_HEADER + PAGE_MAX_TRX_ID;
if (dict_index_is_sec_or_ibuf(index)) {
/* In secondary indexes and the change buffer,
PAGE_MAX_TRX_ID can be reset on the root page, because
@@ -1910,11 +1946,12 @@ btr_root_raise_and_insert(
set PAGE_MAX_TRX_ID on all secondary index pages.) */
byte* p = my_assume_aligned<8>(
PAGE_HEADER + PAGE_MAX_TRX_ID + root->frame);
- if (UNIV_LIKELY_NULL(root->page.zip.data)) {
- memset_aligned<8>(p, 0, 8);
- page_zip_write_header(root, p, 8, mtr);
- } else if (mach_read_from_8(p)) {
- mtr->memset(root, PAGE_HEADER + PAGE_MAX_TRX_ID, 8, 0);
+ if (mach_read_from_8(p)) {
+ mtr->memset(root, max_trx_id, 8, 0);
+ if (UNIV_LIKELY_NULL(root->page.zip.data)) {
+ memset_aligned<8>(max_trx_id
+ + root->page.zip.data, 0, 8);
+ }
}
} else {
/* PAGE_ROOT_AUTO_INC is only present in the clustered index
@@ -1922,12 +1959,13 @@ btr_root_raise_and_insert(
the field PAGE_MAX_TRX_ID for future use. */
byte* p = my_assume_aligned<8>(
PAGE_HEADER + PAGE_MAX_TRX_ID + new_block->frame);
- if (UNIV_LIKELY_NULL(new_block->page.zip.data)) {
- memset_aligned<8>(p, 0, 8);
- page_zip_write_header(new_block, p, 8, mtr);
- } else if (mach_read_from_8(p)) {
- mtr->memset(new_block, PAGE_HEADER + PAGE_MAX_TRX_ID,
- 8, 0);
+ if (mach_read_from_8(p)) {
+ mtr->memset(new_block, max_trx_id, 8, 0);
+ if (UNIV_LIKELY_NULL(new_block->page.zip.data)) {
+ memset_aligned<8>(max_trx_id
+ + new_block->page.zip.data,
+ 0, 8);
+ }
}
}
@@ -2522,37 +2560,15 @@ btr_attach_half_pages(
if (direction == FSP_DOWN) {
ut_ad(lower_block == new_block);
ut_ad(btr_page_get_next(upper_block->frame) == next_page_no);
- if (UNIV_UNLIKELY(btr_page_get_prev(lower_block->frame)
- == prev_page_no)) {
- ut_ad(index->is_ibuf());
- } else {
- btr_page_set_prev(lower_block, prev_page_no, mtr);
- }
+ btr_page_set_prev(lower_block, prev_page_no, mtr);
} else {
ut_ad(upper_block == new_block);
ut_ad(btr_page_get_prev(lower_block->frame) == prev_page_no);
- if (UNIV_UNLIKELY(btr_page_get_next(upper_block->frame)
- == next_page_no)) {
- ut_ad(index->is_ibuf());
- } else {
- btr_page_set_next(upper_block, next_page_no, mtr);
- }
+ btr_page_set_next(upper_block, next_page_no, mtr);
}
- if (UNIV_UNLIKELY(btr_page_get_next(lower_block->frame)
- == upper_block->page.id.page_no())) {
- ut_ad(index->is_ibuf());
- } else {
- btr_page_set_next(lower_block, upper_block->page.id.page_no(),
- mtr);
- }
- if (UNIV_UNLIKELY(btr_page_get_prev(upper_block->frame)
- == lower_block->page.id.page_no())) {
- ut_ad(index->is_ibuf());
- } else {
- btr_page_set_prev(upper_block, lower_block->page.id.page_no(),
- mtr);
- }
+ btr_page_set_prev(upper_block, lower_block->page.id.page_no(), mtr);
+ btr_page_set_next(lower_block, upper_block->page.id.page_no(), mtr);
}
/*************************************************************//**
@@ -2838,8 +2854,9 @@ func_start:
return(NULL););
/* 2. Allocate a new page to the index */
+ const uint16_t page_level = btr_page_get_level(page);
new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
- btr_page_get_level(page), mtr, mtr);
+ page_level, mtr, mtr);
if (!new_block) {
return(NULL);
@@ -2847,10 +2864,16 @@ func_start:
new_page = buf_block_get_frame(new_block);
new_page_zip = buf_block_get_page_zip(new_block);
+
+ if (page_level && UNIV_LIKELY_NULL(new_page_zip)) {
+ /* ROW_FORMAT=COMPRESSED non-leaf pages are not expected
+ to contain FIL_NULL in FIL_PAGE_PREV at this stage. */
+ memset_aligned<4>(new_page + FIL_PAGE_PREV, 0, 4);
+ }
btr_page_create(new_block, new_page_zip, cursor->index,
- btr_page_get_level(page), mtr);
+ page_level, mtr);
/* Only record the leaf level page splits. */
- if (page_is_leaf(page)) {
+ if (!page_level) {
cursor->index->stat_defrag_n_page_split ++;
cursor->index->stat_defrag_modified_counter ++;
btr_defragment_save_defrag_stats_if_needed(cursor->index);
@@ -2895,6 +2918,7 @@ insert_empty:
/* 4. Do first the modifications in the tree structure */
+ /* FIXME: write FIL_PAGE_PREV,FIL_PAGE_NEXT in new_block earlier! */
btr_attach_half_pages(flags, cursor->index, block,
first_rec, new_block, direction, mtr);
diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc
index cf1fb62bce0..d892c429a1e 100644
--- a/storage/innobase/btr/btr0bulk.cc
+++ b/storage/innobase/btr/btr0bulk.cc
@@ -82,26 +82,21 @@ PageBulk::init()
new_page = buf_block_get_frame(new_block);
new_page_no = page_get_page_no(new_page);
- byte* index_id = PAGE_HEADER + PAGE_INDEX_ID + new_page;
+ byte* index_id = my_assume_aligned<2>
+ (PAGE_HEADER + PAGE_INDEX_ID + new_page);
+ compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ memset_aligned<8>(new_page + FIL_PAGE_PREV, 0xff, 8);
if (UNIV_LIKELY_NULL(new_block->page.zip.data)) {
+ mach_write_to_8(index_id, m_index->id);
page_create_zip(new_block, m_index, m_level, 0,
&m_mtr);
- static_assert(FIL_PAGE_PREV % 8 == 0, "alignment");
- memset_aligned<8>(FIL_PAGE_PREV + new_page, 0xff, 8);
- page_zip_write_header(new_block,
- FIL_PAGE_PREV + new_page,
- 8, &m_mtr);
- mach_write_to_8(index_id, m_index->id);
- page_zip_write_header(new_block, index_id, 8, &m_mtr);
} else {
ut_ad(!m_index->is_spatial());
page_create(new_block, &m_mtr,
m_index->table->not_redundant());
- compile_time_assert(FIL_PAGE_NEXT
- == FIL_PAGE_PREV + 4);
- compile_time_assert(FIL_NULL == 0xffffffff);
- m_mtr.memset(new_block, FIL_PAGE_PREV, 8, 0xff);
+ m_mtr.memset(*new_block, FIL_PAGE_PREV, 8, 0xff);
m_mtr.write<2,mtr_t::OPT>(*new_block,
PAGE_HEADER + PAGE_LEVEL
+ new_page, m_level);
@@ -155,22 +150,25 @@ PageBulk::init()
/** Insert a record in the page.
@tparam fmt the page format
-@param[in] rec record
+@param[in,out] rec record
@param[in] offsets record offsets */
template<PageBulk::format fmt>
-inline void PageBulk::insertPage(const rec_t *rec, offset_t *offsets)
+inline void PageBulk::insertPage(rec_t *rec, offset_t *offsets)
{
ut_ad((m_page_zip != nullptr) == (fmt == COMPRESSED));
ut_ad((fmt != REDUNDANT) == m_is_comp);
-
+ ut_ad(page_align(m_heap_top) == m_page);
ut_ad(m_heap);
- ulint rec_size= rec_offs_size(offsets);
+ const ulint rec_size= rec_offs_size(offsets);
+ const ulint extra_size= rec_offs_extra_size(offsets);
+ ut_ad(page_align(m_heap_top + rec_size) == m_page);
ut_d(const bool is_leaf= page_rec_is_leaf(m_cur_rec));
#ifdef UNIV_DEBUG
/* Check whether records are in order. */
- if (!page_rec_is_infimum_low(page_offset(m_cur_rec)))
+ if (page_offset(m_cur_rec) !=
+ (fmt == REDUNDANT ? PAGE_OLD_INFIMUM : PAGE_NEW_INFIMUM))
{
const rec_t *old_rec = m_cur_rec;
offset_t *old_offsets= rec_get_offsets(old_rec, m_index, nullptr, is_leaf,
@@ -181,41 +179,126 @@ inline void PageBulk::insertPage(const rec_t *rec, offset_t *offsets)
m_total_data+= rec_size;
#endif /* UNIV_DEBUG */
- /* Copy the record payload. */
- rec_t *insert_rec= rec_copy(m_heap_top, rec, offsets);
- ut_ad(page_align(insert_rec) == m_page);
- rec_offs_make_valid(insert_rec, m_index, is_leaf, offsets);
+ rec_t* const insert_rec= m_heap_top + extra_size;
/* Insert the record in the linked list. */
if (fmt != REDUNDANT)
{
- rec_t *next_rec= m_page +
+ const rec_t *next_rec= m_page +
page_offset(m_cur_rec + mach_read_from_2(m_cur_rec - REC_NEXT));
- mach_write_to_2(insert_rec - REC_NEXT,
- static_cast<uint16_t>(next_rec - insert_rec));
if (fmt != COMPRESSED)
m_mtr.write<2>(*m_block, m_cur_rec - REC_NEXT,
static_cast<uint16_t>(insert_rec - m_cur_rec));
else
+ {
mach_write_to_2(m_cur_rec - REC_NEXT,
static_cast<uint16_t>(insert_rec - m_cur_rec));
- rec_set_bit_field_1(insert_rec, 0, REC_NEW_N_OWNED, REC_N_OWNED_MASK,
+ memcpy(m_heap_top, rec - extra_size, rec_size);
+ }
+
+ rec_t * const this_rec= fmt != COMPRESSED
+ ? const_cast<rec_t*>(rec) : insert_rec;
+ rec_set_bit_field_1(this_rec, 0, REC_NEW_N_OWNED, REC_N_OWNED_MASK,
REC_N_OWNED_SHIFT);
- rec_set_bit_field_2(insert_rec, PAGE_HEAP_NO_USER_LOW + m_rec_no,
+ rec_set_bit_field_2(this_rec, PAGE_HEAP_NO_USER_LOW + m_rec_no,
REC_NEW_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+ mach_write_to_2(this_rec - REC_NEXT,
+ static_cast<uint16_t>(next_rec - insert_rec));
}
else
{
- memcpy(insert_rec - REC_NEXT, m_cur_rec - REC_NEXT, 2);
+ memcpy(const_cast<rec_t*>(rec) - REC_NEXT, m_cur_rec - REC_NEXT, 2);
m_mtr.write<2>(*m_block, m_cur_rec - REC_NEXT, page_offset(insert_rec));
- rec_set_bit_field_1(insert_rec, 0, REC_OLD_N_OWNED, REC_N_OWNED_MASK,
- REC_N_OWNED_SHIFT);
- rec_set_bit_field_2(insert_rec, PAGE_HEAP_NO_USER_LOW + m_rec_no,
+ rec_set_bit_field_1(const_cast<rec_t*>(rec), 0,
+ REC_OLD_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ rec_set_bit_field_2(const_cast<rec_t*>(rec),
+ PAGE_HEAP_NO_USER_LOW + m_rec_no,
REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
}
- if (fmt != COMPRESSED)
- m_mtr.memcpy(*m_block, page_offset(m_heap_top), rec_offs_size(offsets));
+ if (fmt == COMPRESSED)
+ /* We already wrote the record. Log is written in PageBulk::compress(). */;
+ else if (page_offset(m_cur_rec) ==
+ (fmt == REDUNDANT ? PAGE_OLD_INFIMUM : PAGE_NEW_INFIMUM))
+ m_mtr.memcpy(*m_block, m_heap_top, rec - extra_size, rec_size);
+ else
+ {
+ /* Try to copy common prefix from the preceding record. */
+ const byte *r= rec - extra_size;
+ const byte * const insert_rec_end= m_heap_top + rec_size;
+ byte *b= m_heap_top;
+
+ /* Skip any unchanged prefix of the record. */
+ for (; * b == *r; b++, r++);
+
+ ut_ad(b < insert_rec_end);
+
+ const byte *c= m_cur_rec - (rec - r);
+ const byte * const c_end= std::min(m_cur_rec + rec_offs_data_size(offsets),
+ m_heap_top);
+
+ /* Try to copy any bytes of the preceding record. */
+ if (UNIV_LIKELY(c >= m_page && c < c_end))
+ {
+ const byte *cm= c;
+ byte *bm= b;
+ const byte *rm= r;
+ for (; cm < c_end && *rm == *cm; cm++, bm++, rm++);
+ ut_ad(bm <= insert_rec_end);
+ size_t len= static_cast<size_t>(rm - r);
+ ut_ad(!memcmp(r, c, len));
+ if (len > 2)
+ {
+ memcpy(b, c, len);
+ m_mtr.memmove(*m_block, page_offset(b), page_offset(c), len);
+ c= cm;
+ b= bm;
+ r= rm;
+ }
+ }
+
+ if (c < m_cur_rec)
+ {
+ if (!rec_offs_data_size(offsets))
+ {
+no_data:
+ m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, m_cur_rec - c);
+ goto rec_done;
+ }
+ /* Some header bytes differ. Compare the data separately. */
+ const byte *cd= m_cur_rec;
+ byte *bd= insert_rec;
+ const byte *rd= rec;
+ /* Skip any unchanged prefix of the record. */
+ for (; *bd == *rd; cd++, bd++, rd++)
+ if (bd == insert_rec_end)
+ goto no_data;
+
+ /* Try to copy any data bytes of the preceding record. */
+ const byte *cdm= cd;
+ const byte *rdm= rd;
+ for (; cdm < c_end && *rdm == *cdm; cdm++, rdm++)
+ ut_ad(rdm - rd + bd <= insert_rec_end);
+ size_t len= static_cast<size_t>(rdm - rd);
+ ut_ad(!memcmp(rd, cd, len));
+ if (len > 2)
+ {
+ m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, m_cur_rec - c);
+ memcpy(bd, cd, len);
+ m_mtr.memmove(*m_block, page_offset(bd), page_offset(cd), len);
+ c= cdm;
+ b= rdm - rd + bd;
+ r= rdm;
+ }
+ }
+
+ if (size_t len= static_cast<size_t>(insert_rec_end - b))
+ m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, len);
+ }
+
+rec_done:
+ ut_ad(fmt == COMPRESSED || !memcmp(m_heap_top, rec - extra_size, rec_size));
+ rec_offs_make_valid(insert_rec, m_index, is_leaf, offsets);
/* Update the member variables. */
ulint slot_size= page_dir_calc_reserved_space(m_rec_no + 1) -
@@ -235,12 +318,25 @@ inline void PageBulk::insertPage(const rec_t *rec, offset_t *offsets)
@param[in] offsets record offsets */
inline void PageBulk::insert(const rec_t *rec, offset_t *offsets)
{
+ byte rec_hdr[REC_N_OLD_EXTRA_BYTES];
+ static_assert(REC_N_OLD_EXTRA_BYTES > REC_N_NEW_EXTRA_BYTES, "file format");
+
if (UNIV_LIKELY_NULL(m_page_zip))
- insertPage<COMPRESSED>(rec, offsets);
+ insertPage<COMPRESSED>(const_cast<rec_t*>(rec), offsets);
else if (m_is_comp)
- insertPage<DYNAMIC>(rec, offsets);
+ {
+ memcpy(rec_hdr, rec - REC_N_NEW_EXTRA_BYTES, REC_N_NEW_EXTRA_BYTES);
+ insertPage<DYNAMIC>(const_cast<rec_t*>(rec), offsets);
+ memcpy(const_cast<rec_t*>(rec) - REC_N_NEW_EXTRA_BYTES, rec_hdr,
+ REC_N_NEW_EXTRA_BYTES);
+ }
else
- insertPage<REDUNDANT>(rec, offsets);
+ {
+ memcpy(rec_hdr, rec - REC_N_OLD_EXTRA_BYTES, REC_N_OLD_EXTRA_BYTES);
+ insertPage<REDUNDANT>(const_cast<rec_t*>(rec), offsets);
+ memcpy(const_cast<rec_t*>(rec) - REC_N_OLD_EXTRA_BYTES, rec_hdr,
+ REC_N_OLD_EXTRA_BYTES);
+ }
}
/** Set the number of owned records in the uncompressed page of
@@ -283,18 +379,13 @@ inline void PageBulk::finishPage()
if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)
{
slot-= PAGE_DIR_SLOT_SIZE;
+ mach_write_to_2(slot, offset);
if (fmt != COMPRESSED)
- {
- m_mtr.write<2,mtr_t::OPT>(*m_block, slot, offset);
page_rec_set_n_owned<false>(m_block, m_page + offset, count, true,
&m_mtr);
- }
else
- {
- mach_write_to_2(slot, offset);
rec_set_n_owned_zip(m_page + offset, count);
- }
count= 0;
}
@@ -321,17 +412,12 @@ inline void PageBulk::finishPage()
else
slot-= PAGE_DIR_SLOT_SIZE;
+ mach_write_to_2(slot, PAGE_NEW_SUPREMUM);
if (fmt != COMPRESSED)
- {
- m_mtr.write<2,mtr_t::OPT>(*m_block, slot, PAGE_NEW_SUPREMUM);
page_rec_set_n_owned<false>(m_block, m_page + PAGE_NEW_SUPREMUM,
count + 1, true, &m_mtr);
- }
else
- {
- mach_write_to_2(slot, PAGE_NEW_SUPREMUM);
rec_set_n_owned_zip(m_page + PAGE_NEW_SUPREMUM, count + 1);
- }
}
else
{
@@ -347,7 +433,7 @@ inline void PageBulk::finishPage()
if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)
{
slot-= PAGE_DIR_SLOT_SIZE;
- m_mtr.write<2,mtr_t::OPT>(*m_block, slot, page_offset(insert_rec));
+ mach_write_to_2(slot, page_offset(insert_rec));
page_rec_set_n_owned<false>(m_block, insert_rec, count, false, &m_mtr);
count= 0;
}
@@ -368,31 +454,35 @@ inline void PageBulk::finishPage()
else
slot-= PAGE_DIR_SLOT_SIZE;
- m_mtr.write<2,mtr_t::OPT>(*m_block, slot, PAGE_OLD_SUPREMUM);
+ mach_write_to_2(slot, PAGE_OLD_SUPREMUM);
page_rec_set_n_owned<false>(m_block, m_page + PAGE_OLD_SUPREMUM, count + 1,
false, &m_mtr);
}
- ut_ad(!dict_index_is_spatial(m_index));
+ ut_ad(!m_index->is_spatial());
ut_ad(!page_get_instant(m_page));
ut_ad(!mach_read_from_2(PAGE_HEADER + PAGE_N_DIRECTION + m_page));
if (fmt != COMPRESSED)
{
- m_mtr.write<2,mtr_t::OPT>(*m_block,
- PAGE_HEADER + PAGE_N_DIR_SLOTS + m_page,
- 1 + static_cast<ulint>(slot0 - slot) /
- PAGE_DIR_SLOT_SIZE);
- m_mtr.write<2>(*m_block, PAGE_HEADER + PAGE_HEAP_TOP + m_page,
- static_cast<ulint>(m_heap_top - m_page));
- m_mtr.write<2>(*m_block, PAGE_HEADER + PAGE_N_HEAP + m_page,
- (PAGE_HEAP_NO_USER_LOW + m_rec_no) |
- uint16_t{fmt != REDUNDANT} << 15);
- m_mtr.write<2>(*m_block, PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no);
- m_mtr.write<2>(*m_block, PAGE_HEADER + PAGE_LAST_INSERT + m_page,
- static_cast<ulint>(m_cur_rec - m_page));
- m_mtr.write<2>(*m_block, PAGE_HEADER + PAGE_DIRECTION_B - 1 + m_page,
- PAGE_RIGHT);
+ static_assert(PAGE_N_DIR_SLOTS == 0, "compatibility");
+ alignas(8) byte page_header[PAGE_N_RECS + 2];
+ mach_write_to_2(page_header + PAGE_N_DIR_SLOTS,
+ 1 + (slot0 - slot) / PAGE_DIR_SLOT_SIZE);
+ mach_write_to_2(page_header + PAGE_HEAP_TOP, m_heap_top - m_page);
+ mach_write_to_2(page_header + PAGE_N_HEAP,
+ (PAGE_HEAP_NO_USER_LOW + m_rec_no) |
+ uint16_t{fmt != REDUNDANT} << 15);
+ memset_aligned<2>(page_header + PAGE_FREE, 0, 4);
+ static_assert(PAGE_GARBAGE == PAGE_FREE + 2, "compatibility");
+ mach_write_to_2(page_header + PAGE_LAST_INSERT, m_cur_rec - m_page);
+ mach_write_to_2(page_header + PAGE_DIRECTION_B - 1, PAGE_RIGHT);
+ mach_write_to_2(page_header + PAGE_N_DIRECTION, m_rec_no);
+ memcpy_aligned<2>(page_header + PAGE_N_RECS,
+ page_header + PAGE_N_DIRECTION, 2);
+ m_mtr.memcpy(*m_block, PAGE_HEADER + m_page, page_header,
+ sizeof page_header);
+ m_mtr.memcpy(*m_block, page_offset(slot), slot0 - slot);
}
else
{
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
index 09972a2786c..22495ddbd88 100644
--- a/storage/innobase/btr/btr0cur.cc
+++ b/storage/innobase/btr/btr0cur.cc
@@ -3898,49 +3898,94 @@ static void btr_cur_write_sys(
}
/** Update DB_TRX_ID, DB_ROLL_PTR in a clustered index record.
-@param[in,out] block clustered index leaf page
-@param[in,out] rec clustered index record
-@param[in] index clustered index
-@param[in] offsets rec_get_offsets(rec, index)
-@param[in] trx transaction
-@param[in] roll_ptr DB_ROLL_PTR value
-@param[in,out] mtr mini-transaction */
-static void btr_cur_upd_rec_sys(buf_block_t *block, rec_t* rec,
- dict_index_t* index, const offset_t* offsets,
- const trx_t* trx, roll_ptr_t roll_ptr,
- mtr_t* mtr)
+@param[in,out] block clustered index leaf page
+@param[in,out] rec clustered index record
+@param[in] index clustered index
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] trx transaction
+@param[in] roll_ptr DB_ROLL_PTR value
+@param[in,out] mtr mini-transaction */
+static void btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec,
+ dict_index_t *index, const offset_t *offsets,
+ const trx_t *trx, roll_ptr_t roll_ptr,
+ mtr_t *mtr)
{
- ut_ad(index->is_primary());
- ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(index->is_primary());
+ ut_ad(rec_offs_validate(rec, index, offsets));
- if (UNIV_LIKELY_NULL(block->page.zip.data)) {
- page_zip_write_trx_id_and_roll_ptr(block, rec, offsets,
- index->db_trx_id(),
- trx->id, roll_ptr, mtr);
- } else {
- ulint offset = index->trx_id_offset;
+ if (UNIV_LIKELY_NULL(block->page.zip.data))
+ {
+ page_zip_write_trx_id_and_roll_ptr(block, rec, offsets, index->db_trx_id(),
+ trx->id, roll_ptr, mtr);
+ return;
+ }
- if (!offset) {
- offset = row_get_trx_id_offset(index, offsets);
- }
+ ulint offset= index->trx_id_offset;
- compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
+ if (!offset)
+ offset= row_get_trx_id_offset(index, offsets);
- /* During IMPORT the trx id in the record can be in the
- future, if the .ibd file is being imported from another
- instance. During IMPORT roll_ptr will be 0. */
- ut_ad(roll_ptr == 0
- || lock_check_trx_id_sanity(
- trx_read_trx_id(rec + offset),
- rec, index, offsets));
-
- trx_write_trx_id(rec + offset, trx->id);
- trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, roll_ptr);
- /* MDEV-12353 FIXME: consider emitting MEMMOVE for the
- DB_TRX_ID if it is found in the preceding record */
- mtr->memcpy(*block, page_offset(rec + offset),
- DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
- }
+ compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
+
+ /* During IMPORT the trx id in the record can be in the future, if
+ the .ibd file is being imported from another instance. During IMPORT
+ roll_ptr will be 0. */
+ ut_ad(roll_ptr == 0 ||
+ lock_check_trx_id_sanity(trx_read_trx_id(rec + offset),
+ rec, index, offsets));
+
+ byte sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+
+ trx_write_trx_id(sys, trx->id);
+ trx_write_roll_ptr(sys + DATA_TRX_ID_LEN, roll_ptr);
+
+ ulint d= 0;
+ const byte *src= nullptr;
+ byte *dest= rec + offset;
+ ulint len= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+ if (UNIV_LIKELY(index->trx_id_offset))
+ {
+ const rec_t *prev= page_rec_get_prev_const(rec);
+ if (UNIV_UNLIKELY(prev == rec))
+ ut_ad(0);
+ else if (page_rec_is_infimum(prev));
+ else
+ for (src= prev + offset; d < DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; d++)
+ if (src[d] != sys[d])
+ break;
+ if (d > 6 && memcmp(dest, sys, d))
+ {
+ /* We save space by replacing a single record
+
+ WRITE,page_offset(dest),byte[13]
+
+ with two records:
+
+ MEMMOVE,page_offset(dest),d(1 byte),offset(1..3 bytes),
+ WRITE|0x80,0,byte[13-d]
+
+ The single WRITE record would be x+13 bytes long, with x>2.
+ The MEMMOVE record would be up to x+1+3 = x+4 bytes, and the
+ second WRITE would be 1+1+13-d = 15-d bytes.
+
+ The total size is: x+13 versus x+4+15-d = x+19-d bytes.
+ To save space, we must have d>6, that is, the complete DB_TRX_ID and
+ the first byte(s) of DB_ROLL_PTR must match the previous record. */
+ memcpy(dest, src, d);
+ mtr->memmove(*block, page_offset(dest), page_offset(src), d);
+ dest+= d;
+ len-= d;
+ /* DB_TRX_ID,DB_ROLL_PTR must be unique in each record when
+ DB_TRX_ID refers to an active transaction. */
+ ut_ad(len);
+ }
+ else
+ d= 0;
+ }
+
+ if (UNIV_LIKELY(len)) /* extra safety, to avoid corrupting the log */
+ mtr->memcpy<mtr_t::OPT>(*block, dest, sys + d, len);
}
/*********************************************************************//**
@@ -4400,10 +4445,13 @@ void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index,
if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) {
ut_ad(!rec_offs_nth_sql_null(offsets, n));
ut_ad(!index->table->not_redundant());
- mtr->memset(block,
- page_offset(rec + rec_get_field_start_offs(
- rec, n)),
- rec_get_nth_field_size(rec, n), 0);
+ if (ulint size = rec_get_nth_field_size(rec, n)) {
+ mtr->memset(
+ block,
+ page_offset(rec_get_field_start_offs(
+ rec, n) + rec),
+ size, 0);
+ }
ulint l = rec_get_1byte_offs_flag(rec)
? (n + 1) : (n + 1) * 2;
byte* b = &rec[-REC_N_OLD_EXTRA_BYTES - l];
@@ -4436,7 +4484,10 @@ void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index,
byte(*b & ~REC_1BYTE_SQL_NULL_MASK));
}
- mtr->memcpy(block, page_offset(data), uf->new_val.data, len);
+ if (len) {
+ mtr->memcpy<mtr_t::OPT>(*block, data, uf->new_val.data,
+ len);
+ }
}
if (UNIV_LIKELY_NULL(block->page.zip.data)) {
@@ -7855,21 +7906,10 @@ btr_store_big_rec_extern_fields(
int err;
page_zip_des_t* blob_page_zip;
- /* Write FIL_PAGE_TYPE to the redo log
- separately, before logging any other
- changes to the block, so that the debug
- assertions in
- recv_parse_or_apply_log_rec_body() can
- be made simpler. Before InnoDB Plugin
- 1.0.4, the initialization of
- FIL_PAGE_TYPE was logged as part of
- the mtr_t::memcpy() below. */
-
- mtr.write<2>(*block,
- block->frame + FIL_PAGE_TYPE,
- prev_page_no == FIL_NULL
- ? FIL_PAGE_TYPE_ZBLOB
- : FIL_PAGE_TYPE_ZBLOB2);
+ mach_write_to_2(block->frame + FIL_PAGE_TYPE,
+ prev_page_no == FIL_NULL
+ ? FIL_PAGE_TYPE_ZBLOB
+ : FIL_PAGE_TYPE_ZBLOB2);
c_stream.next_out = block->frame
+ FIL_PAGE_DATA;
@@ -7886,9 +7926,9 @@ btr_store_big_rec_extern_fields(
compile_time_assert(FIL_NULL == 0xffffffff);
mtr.memset(block, FIL_PAGE_PREV, 8, 0xff);
mtr.memcpy(*block,
- FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
+ FIL_PAGE_TYPE,
page_zip_get_size(page_zip)
- - FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+ - FIL_PAGE_TYPE
- c_stream.avail_out);
/* Zero out the unused part of the page. */
if (c_stream.avail_out) {
@@ -7966,12 +8006,14 @@ next_zip_page:
store_len = extern_len;
}
- mtr.memcpy(block,
- FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE,
- (const byte*)
- big_rec_vec->fields[i].data
- + big_rec_vec->fields[i].len
- - extern_len, store_len);
+ mtr.memcpy<mtr_t::OPT>(
+ *block,
+ FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE
+ + block->frame,
+ static_cast<const byte*>
+ (big_rec_vec->fields[i].data)
+ + big_rec_vec->fields[i].len
+ - extern_len, store_len);
mtr.write<4>(*block, BTR_BLOB_HDR_PART_LEN
+ FIL_PAGE_DATA + block->frame,
store_len);
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index 00abae10d5a..6e2c62a1e57 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -5493,7 +5493,7 @@ release_page:
}
if (recv_recovery_is_on()) {
- recv_recover_page(bpage);
+ recv_recover_page(space, bpage);
}
if (uncompressed
@@ -5536,27 +5536,13 @@ release_page:
ut_ad(buf_pool->n_pend_reads > 0);
buf_pool->n_pend_reads--;
buf_pool->stat.n_pages_read++;
- ut_ad(!uncompressed || !bpage->zip.data
- || !recv_recovery_is_on()
- || buf_page_can_relocate(bpage));
- mutex_exit(block_mutex);
if (uncompressed) {
-#if 1 /* MDEV-12353 FIXME: Remove this! */
- if (UNIV_LIKELY_NULL(bpage->zip.data)
- && recv_recovery_is_on()) {
- rw_lock_x_unlock_gen(
- &reinterpret_cast<buf_block_t*>(bpage)
- ->lock, BUF_IO_READ);
- if (!buf_LRU_free_page(bpage, false)) {
- ut_ad(!"could not remove");
- }
- goto func_exit;
- }
-#endif
rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock,
BUF_IO_READ);
}
+
+ mutex_exit(block_mutex);
} else {
/* Write means a flush operation: call the completion
routine in the flush system */
@@ -5590,7 +5576,6 @@ release_page:
DBUG_PRINT("ib_buf", ("%s page %u:%u",
io_type == BUF_IO_READ ? "read" : "wrote",
bpage->id.space(), bpage->id.page_no()));
-func_exit:
mutex_exit(&buf_pool->mutex);
return DB_SUCCESS;
}
diff --git a/storage/innobase/fil/fil0crypt.cc b/storage/innobase/fil/fil0crypt.cc
index 4ec9a6d7952..fc2a26d8b04 100644
--- a/storage/innobase/fil/fil0crypt.cc
+++ b/storage/innobase/fil/fil0crypt.cc
@@ -418,9 +418,7 @@ void fil_space_crypt_t::write_page0(buf_block_t* block, mtr_t* mtr)
+ fsp_header_get_encryption_offset(block->zip_size());
byte* b = block->frame + offset;
- if (memcmp(b, CRYPT_MAGIC, MAGIC_SZ)) {
- mtr->memcpy(block, offset, CRYPT_MAGIC, MAGIC_SZ);
- }
+ mtr->memcpy<mtr_t::OPT>(*block, b, CRYPT_MAGIC, MAGIC_SZ);
b += MAGIC_SZ;
byte* const start = b;
@@ -436,6 +434,8 @@ void fil_space_crypt_t::write_page0(buf_block_t* block, mtr_t* mtr)
b += 4;
*b++ = byte(encryption);
ut_ad(b - start == 11 + MY_AES_BLOCK_SIZE);
+ /* We must log also any unchanged bytes, because recovery will
+ invoke fil_crypt_parse() based on this log record. */
mtr->memcpy(*block, offset + MAGIC_SZ, b - start);
}
diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc
index 0079be0e9f0..0c9b6fdd6dd 100644
--- a/storage/innobase/fil/fil0fil.cc
+++ b/storage/innobase/fil/fil0fil.cc
@@ -1817,68 +1817,62 @@ fil_create_directory_for_tablename(
@param space_id tablespace identifier
@param first_page_no first page number in the file
@param path file path
-@param new_path new file path for type=MLOG_FILE_RENAME2
-@param flags tablespace flags for type=MLOG_FILE_CREATE2 */
-inline void mtr_t::log_file_op(mlog_id_t type,
+@param new_path new file path for type=FILE_RENAME */
+inline void mtr_t::log_file_op(mfile_type_t type,
ulint space_id, ulint first_page_no,
- const char *path, const char *new_path,
- ulint flags)
+ const char *path, const char *new_path)
{
- ulint len;
-
- ut_ad(first_page_no == 0 || type == MLOG_FILE_CREATE2);
- ut_ad(fil_space_t::is_valid_flags(flags, space_id));
-
- /* fil_name_parse() requires that there be at least one path
- separator and that the file path end with ".ibd". */
- ut_ad(strchr(path, OS_PATH_SEPARATOR) != NULL);
- ut_ad(first_page_no /* trimming an undo tablespace */
- || !strcmp(&path[strlen(path) - strlen(DOT_IBD)], DOT_IBD));
-
- set_modified();
- if (m_log_mode != MTR_LOG_ALL) {
- return;
- }
-
- byte* log_ptr = log_write_low(type, page_id_t(space_id, first_page_no),
- m_log.open(11 + 4 + 2 + 1));
-
- if (type == MLOG_FILE_CREATE2) {
- mach_write_to_4(log_ptr, flags);
- log_ptr += 4;
- }
-
- /* Let us store the strings as null-terminated for easier readability
- and handling */
-
- len = strlen(path) + 1;
-
- mach_write_to_2(log_ptr, len);
- log_ptr += 2;
- m_log.close(log_ptr);
-
- m_log.push(reinterpret_cast<const byte*>(path), uint32_t(len));
+ ut_ad(first_page_no == 0 || type == FILE_CREATE);
+ ut_ad((new_path != nullptr) == (type == FILE_RENAME));
+ ut_ad(!(byte(type) & 15));
+
+ /* fil_name_parse() requires that there be at least one path
+ separator and that the file path end with ".ibd". */
+ ut_ad(strchr(path, OS_PATH_SEPARATOR) != NULL);
+ ut_ad(first_page_no /* trimming an undo tablespace */ ||
+ !strcmp(&path[strlen(path) - strlen(DOT_IBD)], DOT_IBD));
+
+ set_modified();
+ if (m_log_mode != MTR_LOG_ALL)
+ return;
+ m_last= nullptr;
+
+ const size_t len= strlen(path);
+ const size_t new_len= type == FILE_RENAME ? 1 + strlen(new_path) : 0;
+ ut_ad(len > 0);
+ byte *const log_ptr= m_log.open(1 + 3/*length*/ + 5/*space_id*/ +
+ 5/*first_page_no*/);
+ byte *end= log_ptr + 1;
+ end= mlog_encode_varint(end, space_id);
+ end= mlog_encode_varint(end, first_page_no);
+ if (UNIV_LIKELY(end + len + new_len >= &log_ptr[16]))
+ {
+ *log_ptr= type;
+ size_t total_len= len + new_len + end - log_ptr - 15;
+ if (total_len >= MIN_3BYTE)
+ total_len+= 2;
+ else if (total_len >= MIN_2BYTE)
+ total_len++;
+ end= mlog_encode_varint(log_ptr + 1, total_len);
+ end= mlog_encode_varint(end, space_id);
+ end= mlog_encode_varint(end, first_page_no);
+ }
+ else
+ {
+ *log_ptr= type | static_cast<byte>(end + len + new_len - &log_ptr[1]);
+ ut_ad(*log_ptr & 15);
+ }
- switch (type) {
- case MLOG_FILE_RENAME2:
- ut_ad(strchr(new_path, OS_PATH_SEPARATOR) != NULL);
- len = strlen(new_path) + 1;
- log_ptr = m_log.open(2 + len);
- ut_a(log_ptr);
- mach_write_to_2(log_ptr, len);
- log_ptr += 2;
- m_log.close(log_ptr);
+ m_log.close(end);
- m_log.push(reinterpret_cast<const byte*>(new_path),
- uint32_t(len));
- break;
- case MLOG_FILE_NAME:
- case MLOG_FILE_DELETE:
- case MLOG_FILE_CREATE2:
- break;
- default:
- ut_ad(0);
- }
+ if (type == FILE_RENAME)
+ {
+ ut_ad(strchr(new_path, OS_PATH_SEPARATOR));
+ m_log.push(reinterpret_cast<const byte*>(path), uint32_t(len + 1));
+ m_log.push(reinterpret_cast<const byte*>(new_path), uint32_t(new_len));
+ }
+ else
+ m_log.push(reinterpret_cast<const byte*>(path), uint32_t(len));
}
/** Write redo log for renaming a file.
@@ -1897,8 +1891,7 @@ fil_name_write_rename_low(
mtr_t* mtr)
{
ut_ad(!is_predefined_tablespace(space_id));
- mtr->log_file_op(MLOG_FILE_RENAME2, space_id, first_page_no,
- old_name, new_name);
+ mtr->log_file_op(FILE_RENAME, space_id, first_page_no, old_name, new_name);
}
/** Write redo log for renaming a file.
@@ -1918,7 +1911,7 @@ fil_name_write_rename(
log_write_up_to(mtr.commit_lsn(), true);
}
-/** Write MLOG_FILE_NAME for a file.
+/** Write FILE_MODIFY for a file.
@param[in] space_id tablespace id
@param[in] first_page_no first page number in the file
@param[in] name tablespace file name
@@ -1931,9 +1924,10 @@ fil_name_write(
const char* name,
mtr_t* mtr)
{
- mtr->log_file_op(MLOG_FILE_NAME, space_id, first_page_no, name);
+ ut_ad(!is_predefined_tablespace(space_id));
+ mtr->log_file_op(FILE_MODIFY, space_id, first_page_no, name);
}
-/** Write MLOG_FILE_NAME for a file.
+/** Write FILE_MODIFY for a file.
@param[in] space tablespace
@param[in] first_page_no first page number in the file
@param[in] file tablespace file
@@ -1946,7 +1940,7 @@ fil_name_write(
const fil_node_t* file,
mtr_t* mtr)
{
- mtr->log_file_op(MLOG_FILE_NAME, space->id, first_page_no, file->name);
+ fil_name_write(space->id, first_page_no, file->name, mtr);
}
/** Replay a file rename operation if possible.
@@ -2347,7 +2341,7 @@ fil_delete_tablespace(
mtr_t mtr;
mtr.start();
- mtr.log_file_op(MLOG_FILE_DELETE, id, 0, path);
+ mtr.log_file_op(FILE_DELETE, id, 0, path);
mtr.commit();
/* Even if we got killed shortly after deleting the
tablespace file, the record must have already been
@@ -2429,13 +2423,12 @@ fil_space_t* fil_truncate_prepare(ulint space_id)
/** Write log about an undo tablespace truncate operation. */
void fil_truncate_log(fil_space_t* space, ulint size, mtr_t* mtr)
{
- /* Write a MLOG_FILE_CREATE2 record with the new size, so that
- recovery and backup will ignore any preceding redo log records
- for writing pages that are after the new end of the tablespace. */
- ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
- const fil_node_t* file = UT_LIST_GET_FIRST(space->chain);
- mtr->log_file_op(MLOG_FILE_CREATE2, space->id, size, file->name,
- nullptr, space->flags & ~FSP_FLAGS_MEM_MASK);
+ /* Write a record with the new size, so that recovery and
+ backup will ignore any preceding redo log records for writing
+ pages that are after the new end of the tablespace. */
+ ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
+ const fil_node_t *file= UT_LIST_GET_FIRST(space->chain);
+ mtr->log_file_op(FILE_CREATE, space->id, size, file->name);
}
/*******************************************************************//**
@@ -2928,9 +2921,7 @@ err_exit:
false, true);
mtr_t mtr;
mtr.start();
- mtr.log_file_op(MLOG_FILE_CREATE2, space_id, 0, node->name,
- nullptr, space->flags & ~FSP_FLAGS_MEM_MASK);
- fil_name_write(space, 0, node, &mtr);
+ mtr.log_file_op(FILE_CREATE, space_id, 0, node->name);
mtr.commit();
node->find_metadata(file);
@@ -4561,7 +4552,7 @@ fil_space_validate_for_mtr_commit(
}
#endif /* UNIV_DEBUG */
-/** Write a MLOG_FILE_NAME record for a persistent tablespace.
+/** Write a FILE_MODIFY record for a persistent tablespace.
@param[in] space tablespace
@param[in,out] mtr mini-transaction */
static
@@ -4591,22 +4582,20 @@ fil_names_dirty(
space->max_lsn = log_sys.lsn;
}
-/** Write MLOG_FILE_NAME records when a non-predefined persistent
+/** Write FILE_MODIFY records when a non-predefined persistent
tablespace was modified for the first time since the latest
fil_names_clear().
-@param[in,out] space tablespace
-@param[in,out] mtr mini-transaction */
-void
-fil_names_dirty_and_write(
- fil_space_t* space,
- mtr_t* mtr)
+@param[in,out] space tablespace */
+void fil_names_dirty_and_write(fil_space_t* space)
{
ut_ad(log_mutex_own());
ut_d(fil_space_validate_for_mtr_commit(space));
ut_ad(space->max_lsn == log_sys.lsn);
UT_LIST_ADD_LAST(fil_system.named_spaces, space);
- fil_names_write(space, mtr);
+ mtr_t mtr;
+ mtr.start();
+ fil_names_write(space, &mtr);
DBUG_EXECUTE_IF("fil_names_write_bogus",
{
@@ -4614,14 +4603,16 @@ fil_names_dirty_and_write(
os_normalize_path(bogus_name);
fil_name_write(
SRV_SPACE_ID_UPPER_BOUND, 0,
- bogus_name, mtr);
+ bogus_name, &mtr);
});
+
+ mtr.commit_files();
}
/** On a log checkpoint, reset fil_names_dirty_and_write() flags
-and write out MLOG_FILE_NAME and MLOG_CHECKPOINT if needed.
+and write out FILE_MODIFY and FILE_CHECKPOINT if needed.
@param[in] lsn checkpoint LSN
-@param[in] do_write whether to always write MLOG_CHECKPOINT
+@param[in] do_write whether to always write FILE_CHECKPOINT
@return whether anything was written to the redo log
@retval false if no flags were set and nothing written
@retval true if anything was written to the redo log */
@@ -4631,7 +4622,7 @@ fil_names_clear(
bool do_write)
{
mtr_t mtr;
- ulint mtr_checkpoint_size = LOG_CHECKPOINT_FREE_PER_THREAD;
+ ulint mtr_checkpoint_size = RECV_SCAN_SIZE - 1;
DBUG_EXECUTE_IF(
"increase_mtr_checkpoint_size",
@@ -4650,6 +4641,14 @@ fil_names_clear(
for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.named_spaces);
space != NULL; ) {
+ if (mtr.get_log()->size()
+ + (3 + 5 + 1) + strlen(space->chain.start->name)
+ >= mtr_checkpoint_size) {
+ /* Prevent log parse buffer overflow */
+ mtr.commit_files();
+ mtr.start();
+ }
+
fil_space_t* next = UT_LIST_GET_NEXT(named_spaces, space);
ut_ad(space->max_lsn > 0);
@@ -4671,19 +4670,6 @@ fil_names_clear(
fil_names_write(space, &mtr);
do_write = true;
- const mtr_buf_t* mtr_log = mtr_get_log(&mtr);
-
- /** If the mtr buffer size exceeds the size of
- LOG_CHECKPOINT_FREE_PER_THREAD then commit the multi record
- mini-transaction, start the new mini-transaction to
- avoid the parsing buffer overflow error during recovery. */
-
- if (mtr_log->size() > mtr_checkpoint_size) {
- ut_ad(mtr_log->size() < (RECV_PARSING_BUF_SIZE / 2));
- mtr.commit_files();
- mtr.start();
- }
-
space = next;
}
diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc
index 5fcc58300f1..ede8d4f8c16 100644
--- a/storage/innobase/fsp/fsp0fsp.cc
+++ b/storage/innobase/fsp/fsp0fsp.cc
@@ -476,27 +476,29 @@ xdes_get_offset(
/** Initialize a file page whose prior contents should be ignored.
@param[in,out] block buffer pool block */
-void fsp_apply_init_file_page(buf_block_t* block)
+void fsp_apply_init_file_page(buf_block_t *block)
{
- page_t* page = buf_block_get_frame(block);
-
- memset(page, 0, srv_page_size);
-
- mach_write_to_4(page + FIL_PAGE_OFFSET, block->page.id.page_no());
- mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
- block->page.id.space());
-
- if (page_zip_des_t* page_zip= buf_block_get_page_zip(block)) {
- memset(page_zip->data, 0, page_zip_get_size(page_zip));
- static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
- memcpy_aligned<4>(page_zip->data + FIL_PAGE_OFFSET,
- page + FIL_PAGE_OFFSET, 4);
- static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
- "not perfect alignment");
- memcpy_aligned<2>(page_zip->data
- + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
- page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4);
- }
+ memset_aligned<UNIV_PAGE_SIZE_MIN>(block->frame, 0, srv_page_size);
+
+ mach_write_to_4(block->frame + FIL_PAGE_OFFSET, block->page.id.page_no());
+ if (log_sys.is_physical())
+ memset_aligned<8>(block->frame + FIL_PAGE_PREV, 0xff, 8);
+ mach_write_to_4(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ block->page.id.space());
+ if (page_zip_des_t* page_zip= buf_block_get_page_zip(block))
+ {
+ memset_aligned<UNIV_ZIP_SIZE_MIN>(page_zip->data, 0,
+ page_zip_get_size(page_zip));
+ static_assert(FIL_PAGE_OFFSET == 4, "compatibility");
+ memcpy_aligned<4>(page_zip->data + FIL_PAGE_OFFSET,
+ block->frame + FIL_PAGE_OFFSET, 4);
+ if (log_sys.is_physical())
+ memset_aligned<8>(page_zip->data + FIL_PAGE_PREV, 0xff, 8);
+ static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+ "not perfect alignment");
+ memcpy_aligned<2>(page_zip->data + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4);
+ }
}
#ifdef UNIV_DEBUG
@@ -577,8 +579,12 @@ void fsp_header_init(fil_space_t* space, ulint size, mtr_t* mtr)
+ block->frame, space->id);
ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_NOT_USED
+ block->frame));
- mtr->write<4>(*block, FSP_HEADER_OFFSET + FSP_SIZE + block->frame,
- size);
+ /* recv_sys_t::parse() expects to find a WRITE record that
+ covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
+ in order to avoid optimizing away any unchanged most
+ significant bytes of FSP_SIZE. */
+ mtr->write<4,mtr_t::FORCED>(*block, FSP_HEADER_OFFSET + FSP_SIZE
+ + block->frame, size);
ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
+ block->frame));
mtr->write<4,mtr_t::OPT>(*block, FSP_HEADER_OFFSET + FSP_SPACE_FLAGS
@@ -636,8 +642,12 @@ fsp_try_extend_data_file_with_pages(
success = fil_space_extend(space, page_no + 1);
/* The size may be less than we wanted if we ran out of disk space. */
- mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_SIZE + header->frame,
- space->size);
+ /* recv_sys_t::parse() expects to find a WRITE record that
+ covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
+ in order to avoid optimizing away any unchanged most
+ significant bytes of FSP_SIZE. */
+ mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE
+ + header->frame, space->size);
space->size_in_header = space->size;
return(success);
@@ -770,8 +780,12 @@ fsp_try_extend_data_file(fil_space_t *space, buf_block_t *header, mtr_t *mtr)
space->size_in_header = ut_2pow_round(space->size, (1024 * 1024) / ps);
- mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_SIZE + header->frame,
- space->size_in_header);
+ /* recv_sys_t::parse() expects to find a WRITE record that
+ covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
+ in order to avoid optimizing away any unchanged most
+ significant bytes of FSP_SIZE. */
+ mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE
+ + header->frame, space->size_in_header);
return(size_increase);
}
@@ -1511,8 +1525,7 @@ static void fsp_free_seg_inode(
iblock, FSEG_INODE_PAGE_NODE, mtr);
}
- mtr->write<8>(*iblock, inode + FSEG_ID, 0U);
- mtr->write<4>(*iblock, inode + FSEG_MAGIC_N, 0xfa051ce3);
+ mtr->memset(iblock, page_offset(inode) + FSEG_ID, FSEG_INODE_SIZE, 0);
if (ULINT_UNDEFINED
== fsp_seg_inode_page_find_used(iblock->frame, physical_size)) {
diff --git a/storage/innobase/fut/fut0lst.cc b/storage/innobase/fut/fut0lst.cc
index 93249aeab54..618eb1881e3 100644
--- a/storage/innobase/fut/fut0lst.cc
+++ b/storage/innobase/fut/fut0lst.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, MariaDB Corporation.
+Copyright (c) 2019, 2020, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -28,6 +28,61 @@ Created 11/28/1995 Heikki Tuuri
#include "buf0buf.h"
#include "page0page.h"
+
+/** Write a file address.
+@param[in] block file page
+@param[in,out] faddr file address location
+@param[in] page page number
+@param[in] boffset byte offset
+@param[in,out] mtr mini-transaction */
+static void flst_write_addr(const buf_block_t& block, byte *faddr,
+ uint32_t page, uint16_t boffset, mtr_t* mtr)
+{
+ ut_ad(mtr->memo_contains_page_flagged(faddr,
+ MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_SX_FIX));
+ ut_a(page == FIL_NULL || boffset >= FIL_PAGE_DATA);
+ ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA);
+
+ static_assert(FIL_ADDR_PAGE == 0, "compatibility");
+ static_assert(FIL_ADDR_BYTE == 4, "compatibility");
+ static_assert(FIL_ADDR_SIZE == 6, "compatibility");
+
+ const bool same_page= mach_read_from_4(faddr + FIL_ADDR_PAGE) == page;
+ const bool same_offset= mach_read_from_2(faddr + FIL_ADDR_BYTE) == boffset;
+ if (same_page)
+ {
+ if (!same_offset)
+ mtr->write<2>(block, faddr + FIL_ADDR_BYTE, boffset);
+ return;
+ }
+ if (same_offset)
+ mtr->write<4>(block, faddr + FIL_ADDR_PAGE, page);
+ else
+ {
+ alignas(4) byte fil_addr[6];
+ mach_write_to_4(fil_addr + FIL_ADDR_PAGE, page);
+ mach_write_to_2(fil_addr + FIL_ADDR_BYTE, boffset);
+ mtr->memcpy(block, faddr + FIL_ADDR_PAGE, fil_addr, 6);
+ }
+}
+
+/** Write 2 null file addresses.
+@param[in] b file page
+@param[in,out] addr file address to be zeroed out
+@param[in,out] mtr mini-transaction */
+static void flst_zero_both(const buf_block_t& b, byte *addr, mtr_t *mtr)
+{
+ if (mach_read_from_4(addr + FIL_ADDR_PAGE) != FIL_NULL)
+ mtr->memset(&b, ulint(addr - b.frame) + FIL_ADDR_PAGE, 4, 0xff);
+ mtr->write<2,mtr_t::OPT>(b, addr + FIL_ADDR_BYTE, 0U);
+ /* Initialize the other address by (MEMMOVE|0x80,offset,FIL_ADDR_SIZE,source)
+ which is 4 bytes, or less than FIL_ADDR_SIZE. */
+ memcpy(addr + FIL_ADDR_SIZE, addr, FIL_ADDR_SIZE);
+ const uint16_t boffset= page_offset(addr);
+ mtr->memmove(b, boffset + FIL_ADDR_SIZE, boffset, FIL_ADDR_SIZE);
+}
+
/** Add a node to an empty list. */
static void flst_add_to_empty(buf_block_t *base, uint16_t boffset,
buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
@@ -41,20 +96,22 @@ static void flst_add_to_empty(buf_block_t *base, uint16_t boffset,
ut_ad(mtr_memo_contains_page_flagged(mtr, add->frame,
MTR_MEMO_PAGE_X_FIX |
MTR_MEMO_PAGE_SX_FIX));
- fil_addr_t addr= { add->page.id.page_no(), aoffset };
+ ut_ad(!mach_read_from_4(base->frame + boffset + FLST_LEN));
+ mtr->write<1>(*base, base->frame + boffset + (FLST_LEN + 3), 1U);
/* Update first and last fields of base node */
- flst_write_addr(*base, base->frame + boffset + FLST_FIRST, addr, mtr);
- /* MDEV-12353 TODO: use MEMMOVE record */
- flst_write_addr(*base, base->frame + boffset + FLST_LAST, addr, mtr);
+ flst_write_addr(*base, base->frame + boffset + FLST_FIRST,
+ add->page.id.page_no(), aoffset, mtr);
+ memcpy(base->frame + boffset + FLST_LAST, base->frame + boffset + FLST_FIRST,
+ FIL_ADDR_SIZE);
+ /* Initialize FLST_LAST by (MEMMOVE|0x80,offset,FIL_ADDR_SIZE,source)
+ which is 4 bytes, or less than FIL_ADDR_SIZE. */
+ mtr->memmove(*base, boffset + FLST_LAST, boffset + FLST_FIRST,
+ FIL_ADDR_SIZE);
/* Set prev and next fields of node to add */
- flst_zero_addr(*add, add->frame + aoffset + FLST_PREV, mtr);
- flst_zero_addr(*add, add->frame + aoffset + FLST_NEXT, mtr);
-
- /* Update len of base node */
- ut_ad(!mach_read_from_4(base->frame + boffset + FLST_LEN));
- mtr->write<1>(*base, base->frame + boffset + (FLST_LEN + 3), 1U);
+ static_assert(FLST_NEXT == FLST_PREV + FIL_ADDR_SIZE, "compatibility");
+ flst_zero_both(*add, add->frame + aoffset + FLST_PREV, mtr);
}
/** Insert a node after another one.
@@ -85,24 +142,27 @@ static void flst_insert_after(buf_block_t *base, uint16_t boffset,
MTR_MEMO_PAGE_X_FIX |
MTR_MEMO_PAGE_SX_FIX));
- fil_addr_t cur_addr= { cur->page.id.page_no(), coffset };
- fil_addr_t add_addr= { add->page.id.page_no(), aoffset };
fil_addr_t next_addr= flst_get_next_addr(cur->frame + coffset);
- flst_write_addr(*add, add->frame + aoffset + FLST_PREV, cur_addr, mtr);
- flst_write_addr(*add, add->frame + aoffset + FLST_NEXT, next_addr, mtr);
+ flst_write_addr(*add, add->frame + aoffset + FLST_PREV,
+ cur->page.id.page_no(), coffset, mtr);
+ flst_write_addr(*add, add->frame + aoffset + FLST_NEXT,
+ next_addr.page, next_addr.boffset, mtr);
if (fil_addr_is_null(next_addr))
- flst_write_addr(*base, base->frame + boffset + FLST_LAST, add_addr, mtr);
+ flst_write_addr(*base, base->frame + boffset + FLST_LAST,
+ add->page.id.page_no(), aoffset, mtr);
else
{
buf_block_t *block;
flst_node_t *next= fut_get_ptr(add->page.id.space(), add->zip_size(),
next_addr, RW_SX_LATCH, mtr, &block);
- flst_write_addr(*block, next + FLST_PREV, add_addr, mtr);
+ flst_write_addr(*block, next + FLST_PREV,
+ add->page.id.page_no(), aoffset, mtr);
}
- flst_write_addr(*cur, cur->frame + coffset + FLST_NEXT, add_addr, mtr);
+ flst_write_addr(*cur, cur->frame + coffset + FLST_NEXT,
+ add->page.id.page_no(), aoffset, mtr);
byte *len= &base->frame[boffset + FLST_LEN];
mtr->write<4>(*base, len, mach_read_from_4(len) + 1);
@@ -136,29 +196,45 @@ static void flst_insert_before(buf_block_t *base, uint16_t boffset,
MTR_MEMO_PAGE_X_FIX |
MTR_MEMO_PAGE_SX_FIX));
- fil_addr_t cur_addr= { cur->page.id.page_no(), coffset };
- fil_addr_t add_addr= { add->page.id.page_no(), aoffset };
fil_addr_t prev_addr= flst_get_prev_addr(cur->frame + coffset);
- flst_write_addr(*add, add->frame + aoffset + FLST_PREV, prev_addr, mtr);
- flst_write_addr(*add, add->frame + aoffset + FLST_NEXT, cur_addr, mtr);
+ flst_write_addr(*add, add->frame + aoffset + FLST_PREV,
+ prev_addr.page, prev_addr.boffset, mtr);
+ flst_write_addr(*add, add->frame + aoffset + FLST_NEXT,
+ cur->page.id.page_no(), coffset, mtr);
if (fil_addr_is_null(prev_addr))
- flst_write_addr(*base, base->frame + boffset + FLST_FIRST, add_addr, mtr);
+ flst_write_addr(*base, base->frame + boffset + FLST_FIRST,
+ add->page.id.page_no(), aoffset, mtr);
else
{
buf_block_t *block;
flst_node_t *prev= fut_get_ptr(add->page.id.space(), add->zip_size(),
prev_addr, RW_SX_LATCH, mtr, &block);
- flst_write_addr(*block, prev + FLST_NEXT, add_addr, mtr);
+ flst_write_addr(*block, prev + FLST_NEXT,
+ add->page.id.page_no(), aoffset, mtr);
}
- flst_write_addr(*cur, cur->frame + coffset + FLST_PREV, add_addr, mtr);
+ flst_write_addr(*cur, cur->frame + coffset + FLST_PREV,
+ add->page.id.page_no(), aoffset, mtr);
byte *len= &base->frame[boffset + FLST_LEN];
mtr->write<4>(*base, len, mach_read_from_4(len) + 1);
}
+/** Initialize a list base node.
+@param[in] block file page
+@param[in,out] base base node
+@param[in,out] mtr mini-transaction */
+void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr)
+{
+ ut_ad(mtr->memo_contains_page_flagged(base, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+ mtr->write<4,mtr_t::OPT>(block, base + FLST_LEN, 0U);
+ static_assert(FLST_LAST == FLST_FIRST + FIL_ADDR_SIZE, "compatibility");
+ flst_zero_both(block, base + FLST_FIRST, mtr);
+}
+
/** Append a file list node to a list.
@param[in,out] base base node block
@param[in] boffset byte offset of the base node
@@ -251,7 +327,8 @@ void flst_remove(buf_block_t *base, uint16_t boffset,
const fil_addr_t next_addr= flst_get_next_addr(cur->frame + coffset);
if (fil_addr_is_null(prev_addr))
- flst_write_addr(*base, base->frame + boffset + FLST_FIRST, next_addr, mtr);
+ flst_write_addr(*base, base->frame + boffset + FLST_FIRST,
+ next_addr.page, next_addr.boffset, mtr);
else
{
buf_block_t *block= cur;
@@ -259,11 +336,13 @@ void flst_remove(buf_block_t *base, uint16_t boffset,
? cur->frame + prev_addr.boffset
: fut_get_ptr(cur->page.id.space(), cur->zip_size(), prev_addr,
RW_SX_LATCH, mtr, &block);
- flst_write_addr(*block, prev + FLST_NEXT, next_addr, mtr);
+ flst_write_addr(*block, prev + FLST_NEXT,
+ next_addr.page, next_addr.boffset, mtr);
}
if (fil_addr_is_null(next_addr))
- flst_write_addr(*base, base->frame + boffset + FLST_LAST, prev_addr, mtr);
+ flst_write_addr(*base, base->frame + boffset + FLST_LAST,
+ prev_addr.page, prev_addr.boffset, mtr);
else
{
buf_block_t *block= cur;
@@ -271,7 +350,8 @@ void flst_remove(buf_block_t *base, uint16_t boffset,
? cur->frame + next_addr.boffset
: fut_get_ptr(cur->page.id.space(), cur->zip_size(), next_addr,
RW_SX_LATCH, mtr, &block);
- flst_write_addr(*block, next + FLST_PREV, prev_addr, mtr);
+ flst_write_addr(*block, next + FLST_PREV,
+ prev_addr.page, prev_addr.boffset, mtr);
}
byte *len= &base->frame[boffset + FLST_LEN];
diff --git a/storage/innobase/gis/gis0rtree.cc b/storage/innobase/gis/gis0rtree.cc
index 8cb47457415..a4dfb9fa453 100644
--- a/storage/innobase/gis/gis0rtree.cc
+++ b/storage/innobase/gis/gis0rtree.cc
@@ -300,8 +300,9 @@ rtr_update_mbr_field(
memcpy(rec, node_ptr->fields[0].data, DATA_MBR_LEN);
page_zip_write_rec(block, rec, index, offsets, 0, mtr);
} else {
- mtr->memcpy(block, page_offset(rec),
- node_ptr->fields[0].data, DATA_MBR_LEN);
+ mtr->memcpy<mtr_t::OPT>(*block, rec,
+ node_ptr->fields[0].data,
+ DATA_MBR_LEN);
}
if (cursor2) {
@@ -895,7 +896,6 @@ rtr_page_split_and_insert(
rtr_split_node_t* cur_split_node;
rtr_split_node_t* end_split_node;
double* buf_pos;
- ulint page_level;
node_seq_t current_ssn;
node_seq_t next_ssn;
buf_block_t* root_block;
@@ -926,7 +926,6 @@ func_start:
block = btr_cur_get_block(cursor);
page = buf_block_get_frame(block);
page_zip = buf_block_get_page_zip(block);
- page_level = btr_page_get_level(page);
current_ssn = page_get_ssn_id(page);
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
@@ -971,9 +970,19 @@ func_start:
/* Allocate a new page to the index */
hint_page_no = page_no + 1;
+ const uint16_t page_level = btr_page_get_level(page);
new_block = btr_page_alloc(cursor->index, hint_page_no, FSP_UP,
page_level, mtr, mtr);
+ if (!new_block) {
+ return NULL;
+ }
+
new_page_zip = buf_block_get_page_zip(new_block);
+ if (page_level && UNIV_LIKELY_NULL(new_page_zip)) {
+ /* ROW_FORMAT=COMPRESSED non-leaf pages are not expected
+ to contain FIL_NULL in FIL_PAGE_PREV at this stage. */
+ memset_aligned<4>(new_block->frame + FIL_PAGE_PREV, 0, 4);
+ }
btr_page_create(new_block, new_page_zip, cursor->index,
page_level, mtr);
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 0a078cce7df..5b42501d45d 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -18585,7 +18585,7 @@ checkpoint_now_set(THD*, st_mysql_sys_var*, void*, const void* save)
mysql_mutex_unlock(&LOCK_global_system_variables);
while (log_sys.last_checkpoint_lsn
- + SIZE_OF_MLOG_CHECKPOINT
+ + SIZE_OF_FILE_CHECKPOINT
+ (log_sys.append_on_checkpoint != NULL
? log_sys.append_on_checkpoint->size() : 0)
< log_sys.lsn) {
diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc
index 3634e372bd9..c644ce9593f 100644
--- a/storage/innobase/handler/handler0alter.cc
+++ b/storage/innobase/handler/handler0alter.cc
@@ -10890,19 +10890,14 @@ ha_innobase::commit_inplace_alter_table(
ut_ad(trx->has_logged());
if (mtr.get_log()->size() > 0) {
- ut_ad(*mtr.get_log()->front()->begin()
- == MLOG_FILE_RENAME2);
-
- /* Append the MLOG_FILE_RENAME2
+ ut_ad((*mtr.get_log()->front()->begin()
+ & 0xf0) == FILE_RENAME);
+ /* Append the FILE_RENAME
records on checkpoint, as a separate
mini-transaction before the one that
- contains the MLOG_CHECKPOINT marker. */
- static const byte multi
- = MLOG_MULTI_REC_END;
-
+ contains the FILE_CHECKPOINT marker. */
mtr.get_log()->for_each_block(logs);
- logs.m_buf.push(&multi, sizeof multi);
-
+ logs.m_buf.push(field_ref_zero, 1);
log_append_on_checkpoint(&logs.m_buf);
}
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
index e0077e1ca42..c59474f7d12 100644
--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -276,23 +276,15 @@ btr_page_get_index_id(
/*==================*/
const page_t* page) /*!< in: index page */
MY_ATTRIBUTE((warn_unused_result));
-/********************************************************//**
-Gets the node level field in an index page.
-@param[in] page index page
-@return level, leaf level == 0 */
-UNIV_INLINE
-ulint
-btr_page_get_level(const page_t* page)
+/** Read the B-tree or R-tree PAGE_LEVEL.
+@param page B-tree or R-tree page
+@return number of child page links to reach the leaf level
+@retval 0 for leaf pages */
+inline uint16_t btr_page_get_level(const page_t *page)
{
- ulint level;
-
- ut_ad(page);
-
- level = mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL);
-
- ut_ad(level <= BTR_MAX_NODE_LEVEL);
-
- return(level);
+ uint16_t level = mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL);
+ ut_ad(level <= BTR_MAX_NODE_LEVEL);
+ return level;
} MY_ATTRIBUTE((warn_unused_result))
/** Read FIL_PAGE_NEXT.
@@ -403,6 +395,13 @@ btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset = false)
@param[in,out] mtr mini-transaction */
void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr);
+/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE.
+@param[in] index clustered index with instant ALTER TABLE
+@param[in] all whether to reset FIL_PAGE_TYPE as well
+@param[in,out] mtr mini-transaction */
+ATTRIBUTE_COLD __attribute__((nonnull))
+void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr);
+
/*************************************************************//**
Makes tree one level higher by splitting the root, and inserts
the tuple. It is assumed that mtr contains an x-latch on the tree.
diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic
index 8f8ed344fa4..35ac49dd117 100644
--- a/storage/innobase/include/btr0btr.ic
+++ b/storage/innobase/include/btr0btr.ic
@@ -49,16 +49,11 @@ inline
void btr_page_set_level(buf_block_t *block, ulint level, mtr_t *mtr)
{
ut_ad(level <= BTR_MAX_NODE_LEVEL);
-
- byte *page_level= PAGE_HEADER + PAGE_LEVEL + block->frame;
-
- if (UNIV_LIKELY_NULL(block->page.zip.data))
- {
- mach_write_to_2(page_level, level);
- page_zip_write_header(block, page_level, 2, mtr);
- }
- else
- mtr->write<2,mtr_t::OPT>(*block, page_level, level);
+ constexpr uint16_t field= PAGE_HEADER + PAGE_LEVEL;
+ byte *b= my_assume_aligned<2>(&block->frame[field]);
+ if (mtr->write<2,mtr_t::OPT>(*block, b, level) &&
+ UNIV_LIKELY_NULL(block->page.zip.data))
+ memcpy_aligned<2>(&block->page.zip.data[field], b, 2);
}
/** Set FIL_PAGE_NEXT.
@@ -67,14 +62,11 @@ void btr_page_set_level(buf_block_t *block, ulint level, mtr_t *mtr)
@param[in,out] mtr mini-transaction */
inline void btr_page_set_next(buf_block_t *block, ulint next, mtr_t *mtr)
{
- byte *fil_page_next= block->frame + FIL_PAGE_NEXT;
- if (UNIV_LIKELY_NULL(block->page.zip.data))
- {
- mach_write_to_4(fil_page_next, next);
- page_zip_write_header(block, fil_page_next, 4, mtr);
- }
- else
- mtr->write<4>(*block, fil_page_next, next);
+ constexpr uint16_t field= FIL_PAGE_NEXT;
+ byte *b= my_assume_aligned<4>(&block->frame[field]);
+ if (mtr->write<4,mtr_t::OPT>(*block, b, next) &&
+ UNIV_LIKELY_NULL(block->page.zip.data))
+ memcpy_aligned<4>(&block->page.zip.data[field], b, 4);
}
/** Set FIL_PAGE_PREV.
@@ -83,14 +75,11 @@ inline void btr_page_set_next(buf_block_t *block, ulint next, mtr_t *mtr)
@param[in,out] mtr mini-transaction */
inline void btr_page_set_prev(buf_block_t *block, ulint prev, mtr_t *mtr)
{
- byte *fil_page_prev= block->frame + FIL_PAGE_PREV;
- if (UNIV_LIKELY_NULL(block->page.zip.data))
- {
- mach_write_to_4(fil_page_prev, prev);
- page_zip_write_header(block, fil_page_prev, 4, mtr);
- }
- else
- mtr->write<4>(*block, fil_page_prev, prev);
+ constexpr uint16_t field= FIL_PAGE_PREV;
+ byte *b= my_assume_aligned<4>(&block->frame[field]);
+ if (mtr->write<4,mtr_t::OPT>(*block, b, prev) &&
+ UNIV_LIKELY_NULL(block->page.zip.data))
+ memcpy_aligned<4>(&block->page.zip.data[field], b, 4);
}
/**************************************************************//**
diff --git a/storage/innobase/include/btr0bulk.h b/storage/innobase/include/btr0bulk.h
index 12ecddca44f..18ab4cfa4fc 100644
--- a/storage/innobase/include/btr0bulk.h
+++ b/storage/innobase/include/btr0bulk.h
@@ -109,10 +109,9 @@ private:
template<format> inline void finishPage();
/** Insert a record in the page.
@tparam format the page format
- @param[in] rec record
+ @param[in,out] rec record
@param[in] offsets record offsets */
- template<format> inline void insertPage(const rec_t* rec,
- offset_t* offsets);
+ template<format> inline void insertPage(rec_t* rec, offset_t* offsets);
public:
/** Mark end of insertion to the page. Scan all records to set page
diff --git a/storage/innobase/include/dyn0buf.h b/storage/innobase/include/dyn0buf.h
index f66c7e3d405..b15d35c4831 100644
--- a/storage/innobase/include/dyn0buf.h
+++ b/storage/innobase/include/dyn0buf.h
@@ -382,6 +382,9 @@ public:
return(m_heap == NULL);
}
+ /** @return whether the buffer is empty */
+ bool empty() const { return !back()->m_used; }
+
private:
// Disable copying
mtr_buf_t(const mtr_buf_t&);
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
index 770c2e43a6c..5f5f3204a87 100644
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -149,7 +149,7 @@ struct fil_space_t
rw_lock_t latch; /*!< latch protecting the file space storage
allocation */
UT_LIST_NODE_T(fil_space_t) named_spaces;
- /*!< list of spaces for which MLOG_FILE_NAME
+ /*!< list of spaces for which FILE_MODIFY
records have been issued */
/** Checks that this tablespace in a list of unflushed tablespaces.
@return true if in a list */
@@ -641,13 +641,6 @@ extern const char* dot_ext[];
but in the MySQL Embedded Server Library and mysqlbackup it is not the default
directory, and we must set the base file path explicitly */
extern const char* fil_path_to_mysql_datadir;
-
-/* Space address data type; this is intended to be used when
-addresses accurate to a byte are stored in file pages. If the page part
-of the address is FIL_NULL, the address is considered undefined. */
-
-typedef byte fil_faddr_t; /*!< 'type' definition in C: an address
- stored in a file page is a string of bytes */
#else
# include "univ.i"
#endif /* !UNIV_INNOCHECKSUM */
@@ -951,7 +944,7 @@ public:
/*!< list of all file spaces */
UT_LIST_BASE_NODE_T(fil_space_t) named_spaces;
/*!< list of all file spaces
- for which a MLOG_FILE_NAME
+ for which a FILE_MODIFY
record has been written since
the latest redo log checkpoint.
Protected only by log_sys.mutex. */
@@ -1531,26 +1524,18 @@ void
fil_names_dirty(
fil_space_t* space);
-/** Write MLOG_FILE_NAME records when a non-predefined persistent
+/** Write FILE_MODIFY records when a non-predefined persistent
tablespace was modified for the first time since the latest
fil_names_clear().
-@param[in,out] space tablespace
-@param[in,out] mtr mini-transaction */
-void
-fil_names_dirty_and_write(
- fil_space_t* space,
- mtr_t* mtr);
+@param[in,out] space tablespace */
+void fil_names_dirty_and_write(fil_space_t* space);
-/** Write MLOG_FILE_NAME records if a persistent tablespace was modified
+/** Write FILE_MODIFY records if a persistent tablespace was modified
for the first time since the latest fil_names_clear().
@param[in,out] space tablespace
@param[in,out] mtr mini-transaction
-@return whether any MLOG_FILE_NAME record was written */
-inline MY_ATTRIBUTE((warn_unused_result))
-bool
-fil_names_write_if_was_clean(
- fil_space_t* space,
- mtr_t* mtr)
+@return whether any FILE_MODIFY record was written */
+inline bool fil_names_write_if_was_clean(fil_space_t* space)
{
ut_ad(log_mutex_own());
@@ -1563,7 +1548,7 @@ fil_names_write_if_was_clean(
space->max_lsn = log_sys.lsn;
if (was_clean) {
- fil_names_dirty_and_write(space, mtr);
+ fil_names_dirty_and_write(space);
}
return(was_clean);
@@ -1588,9 +1573,9 @@ inline void fil_space_open_if_needed(fil_space_t* space)
}
/** On a log checkpoint, reset fil_names_dirty_and_write() flags
-and write out MLOG_FILE_NAME and MLOG_CHECKPOINT if needed.
+and write out FILE_MODIFY and FILE_CHECKPOINT if needed.
@param[in] lsn checkpoint LSN
-@param[in] do_write whether to always write MLOG_CHECKPOINT
+@param[in] do_write whether to always write FILE_CHECKPOINT
@return whether anything was written to the redo log
@retval false if no flags were set and nothing written
@retval true if anything was written to the redo log */
diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h
index 1aeb3867eea..1bde90fdad4 100644
--- a/storage/innobase/include/fsp0fsp.h
+++ b/storage/innobase/include/fsp0fsp.h
@@ -612,7 +612,7 @@ inline bool fsp_descr_page(const page_id_t page_id, ulint physical_size)
/** Initialize a file page whose prior contents should be ignored.
@param[in,out] block buffer pool block */
-void fsp_apply_init_file_page(buf_block_t* block);
+void fsp_apply_init_file_page(buf_block_t *block);
/** Initialize a file page.
@param[in] space tablespace
diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h
index e9355948599..1ade24cd069 100644
--- a/storage/innobase/include/fut0lst.h
+++ b/storage/innobase/include/fut0lst.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2019, MariaDB Corporation.
+Copyright (c) 2018, 2020, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -78,47 +78,12 @@ inline void flst_init(const buf_block_t* block, uint16_t ofs, mtr_t* mtr)
mtr->memset(block, FLST_LAST + FIL_ADDR_PAGE + ofs, 4, 0xff);
}
-/** Write a null file address.
-@param[in] b file page
-@param[in,out] addr file address to be zeroed out
-@param[in,out] mtr mini-transaction */
-inline void flst_zero_addr(const buf_block_t& b, fil_faddr_t *addr, mtr_t *mtr)
-{
- if (mach_read_from_4(addr + FIL_ADDR_PAGE) != FIL_NULL)
- mtr->memset(&b, ulint(addr - b.frame) + FIL_ADDR_PAGE, 4, 0xff);
- mtr->write<2,mtr_t::OPT>(b, addr + FIL_ADDR_BYTE, 0U);
-}
-
-/** Write a file address.
-@param[in] block file page
-@param[in,out] faddr file address location
-@param[in] addr file address to be written out
-@param[in,out] mtr mini-transaction */
-inline void flst_write_addr(const buf_block_t& block, fil_faddr_t *faddr,
- fil_addr_t addr, mtr_t* mtr)
-{
- ut_ad(mtr->memo_contains_page_flagged(faddr,
- MTR_MEMO_PAGE_X_FIX
- | MTR_MEMO_PAGE_SX_FIX));
- ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA);
- ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA);
-
- mtr->write<4,mtr_t::OPT>(block, faddr + FIL_ADDR_PAGE, addr.page);
- mtr->write<2,mtr_t::OPT>(block, faddr + FIL_ADDR_BYTE, addr.boffset);
-}
-
/** Initialize a list base node.
@param[in] block file page
@param[in,out] base base node
@param[in,out] mtr mini-transaction */
-inline void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr)
-{
- ut_ad(mtr->memo_contains_page_flagged(base, MTR_MEMO_PAGE_X_FIX |
- MTR_MEMO_PAGE_SX_FIX));
- mtr->write<4,mtr_t::OPT>(block, base + FLST_LEN, 0U);
- flst_zero_addr(block, base + FLST_FIRST, mtr);
- flst_zero_addr(block, base + FLST_LAST, mtr);
-}
+void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
/** Append a file list node to a list.
@param[in,out] base base node block
@@ -155,7 +120,7 @@ inline uint32_t flst_get_len(const flst_base_node_t *base)
}
/** @return a file address */
-inline fil_addr_t flst_read_addr(const fil_faddr_t *faddr)
+inline fil_addr_t flst_read_addr(const byte *faddr)
{
fil_addr_t addr= { mach_read_from_4(faddr + FIL_ADDR_PAGE),
mach_read_from_2(faddr + FIL_ADDR_BYTE) };
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
index 015e22cdfa4..49851cd6929 100644
--- a/storage/innobase/include/log0log.h
+++ b/storage/innobase/include/log0log.h
@@ -2,7 +2,7 @@
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved.
Copyright (c) 2009, Google Inc.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -206,7 +206,7 @@ logs_empty_and_mark_files_at_shutdown(void);
@param[in] header 0 or LOG_CHECKPOINT_1 or LOG_CHECKPOINT2 */
void log_header_read(ulint header);
/** Write checkpoint info to the log header and invoke log_mutex_exit().
-@param[in] end_lsn start LSN of the MLOG_CHECKPOINT mini-transaction */
+@param[in] end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */
void log_write_checkpoint_info(lsn_t end_lsn);
/** Set extra data to be written to the redo log during checkpoint.
@@ -499,6 +499,10 @@ struct log_t{
static constexpr uint32_t FORMAT_ENCRYPTED = 1U << 31;
/** The MariaDB 10.4.0 log format (only with innodb_encrypt_log=ON) */
static constexpr uint32_t FORMAT_ENC_10_4 = FORMAT_10_4 | FORMAT_ENCRYPTED;
+ /** The MariaDB 10.5 physical redo log format */
+ static constexpr uint32_t FORMAT_10_5 = 0x50485953;
+ /** The MariaDB 10.5 physical format (only with innodb_encrypt_log=ON) */
+ static constexpr uint32_t FORMAT_ENC_10_5 = FORMAT_10_5 | FORMAT_ENCRYPTED;
MY_ALIGNED(CACHE_LINE_SIZE)
lsn_t lsn; /*!< log sequence number */
@@ -548,7 +552,7 @@ struct log_t{
struct files {
/** number of files */
ulint n_files;
- /** format of the redo log: e.g., FORMAT_10_4 */
+ /** format of the redo log: e.g., FORMAT_10_5 */
uint32_t format;
/** redo log subformat: 0 with separately logged TRUNCATE,
2 with fully redo-logged TRUNCATE (1 in MariaDB 10.2) */
@@ -586,6 +590,9 @@ struct log_t{
/** @return whether the redo log is encrypted */
bool is_encrypted() const { return format & FORMAT_ENCRYPTED; }
+ /** @return whether the redo log is in the physical format */
+ bool is_physical() const
+ { return (format & ~FORMAT_ENCRYPTED) == FORMAT_10_5; }
/** @return capacity in bytes */
lsn_t capacity() const{ return (file_size - LOG_FILE_HDR_SIZE) * n_files; }
/** Calculate the offset of a log sequence number.
@@ -718,6 +725,8 @@ public:
/** @return whether the redo log is encrypted */
bool is_encrypted() const { return(log.is_encrypted()); }
+ /** @return whether the redo log is in the physical format */
+ bool is_physical() const { return log.is_physical(); }
bool is_initialised() const { return m_initialised; }
diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h
index bd66f68b5ab..90b6cfe69d8 100644
--- a/storage/innobase/include/log0recv.h
+++ b/storage/innobase/include/log0recv.h
@@ -48,8 +48,10 @@ recv_find_max_checkpoint(ulint* max_field)
MY_ATTRIBUTE((nonnull, warn_unused_result));
/** Apply any buffered redo log to a page that was just read from a data file.
+@param[in,out] space tablespace
@param[in,out] bpage buffer pool page */
-ATTRIBUTE_COLD void recv_recover_page(buf_page_t* bpage);
+ATTRIBUTE_COLD void recv_recover_page(fil_space_t* space, buf_page_t* bpage)
+ MY_ATTRIBUTE((nonnull));
/** Start recovering from a redo log checkpoint.
@see recv_recovery_from_checkpoint_finish
@@ -102,24 +104,21 @@ to wait merging to file pages.
@param[in] checkpoint_lsn the LSN of the latest checkpoint
@param[in] store whether to store page operations
@param[in] apply whether to apply the records
-@return whether MLOG_CHECKPOINT record was seen the first time,
-or corruption was noticed */
-bool recv_parse_log_recs(
- lsn_t checkpoint_lsn,
- store_t* store,
- bool apply);
+@return whether MLOG_CHECKPOINT or FILE_CHECKPOINT record
+was seen the first time, or corruption was noticed */
+bool recv_parse_log_recs(lsn_t checkpoint_lsn, store_t *store, bool apply);
/** Moves the parsing buffer data left to the buffer start */
void recv_sys_justify_left_parsing_buf();
/** Report an operation to create, delete, or rename a file during backup.
@param[in] space_id tablespace identifier
-@param[in] flags tablespace flags (NULL if not create)
+@param[in] create whether the file is being created
@param[in] name file name (not NUL-terminated)
@param[in] len length of name, in bytes
@param[in] new_name new file name (NULL if not rename)
@param[in] new_len length of new_name, in bytes (0 if NULL) */
-extern void (*log_file_op)(ulint space_id, const byte* flags,
+extern void (*log_file_op)(ulint space_id, bool create,
const byte* name, ulint len,
const byte* new_name, ulint new_len);
@@ -134,7 +133,10 @@ struct log_rec_t
/** next record */
log_rec_t *next;
/** mtr_t::commit_lsn() of the mini-transaction */
- const lsn_t lsn;
+ lsn_t lsn;
+
+protected:
+ void set_lsn(lsn_t end_lsn) { ut_ad(lsn <= end_lsn); lsn= end_lsn; }
};
struct recv_dblwr_t {
@@ -171,13 +173,17 @@ struct page_recv_t
/** log records are being applied on the page */
RECV_BEING_PROCESSED
} state= RECV_NOT_PROCESSED;
+ /** Latest written byte offset when applying the log records.
+ @see mtr_t::m_last_offset */
+ uint16_t last_offset= 1;
/** log records for a page */
class recs_t
{
/** The first log record */
- log_rec_t *head= NULL;
+ log_rec_t *head= nullptr;
/** The last log record */
- log_rec_t *tail= NULL;
+ log_rec_t *tail= nullptr;
+ friend struct page_recv_t;
public:
/** Append a redo log snippet for the page
@param recs log snippet */
@@ -190,12 +196,10 @@ struct page_recv_t
tail= recs;
}
- /** Trim old log records for a page.
- @param start_lsn oldest log sequence number to preserve
- @return whether all the log for the page was trimmed */
- inline bool trim(lsn_t start_lsn);
/** @return the last log snippet */
const log_rec_t* last() const { return tail; }
+ /** @return the last log snippet */
+ log_rec_t* last() { return tail; }
class iterator
{
@@ -213,6 +217,10 @@ struct page_recv_t
inline void clear();
} log;
+ /** Trim old log records for a page.
+ @param start_lsn oldest log sequence number to preserve
+ @return whether all the log for the page was trimmed */
+ inline bool trim(lsn_t start_lsn);
/** Ignore any earlier redo log records for this page. */
inline void will_not_read();
/** @return whether the log records for the page are being processed */
@@ -288,7 +296,7 @@ struct recv_sys_t{
(indexed by page_id_t::space() - srv_undo_space_id_start) */
struct trunc
{
- /** log sequence number of MLOG_FILE_CREATE2, or 0 if none */
+ /** log sequence number of FILE_CREATE, or 0 if none */
lsn_t lsn;
/** truncated size of the tablespace, or 0 if not truncated */
unsigned pages;
@@ -342,8 +350,25 @@ public:
const byte* body, const byte* rec_end, lsn_t lsn,
lsn_t end_lsn);
- /** Clear a fully processed set of stored redo log records. */
- inline void clear();
+ /** Register a redo log snippet for a page.
+ @param page_id page identifier
+ @param start_lsn start LSN of the mini-transaction
+ @param lsn @see mtr_t::commit_lsn()
+ @param l redo log snippet @see log_t::FORMAT_10_5
+ @param len length of l, in bytes */
+ inline void add(const page_id_t page_id, lsn_t start_lsn, lsn_t lsn,
+ const byte *l, size_t len);
+
+ /** Parse and register one mini-transaction in log_t::FORMAT_10_5.
+ @param checkpoint_lsn the log sequence number of the latest checkpoint
+ @param store whether to store the records
+ @param apply whether to apply file-level log records
+ @return whether FILE_CHECKPOINT record was seen the first time,
+ or corruption was noticed */
+ inline bool parse(lsn_t checkpoint_lsn, store_t store, bool apply);
+
+ /** Clear a fully processed set of stored redo log records. */
+ inline void clear();
/** Determine whether redo log recovery progress should be reported.
@param[in] time the current time
@@ -362,19 +387,15 @@ public:
/** The alloc() memory alignment, in bytes */
static constexpr size_t ALIGNMENT= sizeof(size_t);
- /** Get the memory block for storing recv_t and redo log data
- @param[in] len length of the data to be stored
- @param[in] store_recv whether to store recv_t object
+ /** Allocate memory for log_rec_t
+ @param len allocation size, in bytes
@return pointer to len bytes of memory (never NULL) */
- inline byte *alloc(size_t len, bool store_recv= false);
+ inline void *alloc(size_t len, bool store_recv= false);
/** Free a redo log snippet.
@param data buffer returned by alloc() */
inline void free(const void *data);
- /** @return the free length of the latest alloc() block, in bytes */
- inline size_t get_free_len() const;
-
/** Remove records for a corrupted page.
This function should only be called when innodb_force_recovery is set.
@param page_id corrupted page identifier */
diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h
index 06afdbb54bc..71faf119cf0 100644
--- a/storage/innobase/include/mtr0log.h
+++ b/storage/innobase/include/mtr0log.h
@@ -33,82 +33,478 @@ Created 12/7/1995 Heikki Tuuri
// Forward declaration
struct dict_index_t;
+/** The minimum 2-byte integer (0b10xxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_2BYTE= 1 << 7;
+/** The minimum 3-byte integer (0b110xxxxx xxxxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_3BYTE= MIN_2BYTE + (1 << 14);
+/** The minimum 4-byte integer (0b1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_4BYTE= MIN_3BYTE + (1 << 21);
+/** Minimum 5-byte integer (0b11110000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_5BYTE= MIN_4BYTE + (1 << 28);
+
+/** Error from mlog_decode_varint() */
+constexpr uint32_t MLOG_DECODE_ERROR= ~0U;
+
+/** Decode the length of a variable-length encoded integer.
+@param first first byte of the encoded integer
+@return the length, in bytes */
+inline uint8_t mlog_decode_varint_length(byte first)
+{
+ uint8_t len= 1;
+ for (; first & 0x80; len++, first<<= 1);
+ return len;
+}
+
+/** Decode an integer in a redo log record.
+@param log redo log record buffer
+@return the decoded integer
+@retval MLOG_DECODE_ERROR on error */
+inline uint32_t mlog_decode_varint(const byte* log)
+{
+ uint32_t i= *log;
+ if (i < MIN_2BYTE)
+ return i;
+ if (i < 0xc0)
+ return MIN_2BYTE + ((i & ~0x80) << 8 | log[1]);
+ if (i < 0xe0)
+ return MIN_3BYTE + ((i & ~0xc0) << 16 | uint32_t{log[1]} << 8 | log[2]);
+ if (i < 0xf0)
+ return MIN_4BYTE + ((i & ~0xe0) << 24 | uint32_t{log[1]} << 16 |
+ uint32_t{log[2]} << 8 | log[3]);
+ if (i == 0xf0)
+ {
+ i= uint32_t{log[1]} << 24 | uint32_t{log[2]} << 16 |
+ uint32_t{log[3]} << 8 | log[4];
+ if (i <= ~MIN_5BYTE)
+ return MIN_5BYTE + i;
+ }
+ return MLOG_DECODE_ERROR;
+}
+
+/** Encode an integer in a redo log record.
+@param log redo log record buffer
+@param i the integer to encode
+@return end of the encoded integer */
+inline byte *mlog_encode_varint(byte *log, size_t i)
+{
+ if (i < MIN_2BYTE)
+ {
+ }
+ else if (i < MIN_3BYTE)
+ {
+ i-= MIN_2BYTE;
+ static_assert(MIN_3BYTE - MIN_2BYTE == 1 << 14, "compatibility");
+ *log++= 0x80 | static_cast<byte>(i >> 8);
+ }
+ else if (i < MIN_4BYTE)
+ {
+ i-= MIN_3BYTE;
+ static_assert(MIN_4BYTE - MIN_3BYTE == 1 << 21, "compatibility");
+ *log++= 0xc0 | static_cast<byte>(i >> 16);
+ goto last2;
+ }
+ else if (i < MIN_5BYTE)
+ {
+ i-= MIN_4BYTE;
+ static_assert(MIN_5BYTE - MIN_4BYTE == 1 << 28, "compatibility");
+ *log++= 0xe0 | static_cast<byte>(i >> 24);
+ goto last3;
+ }
+ else
+ {
+ ut_ad(i < MLOG_DECODE_ERROR);
+ i-= MIN_5BYTE;
+ *log++= 0xf0;
+ *log++= static_cast<byte>(i >> 24);
+last3:
+ *log++= static_cast<byte>(i >> 16);
+last2:
+ *log++= static_cast<byte>(i >> 8);
+ }
+ *log++= static_cast<byte>(i);
+ return log;
+}
+
+/** Determine the length of a log record.
+@param log start of log record
+@param end end of the log record buffer
+@return the length of the record, in bytes
+@retval 0 if the log extends past the end
+@retval MLOG_DECODE_ERROR if the record is corrupted */
+inline uint32_t mlog_decode_len(const byte *log, const byte *end)
+{
+ ut_ad(log < end);
+ uint32_t i= *log;
+ if (!i)
+ return 0; /* end of mini-transaction */
+ if (~i & 15)
+ return (i & 15) + 1; /* 1..16 bytes */
+ if (UNIV_UNLIKELY(++log == end))
+ return 0; /* end of buffer */
+ i= *log;
+ if (UNIV_LIKELY(i < MIN_2BYTE)) /* 1 additional length byte: 16..143 bytes */
+ return 16 + i;
+ if (i < 0xc0) /* 2 additional length bytes: 144..16,527 bytes */
+ {
+ if (UNIV_UNLIKELY(log + 1 == end))
+ return 0; /* end of buffer */
+ return 16 + MIN_2BYTE + ((i & ~0xc0) << 8 | log[1]);
+ }
+ if (i < 0xe0) /* 3 additional length bytes: 16528..1065103 bytes */
+ {
+ if (UNIV_UNLIKELY(log + 2 == end))
+ return 0; /* end of buffer */
+ return 16 + MIN_3BYTE + ((i & ~0xe0) << 16 |
+ static_cast<uint32_t>(log[1]) << 8 | log[2]);
+ }
+ /* 1,065,103 bytes per log record ought to be enough for everyone */
+ return MLOG_DECODE_ERROR;
+}
+
/** Write 1, 2, 4, or 8 bytes to a file page.
@param[in] block file page
@param[in,out] ptr pointer in file page
@param[in] val value to write
@tparam l number of bytes to write
@tparam w write request type
-@tparam V type of val */
+@tparam V type of val
+@return whether any log was written */
template<unsigned l,mtr_t::write_type w,typename V>
-inline void mtr_t::write(const buf_block_t &block, byte *ptr, V val)
+inline bool mtr_t::write(const buf_block_t &block, void *ptr, V val)
{
ut_ad(ut_align_down(ptr, srv_page_size) == block.frame);
- ut_ad(m_log_mode == MTR_LOG_NONE || m_log_mode == MTR_LOG_NO_REDO ||
- !block.page.zip.data ||
- /* written by fil_crypt_rotate_page() or innodb_make_page_dirty()? */
- (w == FORCED && l == 1 && ptr == &block.frame[FIL_PAGE_SPACE_ID]) ||
- mach_read_from_2(block.frame + FIL_PAGE_TYPE) <= FIL_PAGE_TYPE_ZBLOB2);
static_assert(l == 1 || l == 2 || l == 4 || l == 8, "wrong length");
+ byte buf[l];
switch (l) {
case 1:
- if (w == OPT && mach_read_from_1(ptr) == val) return;
- ut_ad(w != NORMAL || mach_read_from_1(ptr) != val);
ut_ad(val == static_cast<byte>(val));
- *ptr= static_cast<byte>(val);
+ buf[0]= static_cast<byte>(val);
break;
case 2:
ut_ad(val == static_cast<uint16_t>(val));
- if (w == OPT && mach_read_from_2(ptr) == val) return;
- ut_ad(w != NORMAL || mach_read_from_2(ptr) != val);
- mach_write_to_2(ptr, static_cast<uint16_t>(val));
+ mach_write_to_2(buf, static_cast<uint16_t>(val));
break;
case 4:
ut_ad(val == static_cast<uint32_t>(val));
- if (w == OPT && mach_read_from_4(ptr) == val) return;
- ut_ad(w != NORMAL || mach_read_from_4(ptr) != val);
- mach_write_to_4(ptr, static_cast<uint32_t>(val));
+ mach_write_to_4(buf, static_cast<uint32_t>(val));
break;
case 8:
- if (w == OPT && mach_read_from_8(ptr) == val) return;
- ut_ad(w != NORMAL || mach_read_from_8(ptr) != val);
- mach_write_to_8(ptr, val);
+ mach_write_to_8(buf, val);
break;
}
+ byte *p= static_cast<byte*>(ptr);
+ const byte *const end= p + l;
+ if (w != FORCED && m_log_mode == MTR_LOG_ALL)
+ {
+ const byte *b= buf;
+ while (*p++ == *b++)
+ {
+ if (p == end)
+ {
+ ut_ad(w == OPT);
+ return false;
+ }
+ }
+ p--;
+ }
+ ::memcpy(ptr, buf, l);
+ memcpy_low(block.page, static_cast<uint16_t>
+ (ut_align_offset(p, srv_page_size)), p, end - p);
+ return true;
+}
+
+/** Log an initialization of a string of bytes.
+@param[in] b buffer page
+@param[in] ofs byte offset from b->frame
+@param[in] len length of the data to write
+@param[in] val the data byte to write */
+inline void mtr_t::memset(const buf_block_t &b, ulint ofs, ulint len, byte val)
+{
+ ut_ad(len);
set_modified();
if (m_log_mode != MTR_LOG_ALL)
return;
- byte *log_ptr= m_log.open(11 + 2 + (l == 8 ? 9 : 5));
- if (l == 8)
- log_write(block, ptr, static_cast<mlog_id_t>(l), log_ptr, uint64_t{val});
- else
- log_write(block, ptr, static_cast<mlog_id_t>(l), log_ptr,
- static_cast<uint32_t>(val));
+
+ static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
+ size_t lenlen= (len < MIN_2BYTE ? 1 + 1 : len < MIN_3BYTE ? 2 + 1 : 3 + 1);
+ byte *l= log_write<MEMSET>(b.page.id, &b.page, lenlen, true, ofs);
+ l= mlog_encode_varint(l, len);
+ *l++= val;
+ m_log.close(l);
+ m_last_offset= static_cast<uint16_t>(ofs + len);
}
-/** Write a byte string to a page.
+/** Initialize a string of bytes.
@param[in,out] b buffer page
+@param[in] ofs byte offset from block->frame
+@param[in] len length of the data to write
+@param[in] val the data byte to write */
+inline void mtr_t::memset(const buf_block_t *b, ulint ofs, ulint len, byte val)
+{
+ ut_ad(ofs <= ulint(srv_page_size));
+ ut_ad(ofs + len <= ulint(srv_page_size));
+ ::memset(ofs + b->frame, val, len);
+ memset(*b, ofs, len, val);
+}
+
+/** Log an initialization of a repeating string of bytes.
+@param[in] b buffer page
+@param[in] ofs byte offset from b->frame
+@param[in] len length of the data to write, in bytes
+@param[in] str the string to write
+@param[in] size size of str, in bytes */
+inline void mtr_t::memset(const buf_block_t &b, ulint ofs, size_t len,
+ const void *str, size_t size)
+{
+ ut_ad(size);
+ ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */
+ set_modified();
+ if (m_log_mode != MTR_LOG_ALL)
+ return;
+
+ static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
+ size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3);
+ byte *l= log_write<MEMSET>(b.page.id, &b.page, lenlen + size, true, ofs);
+ l= mlog_encode_varint(l, len);
+ ::memcpy(l, str, size);
+ l+= size;
+ m_log.close(l);
+ m_last_offset= static_cast<uint16_t>(ofs + len);
+}
+
+/** Initialize a repeating string of bytes.
+@param[in,out] b buffer page
+@param[in] ofs byte offset from b->frame
+@param[in] len length of the data to write, in bytes
+@param[in] str the string to write
+@param[in] size size of str, in bytes */
+inline void mtr_t::memset(const buf_block_t *b, ulint ofs, size_t len,
+ const void *str, size_t size)
+{
+ ut_ad(ofs <= ulint(srv_page_size));
+ ut_ad(ofs + len <= ulint(srv_page_size));
+ ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */
+ size_t s= 0;
+ while (s < len)
+ {
+ ::memcpy(ofs + s + b->frame, str, size);
+ s+= len;
+ }
+ ::memcpy(ofs + s + b->frame, str, len - s);
+ memset(*b, ofs, len, str, size);
+}
+
+/** Log a write of a byte string to a page.
+@param[in] b buffer page
@param[in] offset byte offset from b->frame
@param[in] str the data to write
@param[in] len length of the data to write */
-inline
-void mtr_t::memcpy(buf_block_t *b, ulint offset, const void *str, ulint len)
+inline void mtr_t::memcpy(const buf_block_t &b, ulint offset, ulint len)
+{
+ ut_ad(len);
+ ut_ad(offset <= ulint(srv_page_size));
+ ut_ad(offset + len <= ulint(srv_page_size));
+ memcpy_low(b.page, uint16_t(offset), &b.frame[offset], len);
+}
+
+/** Log a write of a byte string to a page.
+@param id page identifier
+@param offset byte offset within page
+@param data data to be written
+@param len length of the data, in bytes */
+inline void mtr_t::memcpy_low(const buf_page_t &bpage, uint16_t offset,
+ const void *data, size_t len)
+{
+ ut_ad(len);
+ set_modified();
+ if (m_log_mode != MTR_LOG_ALL)
+ return;
+ if (len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5))
+ {
+ byte *end= log_write<WRITE>(bpage.id, &bpage, len, true, offset);
+ ::memcpy(end, data, len);
+ m_log.close(end + len);
+ }
+ else
+ {
+ m_log.close(log_write<WRITE>(bpage.id, &bpage, len, false, offset));
+ m_log.push(static_cast<const byte*>(data), static_cast<uint32_t>(len));
+ }
+ m_last_offset= static_cast<uint16_t>(offset + len);
+}
+
+/** Log that a string of bytes was copied from the same page.
+@param[in] b buffer page
+@param[in] d destination offset within the page
+@param[in] s source offset within the page
+@param[in] len length of the data to copy */
+inline void mtr_t::memmove(const buf_block_t &b, ulint d, ulint s, ulint len)
{
- ::memcpy(b->frame + offset, str, len);
- memcpy(*b, offset, len);
+ ut_ad(d >= 8);
+ ut_ad(s >= 8);
+ ut_ad(len);
+ ut_ad(s <= ulint(srv_page_size));
+ ut_ad(s + len <= ulint(srv_page_size));
+ ut_ad(s != d);
+ ut_ad(d <= ulint(srv_page_size));
+ ut_ad(d + len <= ulint(srv_page_size));
+
+ set_modified();
+ if (m_log_mode != MTR_LOG_ALL)
+ return;
+ static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
+ size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3);
+ /* The source offset is encoded relative to the destination offset,
+ with the sign in the least significant bit. */
+ if (s > d)
+ s= (s - d) << 1;
+ else
+ s= (d - s) << 1 | 1;
+ /* The source offset 0 is not possible. */
+ s-= 1 << 1;
+ size_t slen= (s < MIN_2BYTE ? 1 : s < MIN_3BYTE ? 2 : 3);
+ byte *l= log_write<MEMMOVE>(b.page.id, &b.page, lenlen + slen, true, d);
+ l= mlog_encode_varint(l, len);
+ l= mlog_encode_varint(l, s);
+ m_log.close(l);
+ m_last_offset= static_cast<uint16_t>(d + len);
+}
+
+/**
+Write a log record.
+@tparam type redo log record type
+@param id persistent page identifier
+@param bpage buffer pool page, or nullptr
+@param len number of additional bytes to write
+@param alloc whether to allocate the additional bytes
+@param offset byte offset, or 0 if the record type does not allow one
+@return end of mini-transaction log, minus len */
+template<byte type>
+inline byte *mtr_t::log_write(const page_id_t id, const buf_page_t *bpage,
+ size_t len, bool alloc, size_t offset)
+{
+ static_assert(!(type & 15) && type != RESERVED && type != OPTION &&
+ type <= FILE_CHECKPOINT, "invalid type");
+ ut_ad(type >= FILE_CREATE || is_named_space(id.space()));
+ ut_ad(!bpage || bpage->id == id);
+ constexpr bool have_len= type != INIT_PAGE && type != FREE_PAGE;
+ constexpr bool have_offset= type == WRITE || type == MEMSET ||
+ type == MEMMOVE;
+ static_assert(!have_offset || have_len, "consistency");
+ ut_ad(have_len || len == 0);
+ ut_ad(have_len || !alloc);
+ ut_ad(have_offset || offset == 0);
+ ut_ad(offset + len <= srv_page_size);
+ static_assert(MIN_4BYTE >= UNIV_PAGE_SIZE_MAX, "consistency");
+
+ size_t max_len;
+ if (!have_len)
+ max_len= 1 + 5 + 5;
+ else if (!have_offset)
+ max_len= m_last == bpage
+ ? 1 + 3
+ : 1 + 3 + 5 + 5;
+ else if (m_last == bpage && m_last_offset <= offset)
+ {
+ /* Encode the offset relative from m_last_offset. */
+ offset-= m_last_offset;
+ max_len= 1 + 3 + 3;
+ }
+ else
+ max_len= 1 + 3 + 5 + 5 + 3;
+ byte *const log_ptr= m_log.open(alloc ? max_len + len : max_len);
+ byte *end= log_ptr + 1;
+ const byte same_page= max_len < 1 + 5 + 5 ? 0x80 : 0;
+ if (!same_page)
+ {
+ end= mlog_encode_varint(end, id.space());
+ end= mlog_encode_varint(end, id.page_no());
+ m_last= bpage;
+ }
+ if (have_offset)
+ {
+ byte* oend= mlog_encode_varint(end, offset);
+ if (oend + len > &log_ptr[16])
+ {
+ len+= oend - log_ptr - 15;
+ if (len >= MIN_3BYTE)
+ len+= 2;
+ else if (len >= MIN_2BYTE)
+ len++;
+
+ *log_ptr= type | same_page;
+ end= mlog_encode_varint(log_ptr + 1, len);
+ if (!same_page)
+ {
+ end= mlog_encode_varint(end, id.space());
+ end= mlog_encode_varint(end, id.page_no());
+ }
+ end= mlog_encode_varint(end, offset);
+ return end;
+ }
+ else
+ end= oend;
+ }
+ else if (len >= 3 && end + len > &log_ptr[16])
+ {
+ len+= end - log_ptr - 16;
+ if (len >= MIN_3BYTE)
+ len+= 2;
+ else if (len >= MIN_2BYTE)
+ len++;
+
+ end= log_ptr;
+ *end++= type | same_page;
+ mlog_encode_varint(end, len);
+
+ if (!same_page)
+ {
+ end= mlog_encode_varint(end, id.space());
+ end= mlog_encode_varint(end, id.page_no());
+ }
+ return end;
+ }
+
+ ut_ad(end + len >= &log_ptr[1] + !same_page);
+ ut_ad(end + len <= &log_ptr[16]);
+ ut_ad(end <= &log_ptr[max_len]);
+ *log_ptr= type | same_page | static_cast<byte>(end + len - log_ptr - 1);
+ ut_ad(*log_ptr & 15);
+ return end;
}
/** Write a byte string to a page.
-@param[in,out] b ROW_FORMAT=COMPRESSED index page
-@param[in] ofs byte offset from b->zip.data
+@param[in] b buffer page
+@param[in] dest destination within b.frame
@param[in] str the data to write
-@param[in] len length of the data to write */
-inline
-void mtr_t::zmemcpy(buf_page_t *b, ulint offset, const void *str, ulint len)
+@param[in] len length of the data to write
+@tparam w write request type */
+template<mtr_t::write_type w>
+inline void mtr_t::memcpy(const buf_block_t &b, void *dest, const void *str,
+ ulint len)
{
- ::memcpy(b->zip.data + offset, str, len);
- zmemcpy(*b, offset, len);
+ ut_ad(ut_align_down(dest, srv_page_size) == b.frame);
+ char *d= static_cast<char*>(dest);
+ const char *s= static_cast<const char*>(str);
+ if (w != FORCED && m_log_mode == MTR_LOG_ALL)
+ {
+ ut_ad(len);
+ const char *const end= d + len;
+ while (*d++ == *s++)
+ {
+ if (d == end)
+ {
+ ut_ad(w == OPT);
+ return;
+ }
+ }
+ s--;
+ d--;
+ len= static_cast<ulint>(end - d);
+ }
+ ::memcpy(d, s, len);
+ memcpy(b, ut_align_offset(d, srv_page_size), len);
}
/** Initialize an entire page.
@@ -121,13 +517,37 @@ inline void mtr_t::init(buf_block_t *b)
return;
}
- m_log.close(log_write_low(MLOG_INIT_FILE_PAGE2, b->page.id, m_log.open(11)));
+ m_log.close(log_write<INIT_PAGE>(b->page.id, &b->page));
+ m_last_offset= FIL_PAGE_TYPE;
b->page.init_on_flush= true;
}
+/** Free a page.
+@param id page identifier */
+inline void mtr_t::free(const page_id_t id)
+{
+ if (m_log_mode == MTR_LOG_ALL)
+ m_log.close(log_write<FREE_PAGE>(id, nullptr));
+}
+
+/** Partly initialize a B-tree page.
+@param block B-tree page
+@param comp false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */
+inline void mtr_t::page_create(const buf_block_t &block, bool comp)
+{
+ set_modified();
+ if (m_log_mode != MTR_LOG_ALL)
+ return;
+ byte *l= log_write<INIT_INDEX_PAGE>(block.page.id, &block.page, 1, true);
+ *l++= comp;
+ m_log.close(l);
+ m_last_offset= FIL_PAGE_TYPE;
+}
+
/********************************************************//**
-Parses an initial log record written by mtr_t::log_write_low().
+Parses an initial log record written by mlog_write_initial_log_record_low().
@return parsed record end, NULL if not a complete record */
+ATTRIBUTE_COLD /* only used when crash-upgrading */
const byte*
mlog_parse_initial_log_record(
/*==========================*/
diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h
index f2f8ee13a2a..49537faa030 100644
--- a/storage/innobase/include/mtr0mtr.h
+++ b/storage/innobase/include/mtr0mtr.h
@@ -129,7 +129,7 @@ struct mtr_t {
/** Commit a mini-transaction that did not modify any pages,
but generated some redo log on a higher level, such as
- MLOG_FILE_NAME records and an optional MLOG_CHECKPOINT marker.
+ FILE_MODIFY records and an optional FILE_CHECKPOINT marker.
The caller must invoke log_mutex_enter() and log_mutex_exit().
This is to be used at log_checkpoint().
@param checkpoint_lsn the log sequence number of a checkpoint, or 0 */
@@ -171,7 +171,7 @@ struct mtr_t {
inline mtr_log_t set_log_mode(mtr_log_t mode);
/** Copy the tablespaces associated with the mini-transaction
- (needed for generating MLOG_FILE_NAME records)
+ (needed for generating FILE_MODIFY records)
@param[in] mtr mini-transaction that may modify
the same set of tablespaces as this one */
void set_spaces(const mtr_t& mtr)
@@ -184,7 +184,7 @@ struct mtr_t {
}
/** Set the tablespace associated with the mini-transaction
- (needed for generating a MLOG_FILE_NAME record)
+ (needed for generating a FILE_MODIFY record)
@param[in] space_id user or system tablespace ID
@return the tablespace */
fil_space_t* set_named_space_id(ulint space_id)
@@ -203,7 +203,7 @@ struct mtr_t {
}
/** Set the tablespace associated with the mini-transaction
- (needed for generating a MLOG_FILE_NAME record)
+ (needed for generating a FILE_MODIFY record)
@param[in] space user or system tablespace */
void set_named_space(fil_space_t* space)
{
@@ -216,12 +216,12 @@ struct mtr_t {
#ifdef UNIV_DEBUG
/** Check the tablespace associated with the mini-transaction
- (needed for generating a MLOG_FILE_NAME record)
+ (needed for generating a FILE_MODIFY record)
@param[in] space tablespace
@return whether the mini-transaction is associated with the space */
bool is_named_space(ulint space) const;
/** Check the tablespace associated with the mini-transaction
- (needed for generating a MLOG_FILE_NAME record)
+ (needed for generating a FILE_MODIFY record)
@param[in] space tablespace
@return whether the mini-transaction is associated with the space */
bool is_named_space(const fil_space_t* space) const;
@@ -407,136 +407,124 @@ struct mtr_t {
@param[in] val value to write
@tparam l number of bytes to write
@tparam w write request type
- @tparam V type of val */
+ @tparam V type of val
+ @return whether any log was written */
template<unsigned l,write_type w= NORMAL,typename V>
- inline void write(const buf_block_t &block, byte *ptr, V val)
+ inline bool write(const buf_block_t &block, void *ptr, V val)
MY_ATTRIBUTE((nonnull));
/** Log a write of a byte string to a page.
@param[in] b buffer page
@param[in] ofs byte offset from b->frame
@param[in] len length of the data to write */
- void memcpy(const buf_block_t &b, ulint ofs, ulint len);
+ inline void memcpy(const buf_block_t &b, ulint ofs, ulint len);
/** Write a byte string to a page.
@param[in,out] b buffer page
- @param[in] offset byte offset from b->frame
+ @param[in] dest destination within b.frame
@param[in] str the data to write
- @param[in] len length of the data to write */
- inline void memcpy(buf_block_t *b, ulint offset, const void *str, ulint len);
+ @param[in] len length of the data to write
+ @tparam w write request type */
+ template<write_type w= NORMAL>
+ inline void memcpy(const buf_block_t &b, void *dest, const void *str,
+ ulint len);
- /** Write a byte string to a ROW_FORMAT=COMPRESSED page.
+ /** Log a write of a byte string to a ROW_FORMAT=COMPRESSED page.
@param[in] b ROW_FORMAT=COMPRESSED index page
- @param[in] ofs byte offset from b.zip.data
+ @param[in] offset byte offset from b.zip.data
@param[in] len length of the data to write */
- void zmemcpy(const buf_page_t &b, ulint offset, ulint len);
+ inline void zmemcpy(const buf_page_t &b, ulint offset, ulint len);
/** Write a byte string to a ROW_FORMAT=COMPRESSED page.
@param[in,out] b ROW_FORMAT=COMPRESSED index page
- @param[in] ofs byte offset from b->zip.data
+ @param[in] dest destination within b.zip.data
@param[in] str the data to write
- @param[in] len length of the data to write */
- inline void zmemcpy(buf_page_t *b, ulint offset, const void *str, ulint len);
+ @param[in] len length of the data to write
+ @tparam w write request type */
+ template<write_type w= NORMAL>
+ inline void zmemcpy(const buf_page_t &b, void *dest, const void *str,
+ ulint len);
+
+ /** Log an initialization of a string of bytes.
+ @param[in] b buffer page
+ @param[in] ofs byte offset from b->frame
+ @param[in] len length of the data to write
+ @param[in] val the data byte to write */
+ inline void memset(const buf_block_t &b, ulint ofs, ulint len, byte val);
/** Initialize a string of bytes.
@param[in,out] b buffer page
@param[in] ofs byte offset from b->frame
@param[in] len length of the data to write
@param[in] val the data byte to write */
- void memset(const buf_block_t* b, ulint ofs, ulint len, byte val);
+ inline void memset(const buf_block_t *b, ulint ofs, ulint len, byte val);
+
+ /** Log an initialization of a repeating string of bytes.
+ @param[in] b buffer page
+ @param[in] ofs byte offset from b->frame
+ @param[in] len length of the data to write, in bytes
+ @param[in] str the string to write
+ @param[in] size size of str, in bytes */
+ inline void memset(const buf_block_t &b, ulint ofs, size_t len,
+ const void *str, size_t size);
+
+ /** Initialize a repeating string of bytes.
+ @param[in,out] b buffer page
+ @param[in] ofs byte offset from b->frame
+ @param[in] len length of the data to write, in bytes
+ @param[in] str the string to write
+ @param[in] size size of str, in bytes */
+ inline void memset(const buf_block_t *b, ulint ofs, size_t len,
+ const void *str, size_t size);
+
+ /** Log that a string of bytes was copied from the same page.
+ @param[in] b buffer page
+ @param[in] d destination offset within the page
+ @param[in] s source offset within the page
+ @param[in] len length of the data to copy */
+ inline void memmove(const buf_block_t &b, ulint d, ulint s, ulint len);
/** Initialize an entire page.
@param[in,out] b buffer page */
void init(buf_block_t *b);
/** Free a page.
@param id page identifier */
- void free(const page_id_t id) { log_page_write(id, MLOG_INIT_FREE_PAGE); }
-
+ inline void free(const page_id_t id);
/** Partly initialize a B-tree page.
- @param id page identifier
+ @param block B-tree page
@param comp false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */
- void page_create(const page_id_t id, bool comp)
- {
- set_modified();
- log_page_write(id, comp ? MLOG_COMP_PAGE_CREATE : MLOG_PAGE_CREATE);
- }
+ inline void page_create(const buf_block_t &block, bool comp);
/** Write a log record about a file operation.
@param type file operation
@param space_id tablespace identifier
@param first_page_no first page number in the file
@param path file path
- @param new_path new file path for type=MLOG_FILE_RENAME2
- @param flags tablespace flags for type=MLOG_FILE_CREATE2 */
- inline void log_file_op(mlog_id_t type, ulint space_id, ulint first_page_no,
- const char *path,
- const char *new_path= nullptr, ulint flags= 0);
+ @param new_path new file path for type=FILE_RENAME */
+ inline void log_file_op(mfile_type_t type, ulint space_id,
+ ulint first_page_no, const char *path,
+ const char *new_path= nullptr);
private:
- /**
- Write a complex page operation.
- @param id page identifier
- @param type type of operation */
- void log_page_write(const page_id_t id, mlog_id_t type)
- {
- ut_ad(type == MLOG_INIT_FREE_PAGE || type == MLOG_COMP_PAGE_CREATE ||
- type == MLOG_PAGE_CREATE);
-
- if (m_log_mode == MTR_LOG_ALL)
- m_log.close(log_write_low(type, id, m_log.open(11)));
- }
+ /** Log a write of a byte string to a page.
+ @param b buffer page
+ @param offset byte offset within page
+ @param data data to be written
+ @param len length of the data, in bytes */
+ inline void memcpy_low(const buf_page_t &bpage, uint16_t offset,
+ const void *data, size_t len);
/**
Write a log record.
- @param type redo log record type
+ @tparam type redo log record type
@param id persistent page identifier
- @param l current end of mini-transaction log
- @return new end of mini-transaction log */
- inline byte *log_write_low(mlog_id_t type, const page_id_t id, byte *l)
- {
- ut_ad(type <= MLOG_BIGGEST_TYPE);
- ut_ad(type == MLOG_FILE_NAME || type == MLOG_FILE_DELETE ||
- type == MLOG_FILE_CREATE2 || type == MLOG_FILE_RENAME2 ||
- is_named_space(id.space()));
-
- *l++= type;
-
- l+= mach_write_compressed(l, id.space());
- l+= mach_write_compressed(l, id.page_no());
-
- ++m_n_log_recs;
- return l;
- }
-
- /**
- Write a log record for writing 1, 2, 4, or 8 bytes.
- @param[in] type number of bytes to write
- @param[in] block file page
- @param[in] ptr pointer within block.frame
- @param[in,out] l log record buffer
- @return new end of mini-transaction log */
- byte *log_write_low(mlog_id_t type, const buf_block_t &block,
- const byte *ptr, byte *l);
-
- /**
- Write a log record for writing 1, 2, or 4 bytes.
- @param[in] block file page
- @param[in,out] ptr pointer in file page
- @param[in] l number of bytes to write
- @param[in,out] log_ptr log record buffer
- @param[in] val value to write */
- void log_write(const buf_block_t &block, byte *ptr, mlog_id_t l,
- byte *log_ptr, uint32_t val)
- MY_ATTRIBUTE((nonnull));
- /**
- Write a log record for writing 8 bytes.
- @param[in] block file page
- @param[in,out] ptr pointer in file page
- @param[in] l number of bytes to write (8)
- @param[in,out] log_ptr log record buffer
- @param[in] val value to write */
- void log_write(const buf_block_t &block, byte *ptr, mlog_id_t l,
- byte *log_ptr, uint64_t val)
- MY_ATTRIBUTE((nonnull));
+ @param bpage buffer pool page, or nullptr
+ @param len number of additional bytes to write
+ @param alloc whether to allocate the additional bytes
+ @param offset byte offset, or 0 if the record type does not allow one
+ @return end of mini-transaction log, minus len */
+ template<byte type>
+ inline byte *log_write(const page_id_t id, const buf_page_t *bpage,
+ size_t len= 0, bool alloc= false, size_t offset= 0);
/** Prepare to write the mini-transaction log to the redo log buffer.
@return number of bytes to write in finish_write() */
@@ -563,6 +551,11 @@ private:
bool m_commit= false;
#endif
+ /** The page of the most recent m_log record written, or NULL */
+ const buf_page_t* m_last;
+ /** The current byte offset in m_last, or 0 */
+ uint16_t m_last_offset;
+
/** specifies which operations should be logged; default MTR_LOG_ALL */
uint16_t m_log_mode:2;
@@ -576,8 +569,6 @@ private:
to suppress some read-ahead operations, @see ibuf_inside() */
uint16_t m_inside_ibuf:1;
- /** number of m_log records */
- uint16_t m_n_log_recs:11;
#ifdef UNIV_DEBUG
/** Persistent user tablespace associated with the
mini-transaction, or 0 (TRX_SYS_SPACE) if none yet */
diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic
index e00ae2057df..016d7c768c2 100644
--- a/storage/innobase/include/mtr0mtr.ic
+++ b/storage/innobase/include/mtr0mtr.ic
@@ -204,7 +204,7 @@ mtr_t::set_log_mode(mtr_log_t mode)
case MTR_LOG_ALL:
/* MTR_LOG_NO_REDO can only be set before generating
any redo log records. */
- ut_ad(mode != MTR_LOG_NO_REDO || m_n_log_recs == 0);
+ ut_ad(mode != MTR_LOG_NO_REDO || m_log.empty());
m_log_mode = mode;
return(old_mode);
}
diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h
index ef180fb36bc..bdcca691c2e 100644
--- a/storage/innobase/include/mtr0types.h
+++ b/storage/innobase/include/mtr0types.h
@@ -29,6 +29,8 @@ Created 11/26/1995 Heikki Tuuri
#ifndef UNIV_INNOCHECKSUM
#include "sync0rw.h"
+#else
+#include "univ.i"
#endif /* UNIV_INNOCHECKSUM */
struct mtr_t;
@@ -47,6 +49,233 @@ enum mtr_log_t {
MTR_LOG_NO_REDO
};
+/*
+A mini-transaction is a stream of records that is always terminated by
+a NUL byte. The first byte of a mini-transaction record is never NUL,
+but NUL bytes can occur within mini-transaction records. The first
+bytes of each record will explicitly encode the length of the record.
+NUL bytes also acts as padding in log blocks, that is, there can be
+multiple sucessive NUL bytes between mini-transactions in a redo log
+block.
+
+The first byte of the record would contain a record type, flags, and a
+part of length. The optional second byte of the record will contain
+more length. (Not needed for short records.)
+
+Bit 7 of the first byte of a redo log record is the same_page flag.
+If same_page=1, the record is referring to the same page as the
+previous record. Records that do not refer to data pages but to file
+operations are identified by setting the same_page=1 in the very first
+record(s) of the mini-transaction. A mini-transaction record that
+carries same_page=0 must only be followed by page-oriented records.
+
+Bits 6..4 of the first byte of a redo log record identify the redo log
+type. The following record types refer to data pages:
+
+ FREE_PAGE (0): corresponds to MLOG_INIT_FREE_PAGE
+ INIT_PAGE (1): corresponds to MLOG_INIT_FILE_PAGE2
+ INIT_INDEX_PAGE (2): initialize a B-tree or R-tree page
+ WRITE (3): replaces MLOG_nBYTES, MLOG_WRITE_STRING, MLOG_ZIP_*
+ MEMSET (4): extends the 10.4 MLOG_MEMSET record
+ MEMMOVE (5): copy data within the page (avoids logging redundant data)
+ RESERVED (6): reserved for future use; a subtype code
+ (encoded immediately after the length) would be written
+ to reserve code space for further extensions
+ OPTION (7): optional record that may be ignored; a subtype code
+ (encoded immediately after the length) would distinguish actual
+ usage, such as:
+ * MDEV-18976 page checksum record
+ * binlog record
+ * SQL statement (at the start of statement)
+
+Bits 3..0 indicate the redo log record length, excluding the first
+byte, but including additional length bytes and any other bytes,
+such as the optional tablespace identifier and page number.
+Values 1..15 represent lengths of 1 to 15 bytes. The special value 0
+indicates that 1 to 3 length bytes will follow to encode the remaining
+length that exceeds 16 bytes.
+
+Additional length bytes if length>16: 0 to 3 bytes
+0xxxxxxx for 0 to 127 (total: 16 to 143 bytes)
+10xxxxxx xxxxxxxx for 128 to 16511 (total: 144 to 16527)
+110xxxxx xxxxxxxx xxxxxxxx for 16512 to 2113663 (total: 16528 to 2113679)
+111xxxxx reserved (corrupted record, and file!)
+
+If same_page=0, the tablespace identifier and page number will use
+similar 1-to-5-byte variable-length encoding:
+0xxxxxxx for 0 to 127
+10xxxxxx xxxxxxxx for 128 to 16,511
+110xxxxx xxxxxxxx xxxxxxxx for 16,512 to 2,113,663
+1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx for 2,113,664 to 270,549,119
+11110xxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx for 270,549,120 to 34,630,287,487
+11111xxx reserved (corrupted record)
+Note: Some 5-byte values are reserved, because the tablespace identifier
+and page number can only be up to 4,294,967,295.
+
+If same_page=1 is set in a record that follows a same_page=0 record
+in a mini-transaction, the tablespace identifier and page number
+fields will be omitted.
+
+(For some file-oriented records (if same_page=1 for the first records
+of a mini-transaction), we will write tablespace identifier using the
+same 1-to-5-byte encoding. TBD: describe the exact format of
+file-oriented records. With MDEV-14425, we could write file-level log
+records to a separate file, not interleaved with page-level redo log
+at all. We could reserve the file ib_logfile0 for checkpoint information
+and for file-level redo log records.)
+
+For FREE_PAGE or INIT_PAGE, if same_page=1, the record will be treated
+as corrupted (or reserved for future extension). The type code must
+be followed by 1+1 to 5+5 bytes (to encode the tablespace identifier
+and page number). If the record length does not match the encoded
+lengths of the tablespace identifier and page number, the record will
+be treated as corrupted. This allows future expansion of the format.
+
+If there is a FREE_PAGE record in a mini-transaction, it must be the
+only record for that page in the mini-transaction. If there is an
+INIT_PAGE record for a page in a mini-transaction, it must be the
+first record for that page in the mini-transaction.
+
+An INIT_INDEX_PAGE must be followed by 1+1 to 5+5 bytes for the page
+identifier (unless the same_page flag is set) and a subtype code:
+0 for ROW_FORMAT=REDUNDANT and 1 for ROW_FORMAT=COMPACT or DYNAMIC.
+
+For WRITE, MEMSET, MEMMOVE, the next 1 to 3 bytes are the byte offset
+on the page, relative from the previous offset. If same_page=0, the
+"previous offset" is 0. If same_page=1, the "previous offset" is where
+the previous operation ended (FIL_PAGE_TYPE for INIT_PAGE or INIT_INDEX_PAGE).
+0xxxxxxx for 0 to 127
+10xxxxxx xxxxxxxx for 128 to 16,511
+110xxxxx xxxxxxxx xxxxxxxx for 16,512 to 2,113,663
+111xxxxx reserved (corrupted record)
+If the sum of the "previous offset" and the current offset exceeds the
+page size, the record is treated as corrupted. Negative relative offsets
+cannot be written. Instead, a record with same_page=0 can be written.
+
+For MEMSET and MEMMOVE, the target length will follow, encoded in 1 to
+3 bytes. If the length+offset exceeds the page size, the record will
+be treated as corrupted.
+
+For MEMMOVE, the source offset will follow, encoded in 1 to 3 bytes,
+relative to the current offset. The offset 0 is not possible, and
+the sign bit is the least significant bit. That is,
++x is encoded as (x-1)<<1 (+1,+2,+3,... is 0,2,4,...) and
+-x is encoded as (x-1)<<1|1 (-1,-2,-3,... is 1,3,5,...).
+The source offset must be within the page size, or else the record
+will be treated as corrupted.
+
+For MEMSET or WRITE, the byte(s) to be written will follow. For
+MEMSET, it usually is a single byte, but it could also be a multi-byte
+string, which would be copied over and over until the target length is
+reached. The length of the remaining bytes is implied by the length
+bytes at the start of the record.
+
+For MEMMOVE, if any bytes follow, the record is treated as corrupted
+(future expansion).
+
+As mentioned at the start of this comment, the type byte 0 would be
+special, marking the end of a mini-transaction. We could use the
+corresponding value 0x80 (with same_page=1) for something special,
+such as a future extension when more type codes are needed, or for
+encoding rarely needed redo log records.
+
+Examples:
+
+INIT could be logged as 0x12 0x34 0x56, meaning "type code 1 (INIT), 2
+bytes to follow" and "tablespace ID 0x34", "page number 0x56".
+The first byte must be between 0x12 and 0x1a, and the total length of
+the record must match the lengths of the encoded tablespace ID and
+page number.
+
+WRITE could be logged as 0x36 0x40 0x57 0x60 0x12 0x34 0x56, meaning
+"type code 3 (WRITE), 6 bytes to follow" and "tablespace ID 0x40",
+"page number 0x57", "byte offset 0x60", data 0x34,0x56.
+
+A subsequent WRITE to the same page could be logged 0xb5 0x7f 0x23
+0x34 0x56 0x78, meaning "same page, type code 3 (WRITE), 5 bytes to
+follow", "byte offset 0x7f"+0x60+2, bytes 0x23,0x34,0x56,0x78.
+
+The end of the mini-transaction would be indicated by a NUL byte.
+*/
+
+/** Redo log record types. These bit patterns (3 bits) will be written
+to the redo log file, so the existing codes or their interpretation on
+crash recovery must not be changed. */
+enum mrec_type_t
+{
+ /** Free a page. On recovery, it is unnecessary to read the page.
+ The next record for the page (if any) must be INIT_PAGE or
+ INIT_INDEX_PAGE. After this record has been written, the page may be
+ overwritten with zeros, or discarded or trimmed. */
+ FREE_PAGE = 0,
+ /** Zero-initialize a page. The current byte offset (for subsequent
+ records) will be reset to FIL_PAGE_TYPE. */
+ INIT_PAGE = 0x10,
+ /** Like INIT_PAGE, but initializing a B-tree or R-tree index page,
+ including writing the "infimum" and "supremum" pseudo-records. The
+ current byte offset will be reset to FIL_PAGE_TYPE. The
+ type code is followed by a subtype byte to specify the ROW_FORMAT:
+ 0 for ROW_FORMAT=REDUNDANT, 1 for ROW_FORMAT=COMPACT or DYNAMIC. */
+ INIT_INDEX_PAGE = 0x20,
+ /** Write a string of bytes. Followed by the byte offset (unsigned,
+ relative to the current byte offset, encoded in 1 to 3 bytes) and
+ the bytes to write (at least one). The current byte offset will be
+ set after the last byte written. */
+ WRITE = 0x30,
+ /** Like WRITE, but before the bytes to write, the data_length-1
+ (encoded in 1 to 3 bytes) will be encoded, and it must be more
+ than the length of the following data bytes to write.
+ The data byte(s) will be repeatedly copied to the output until
+ the data_length is reached. */
+ MEMSET = 0x40,
+ /** Like MEMSET, but instead of the bytes to write, a source byte
+ offset (signed, nonzero, relative to the target byte offset, encoded
+ in 1 to 3 bytes, with the sign bit in the least significant bit)
+ will be written.
+ That is, +x is encoded as (x-1)<<1 (+1,+2,+3,... is 0,2,4,...)
+ and -x is encoded as (x-1)<<1|1 (-1,-2,-3,... is 1,3,5,...).
+ The source offset and data_length must be within the page size, or
+ else the record will be treated as corrupted. The data will be
+ copied from the page as it was at the start of the
+ mini-transaction. */
+ MEMMOVE = 0x50,
+ /** Reserved for future use. */
+ RESERVED = 0x60,
+ /** Optional record that may be ignored in crash recovery.
+ A subtype code will be encoded immediately after the length.
+ Possible subtypes would include a MDEV-18976 page checksum record,
+ a binlog record, or an SQL statement. */
+ OPTION = 0x70
+};
+
+
+/** Redo log record types for file-level operations. These bit
+patterns will be written to redo log files, so the existing codes or
+their interpretation on crash recovery must not be changed. */
+enum mfile_type_t
+{
+ /** Create a file. Followed by tablespace ID and the file name. */
+ FILE_CREATE = 0x80,
+ /** Delete a file. Followed by tablespace ID and the file name. */
+ FILE_DELETE = 0x90,
+ /** Rename a file. Followed by tablespace ID and the old file name,
+ NUL, and the new file name. */
+ FILE_RENAME = 0xa0,
+ /** Modify a file. Followed by tablespace ID and the file name. */
+ FILE_MODIFY = 0xb0,
+#if 1 /* MDEV-14425 FIXME: Remove this! */
+ /** End-of-checkpoint marker. Followed by 2 dummy bytes of page identifier,
+ 8 bytes of LSN, and padded with a NUL; @see SIZE_OF_FILE_CHECKPOINT. */
+ FILE_CHECKPOINT = 0xf0
+#endif
+};
+
+#if 1 /* MDEV-14425 FIXME: Remove this! */
+/** Size of a FILE_CHECKPOINT record, including the trailing byte to
+terminate the mini-transaction. */
+constexpr byte SIZE_OF_FILE_CHECKPOINT= 3/*type,page_id*/ + 8/*LSN*/ + 1;
+#endif
+
/** @name Log item types
The log items are declared 'byte' so that the compiler can warn if val
and type parameters are switched in a call to mlog_write. NOTE!
@@ -120,9 +349,6 @@ enum mlog_id_t {
/** initialize an ibuf bitmap page (used in MariaDB 10.2 and 10.3) */
MLOG_IBUF_BITMAP_INIT = 27,
- /** MDEV-12353 WIP: write to a ROW_FORMAT=COMPRESSED page */
- MLOG_ZIP_WRITE_STRING = 29,
-
/** write a string to a page */
MLOG_WRITE_STRING = 30,
diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h
index 6f57fd38848..46981c777cd 100644
--- a/storage/innobase/include/page0page.h
+++ b/storage/innobase/include/page0page.h
@@ -178,7 +178,7 @@ the first record in the list of records. */
#define PAGE_DIR FIL_PAGE_DATA_END
/* We define a slot in the page directory as two bytes */
-#define PAGE_DIR_SLOT_SIZE 2
+constexpr uint16_t PAGE_DIR_SLOT_SIZE= 2;
/* The offset of the physically lower end of the directory, counted from
page end, when the page is empty */
@@ -840,15 +840,6 @@ page_rec_is_second_last(
const page_t* page) /*!< in: page */
MY_ATTRIBUTE((warn_unused_result));
-/***************************************************************//**
-Looks for the record which owns the given record.
-@return the owner record */
-UNIV_INLINE
-rec_t*
-page_rec_find_owner_rec(
-/*====================*/
- rec_t* rec); /*!< in: the physical record */
-
/************************************************************//**
Returns the maximum combined size of records which can be inserted on top
of record heap.
@@ -924,7 +915,7 @@ page_get_instant(const page_t* page);
@param[in,out] block buffer block
@param[in,out] mtr mini-transaction
@param[in] comp set unless ROW_FORMAT=REDUNDANT */
-void page_create(buf_block_t* block, mtr_t* mtr, bool comp);
+void page_create(buf_block_t *block, mtr_t *mtr, bool comp);
/**********************************************************//**
Create a compressed B-tree index page. */
void
diff --git a/storage/innobase/include/page0page.ic b/storage/innobase/include/page0page.ic
index 5cc6b4d9d50..8604f088adf 100644
--- a/storage/innobase/include/page0page.ic
+++ b/storage/innobase/include/page0page.ic
@@ -89,17 +89,14 @@ page_set_ssn_id(
node_seq_t ssn_id, /*!< in: transaction id */
mtr_t* mtr) /*!< in/out: mini-transaction */
{
- ut_ad(!mtr || mtr_memo_contains_flagged(mtr, block,
- MTR_MEMO_PAGE_SX_FIX
- | MTR_MEMO_PAGE_X_FIX));
-
- byte* ssn = block->frame + FIL_RTREE_SPLIT_SEQ_NUM;
- if (UNIV_LIKELY_NULL(page_zip)) {
- mach_write_to_8(ssn, ssn_id);
- page_zip_write_header(block, ssn, 8, mtr);
- } else {
- mtr->write<8,mtr_t::OPT>(*block, ssn, ssn_id);
- }
+ ut_ad(mtr_memo_contains_flagged(mtr, block,
+ MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!page_zip || page_zip == &block->page.zip);
+ constexpr uint16_t field= FIL_RTREE_SPLIT_SEQ_NUM;
+ byte *b= my_assume_aligned<2>(&block->frame[field]);
+ if (mtr->write<8,mtr_t::OPT>(*block, b, ssn_id) &&
+ UNIV_LIKELY_NULL(page_zip))
+ memcpy_aligned<2>(&page_zip->data[field], b, 8);
}
#endif /* !UNIV_INNOCHECKSUM */
@@ -133,15 +130,11 @@ Reset PAGE_LAST_INSERT.
@param[in,out] mtr mini-transaction */
inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr)
{
- byte *b= &block->frame[PAGE_HEADER + PAGE_LAST_INSERT];
-
- if (UNIV_LIKELY_NULL(block->page.zip.data))
- {
- mach_write_to_2(b, 0);
- page_zip_write_header(block, b, 2, mtr);
- }
- else
- mtr->write<2,mtr_t::OPT>(*block, b, 0U);
+ constexpr uint16_t field= PAGE_HEADER + PAGE_LAST_INSERT;
+ byte *b= my_assume_aligned<2>(&block->frame[field]);
+ if (mtr->write<2,mtr_t::OPT>(*block, b, 0U) &&
+ UNIV_LIKELY_NULL(block->page.zip.data))
+ memcpy_aligned<2>(&block->page.zip.data[field], b, 2);
}
/***************************************************************//**
@@ -576,30 +569,6 @@ page_rec_get_prev(
return((rec_t*) page_rec_get_prev_const(rec));
}
-/***************************************************************//**
-Looks for the record which owns the given record.
-@return the owner record */
-UNIV_INLINE
-rec_t*
-page_rec_find_owner_rec(
-/*====================*/
- rec_t* rec) /*!< in: the physical record */
-{
- ut_ad(page_rec_check(rec));
-
- if (page_rec_is_comp(rec)) {
- while (rec_get_n_owned_new(rec) == 0) {
- rec = page_rec_get_next(rec);
- }
- } else {
- while (rec_get_n_owned_old(rec) == 0) {
- rec = page_rec_get_next(rec);
- }
- }
-
- return(rec);
-}
-
/**********************************************************//**
Returns the base extra size of a physical record. This is the
size of the fixed header, independent of the record size.
diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h
index c7def1d77fb..8b8a4e5b984 100644
--- a/storage/innobase/include/page0zip.h
+++ b/storage/innobase/include/page0zip.h
@@ -230,19 +230,6 @@ page_zip_available(
the heap */
MY_ATTRIBUTE((warn_unused_result));
-/**********************************************************************//**
-Write data to the uncompressed header portion of a page. The data must
-already have been written to the uncompressed page. */
-UNIV_INLINE
-void
-page_zip_write_header(
-/*==================*/
- buf_block_t* block, /*!< in/out: compressed page */
- const byte* str, /*!< in: address on the uncompressed page */
- ulint length, /*!< in: length of the data */
- mtr_t* mtr) /*!< in/out: mini-transaction */
- MY_ATTRIBUTE((nonnull));
-
/** Write an entire record to the ROW_FORMAT=COMPRESSED page.
The data must already have been written to the uncompressed page.
@param[in,out] block ROW_FORMAT=COMPRESSED page
@@ -342,17 +329,14 @@ page_zip_parse_write_trx_id(
page_zip_des_t* page_zip)
MY_ATTRIBUTE((nonnull(1,2), warn_unused_result));
-/**********************************************************************//**
-Write the "deleted" flag of a record on a compressed page. The flag must
-already have been written on the uncompressed page. */
-void
-page_zip_rec_set_deleted(
-/*=====================*/
- buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */
- const byte* rec, /*!< in: record on the uncompressed page */
- ulint flag, /*!< in: the deleted flag (nonzero=TRUE) */
- mtr_t* mtr) /*!< in,out: mini-transaction */
- MY_ATTRIBUTE((nonnull));
+/** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record.
+@param[in,out] block buffer block
+@param[in,out] rec record on a physical index page
+@param[in] flag the value of the delete-mark flag
+@param[in,out] mtr mini-transaction */
+void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag,
+ mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
/**********************************************************************//**
Insert a record to the dense page directory. */
@@ -360,8 +344,8 @@ void
page_zip_dir_insert(
/*================*/
page_cur_t* cursor, /*!< in/out: page cursor */
- const byte* free_rec,/*!< in: record from which rec was
- allocated, or NULL */
+ uint16_t free_rec,/*!< in: record from which rec was
+ allocated, or 0 */
byte* rec, /*!< in: record to insert */
mtr_t* mtr) /*!< in/out: mini-transaction */
MY_ATTRIBUTE((nonnull(1,3,4)));
diff --git a/storage/innobase/include/page0zip.ic b/storage/innobase/include/page0zip.ic
index 32879109cec..1ca59116407 100644
--- a/storage/innobase/include/page0zip.ic
+++ b/storage/innobase/include/page0zip.ic
@@ -25,10 +25,7 @@ Compressed page interface
Created June 2005 by Marko Makela
*******************************************************/
-#include "page0zip.h"
-#include "mtr0log.h"
#include "page0page.h"
-#include "srv0srv.h"
/* The format of compressed pages is as follows.
@@ -320,29 +317,6 @@ page_zip_des_init(
}
/**********************************************************************//**
-Write data to the uncompressed header portion of a page. The data must
-already have been written to the uncompressed page.
-However, the data portion of the uncompressed page may differ from
-the compressed page when a record is being inserted in
-page_cur_insert_rec_zip(). */
-UNIV_INLINE
-void
-page_zip_write_header(
-/*==================*/
- buf_block_t* block, /*!< in/out: compressed page */
- const byte* str, /*!< in: address on the uncompressed page */
- ulint length, /*!< in: length of the data */
- mtr_t* mtr) /*!< in/out: mini-transaction */
-{
- ut_ad(page_align(str) == block->frame);
- const uint16_t pos = page_offset(str);
-
- ut_ad(pos < PAGE_DATA);
- ut_ad(pos + length < PAGE_DATA);
- mtr->zmemcpy(&block->page, pos, str, length);
-}
-
-/**********************************************************************//**
Reset the counters used for filling
INFORMATION_SCHEMA.innodb_cmp_per_index. */
UNIV_INLINE
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
index bf8d7c958e0..37742bb2008 100644
--- a/storage/innobase/log/log0log.cc
+++ b/storage/innobase/log/log0log.cc
@@ -717,7 +717,7 @@ void log_t::files::create(ulint n_files)
ut_ad(log_sys.is_initialised());
this->n_files= n_files;
- format= srv_encrypt_log ? log_t::FORMAT_ENC_10_4 : log_t::FORMAT_10_4;
+ format= srv_encrypt_log ? log_t::FORMAT_ENC_10_5 : log_t::FORMAT_10_5;
subformat= 2;
file_size= srv_log_file_size;
lsn= LOG_START_LSN;
@@ -745,8 +745,8 @@ log_file_header_flush(
ut_ad(log_write_mutex_own());
ut_ad(!recv_no_log_write);
ut_a(nth_file < log_sys.log.n_files);
- ut_ad(log_sys.log.format == log_t::FORMAT_10_4
- || log_sys.log.format == log_t::FORMAT_ENC_10_4);
+ ut_ad(log_sys.log.format == log_t::FORMAT_10_5
+ || log_sys.log.format == log_t::FORMAT_ENC_10_5);
// man 2 open suggests this buffer to be aligned by 512 for O_DIRECT
MY_ALIGNED(OS_FILE_LOG_BLOCK_SIZE)
@@ -1273,14 +1273,14 @@ void log_header_read(ulint header)
}
/** Write checkpoint info to the log header and invoke log_mutex_exit().
-@param[in] end_lsn start LSN of the MLOG_CHECKPOINT mini-transaction */
+@param[in] end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */
void log_write_checkpoint_info(lsn_t end_lsn)
{
ut_ad(log_mutex_own());
ut_ad(!srv_read_only_mode);
ut_ad(end_lsn == 0 || end_lsn >= log_sys.next_checkpoint_lsn);
ut_ad(end_lsn <= log_sys.lsn);
- ut_ad(end_lsn + SIZE_OF_MLOG_CHECKPOINT <= log_sys.lsn
+ ut_ad(end_lsn + SIZE_OF_FILE_CHECKPOINT <= log_sys.lsn
|| srv_shutdown_state != SRV_SHUTDOWN_NONE);
DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF
@@ -1415,23 +1415,23 @@ bool log_checkpoint()
ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn);
if (oldest_lsn
- > log_sys.last_checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT) {
+ > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT) {
/* Some log has been written since the previous checkpoint. */
} else if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
- /* MariaDB 10.3 startup expects the redo log file to be
+ /* MariaDB startup expects the redo log file to be
logically empty (not even containing a MLOG_CHECKPOINT record)
after a clean shutdown. Perform an extra checkpoint at
shutdown. */
} else {
/* Do nothing, because nothing was logged (other than
- a MLOG_CHECKPOINT marker) since the previous checkpoint. */
+ a FILE_CHECKPOINT marker) since the previous checkpoint. */
log_mutex_exit();
return(true);
}
- /* Repeat the MLOG_FILE_NAME records after the checkpoint, in
+ /* Repeat the FILE_MODIFY records after the checkpoint, in
case some log records between the checkpoint and log_sys.lsn
- need them. Finally, write a MLOG_CHECKPOINT marker. Redo log
- apply expects to see a MLOG_CHECKPOINT after the checkpoint,
+ need them. Finally, write a FILE_CHECKPOINT marker. Redo log
+ apply expects to see a FILE_CHECKPOINT after the checkpoint,
except on clean shutdown, where the log will be empty after
the checkpoint.
It is important that we write out the redo log before any
@@ -1446,7 +1446,7 @@ bool log_checkpoint()
|| flush_lsn != end_lsn;
if (fil_names_clear(flush_lsn, do_write)) {
- ut_ad(log_sys.lsn >= end_lsn + SIZE_OF_MLOG_CHECKPOINT);
+ ut_ad(log_sys.lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT);
flush_lsn = log_sys.lsn;
}
@@ -1794,7 +1794,9 @@ wait_suspend_loop:
lsn = log_sys.lsn;
- const bool lsn_changed = lsn != log_sys.last_checkpoint_lsn;
+ const bool lsn_changed = lsn != log_sys.last_checkpoint_lsn
+ && lsn != log_sys.last_checkpoint_lsn
+ + SIZE_OF_FILE_CHECKPOINT;
ut_ad(lsn >= log_sys.last_checkpoint_lsn);
log_mutex_exit();
@@ -1956,7 +1958,7 @@ void
log_pad_current_log_block(void)
/*===========================*/
{
- byte b = MLOG_DUMMY_RECORD;
+ byte b = 0;
ulint pad_length;
ulint i;
lsn_t lsn;
diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
index 3a9ee20c3dd..9a229d4bb20 100644
--- a/storage/innobase/log/log0recv.cc
+++ b/storage/innobase/log/log0recv.cc
@@ -90,12 +90,14 @@ TRUE means that recovery is running and no operations on the log files
are allowed yet: the variable name is misleading. */
bool recv_no_ibuf_operations;
+#if 1 /* MDEV-12353: only for parsing old redo log format */
/** The type of the previous parsed redo log record */
static mlog_id_t recv_previous_parsed_rec_type;
/** The offset of the previous parsed redo log record */
static ulint recv_previous_parsed_rec_offset;
/** The 'multi' flag of the previous parsed redo log record */
static ulint recv_previous_parsed_rec_is_multi;
+#endif
/** The maximum lsn we see for a page during the recovery process. If this
is bigger than the lsn we are able to scan up to, that is an indication that
@@ -110,7 +112,8 @@ mysql_pfs_key_t recv_writer_thread_key;
bool recv_writer_thread_active;
-/** Stored physiological log record with byte-oriented start/end LSN */
+/** Stored physiological log record with byte-oriented start/end LSN
+(before log_t::FORMAT_10_5) */
struct recv_t : public log_rec_t
{
/**
@@ -171,6 +174,254 @@ struct recv_t : public log_rec_t
};
+/** Stored physical log record with logical LSN (@see log_t::FORMAT_10_5) */
+struct log_phys_t : public log_rec_t
+{
+#if 1 // MDEV-14425 FIXME: remove this!
+ /** start LSN of the mini-transaction (not necessarily of this record) */
+ const lsn_t start_lsn;
+#endif
+private:
+ /** length of the record, in bytes */
+ uint16_t len;
+
+ /** @return start of the log records */
+ byte *begin() { return reinterpret_cast<byte*>(&len + 1); }
+ /** @return start of the log records */
+ const byte *begin() const { return const_cast<log_phys_t*>(this)->begin(); }
+ /** @return end of the log records */
+ byte *end() { byte *e= begin() + len; ut_ad(!*e); return e; }
+public:
+ /** @return end of the log records */
+ const byte *end() const { return const_cast<log_phys_t*>(this)->end(); }
+
+ /** Determine the allocated size of the object.
+ @param len length of recs, excluding terminating NUL byte
+ @return the total allocation size */
+ static size_t alloc_size(size_t len)
+ {
+ return len + 1 +
+ reinterpret_cast<size_t>(reinterpret_cast<log_phys_t*>(0)->begin());
+ }
+
+ /** Constructor.
+ @param start_lsn start LSN of the mini-transaction
+ @param lsn mtr_t::commit_lsn() of the mini-transaction
+ @param recs the first log record for the page in the mini-transaction
+ @param size length of recs, in bytes, excluding terminating NUL byte */
+ log_phys_t(lsn_t start_lsn, lsn_t lsn, const byte *recs, size_t size) :
+ log_rec_t(lsn), start_lsn(start_lsn), len(static_cast<uint16_t>(size))
+ {
+ ut_ad(start_lsn);
+ ut_ad(start_lsn < lsn);
+ ut_ad(len == size);
+ reinterpret_cast<byte*>(memcpy(begin(), recs, size))[size]= 0;
+ }
+
+ /** Append a record to the log.
+ @param recs log to append
+ @param size size of the log, in bytes
+ @param lsn the commit LSN of the record */
+ void append(const byte *recs, size_t size, lsn_t lsn)
+ {
+ ut_ad(start_lsn < lsn);
+ set_lsn(lsn);
+ reinterpret_cast<byte*>(memcpy(end(), recs, size))[size]= 0;
+ len+= static_cast<uint16_t>(size);
+ }
+
+ /** The status of apply() */
+ enum apply_status {
+ /** The page was not affected */
+ APPLIED_NO= 0,
+ /** The page was modified */
+ APPLIED_YES,
+ /** The page was modified, affecting the encryption parameters */
+ APPLIED_TO_ENCRYPTION,
+ /** The page was modified, affecting the tablespace header */
+ APPLIED_TO_FSP_HEADER
+ };
+
+ /** Apply log to a page frame.
+ @param[in,out] block buffer block
+ @param[in,out] last_offset last byte offset, for same_page records
+ @return whether any log was applied to the page */
+ apply_status apply(const buf_block_t &block, uint16_t &last_offset) const
+ {
+ const byte * const recs= begin();
+ byte *const frame= block.page.zip.ssize
+ ? block.page.zip.data : block.frame;
+ const size_t size= block.physical_size();
+ apply_status applied= APPLIED_NO;
+
+ for (const byte *l= recs;;)
+ {
+ const byte b= *l++;
+ if (!b)
+ return applied;
+ ut_ad((b & 0x70) != RESERVED);
+ size_t rlen= b & 0xf;
+ if (!rlen)
+ {
+ const size_t lenlen= mlog_decode_varint_length(*l);
+ const uint32_t addlen= mlog_decode_varint(l);
+ ut_ad(addlen != MLOG_DECODE_ERROR);
+ rlen= addlen + 15 - lenlen;
+ l+= lenlen;
+ }
+ if (!(b & 0x80))
+ {
+ /* Skip the page identifier. It has already been validated. */
+ size_t idlen= mlog_decode_varint_length(*l);
+ ut_ad(idlen <= 5);
+ ut_ad(idlen < rlen);
+ ut_ad(mlog_decode_varint(l) == block.page.id.space());
+ l+= idlen;
+ rlen-= idlen;
+ idlen= mlog_decode_varint_length(*l);
+ ut_ad(idlen <= 5);
+ ut_ad(idlen <= rlen);
+ ut_ad(mlog_decode_varint(l) == block.page.id.page_no());
+ l+= idlen;
+ rlen-= idlen;
+ last_offset= 0;
+ }
+
+ switch (b & 0x70) {
+ case FREE_PAGE:
+ ut_ad(last_offset == 0);
+ goto next_not_same_page;
+ case INIT_PAGE:
+ if (UNIV_LIKELY(rlen == 0))
+ {
+ memset_aligned<UNIV_ZIP_SIZE_MIN>(frame, 0, size);
+ mach_write_to_4(frame + FIL_PAGE_OFFSET, block.page.id.page_no());
+ memset_aligned<8>(FIL_PAGE_PREV + frame, 0xff, 8);
+ mach_write_to_4(frame + FIL_PAGE_SPACE_ID, block.page.id.space());
+ last_offset= FIL_PAGE_TYPE;
+ next_after_applying:
+ if (applied == APPLIED_NO)
+ applied= APPLIED_YES;
+ }
+ else
+ {
+ record_corrupted:
+ if (!srv_force_recovery)
+ {
+ recv_sys.found_corrupt_log= true;
+ return applied;
+ }
+ next_not_same_page:
+ last_offset= 1; /* the next record must not be same_page */
+ }
+ next:
+ l+= rlen;
+ continue;
+ }
+
+ ut_ad(mach_read_from_4(frame + FIL_PAGE_OFFSET) ==
+ block.page.id.page_no());
+ ut_ad(mach_read_from_4(frame + FIL_PAGE_SPACE_ID) ==
+ block.page.id.space());
+ ut_ad(last_offset <= 1 || last_offset > 8);
+ ut_ad(last_offset <= size);
+
+ switch (b & 0x70) {
+ case OPTION:
+ goto next;
+ case INIT_INDEX_PAGE:
+ if (UNIV_UNLIKELY(block.page.id.page_no() < 3 ||
+ block.page.zip.ssize) &&
+ !srv_force_recovery)
+ goto record_corrupted;
+ if (UNIV_UNLIKELY(rlen != 1 || *l > 1))
+ goto record_corrupted;
+ page_create_low(&block, *l != 0);
+ last_offset= FIL_PAGE_TYPE;
+ goto next_after_applying;
+ case WRITE:
+ case MEMSET:
+ case MEMMOVE:
+ if (UNIV_UNLIKELY(last_offset == 1))
+ goto record_corrupted;
+ const size_t olen= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(olen >= rlen) || UNIV_UNLIKELY(olen > 3))
+ goto record_corrupted;
+ const uint32_t offset= mlog_decode_varint(l);
+ ut_ad(offset != MLOG_DECODE_ERROR);
+ static_assert(FIL_PAGE_OFFSET == 4, "compatibility");
+ if (UNIV_UNLIKELY(offset >= size))
+ goto record_corrupted;
+ if (UNIV_UNLIKELY(offset + last_offset < 8 ||
+ offset + last_offset >= size))
+ goto record_corrupted;
+ last_offset+= static_cast<uint16_t>(offset);
+ l+= olen;
+ rlen-= olen;
+ size_t llen= rlen;
+ if ((b & 0x70) == WRITE)
+ {
+ if (UNIV_UNLIKELY(rlen + last_offset > size))
+ goto record_corrupted;
+ memcpy(frame + last_offset, l, llen);
+ if (UNIV_LIKELY(block.page.id.page_no()));
+ else if (llen == 11 + MY_AES_BLOCK_SIZE &&
+ last_offset == FSP_HEADER_OFFSET + MAGIC_SZ +
+ fsp_header_get_encryption_offset(block.zip_size()))
+ applied= APPLIED_TO_ENCRYPTION;
+ else if (last_offset < FSP_HEADER_OFFSET + FSP_FREE + FLST_LEN + 4 &&
+ last_offset + llen >= FSP_HEADER_OFFSET + FSP_SIZE)
+ applied= APPLIED_TO_FSP_HEADER;
+ next_after_applying_write:
+ ut_ad(llen + last_offset <= size);
+ last_offset+= static_cast<uint16_t>(llen);
+ goto next_after_applying;
+ }
+ llen= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(llen > rlen || llen > 3))
+ goto record_corrupted;
+ const uint32_t len= mlog_decode_varint(l);
+ ut_ad(len != MLOG_DECODE_ERROR);
+ if (UNIV_UNLIKELY(len + last_offset > size))
+ goto record_corrupted;
+ l+= llen;
+ rlen-= llen;
+ llen= len;
+ if ((b & 0x70) == MEMSET)
+ {
+ ut_ad(rlen < llen);
+ if (UNIV_UNLIKELY(rlen != 1))
+ {
+ size_t s;
+ for (s= 0; s < llen; s+= rlen)
+ memcpy(frame + last_offset + s, l, rlen);
+ memcpy(frame + last_offset + s, l, llen - s);
+ }
+ else
+ memset(frame + last_offset, *l, llen);
+ goto next_after_applying_write;
+ }
+ const size_t slen= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(slen != rlen || slen > 3))
+ goto record_corrupted;
+ uint32_t s= mlog_decode_varint(l);
+ ut_ad(slen != MLOG_DECODE_ERROR);
+ if (s & 1)
+ s= last_offset - (s >> 1) - 1;
+ else
+ s= last_offset + (s >> 1) + 1;
+ if (UNIV_LIKELY(s >= 8 && s + llen <= size))
+ {
+ memmove(frame + last_offset, frame + s, llen);
+ goto next_after_applying_write;
+ }
+ }
+ goto record_corrupted;
+ }
+ }
+};
+
+
#ifndef DBUG_OFF
/** Return string name of the redo log record type.
@param[in] type record log record enum
@@ -180,7 +431,7 @@ static const char* get_mlog_string(mlog_id_t type);
/** Tablespace item during recovery */
struct file_name_t {
- /** Tablespace file name (MLOG_FILE_NAME) */
+ /** Tablespace file name (MLOG_FILE_NAME or FILE_MODIFY) */
std::string name;
/** Tablespace object (NULL if not valid or not found) */
fil_space_t* space;
@@ -218,16 +469,17 @@ static recv_spaces_t recv_spaces;
/** Report an operation to create, delete, or rename a file during backup.
@param[in] space_id tablespace identifier
-@param[in] flags tablespace flags (NULL if not create)
+@param[in] create whether the file is being created
@param[in] name file name (not NUL-terminated)
@param[in] len length of name, in bytes
@param[in] new_name new file name (NULL if not rename)
@param[in] new_len length of new_name, in bytes (0 if NULL) */
-void (*log_file_op)(ulint space_id, const byte* flags,
+void (*log_file_op)(ulint space_id, bool create,
const byte* name, ulint len,
const byte* new_name, ulint new_len);
-/** Information about initializing page contents during redo log processing */
+/** Information about initializing page contents during redo log processing.
+FIXME: Rely on recv_sys.pages! */
class mlog_init_t
{
public:
@@ -358,7 +610,7 @@ inline void recv_sys_t::trim(const page_id_t page_id, lsn_t lsn)
for (recv_sys_t::map::iterator p = pages.lower_bound(page_id);
p != pages.end() && p->first.space() == page_id.space();) {
recv_sys_t::map::iterator r = p++;
- if (r->second.log.trim(lsn)) {
+ if (r->second.trim(lsn)) {
pages.erase(r);
}
}
@@ -373,11 +625,12 @@ inline void recv_sys_t::trim(const page_id_t page_id, lsn_t lsn)
DBUG_VOID_RETURN;
}
-/** Process a file name from a MLOG_FILE_* record.
+/** Process a file name from a MLOG_FILE_* or FILE_* record.
@param[in,out] name file name
@param[in] len length of the file name
@param[in] space_id the tablespace ID
-@param[in] deleted whether this is a MLOG_FILE_DELETE record */
+@param[in] deleted whether this is a MLOG_FILE_DELETE
+ or FILE_DELETE record */
static
void
fil_name_process(char* name, ulint len, ulint space_id, bool deleted)
@@ -395,15 +648,15 @@ fil_name_process(char* name, ulint len, ulint space_id, bool deleted)
scanned before applying any page records for the space_id. */
os_normalize_path(name);
- file_name_t fname(std::string(name, len - 1), deleted);
- std::pair<recv_spaces_t::iterator,bool> p = recv_spaces.insert(
- std::make_pair(space_id, fname));
+ const file_name_t fname(std::string(name, len), deleted);
+ std::pair<recv_spaces_t::iterator,bool> p = recv_spaces.emplace(
+ space_id, fname);
ut_ad(p.first->first == space_id);
file_name_t& f = p.first->second;
if (deleted) {
- /* Got MLOG_FILE_DELETE */
+ /* Got MLOG_FILE_DELETE oR FILE_DELETE */
if (!p.second && f.status != file_name_t::DELETED) {
f.status = file_name_t::DELETED;
@@ -414,7 +667,9 @@ fil_name_process(char* name, ulint len, ulint space_id, bool deleted)
}
ut_ad(f.space == NULL);
- } else if (p.second // the first MLOG_FILE_NAME or MLOG_FILE_RENAME2
+ } else if (p.second
+ /* the first MLOG_FILE_NAME or MLOG_FILE_RENAME2
+ or FILE_MODIFY or FILE_RENAME */
|| f.name != fname.name) {
fil_space_t* space;
@@ -451,7 +706,7 @@ fil_name_process(char* name, ulint len, ulint space_id, bool deleted)
case FIL_LOAD_NOT_FOUND:
/* No matching tablespace was found; maybe it
was renamed, and we will find a subsequent
- MLOG_FILE_* record. */
+ MLOG_FILE_* or FILE_* record. */
ut_ad(space == NULL);
if (srv_force_recovery) {
@@ -562,7 +817,7 @@ fil_name_parse(
}
}
- byte* end_ptr = ptr + len;
+ byte* end_ptr = ptr + len--;
switch (type) {
default:
@@ -603,7 +858,7 @@ fil_name_parse(
t.pages = uint32_t(page_id.page_no());
} else if (log_file_op) {
log_file_op(page_id.space(),
- type == MLOG_FILE_CREATE2 ? ptr - 4 : NULL,
+ type == MLOG_FILE_CREATE2,
ptr, len, NULL, 0);
}
break;
@@ -630,6 +885,7 @@ fil_name_parse(
corrupt = corrupt
|| new_len < sizeof "/a.ibd\0"
|| memcmp(new_name + new_len - 5, DOT_IBD, 5) != 0;
+ new_len--;
if (!corrupt && !memchr(new_name, OS_PATH_SEPARATOR, new_len)) {
if (byte* c = static_cast<byte*>
@@ -664,7 +920,7 @@ fil_name_parse(
page_id.space(), false);
if (log_file_op) {
- log_file_op(page_id.space(), NULL,
+ log_file_op(page_id.space(), false,
ptr, len, new_name, new_len);
}
@@ -872,18 +1128,7 @@ void recv_sys_t::debug_free()
mutex_exit(&mutex);
}
-inline size_t recv_sys_t::get_free_len() const
-{
- if (const buf_block_t* block= UT_LIST_GET_FIRST(blocks))
- {
- if (const size_t used= static_cast<uint16_t>(block->page.access_time))
- return srv_page_size - used;
- ut_ad(srv_page_size == 65536);
- }
- return 0;
-}
-
-inline byte* recv_sys_t::alloc(size_t len, bool store_recv)
+inline void *recv_sys_t::alloc(size_t len, bool store_recv)
{
ut_ad(mutex_own(&mutex));
ut_ad(len);
@@ -913,9 +1158,6 @@ create_block:
ut_ad(free_offset <= srv_page_size);
free_offset+= len;
- if (store_recv && free_offset + sizeof(recv_t::data) + 1 > srv_page_size)
- goto create_block;
-
if (free_offset > srv_page_size)
goto create_block;
@@ -1292,6 +1534,8 @@ recv_find_max_checkpoint(ulint* max_field)
case log_t::FORMAT_10_3 | log_t::FORMAT_ENCRYPTED:
case log_t::FORMAT_10_4:
case log_t::FORMAT_10_4 | log_t::FORMAT_ENCRYPTED:
+ case log_t::FORMAT_10_5:
+ case log_t::FORMAT_10_5 | log_t::FORMAT_ENCRYPTED:
break;
default:
ib::error() << "Unsupported redo log format."
@@ -1763,40 +2007,6 @@ parse_log:
contents can be ignored. We do not write or apply
this record yet. */
break;
- case MLOG_ZIP_WRITE_STRING:
- ut_ad(!page_zip
- || !fil_page_get_type(page_zip->data)
- || fil_page_get_type(page_zip->data) == FIL_PAGE_INDEX
- || fil_page_get_type(page_zip->data) == FIL_PAGE_RTREE);
- if (ptr + 4 > end_ptr) {
- goto truncated;
- } else {
- const ulint ofs = mach_read_from_2(ptr);
- const ulint len = mach_read_from_2(ptr + 2);
- if (ofs < FIL_PAGE_PREV || !len) {
- goto corrupted;
- }
- ptr += 4 + len;
- if (ptr > end_ptr) {
- goto truncated;
- }
- if (!page_zip) {
- break;
- }
- ut_ad(ofs + len <= block->zip_size());
- memcpy(page_zip->data + ofs, old_ptr + 4, len);
- if (ofs >= FIL_PAGE_TYPE +2
- || ofs + len < FIL_PAGE_TYPE + 2) {
- break;
- }
- /* Ensure that buf_flush_init_for_writing()
- will treat the page as an index page, and
- not overwrite the compressed page with the
- contents of the uncompressed page. */
- memcpy_aligned<2>(&page[FIL_PAGE_TYPE],
- &page_zip->data[FIL_PAGE_TYPE], 2);
- }
- break;
case MLOG_WRITE_STRING:
ut_ad(!page_zip
|| fil_page_get_type(page_zip->data)
@@ -1875,9 +2085,7 @@ parse_log:
default:
ib::error() << "Incorrect log record type "
<< ib::hex(unsigned(type));
-corrupted:
recv_sys.found_corrupt_log = true;
-truncated:
ptr = NULL;
}
@@ -1891,6 +2099,26 @@ truncated:
return(ptr);
}
+/*******************************************************//**
+Calculates the new value for lsn when more data is added to the log. */
+static
+lsn_t
+recv_calc_lsn_on_data_add(
+/*======================*/
+ lsn_t lsn, /*!< in: old lsn */
+ ib_uint64_t len) /*!< in: this many bytes of data is
+ added, log block headers not included */
+{
+ unsigned frag_len = (lsn % OS_FILE_LOG_BLOCK_SIZE) - LOG_BLOCK_HDR_SIZE;
+ unsigned payload_size = log_sys.payload_size();
+ ut_ad(frag_len < payload_size);
+ lsn_t lsn_len = len;
+ lsn_len += (lsn_len + frag_len) / payload_size
+ * (OS_FILE_LOG_BLOCK_SIZE - payload_size);
+
+ return(lsn + lsn_len);
+}
+
/** Store a redo log record for applying.
@param type record type
@param page_id page identifier
@@ -1909,22 +2137,13 @@ inline void recv_sys_t::add(mlog_id_t type, const page_id_t page_id,
ut_ad(type != MLOG_DUMMY_RECORD);
ut_ad(type != MLOG_CHECKPOINT);
ut_ad(type != MLOG_TRUNCATE);
+ ut_ad(!log_sys.is_physical());
std::pair<map::iterator, bool> p= pages.insert(map::value_type
(page_id, page_recv_t()));
page_recv_t& recs= p.first->second;
ut_ad(p.second == recs.log.empty());
- switch (type) {
- case MLOG_INIT_FILE_PAGE2:
- case MLOG_ZIP_PAGE_COMPRESS:
- case MLOG_INIT_FREE_PAGE:
- recs.will_not_read();
- mlog_init.add(page_id, lsn);
- default:
- break;
- }
-
/* Store the log record body in limited-size chunks, because the
heap grows into the buffer pool. */
size_t len= static_cast<size_t>(rec_end - body);
@@ -1935,7 +2154,10 @@ inline void recv_sys_t::add(mlog_id_t type, const page_id_t page_id,
for (recv_t::data_t *prev= nullptr;;)
{
- const size_t l= std::min(len, get_free_len() - sizeof(recv_t::data));
+ const size_t used= static_cast<uint16_t>
+ (UT_LIST_GET_FIRST(blocks)->page.access_time);
+ ut_ad(used || srv_page_size == 65536);
+ const size_t l= std::min(len, srv_page_size - used - sizeof(recv_t::data));
recv_t::data_t *d= new (alloc(sizeof(recv_t::data) + l))
recv_t::data_t(body, l);
if (prev)
@@ -1954,16 +2176,30 @@ inline void recv_sys_t::add(mlog_id_t type, const page_id_t page_id,
/** Trim old log records for a page.
@param start_lsn oldest log sequence number to preserve
@return whether all the log for the page was trimmed */
-inline bool page_recv_t::recs_t::trim(lsn_t start_lsn)
+inline bool page_recv_t::trim(lsn_t start_lsn)
{
- while (head)
+ if (log_sys.is_physical())
{
- if (head->lsn >= start_lsn) return false;
- log_rec_t *next= head->next;
- static_cast<const recv_t*>(head)->free();
- head= next;
+ while (log.head)
+ {
+ if (log.head->lsn >= start_lsn) return false;
+ last_offset= 1; /* the next record must not be same_page */
+ log_rec_t *next= log.head->next;
+ recv_sys.free(log.head);
+ log.head= next;
+ }
+ log.tail= nullptr;
+ return true;
}
- tail= nullptr;
+
+ while (log.head)
+ {
+ if (log.head->lsn >= start_lsn) return false;
+ log_rec_t *next= log.head->next;
+ static_cast<const recv_t*>(log.head)->free();
+ log.head= next;
+ }
+ log.tail= nullptr;
return true;
}
@@ -1971,6 +2207,17 @@ inline bool page_recv_t::recs_t::trim(lsn_t start_lsn)
inline void page_recv_t::recs_t::clear()
{
ut_ad(mutex_own(&recv_sys.mutex));
+ if (log_sys.is_physical())
+ {
+ for (const log_rec_t *l= head; l; )
+ {
+ const log_rec_t *next= l->next;
+ recv_sys.free(l);
+ l= next;
+ }
+ head= tail= nullptr;
+ return;
+ }
for (const log_rec_t *l= head; l; )
{
const log_rec_t *next= l->next;
@@ -1990,6 +2237,501 @@ inline void page_recv_t::will_not_read()
}
+/** Register a redo log snippet for a page.
+@param page_id page identifier
+@param start_lsn start LSN of the mini-transaction
+@param lsn @see mtr_t::commit_lsn()
+@param recs redo log snippet @see log_t::FORMAT_10_5
+@param len length of l, in bytes */
+inline void recv_sys_t::add(const page_id_t page_id,
+ lsn_t start_lsn, lsn_t lsn, const byte *l,
+ size_t len)
+{
+ ut_ad(mutex_own(&mutex));
+ std::pair<map::iterator, bool> p= pages.emplace(map::value_type
+ (page_id, page_recv_t()));
+ page_recv_t& recs= p.first->second;
+ ut_ad(p.second == recs.log.empty());
+
+ switch (*l & 0x70) {
+ case FREE_PAGE: case INIT_PAGE:
+ recs.will_not_read();
+ mlog_init.add(page_id, start_lsn); /* FIXME: remove this! */
+ /* fall through */
+ default:
+ log_phys_t *tail= static_cast<log_phys_t*>(recs.log.last());
+ if (!tail)
+ break;
+#if 1 // MDEV-14425 FIXME: remove this!
+ if (tail->start_lsn != start_lsn)
+ break;
+#endif
+ buf_block_t *block= UT_LIST_GET_LAST(blocks);
+ ut_ad(block);
+ const size_t used= static_cast<uint16_t>(block->page.access_time - 1) + 1;
+ ut_ad(used >= ALIGNMENT);
+ const byte *end= const_cast<const log_phys_t*>(tail)->end();
+ if (!((reinterpret_cast<size_t>(end + len) ^
+ reinterpret_cast<size_t>(end)) & ~(ALIGNMENT - 1)))
+ {
+ /* Use already allocated 'padding' bytes */
+append:
+ UNIV_MEM_ALLOC(end + 1, len);
+ /* Append to the preceding record for the page */
+ tail->append(l, len, lsn);
+ return;
+ }
+ if (end <= &block->frame[used - ALIGNMENT] || &block->frame[used] >= end)
+ break; /* Not the last allocated record in the page */
+ const size_t new_used= static_cast<size_t>(end - block->frame + len + 1);
+ ut_ad(new_used > used);
+ if (new_used > srv_page_size)
+ break;
+ block->page.access_time= (block->page.access_time & ~0U << 16) |
+ ut_calc_align<uint16_t>(static_cast<uint16_t>(new_used), ALIGNMENT);
+ goto append;
+ }
+ recs.log.append(new (alloc(log_phys_t::alloc_size(len)))
+ log_phys_t(start_lsn, lsn, l, len));
+}
+
+
+/** Parse and register one mini-transaction in log_t::FORMAT_10_5.
+@param checkpoint_lsn the log sequence number of the latest checkpoint
+@param store whether to store the records
+@param apply whether to apply file-level log records
+@return whether FILE_CHECKPOINT record was seen the first time,
+or corruption was noticed */
+inline bool recv_sys_t::parse(lsn_t checkpoint_lsn, store_t store, bool apply)
+{
+ const byte *const end= buf + len;
+loop:
+ const byte *const log= buf + recovered_offset;
+ const lsn_t start_lsn= recovered_lsn;
+
+ /* Check that the entire mini-transaction is included within the buffer */
+ const byte *l;
+ uint32_t rlen;
+ for (l= log; l < end; l+= rlen)
+ {
+ if (!*l)
+ goto eom_found;
+ if (UNIV_LIKELY((*l & 0x70) != RESERVED));
+ else if (srv_force_recovery)
+ ib::warn() << "Ignoring unknown log record at LSN " << recovered_lsn;
+ else
+ {
+malformed:
+ ib::error() << "Malformed log record;"
+ " set innodb_force_recovery=1 to ignore.";
+corrupted:
+ const size_t trailing_bytes= std::min<size_t>(100, size_t(end - l));
+ ib::info() << "Dump from the start of the mini-transaction (LSN="
+ << start_lsn << ") to "
+ << trailing_bytes << " bytes after the record:";
+ ut_print_buf(stderr, log, l - log + trailing_bytes);
+ putc('\n', stderr);
+ found_corrupt_log= true;
+ return true;
+ }
+ rlen= *l++ & 0xf;
+ if (l + (rlen ? rlen : 16) >= end)
+ break;
+ if (!rlen)
+ {
+ rlen= mlog_decode_varint_length(*l);
+ if (l + rlen >= end)
+ break;
+ const uint32_t addlen= mlog_decode_varint(l);
+ if (UNIV_UNLIKELY(addlen == MLOG_DECODE_ERROR))
+ {
+ ib::error() << "Corrupted record length";
+ goto corrupted;
+ }
+ rlen= addlen + 15;
+ }
+ }
+
+ /* Not the entire mini-transaction was present. */
+ return false;
+
+eom_found:
+ ut_ad(!*l);
+ ut_d(const byte *const el= l + 1);
+
+ const lsn_t end_lsn= recv_calc_lsn_on_data_add(start_lsn, l + 1 - log);
+ if (UNIV_UNLIKELY(end_lsn > scanned_lsn))
+ /* The log record filled a log block, and we require that also the
+ next log block should have been scanned in */
+ return false;
+
+ ut_d(std::set<page_id_t> freed);
+#if 0 && defined UNIV_DEBUG /* MDEV-21727 FIXME: enable this */
+ /* Pages that have been modified in this mini-transaction.
+ If a mini-transaction writes INIT_PAGE for a page, it should not have
+ written any log records for the page. Unfortunately, this does not
+ hold for ROW_FORMAT=COMPRESSED pages, because page_zip_compress()
+ can be invoked in a pessimistic operation, even after log has
+ been written for other pages. */
+ ut_d(std::set<page_id_t> modified);
+#endif
+
+ uint32_t space_id= 0, page_no= 0, last_offset= 0;
+#if 1 /* MDEV-14425 FIXME: remove this */
+ bool got_page_op= false;
+#endif
+ for (l= log; l < end; l+= rlen)
+ {
+ const byte *const recs= l;
+ const byte b= *l++;
+
+ if (!b)
+ break;
+ ut_ad(UNIV_LIKELY(b & 0x70) != RESERVED || srv_force_recovery);
+ rlen= b & 0xf;
+ ut_ad(l + rlen < end);
+ ut_ad(rlen || l + 16 < end);
+ if (!rlen)
+ {
+ const uint32_t lenlen= mlog_decode_varint_length(*l);
+ ut_ad(l + lenlen < end);
+ const uint32_t addlen= mlog_decode_varint(l);
+ ut_ad(addlen != MLOG_DECODE_ERROR);
+ rlen= addlen + 15 - lenlen;
+ l+= lenlen;
+ }
+ ut_ad(l + rlen < end);
+ uint32_t idlen;
+ if ((b & 0x80) && got_page_op)
+ {
+ /* This record is for the same page as the previous one. */
+ if (UNIV_UNLIKELY((b & 0x70) <= INIT_PAGE))
+ {
+record_corrupted:
+ /* FREE_PAGE,INIT_PAGE cannot be with same_page flag */
+ if (!srv_force_recovery)
+ goto malformed;
+ ib::warn() << "Ignoring malformed log record at LSN " << recovered_lsn;
+ last_offset= 1; /* the next record must not be same_page */
+ continue;
+ }
+ goto same_page;
+ }
+ last_offset= 0;
+ idlen= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(idlen > 5 || idlen >= rlen))
+ {
+page_id_corrupted:
+ if (!srv_force_recovery)
+ {
+ ib::error() << "Corrupted page identifier at " << recovered_lsn
+ << "; set innodb_force_recovery=1 to ignore the record.";
+ goto corrupted;
+ }
+ ib::warn() << "Ignoring corrupted page identifier at LSN "
+ << recovered_lsn;
+ continue;
+ }
+ space_id= mlog_decode_varint(l);
+ if (UNIV_UNLIKELY(space_id == MLOG_DECODE_ERROR))
+ goto page_id_corrupted;
+ l+= idlen;
+ rlen-= idlen;
+ idlen= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(idlen > 5 || idlen > rlen))
+ goto page_id_corrupted;
+ page_no= mlog_decode_varint(l);
+ if (UNIV_UNLIKELY(page_no == MLOG_DECODE_ERROR))
+ goto page_id_corrupted;
+ l+= idlen;
+ rlen-= idlen;
+ got_page_op = !(b & 0x80);
+ if (got_page_op && apply && !is_predefined_tablespace(space_id))
+ {
+ recv_spaces_t::iterator i= recv_spaces.lower_bound(space_id);
+ if (i != recv_spaces.end() && i->first == space_id);
+ else if (recovered_lsn < mlog_checkpoint_lsn)
+ /* We have not seen all records between the checkpoint and
+ FILE_CHECKPOINT. There should be a FILE_DELETE for this
+ tablespace later. */
+ recv_spaces.emplace_hint(i, space_id, file_name_t("", false));
+ else
+ {
+ const page_id_t id(space_id, page_no);
+ if (!srv_force_recovery)
+ {
+ ib::error() << "Missing FILE_DELETE or FILE_MODIFY for " << id
+ << " at " << recovered_lsn
+ << "; set innodb_force_recovery=1 to ignore the record.";
+ goto corrupted;
+ }
+ ib::warn() << "Ignoring record for " << id << " at " << recovered_lsn;
+ continue;
+ }
+ }
+same_page:
+ DBUG_PRINT("ib_log",
+ ("scan " LSN_PF ": rec %x len %zu page %u:%u",
+ recovered_lsn, b, static_cast<size_t>(l + rlen - recs),
+ space_id, page_no));
+
+ if (got_page_op)
+ {
+ ut_d(const page_id_t id(space_id, page_no));
+ ut_d(if ((b & 0x70) == INIT_PAGE) freed.erase(id));
+ ut_ad(freed.find(id) == freed.end());
+ switch (b & 0x70) {
+ case FREE_PAGE:
+ ut_ad(freed.emplace(id).second);
+ last_offset= 1; /* the next record must not be same_page */
+ goto free_or_init_page;
+ case INIT_PAGE:
+ free_or_init_page:
+ last_offset= FIL_PAGE_TYPE;
+ if (UNIV_UNLIKELY(rlen != 0))
+ goto record_corrupted;
+ break;
+ case INIT_INDEX_PAGE:
+ if (UNIV_UNLIKELY(rlen != 1))
+ goto record_corrupted;
+ last_offset= FIL_PAGE_TYPE;
+ break;
+ case RESERVED:
+ case OPTION:
+ continue;
+ case WRITE:
+ case MEMMOVE:
+ case MEMSET:
+ if (UNIV_UNLIKELY(rlen == 0 || last_offset == 1))
+ goto record_corrupted;
+ const uint32_t olen= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(olen >= rlen) || UNIV_UNLIKELY(olen > 3))
+ goto record_corrupted;
+ const uint32_t offset= mlog_decode_varint(l);
+ ut_ad(offset != MLOG_DECODE_ERROR);
+ static_assert(FIL_PAGE_OFFSET == 4, "compatibility");
+ if (UNIV_UNLIKELY(offset >= srv_page_size))
+ goto record_corrupted;
+ last_offset+= offset;
+ if (UNIV_UNLIKELY(last_offset < 8 || last_offset >= srv_page_size))
+ goto record_corrupted;
+ l+= olen;
+ rlen-= olen;
+ if ((b & 0x70) == WRITE)
+ {
+ if (UNIV_UNLIKELY(rlen + last_offset > srv_page_size))
+ goto record_corrupted;
+ if (UNIV_UNLIKELY(page_no == 0) && apply &&
+ last_offset <= FSP_HEADER_OFFSET + FSP_SIZE &&
+ last_offset + rlen >= FSP_HEADER_OFFSET + FSP_SIZE + 4)
+ {
+ recv_spaces_t::iterator it= recv_spaces.find(space_id);
+ const uint32_t size= mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+ + l - last_offset);
+ if (it == recv_spaces.end())
+ ut_ad(!mlog_checkpoint_lsn || space_id == TRX_SYS_SPACE ||
+ srv_is_undo_tablespace(space_id));
+ else if (!it->second.space)
+ it->second.size= size;
+ fil_space_set_recv_size(space_id, size);
+ }
+ last_offset+= rlen;
+ break;
+ }
+ uint32_t llen= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(llen > rlen || llen > 3))
+ goto record_corrupted;
+ const uint32_t len= mlog_decode_varint(l);
+ ut_ad(len != MLOG_DECODE_ERROR);
+ if (UNIV_UNLIKELY(last_offset + len > srv_page_size))
+ goto record_corrupted;
+ l+= llen;
+ rlen-= llen;
+ llen= len;
+ if ((b & 0x70) == MEMSET)
+ {
+ if (UNIV_UNLIKELY(rlen > llen))
+ goto record_corrupted;
+ last_offset+= llen;
+ break;
+ }
+ const uint32_t slen= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(slen != rlen || slen > 3))
+ goto record_corrupted;
+ uint32_t s= mlog_decode_varint(l);
+ ut_ad(slen != MLOG_DECODE_ERROR);
+ if (s & 1)
+ s= last_offset - (s >> 1) - 1;
+ else
+ s= last_offset + (s >> 1) + 1;
+ if (UNIV_UNLIKELY(s < 8 || s + llen > srv_page_size))
+ goto record_corrupted;
+ last_offset+= llen;
+ break;
+ }
+#if 0 && defined UNIV_DEBUG
+ switch (b & 0x70) {
+ case RESERVED:
+ case OPTION:
+ ut_ad(0); /* we did "continue" earlier */
+ break;
+ case FREE_PAGE:
+ break;
+ default:
+ ut_ad(modified.emplace(id).second || (b & 0x70) != INIT_PAGE);
+ }
+#endif
+ switch (store) {
+ case STORE_NO:
+ continue;
+ case STORE_IF_EXISTS:
+ if (!fil_space_get_size(space_id))
+ continue;
+ /* fall through */
+ case STORE_YES:
+ add(page_id_t(space_id, page_no), start_lsn, end_lsn, recs,
+ static_cast<size_t>(l + rlen - recs));
+ }
+ }
+#if 1 /* MDEV-14425 FIXME: this must be in the checkpoint file only! */
+ else if (rlen)
+ {
+ switch (b & 0xf0) {
+# if 1 /* MDEV-14425 FIXME: Remove this! */
+ case FILE_CHECKPOINT:
+ if (space_id == 0 && page_no == 0 && rlen == 8)
+ {
+ const lsn_t lsn= mach_read_from_8(l);
+
+ if (UNIV_UNLIKELY(srv_print_verbose_log == 2))
+ fprintf(stderr, "FILE_CHECKPOINT(" LSN_PF ") %s at " LSN_PF "\n",
+ lsn, lsn != checkpoint_lsn
+ ? "ignored"
+ : mlog_checkpoint_lsn ? "reread" : "read",
+ recovered_lsn);
+
+ DBUG_PRINT("ib_log", ("FILE_CHECKPOINT(" LSN_PF ") %s at " LSN_PF,
+ lsn, lsn != checkpoint_lsn
+ ? "ignored"
+ : mlog_checkpoint_lsn ? "reread" : "read",
+ recovered_lsn));
+
+ if (lsn == checkpoint_lsn)
+ {
+ ut_ad(mlog_checkpoint_lsn <= recovered_lsn);
+ if (mlog_checkpoint_lsn)
+ continue;
+ mlog_checkpoint_lsn= recovered_lsn;
+ l+= 8;
+ recovered_offset= l - buf;
+ return true;
+ }
+ continue;
+ }
+# endif
+ /* fall through */
+ default:
+ if (!srv_force_recovery)
+ goto malformed;
+ ib::warn() << "Ignoring malformed log record at LSN " << recovered_lsn;
+ continue;
+ case FILE_DELETE:
+ case FILE_MODIFY:
+ case FILE_RENAME:
+ if (UNIV_UNLIKELY(page_no != 0))
+ {
+ file_rec_error:
+ if (!srv_force_recovery)
+ {
+ ib::error() << "Corrupted file-level record;"
+ " set innodb_force_recovery=1 to ignore.";
+ goto corrupted;
+ }
+
+ ib::warn() << "Ignoring corrupted file-level record at LSN "
+ << recovered_lsn;
+ continue;
+ }
+ /* fall through */
+ case FILE_CREATE:
+ if (UNIV_UNLIKELY(space_id == 0))
+ goto file_rec_error;
+ /* There is no terminating NUL character. Names must end in .ibd.
+ For FILE_RENAME, there is a NUL between the two file names. */
+ const char * const fn= reinterpret_cast<const char*>(l);
+ const char *fn2= static_cast<const char*>(memchr(fn, 0, rlen));
+
+ if (UNIV_UNLIKELY((fn2 == nullptr) == ((b & 0xf0) == FILE_RENAME)))
+ goto file_rec_error;
+
+ const char * const fnend= fn2 ? fn2 : fn + rlen;
+ const char * const fn2end= fn2 ? fn + rlen : nullptr;
+
+ if (fn2)
+ {
+ fn2++;
+ if (memchr(fn2, 0, fn2end - fn2))
+ goto file_rec_error;
+ if (fn2end - fn2 < 4 || memcmp(fn2end - 4, DOT_IBD, 4))
+ goto file_rec_error;
+ }
+
+ if (page_no)
+ {
+ if (UNIV_UNLIKELY((b & 0xf0) != FILE_CREATE))
+ goto file_rec_error;
+ /* truncating an undo log tablespace */
+ ut_ad(fnend - fn >= 7);
+ ut_ad(!memcmp(fnend - 7, "undo", 4));
+ ut_d(char n[4]; char *end; memcpy(n, fnend - 3, 3); n[3]= 0);
+ ut_ad(strtoul(n, &end, 10) <= 127);
+ ut_ad(end == &n[3]);
+ ut_ad(page_no == SRV_UNDO_TABLESPACE_SIZE_IN_PAGES);
+ ut_ad(srv_is_undo_tablespace(space_id));
+ static_assert(UT_ARR_SIZE(truncated_undo_spaces) ==
+ TRX_SYS_MAX_UNDO_SPACES, "compatibility");
+ truncated_undo_spaces[space_id - srv_undo_space_id_start]=
+ { recovered_lsn, page_no };
+ continue;
+ }
+ if (is_predefined_tablespace(space_id))
+ goto file_rec_error;
+ if (fnend - fn < 4 || memcmp(fnend - 4, DOT_IBD, 4))
+ goto file_rec_error;
+
+ const char saved_end= fn[rlen];
+ const_cast<char&>(fn[rlen])= '\0';
+ fil_name_process(const_cast<char*>(fn), fnend - fn, space_id,
+ (b & 0xf0) == FILE_DELETE);
+ if (fn2)
+ fil_name_process(const_cast<char*>(fn2), fn2end - fn2, space_id,
+ false);
+ if ((b & 0xf0) < FILE_MODIFY && log_file_op)
+ log_file_op(space_id, (b & 0xf0) == FILE_CREATE,
+ l, static_cast<ulint>(fnend - fn),
+ reinterpret_cast<const byte*>(fn2),
+ fn2 ? static_cast<ulint>(fn2end - fn2) : 0);
+
+ if (!fn2 || !apply);
+ else if (!fil_op_replay_rename(space_id, 0, fn, fn2))
+ found_corrupt_fs= true;
+ const_cast<char&>(fn[rlen])= saved_end;
+ if (UNIV_UNLIKELY(found_corrupt_fs))
+ return true;
+ }
+ }
+#endif
+ else
+ goto malformed;
+ }
+
+ ut_ad(l == el);
+ recovered_offset= l - buf;
+ recovered_lsn= end_lsn;
+ goto loop;
+}
+
+
/*********************************************************************//**
Copies the log record body from recv to buf. */
static ATTRIBUTE_COLD
@@ -2018,13 +2760,15 @@ lsn of a log record.
@param[in,out] block buffer pool page
@param[in,out] mtr mini-transaction
@param[in,out] p recovery address
+@param[in,out] space tablespace, or NULL if not looked up yet
@param[in,out] init page initialization operation, or NULL */
static void recv_recover_page(buf_block_t* block, mtr_t& mtr,
const recv_sys_t::map::iterator& p,
+ fil_space_t* space = NULL,
mlog_init_t::init* init = NULL)
{
page_t* page;
- page_zip_des_t* page_zip;
+ page_zip_des_t* page_zip;
ut_ad(mutex_own(&recv_sys.mutex));
ut_ad(recv_sys.apply_log_recs);
@@ -2033,12 +2777,15 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr,
ut_ad(!init || init->lsn);
ut_ad(block->page.id == p->first);
ut_ad(!p->second.is_being_processed());
+ ut_ad(!space || space->id == block->page.id.space());
if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) {
ib::info() << "Applying log to page " << block->page.id;
}
- DBUG_LOG("ib_log", "Applying log to page " << block->page.id);
+ DBUG_PRINT("ib_log", ("Applying log to page %u:%u",
+ block->page.id.space(),
+ block->page.id.page_no()));
p->second.state = page_recv_t::RECV_BEING_PROCESSED;
@@ -2047,11 +2794,17 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr,
page = block->frame;
page_zip = buf_block_get_page_zip(block);
- const lsn_t page_lsn = mach_read_from_8(page + FIL_PAGE_LSN);
+ byte *frame = UNIV_LIKELY_NULL(block->page.zip.data)
+ ? block->page.zip.data
+ : page;
+ const lsn_t page_lsn = init
+ ? 0
+ : mach_read_from_8(frame + FIL_PAGE_LSN);
bool free_page = false;
lsn_t start_lsn = 0, end_lsn = 0;
ut_d(lsn_t recv_start_lsn = 0);
const lsn_t init_lsn = init ? init->lsn : 0;
+ const bool is_physical = log_sys.is_physical();
for (const log_rec_t* l : p->second.log) {
ut_ad(l->lsn);
@@ -2065,23 +2818,108 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr,
ut_d(recv_start_lsn = recv->start_lsn);
if (recv->start_lsn < page_lsn) {
- /* Ignore this record, because there are later changes
- for this page. */
- DBUG_LOG("ib_log", "apply skip "
- << get_mlog_string(recv->type)
- << " LSN " << recv->start_lsn << " < "
- << page_lsn);
- } else if (recv->start_lsn < init_lsn) {
- DBUG_LOG("ib_log", "init skip "
- << get_mlog_string(recv->type)
- << " LSN " << recv->start_lsn << " < "
- << init_lsn);
+ /* This record has already been applied. */
+ DBUG_PRINT("ib_log", ("apply skip %u:%u LSN " LSN_PF
+ " < " LSN_PF,
+ block->page.id.space(),
+ block->page.id.page_no(),
+ recv->start_lsn, page_lsn));
+ continue;
+ }
+
+ if (recv->start_lsn < init_lsn) {
+ DBUG_PRINT("ib_log", ("init skip %s %u:%u LSN " LSN_PF
+ " < " LSN_PF,
+ is_physical
+ ? "?"
+ : get_mlog_string(recv->type),
+ block->page.id.space(),
+ block->page.id.page_no(),
+ recv->start_lsn, init_lsn));
+ continue;
+ }
+
+ if (is_physical) {
+ const log_phys_t *f= static_cast<const log_phys_t*>(l);
+
+ if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) {
+ ib::info() << "apply " << f->start_lsn
+ << ": " << block->page.id;
+ }
+
+ DBUG_PRINT("ib_log", ("apply " LSN_PF ": %u:%u",
+ f->start_lsn,
+ block->page.id.space(),
+ block->page.id.page_no()));
+
+ log_phys_t::apply_status a= f->apply(
+ *block, p->second.last_offset);
+
+ switch (a) {
+ case log_phys_t::APPLIED_NO:
+ ut_ad(!mtr.has_modifications());
+ free_page = true;
+ start_lsn = 0;
+ continue;
+ case log_phys_t::APPLIED_YES:
+ goto set_start_lsn;
+ case log_phys_t::APPLIED_TO_FSP_HEADER:
+ case log_phys_t::APPLIED_TO_ENCRYPTION:
+ break;
+ }
+
+ if (fil_space_t* s = space
+ ? space
+ : fil_space_acquire(block->page.id.space())) {
+ switch (a) {
+ case log_phys_t::APPLIED_TO_FSP_HEADER:
+ s->flags = mach_read_from_4(
+ FSP_HEADER_OFFSET
+ + FSP_SPACE_FLAGS + frame);
+ s->size_in_header = mach_read_from_4(
+ FSP_HEADER_OFFSET + FSP_SIZE
+ + frame);
+ s->free_limit = mach_read_from_4(
+ FSP_HEADER_OFFSET
+ + FSP_FREE_LIMIT + frame);
+ s->free_len = mach_read_from_4(
+ FSP_HEADER_OFFSET + FSP_FREE
+ + FLST_LEN + frame);
+ break;
+ default:
+ byte* b= frame
+ + fsp_header_get_encryption_offset(
+ block->zip_size())
+ + FSP_HEADER_OFFSET;
+ if (memcmp(b, CRYPT_MAGIC, MAGIC_SZ)) {
+ break;
+ }
+ b += MAGIC_SZ;
+ if (*b != CRYPT_SCHEME_UNENCRYPTED
+ && *b != CRYPT_SCHEME_1) {
+ break;
+ }
+ if (b[1] != MY_AES_BLOCK_SIZE) {
+ break;
+ }
+ if (b[2 + MY_AES_BLOCK_SIZE + 4 + 4]
+ > FIL_ENCRYPTION_OFF) {
+ break;
+ }
+ fil_crypt_parse(s, b);
+ }
+
+ if (s != space) {
+ s->release();
+ }
+ }
} else {
if (recv->type == MLOG_INIT_FREE_PAGE) {
/* This does not really modify the page. */
+ ut_ad(!mtr.has_modifications());
free_page = true;
- } else if (!start_lsn) {
- start_lsn = recv->start_lsn;
+ start_lsn = 0;
+ continue;
}
if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) {
@@ -2130,9 +2968,24 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr,
FIL_PAGE_LSN + page, 8);
}
}
+
+set_start_lsn:
+ if (!start_lsn) {
+ start_lsn = recv->start_lsn;
+ }
}
if (start_lsn) {
+ ut_ad(end_lsn >= start_lsn);
+ mach_write_to_8(FIL_PAGE_LSN + frame, end_lsn);
+ if (UNIV_LIKELY(frame == block->frame)) {
+ mach_write_to_8(srv_page_size
+ - FIL_PAGE_END_LSN_OLD_CHKSUM
+ + frame, end_lsn);
+ } else {
+ buf_zip_decompress(block, false);
+ }
+
buf_block_modify_clock_inc(block);
log_flush_order_mutex_enter();
buf_flush_note_modification(block, start_lsn, end_lsn);
@@ -2187,8 +3040,9 @@ ATTRIBUTE_COLD void recv_sys_t::free_corrupted_page(page_id_t page_id)
}
/** Apply any buffered redo log to a page that was just read from a data file.
+@param[in,out] space tablespace
@param[in,out] bpage buffer pool page */
-void recv_recover_page(buf_page_t* bpage)
+void recv_recover_page(fil_space_t* space, buf_page_t* bpage)
{
mtr_t mtr;
mtr.start();
@@ -2211,7 +3065,7 @@ void recv_recover_page(buf_page_t* bpage)
recv_sys_t::map::iterator p = recv_sys.pages.find(bpage->id);
if (p != recv_sys.pages.end()
&& !p->second.is_being_processed()) {
- recv_recover_page(block, mtr, p);
+ recv_recover_page(block, mtr, p, space);
p->second.log.clear();
recv_sys.pages.erase(p);
goto func_exit;
@@ -2391,7 +3245,7 @@ void recv_apply_hashed_log_recs(bool last_batch)
buf_block_dbg_add_level(
block, SYNC_NO_ORDER_CHECK);
mtr.x_latch_at_savepoint(0, block);
- recv_recover_page(block, mtr, p, &i);
+ recv_recover_page(block, mtr, p, space, &i);
ut_ad(mtr.has_committed());
p->second.log.clear();
recv_sys.pages.erase(p);
@@ -2560,26 +3414,6 @@ recv_parse_log_rec(
return ulint(new_ptr - ptr);
}
-/*******************************************************//**
-Calculates the new value for lsn when more data is added to the log. */
-static
-lsn_t
-recv_calc_lsn_on_data_add(
-/*======================*/
- lsn_t lsn, /*!< in: old lsn */
- ib_uint64_t len) /*!< in: this many bytes of data is
- added, log block headers not included */
-{
- unsigned frag_len = (lsn % OS_FILE_LOG_BLOCK_SIZE) - LOG_BLOCK_HDR_SIZE;
- unsigned payload_size = log_sys.payload_size();
- ut_ad(frag_len < payload_size);
- lsn_t lsn_len = len;
- lsn_len += (lsn_len + frag_len) / payload_size
- * (OS_FILE_LOG_BLOCK_SIZE - payload_size);
-
- return(lsn + lsn_len);
-}
-
/** Prints diagnostic info of corrupt log.
@param[in] ptr pointer to corrupt log record
@param[in] type type of the log record (could be garbage)
@@ -2658,10 +3492,18 @@ hash table to wait merging to file pages.
@param[in] checkpoint_lsn the LSN of the latest checkpoint
@param[in] store whether to store page operations
@param[in] apply whether to apply the records
-@return whether MLOG_CHECKPOINT record was seen the first time,
-or corruption was noticed */
-bool recv_parse_log_recs(lsn_t checkpoint_lsn, store_t* store, bool apply)
+@return whether MLOG_CHECKPOINT or FILE_CHECKPOINT record
+was seen the first time, or corruption was noticed */
+bool recv_parse_log_recs(lsn_t checkpoint_lsn, store_t *store, bool apply)
{
+ ut_ad(log_mutex_own());
+ ut_ad(mutex_own(&recv_sys.mutex));
+ ut_ad(recv_sys.parse_start_lsn != 0);
+
+ if (log_sys.is_physical()) {
+ return recv_sys.parse(checkpoint_lsn, *store, apply);
+ }
+
bool single_rec;
ulint len;
lsn_t new_recovered_lsn;
@@ -2672,9 +3514,6 @@ bool recv_parse_log_recs(lsn_t checkpoint_lsn, store_t* store, bool apply)
const byte* body;
const bool last_phase = (*store == STORE_IF_EXISTS);
- ut_ad(log_mutex_own());
- ut_ad(mutex_own(&recv_sys.mutex));
- ut_ad(recv_sys.parse_start_lsn != 0);
loop:
const byte* ptr = recv_sys.buf + recv_sys.recovered_offset;
const byte* end_ptr = recv_sys.buf + recv_sys.len;
@@ -3087,6 +3926,10 @@ static bool recv_scan_log_recs(
const byte* const log_end = log_block
+ ulint(end_lsn - start_lsn);
+ const ulint sizeof_checkpoint= log_sys.is_physical()
+ ? SIZE_OF_FILE_CHECKPOINT
+ : SIZE_OF_MLOG_CHECKPOINT;
+
do {
ut_ad(!finished);
@@ -3132,11 +3975,17 @@ static bool recv_scan_log_recs(
scanned_lsn += data_len;
- if (data_len == LOG_BLOCK_HDR_SIZE + SIZE_OF_MLOG_CHECKPOINT
- && scanned_lsn == checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT
- && log_block[LOG_BLOCK_HDR_SIZE] == MLOG_CHECKPOINT
- && checkpoint_lsn == mach_read_from_8(LOG_BLOCK_HDR_SIZE
- + 1 + log_block)) {
+ if (data_len == LOG_BLOCK_HDR_SIZE + sizeof_checkpoint
+ && scanned_lsn == checkpoint_lsn + sizeof_checkpoint
+ && log_block[LOG_BLOCK_HDR_SIZE]
+ == (log_sys.is_physical()
+ ? FILE_CHECKPOINT | (SIZE_OF_FILE_CHECKPOINT - 2)
+ : MLOG_CHECKPOINT)
+ && checkpoint_lsn == mach_read_from_8(
+ (log_sys.is_physical()
+ ? LOG_BLOCK_HDR_SIZE + 1 + 2
+ : LOG_BLOCK_HDR_SIZE + 1)
+ + log_block)) {
/* The redo log is logically empty. */
ut_ad(recv_sys.mlog_checkpoint_lsn == 0
|| recv_sys.mlog_checkpoint_lsn
@@ -3170,8 +4019,7 @@ static bool recv_scan_log_recs(
DBUG_EXECUTE_IF(
"reduce_recv_parsing_buf",
- recv_parsing_buf_size
- = (70 * 1024);
+ recv_parsing_buf_size = RECV_SCAN_SIZE * 2;
);
if (recv_sys.len + 4 * OS_FILE_LOG_BLOCK_SIZE
@@ -3231,7 +4079,10 @@ static bool recv_scan_log_recs(
recv_sys.is_memory_exhausted(store);
- if (recv_sys.recovered_offset > recv_parsing_buf_size / 4) {
+ if (recv_sys.recovered_offset > recv_parsing_buf_size / 4
+ || (recv_sys.recovered_offset
+ && recv_sys.len
+ >= recv_parsing_buf_size - RECV_SCAN_SIZE)) {
/* Move parsing buffer data to the buffer start */
recv_sys_justify_left_parsing_buf();
}
@@ -3469,10 +4320,15 @@ recv_init_crash_recovery_spaces(bool rescan, bool& missing_tablespace)
are some redo log records for it. */
fil_names_dirty(rs.second.space);
} else if (rs.second.name == "") {
- ib::error() << "Missing MLOG_FILE_NAME"
- " or MLOG_FILE_DELETE"
- " before MLOG_CHECKPOINT for tablespace "
- << rs.first;
+ ib::error() << (log_sys.is_physical()
+ ? "Missing FILE_CREATE, FILE_DELETE"
+ " or FILE_MODIFY"
+ " before FILE_CHECKPOINT"
+ " for tablespace "
+ : "Missing MLOG_FILE_NAME"
+ " or MLOG_FILE_DELETE"
+ " before MLOG_CHECKPOINT"
+ " for tablespace ") << rs.first;
recv_sys.found_corrupt_log = true;
return(DB_CORRUPTION);
} else {
@@ -3576,7 +4432,7 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
return(DB_ERROR);
}
- /* Look for MLOG_CHECKPOINT. */
+ /* Look for MLOG_CHECKPOINT or FILE_CHECKPOINT. */
recv_group_scan_log_recs(checkpoint_lsn, &contiguous_lsn, false);
/* The first scan should not have stored or applied any records. */
ut_ad(recv_sys.pages.empty());
@@ -3598,7 +4454,9 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
if (!srv_read_only_mode && scan_lsn != checkpoint_lsn) {
log_mutex_exit();
ib::error err;
- err << "Missing MLOG_CHECKPOINT";
+ err << (log_sys.is_physical()
+ ? "Missing FILE_CHECKPOINT"
+ : "Missing MLOG_CHECKPOINT");
if (end_lsn) {
err << " at " << end_lsn;
}
@@ -3624,14 +4482,17 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
/* NOTE: we always do a 'recovery' at startup, but only if
there is something wrong we will print a message to the
user about recovery: */
+ const ulint sizeof_checkpoint= log_sys.is_physical()
+ ? SIZE_OF_FILE_CHECKPOINT
+ : SIZE_OF_MLOG_CHECKPOINT;
- if (flush_lsn == checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT
+ if (flush_lsn == checkpoint_lsn + sizeof_checkpoint
&& recv_sys.mlog_checkpoint_lsn == checkpoint_lsn) {
/* The redo log is logically empty. */
} else if (checkpoint_lsn != flush_lsn) {
ut_ad(!srv_log_files_created);
- if (checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT < flush_lsn) {
+ if (checkpoint_lsn + sizeof_checkpoint < flush_lsn) {
ib::warn() << "Are you sure you are using the"
" right ib_logfiles to start up the database?"
" Log sequence number in the ib_logfiles is "
@@ -3770,7 +4631,7 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
log_sys.last_checkpoint_lsn = checkpoint_lsn;
if (!srv_read_only_mode && srv_operation == SRV_OPERATION_NORMAL) {
- /* Write a MLOG_CHECKPOINT marker as the first thing,
+ /* Write a MLOG_CHECKPOINT or FILE_CHECKPOINT first,
before generating any other redo log. This ensures
that subsequent crash recovery will be possible even
if the server were killed soon after this. */
@@ -3937,9 +4798,6 @@ static const char* get_mlog_string(mlog_id_t type)
case MLOG_IBUF_BITMAP_INIT:
return("MLOG_IBUF_BITMAP_INIT");
- case MLOG_ZIP_WRITE_STRING:
- return("MLOG_ZIP_WRITE_STRING");
-
case MLOG_WRITE_STRING:
return("MLOG_WRITE_STRING");
diff --git a/storage/innobase/mtr/mtr0log.cc b/storage/innobase/mtr/mtr0log.cc
index 63a313ff0b8..fb363f012ce 100644
--- a/storage/innobase/mtr/mtr0log.cc
+++ b/storage/innobase/mtr/mtr0log.cc
@@ -26,15 +26,14 @@ Created 12/7/1995 Heikki Tuuri
#include "mtr0log.h"
#include "buf0buf.h"
-#include "dict0dict.h"
+#include "dict0mem.h"
#include "log0recv.h"
#include "page0page.h"
-#include "buf0dblwr.h"
-#include "dict0boot.h"
/********************************************************//**
-Parses an initial log record written by mtr_t::write_low().
+Parses an initial log record written by mlog_write_initial_log_record_low().
@return parsed record end, NULL if not a complete record */
+ATTRIBUTE_COLD /* only used when crash-upgrading */
const byte*
mlog_parse_initial_log_record(
/*==========================*/
@@ -196,112 +195,6 @@ mlog_parse_nbytes(
return const_cast<byte*>(ptr);
}
-/**
-Write a log record for writing 1, 2, 4, or 8 bytes.
-@param[in] type number of bytes to write
-@param[in] block file page
-@param[in] ptr pointer within block.frame
-@param[in,out] l log record buffer
-@return new end of mini-transaction log */
-byte *mtr_t::log_write_low(mlog_id_t type, const buf_block_t &block,
- const byte *ptr, byte *l)
-{
- ut_ad(type == MLOG_1BYTE || type == MLOG_2BYTES || type == MLOG_4BYTES ||
- type == MLOG_8BYTES);
- ut_ad(block.page.state == BUF_BLOCK_FILE_PAGE);
- ut_ad(ptr >= block.frame + FIL_PAGE_OFFSET);
- ut_ad(ptr + unsigned(type) <=
- &block.frame[srv_page_size - FIL_PAGE_DATA_END]);
- l= log_write_low(type, block.page.id, l);
- mach_write_to_2(l, page_offset(ptr));
- return l + 2;
-}
-
-/**
-Write a log record for writing 1, 2, or 4 bytes.
-@param[in] block file page
-@param[in,out] ptr pointer in file page
-@param[in] l number of bytes to write
-@param[in,out] log_ptr log record buffer
-@param[in] val value to write */
-void mtr_t::log_write(const buf_block_t &block, byte *ptr, mlog_id_t l,
- byte *log_ptr, uint32_t val)
-{
- ut_ad(l == MLOG_1BYTE || l == MLOG_2BYTES || l == MLOG_4BYTES);
- log_ptr= log_write_low(l, block, ptr, log_ptr);
- log_ptr+= mach_write_compressed(log_ptr, val);
- m_log.close(log_ptr);
-}
-
-/**
-Write a log record for writing 8 bytes.
-@param[in] block file page
-@param[in,out] ptr pointer in file page
-@param[in] l number of bytes to write
-@param[in,out] log_ptr log record buffer
-@param[in] val value to write */
-void mtr_t::log_write(const buf_block_t &block, byte *ptr, mlog_id_t l,
- byte *log_ptr, uint64_t val)
-{
- ut_ad(l == MLOG_8BYTES);
- log_ptr= log_write_low(l, block, ptr, log_ptr);
- log_ptr+= mach_u64_write_compressed(log_ptr, val);
- m_log.close(log_ptr);
-}
-
-/** Log a write of a byte string to a page.
-@param[in] b buffer page
-@param[in] ofs byte offset from b->frame
-@param[in] len length of the data to write */
-void mtr_t::memcpy(const buf_block_t &b, ulint ofs, ulint len)
-{
- ut_ad(len);
- ut_ad(ofs <= ulint(srv_page_size));
- ut_ad(ofs + len <= ulint(srv_page_size));
-
- set_modified();
- if (m_log_mode != MTR_LOG_ALL)
- {
- ut_ad(m_log_mode == MTR_LOG_NONE || m_log_mode == MTR_LOG_NO_REDO);
- return;
- }
-
- ut_ad(ofs + len < PAGE_DATA || !b.page.zip.data ||
- mach_read_from_2(b.frame + FIL_PAGE_TYPE) <= FIL_PAGE_TYPE_ZBLOB2);
-
- byte *l= log_write_low(MLOG_WRITE_STRING, b.page.id, m_log.open(11 + 2 + 2));
- mach_write_to_2(l, ofs);
- mach_write_to_2(l + 2, len);
- m_log.close(l + 4);
- m_log.push(b.frame + ofs, static_cast<uint32_t>(len));
-}
-
-/** Write a byte string to a ROW_FORMAT=COMPRESSED page.
-@param[in] b ROW_FORMAT=COMPRESSED index page
-@param[in] ofs byte offset from b.zip.data
-@param[in] len length of the data to write */
-void mtr_t::zmemcpy(const buf_page_t &b, ulint offset, ulint len)
-{
- ut_ad(page_zip_simple_validate(&b.zip));
- ut_ad(len);
- ut_ad(offset + len <= page_zip_get_size(&b.zip));
- ut_ad(mach_read_from_2(b.zip.data + FIL_PAGE_TYPE) == FIL_PAGE_INDEX ||
- mach_read_from_2(b.zip.data + FIL_PAGE_TYPE) == FIL_PAGE_RTREE);
-
- set_modified();
- if (m_log_mode != MTR_LOG_ALL)
- {
- ut_ad(m_log_mode == MTR_LOG_NONE || m_log_mode == MTR_LOG_NO_REDO);
- return;
- }
-
- byte *l= log_write_low(MLOG_ZIP_WRITE_STRING, b.id, m_log.open(11 + 2 + 2));
- mach_write_to_2(l, offset);
- mach_write_to_2(l + 2, len);
- m_log.close(l + 4);
- m_log.push(b.zip.data + offset, static_cast<uint32_t>(len));
-}
-
/********************************************************//**
Parses a log record written by mtr_t::memcpy().
@return parsed record end, NULL if not a complete record */
@@ -353,34 +246,6 @@ mlog_parse_string(
return(ptr + len);
}
-/** Initialize a string of bytes.
-@param[in,out] b buffer page
-@param[in] ofs byte offset from block->frame
-@param[in] len length of the data to write
-@param[in] val the data byte to write */
-void mtr_t::memset(const buf_block_t* b, ulint ofs, ulint len, byte val)
-{
- ut_ad(len);
- ut_ad(ofs <= ulint(srv_page_size));
- ut_ad(ofs + len <= ulint(srv_page_size));
- ut_ad(ofs + len < PAGE_DATA || !b->page.zip.data ||
- mach_read_from_2(b->frame + FIL_PAGE_TYPE) <= FIL_PAGE_TYPE_ZBLOB2);
- ::memset(ofs + b->frame, val, len);
-
- set_modified();
- if (m_log_mode != MTR_LOG_ALL)
- {
- ut_ad(m_log_mode == MTR_LOG_NONE || m_log_mode == MTR_LOG_NO_REDO);
- return;
- }
-
- byte *l= log_write_low(MLOG_MEMSET, b->page.id, m_log.open(11 + 2 + 2 + 1));
- mach_write_to_2(l, ofs);
- mach_write_to_2(l + 2, len);
- l[4]= val;
- m_log.close(l + 5);
-}
-
/********************************************************//**
Parses a log record written by mlog_open_and_write_index.
@return parsed record end, NULL if not a complete record */
diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc
index cacdb4878c8..2e907d6b113 100644
--- a/storage/innobase/mtr/mtr0mtr.cc
+++ b/storage/innobase/mtr/mtr0mtr.cc
@@ -378,13 +378,15 @@ void mtr_t::start()
ut_d(m_start= true);
ut_d(m_commit= false);
+ m_last= nullptr;
+ m_last_offset= 0;
+
new(&m_memo) mtr_buf_t();
new(&m_log) mtr_buf_t();
m_made_dirty= false;
m_inside_ibuf= false;
m_modifications= false;
- m_n_log_recs= 0;
m_log_mode= MTR_LOG_ALL;
ut_d(m_user_space_id= TRX_SYS_SPACE);
m_user_space= nullptr;
@@ -411,7 +413,7 @@ void mtr_t::commit()
ut_ad(!m_modifications || !recv_no_log_write);
ut_ad(!m_modifications || m_log_mode != MTR_LOG_NONE);
- if (m_modifications && (m_n_log_recs || m_log_mode == MTR_LOG_NO_REDO))
+ if (m_modifications && (m_log_mode == MTR_LOG_NO_REDO || !m_log.empty()))
{
ut_ad(!srv_read_only_mode || m_log_mode == MTR_LOG_NO_REDO);
@@ -445,7 +447,7 @@ void mtr_t::commit()
/** Commit a mini-transaction that did not modify any pages,
but generated some redo log on a higher level, such as
-MLOG_FILE_NAME records and an optional MLOG_CHECKPOINT marker.
+FILE_MODIFY records and an optional FILE_CHECKPOINT marker.
The caller must invoke log_mutex_enter() and log_mutex_exit().
This is to be used at log_checkpoint().
@param[in] checkpoint_lsn log checkpoint LSN, or 0 */
@@ -458,23 +460,16 @@ void mtr_t::commit_files(lsn_t checkpoint_lsn)
ut_ad(!m_made_dirty);
ut_ad(m_memo.size() == 0);
ut_ad(!srv_read_only_mode);
- ut_ad(checkpoint_lsn || m_n_log_recs > 1);
-
- switch (m_n_log_recs) {
- case 0:
- break;
- case 1:
- *m_log.front()->begin() |= MLOG_SINGLE_REC_FLAG;
- break;
- default:
- *m_log.push<byte*>(1) = MLOG_MULTI_REC_END;
- }
if (checkpoint_lsn) {
- byte* ptr = m_log.push<byte*>(SIZE_OF_MLOG_CHECKPOINT);
- compile_time_assert(SIZE_OF_MLOG_CHECKPOINT == 1 + 8);
- *ptr = MLOG_CHECKPOINT;
- mach_write_to_8(ptr + 1, checkpoint_lsn);
+ byte* ptr = m_log.push<byte*>(SIZE_OF_FILE_CHECKPOINT);
+ compile_time_assert(SIZE_OF_FILE_CHECKPOINT == 3 + 8 + 1);
+ *ptr = FILE_CHECKPOINT | (SIZE_OF_FILE_CHECKPOINT - 2);
+ ::memset(ptr + 1, 0, 2);
+ mach_write_to_8(ptr + 3, checkpoint_lsn);
+ ptr[3 + 8] = 0;
+ } else {
+ *m_log.push<byte*>(1) = 0;
}
finish_write(m_log.size());
@@ -482,14 +477,14 @@ void mtr_t::commit_files(lsn_t checkpoint_lsn)
if (checkpoint_lsn) {
DBUG_PRINT("ib_log",
- ("MLOG_CHECKPOINT(" LSN_PF ") written at " LSN_PF,
+ ("FILE_CHECKPOINT(" LSN_PF ") written at " LSN_PF,
checkpoint_lsn, log_sys.lsn));
}
}
#ifdef UNIV_DEBUG
/** Check if a tablespace is associated with the mini-transaction
-(needed for generating a MLOG_FILE_NAME record)
+(needed for generating a FILE_MODIFY record)
@param[in] space tablespace
@return whether the mini-transaction is associated with the space */
bool
@@ -510,7 +505,7 @@ mtr_t::is_named_space(ulint space) const
return(false);
}
/** Check if a tablespace is associated with the mini-transaction
-(needed for generating a MLOG_FILE_NAME record)
+(needed for generating a FILE_MODIFY record)
@param[in] space tablespace
@return whether the mini-transaction is associated with the space */
bool mtr_t::is_named_space(const fil_space_t* space) const
@@ -618,53 +613,32 @@ inline ulint mtr_t::prepare_write()
}
ulint len = m_log.size();
- ulint n_recs = m_n_log_recs;
ut_ad(len > 0);
- ut_ad(n_recs > 0);
if (len > srv_log_buffer_size / 2) {
log_buffer_extend(ulong((len + 1) * 2));
}
- ut_ad(m_n_log_recs == n_recs);
-
fil_space_t* space = m_user_space;
if (space != NULL && is_predefined_tablespace(space->id)) {
- /* Omit MLOG_FILE_NAME for predefined tablespaces. */
+ /* Omit FILE_MODIFY for predefined tablespaces. */
space = NULL;
}
log_mutex_enter();
- if (fil_names_write_if_was_clean(space, this)) {
- /* This mini-transaction was the first one to modify
- this tablespace since the latest checkpoint, so
- some MLOG_FILE_NAME records were appended to m_log. */
- ut_ad(m_n_log_recs > n_recs);
- *m_log.push<byte*>(1) = MLOG_MULTI_REC_END;
+ if (fil_names_write_if_was_clean(space)) {
len = m_log.size();
} else {
/* This was not the first time of dirtying a
tablespace since the latest checkpoint. */
-
- ut_ad(n_recs == m_n_log_recs);
-
- if (n_recs <= 1) {
- ut_ad(n_recs == 1);
-
- /* Flag the single log record as the
- only record in this mini-transaction. */
- *m_log.front()->begin() |= MLOG_SINGLE_REC_FLAG;
- } else {
- /* Because this mini-transaction comprises
- multiple log records, append MLOG_MULTI_REC_END
- at the end. */
- *m_log.push<byte*>(1) = MLOG_MULTI_REC_END;
- len++;
- }
+ ut_ad(len == m_log.size());
}
+ *m_log.push<byte*>(1) = 0;
+ len++;
+
/* check and attempt a checkpoint if exceeding capacity */
log_margin_checkpoint_age(len);
diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc
index e3f59187650..d6d908a3163 100644
--- a/storage/innobase/page/page0cur.cc
+++ b/storage/innobase/page/page0cur.cc
@@ -791,6 +791,13 @@ static void rec_set_heap_no(rec_t *rec, ulint heap_no, bool compact)
REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
}
+static rec_t*
+page_cur_parse_insert_rec_zip(
+ page_cur_t* cursor, /*!< in/out: page cursor */
+ dict_index_t* index, /*!< in: record descriptor */
+ const rec_t* rec, /*!< in: pointer to a physical record */
+ offset_t* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr); /*!< in/out: mini-transaction */
/***********************************************************//**
Parses a log record of a record insert on a page.
@return end of log record or NULL */
@@ -960,9 +967,9 @@ page_cur_parse_insert_rec(
/* The redo log record should only have been written
after the write was successful. */
if (block->page.zip.data) {
- if (!page_cur_insert_rec_zip(&cursor, index,
- buf + origin_offset,
- offsets, mtr)) {
+ if (!page_cur_parse_insert_rec_zip(&cursor, index,
+ buf + origin_offset,
+ offsets, mtr)) {
ut_error;
}
} else if (!page_cur_insert_rec_low(&cursor, index,
@@ -983,60 +990,6 @@ page_cur_parse_insert_rec(
return(const_cast<byte*>(ptr + end_seg_len));
}
-/** Reset PAGE_DIRECTION and PAGE_N_DIRECTION.
-@tparam compressed whether the page is in ROW_FORMAT=COMPRESSED
-@param[in,out] block index page
-@param[in,out] ptr the PAGE_DIRECTION_B field
-@param[in,out] mtr mini-transaction */
-template<bool compressed=false>
-inline void page_direction_reset(buf_block_t *block, byte *ptr, mtr_t *mtr)
-{
- ut_ad(!block->page.zip.data || page_is_comp(block->frame));
- ut_ad(!compressed || block->page.zip.data);
- ut_ad(ptr == PAGE_HEADER + PAGE_DIRECTION_B + block->frame);
- static_assert(PAGE_DIRECTION_B + 1 == PAGE_N_DIRECTION, "adjacent fields");
-
- if (compressed)
- {
- *ptr= PAGE_NO_DIRECTION; /* no instant ALTER bits */
- memset_aligned<2>(ptr + 1, 0, 2);
- page_zip_write_header(block, ptr, 3, mtr);
- }
- else
- {
- mtr->write<1,mtr_t::OPT>(*block, ptr, (*ptr & ~((1U << 3) - 1))
- | PAGE_NO_DIRECTION);
- mtr->write<2,mtr_t::OPT>(*block, ptr + 1, 0U);
- }
-}
-
-/** Increment PAGE_N_DIRECTION.
-@tparam compressed whether the page is in ROW_FORMAT=COMPRESSED
-@param[in,out] block index page
-@param[in,out] ptr the PAGE_DIRECTION_B field
-@param[in] dir PAGE_RIGHT or PAGE_LEFT
-@param[in,out] mtr mini-transaction */
-template<bool compressed=false>
-inline void page_direction_increment(buf_block_t *block, byte *ptr, uint dir,
- mtr_t *mtr)
-{
- ut_ad(!block->page.zip.data || page_is_comp(block->frame));
- ut_ad(!compressed || block->page.zip.data);
- ut_ad(ptr == PAGE_HEADER + PAGE_DIRECTION_B + block->frame);
- ut_ad(dir == PAGE_RIGHT || dir == PAGE_LEFT);
- if (compressed)
- {
- *ptr= static_cast<byte>(dir);
- mach_write_to_2(ptr + 1, 1 + mach_read_from_2(ptr + 1));
- page_zip_write_header(block, ptr, 3, mtr);
- }
- else
- {
- mtr->write<1,mtr_t::OPT>(*block, ptr, (*ptr & ~((1U << 3) - 1)) | dir);
- mtr->write<2>(*block, ptr + 1, 1U + mach_read_from_2(ptr + 1));
- }
-}
-
/**
Set the owned records field of the record pointed to by a directory slot.
@tparam compressed whether to update any ROW_FORMAT=COMPRESSED page as well
@@ -1082,7 +1035,8 @@ static void page_dir_split_slot(buf_block_t *block, ulint s, mtr_t* mtr)
rec= page_rec_get_next_const(rec);
/* Add a directory slot immediately below this one. */
- byte *n_slots_p= PAGE_N_DIR_SLOTS + PAGE_HEADER + block->frame;
+ constexpr uint16_t n_slots_f= PAGE_N_DIR_SLOTS + PAGE_HEADER;
+ byte *n_slots_p= my_assume_aligned<2>(n_slots_f + block->frame);
const uint16_t n_slots= mach_read_from_2(n_slots_p);
page_dir_slot_t *last_slot= static_cast<page_dir_slot_t*>
@@ -1093,12 +1047,13 @@ static void page_dir_split_slot(buf_block_t *block, ulint s, mtr_t* mtr)
const ulint half_owned= n_owned / 2;
+ mtr->write<2>(*block, n_slots_p, 1U + n_slots);
+
if (compressed)
{
/* Log changes to the compressed page header and the dense page
directory. */
- mach_write_to_2(n_slots_p, n_slots + 1);
- page_zip_write_header(block, n_slots_p, 2, mtr);
+ memcpy_aligned<2>(&block->page.zip.data[n_slots_f], n_slots_p, 2);
mach_write_to_2(slot, page_offset(rec));
page_rec_set_n_owned<true>(block, page_dir_slot_get_rec(slot), half_owned,
true, mtr);
@@ -1109,8 +1064,9 @@ static void page_dir_split_slot(buf_block_t *block, ulint s, mtr_t* mtr)
}
else
{
- mtr->write<2>(*block, n_slots_p, 1U + n_slots);
- mtr->memcpy(*block, page_offset(last_slot), slot - last_slot);
+ mtr->memmove(*block, page_offset(last_slot),
+ page_offset(last_slot) + PAGE_DIR_SLOT_SIZE,
+ slot - last_slot);
mtr->write<2>(*block, slot, page_offset(rec));
const bool comp= page_is_comp(block->frame) != 0;
page_rec_set_n_owned<false>(block, page_dir_slot_get_rec(slot), half_owned,
@@ -1164,22 +1120,20 @@ static void page_dir_balance_slot(buf_block_t *block, ulint s, mtr_t *mtr)
block->frame, n_slots - 1);
memmove_aligned<2>(last_slot + PAGE_DIR_SLOT_SIZE, last_slot,
slot - last_slot);
+ constexpr uint16_t n_slots_f = PAGE_N_DIR_SLOTS + PAGE_HEADER;
+ byte *n_slots_p= my_assume_aligned<2>
+ (n_slots_f + block->frame);
+ mtr->write<2>(*block, n_slots_p, n_slots - 1);
+
if (UNIV_LIKELY_NULL(block->page.zip.data)) {
memset_aligned<2>(last_slot, 0, 2);
- mach_write_to_2(PAGE_N_DIR_SLOTS + PAGE_HEADER
- + block->frame, n_slots - 1);
- page_zip_write_header(block,
- PAGE_N_DIR_SLOTS + PAGE_HEADER
- + block->frame, 2, mtr);
+ memcpy_aligned<2>(n_slots_f + block->page.zip.data,
+ n_slots_p, 2);
} else {
- mtr->write<2>(*block,
- PAGE_N_DIR_SLOTS + PAGE_HEADER
- + block->frame,
- n_slots - 1);
+ mtr->memmove(*block, PAGE_DIR_SLOT_SIZE
+ + page_offset(last_slot),
+ page_offset(last_slot), slot - last_slot);
mtr->write<2>(*block, last_slot, 0U);
- mtr->memcpy(*block, page_offset(last_slot)
- + PAGE_DIR_SLOT_SIZE,
- slot - last_slot);
}
return;
@@ -1245,14 +1199,14 @@ static byte* page_mem_alloc_heap(buf_block_t *block, ulint need,
mach_write_to_2(heap_top, top + need);
mach_write_to_2(n_heap, h + 1);
+ mtr->memcpy(*block, PAGE_HEAP_TOP + PAGE_HEADER, 4);
if (compressed)
{
ut_ad(h & 0x8000);
- page_zip_write_header(block, heap_top, 4, mtr);
+ memcpy_aligned<4>(&block->page.zip.data[PAGE_HEAP_TOP + PAGE_HEADER],
+ heap_top, 4);
}
- else
- mtr->memcpy(*block, PAGE_HEAP_TOP + PAGE_HEADER, 4);
compile_time_assert(PAGE_N_HEAP == PAGE_HEAP_TOP + 2);
return &block->frame[top];
@@ -1272,236 +1226,332 @@ page_cur_insert_rec_low(
offset_t* offsets,/*!< in/out: rec_get_offsets(rec, index) */
mtr_t* mtr) /*!< in/out: mini-transaction */
{
- byte* insert_buf;
- ulint rec_size;
- rec_t* last_insert; /*!< cursor position at previous
- insert */
- rec_t* free_rec; /*!< a free record that was reused,
- or NULL */
- rec_t* insert_rec; /*!< inserted record */
- ulint heap_no; /*!< heap number of the inserted
- record */
-
- rec_t* current_rec = cur->rec;
- buf_block_t* block = cur->block;
+ buf_block_t* block = cur->block;
- ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(rec_offs_validate(rec, index, offsets));
- ut_ad(dict_table_is_comp(index->table)
- == (ibool) !!page_is_comp(block->frame));
- ut_ad(fil_page_index_page_check(block->frame));
- ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->frame)
- == index->id
- || index->is_dummy
- || mtr->is_inside_ibuf());
+ ut_ad(index->table->not_redundant() == !!page_is_comp(block->frame));
+ ut_ad(!!page_is_comp(block->frame) == !!rec_offs_comp(offsets));
+ ut_ad(fil_page_index_page_check(block->frame));
+ ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->frame) ==
+ index->id ||
+ index->is_dummy ||
+ mtr->is_inside_ibuf());
- ut_ad(!page_rec_is_supremum(current_rec));
+ ut_ad(!page_rec_is_supremum(cur->rec));
- /* We should not write log for ROW_FORMAT=COMPRESSED pages here. */
- ut_ad(mtr->get_log_mode() == MTR_LOG_NONE
- || mtr->get_log_mode() == MTR_LOG_NO_REDO
- || !(index->table->flags & DICT_TF_MASK_ZIP_SSIZE));
+ /* We should not write log for ROW_FORMAT=COMPRESSED pages here. */
+ ut_ad(mtr->get_log_mode() != MTR_LOG_ALL ||
+ !(index->table->flags & DICT_TF_MASK_ZIP_SSIZE));
- /* 1. Get the size of the physical record in the page */
- rec_size = rec_offs_size(offsets);
+ /* 1. Get the size of the physical record in the page */
+ const ulint rec_size= rec_offs_size(offsets);
#ifdef UNIV_DEBUG_VALGRIND
- {
- const void* rec_start
- = rec - rec_offs_extra_size(offsets);
- ulint extra_size
- = rec_offs_extra_size(offsets)
- - (rec_offs_comp(offsets)
- ? REC_N_NEW_EXTRA_BYTES
- : REC_N_OLD_EXTRA_BYTES);
-
- /* All data bytes of the record must be valid. */
- UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
- /* The variable-length header must be valid. */
- UNIV_MEM_ASSERT_RW(rec_start, extra_size);
- }
+ {
+ const void *rec_start= rec - rec_offs_extra_size(offsets);
+ ulint extra_size= rec_offs_extra_size(offsets) -
+ (page_is_comp(block->frame)
+ ? REC_N_NEW_EXTRA_BYTES
+ : REC_N_OLD_EXTRA_BYTES);
+ /* All data bytes of the record must be valid. */
+ UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+ /* The variable-length header must be valid. */
+ UNIV_MEM_ASSERT_RW(rec_start, extra_size);
+ }
#endif /* UNIV_DEBUG_VALGRIND */
- /* 2. Try to find suitable space from page memory management */
+ /* 2. Try to find suitable space from page memory management */
+ ulint heap_no;
+ byte *insert_buf;
+ alignas(2) byte hdr[8];
- free_rec = page_header_get_ptr(block->frame, PAGE_FREE);
- if (UNIV_LIKELY_NULL(free_rec)) {
- /* Try to allocate from the head of the free list. */
- offset_t foffsets_[REC_OFFS_NORMAL_SIZE];
- offset_t* foffsets = foffsets_;
- mem_heap_t* heap = NULL;
-
- rec_offs_init(foffsets_);
+ if (rec_t* free_rec = page_header_get_ptr(block->frame, PAGE_FREE))
+ {
+ /* Try to reuse the head of PAGE_FREE. */
+ offset_t foffsets_[REC_OFFS_NORMAL_SIZE];
+ mem_heap_t *heap= nullptr;
+
+ rec_offs_init(foffsets_);
+
+ offset_t *foffsets= rec_get_offsets(free_rec, index, foffsets_,
+ page_is_leaf(block->frame),
+ ULINT_UNDEFINED, &heap);
+ insert_buf= free_rec - rec_offs_extra_size(foffsets);
+ const bool too_small= rec_offs_size(foffsets) < rec_size;
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+
+ if (too_small)
+ goto use_heap;
+
+ byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+ block->frame);
+ if (page_is_comp(block->frame))
+ {
+ heap_no= rec_get_heap_no_new(free_rec);
+ const rec_t *next= rec_get_next_ptr(free_rec, true);
+ mach_write_to_2(hdr, next ? page_offset(next) : 0);
+ }
+ else
+ {
+ heap_no= rec_get_heap_no_old(free_rec);
+ memcpy(hdr, free_rec - REC_NEXT, 2);
+ }
- foffsets = rec_get_offsets(
- free_rec, index, foffsets, page_is_leaf(block->frame),
- ULINT_UNDEFINED, &heap);
- if (rec_offs_size(foffsets) < rec_size) {
- if (UNIV_LIKELY_NULL(heap)) {
- mem_heap_free(heap);
- }
+ static_assert(PAGE_GARBAGE == PAGE_FREE + 2, "compatibility");
+ byte *page_garbage = my_assume_aligned<2>(page_free + 2);
+ ut_ad(mach_read_from_2(page_garbage) >= rec_size);
+ mach_write_to_2(my_assume_aligned<2>(hdr + 2),
+ mach_read_from_2(page_garbage) - rec_size);
+ mtr->memcpy(*block, page_free, hdr, 4);
+ }
+ else
+ {
+use_heap:
+ insert_buf= page_mem_alloc_heap(block, rec_size, &heap_no, mtr);
- goto use_heap;
- }
+ if (UNIV_UNLIKELY(!insert_buf))
+ return nullptr;
+ }
- insert_buf = free_rec - rec_offs_extra_size(foffsets);
+ const ulint extra_size= rec_offs_extra_size(offsets);
+ ut_ad(cur->rec != insert_buf + extra_size);
- byte* page_free = my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER
- + block->frame);
- byte* page_garbage = my_assume_aligned<2>(PAGE_GARBAGE
- + PAGE_HEADER
- + block->frame);
- ut_ad(mach_read_from_2(page_garbage) >= rec_size);
- mach_write_to_2(page_garbage, mach_read_from_2(page_garbage)
- - rec_size);
- if (page_is_comp(block->frame)) {
- heap_no = rec_get_heap_no_new(free_rec);
- const rec_t* next = rec_get_next_ptr(free_rec, true);
- mach_write_to_2(page_free,
- next ? page_offset(next) : 0);
- } else {
- heap_no = rec_get_heap_no_old(free_rec);
- memcpy(page_free, free_rec - REC_NEXT, 2);
- }
+ const rec_t *next_rec= page_rec_get_next_low(cur->rec,
+ page_is_comp(block->frame));
- compile_time_assert(PAGE_GARBAGE == PAGE_FREE + 2);
- mtr->memcpy(*block, PAGE_FREE + PAGE_HEADER, 4);
+ /* Update page header fields */
+ rec_t *last_insert= page_header_get_ptr(block->frame, PAGE_LAST_INSERT);
+ ut_ad(!last_insert || !page_is_comp(block->frame) ||
+ rec_get_node_ptr_flag(last_insert) == rec_get_node_ptr_flag(rec));
- if (UNIV_LIKELY_NULL(heap)) {
- mem_heap_free(heap);
- }
- } else {
-use_heap:
- free_rec = NULL;
- insert_buf = page_mem_alloc_heap(block, rec_size, &heap_no,
- mtr);
+ static_assert(PAGE_N_RECS - PAGE_LAST_INSERT + 2 == sizeof hdr,
+ "compatibility");
- if (UNIV_UNLIKELY(insert_buf == NULL)) {
- return(NULL);
- }
- }
+ /* Write PAGE_LAST_INSERT */
+ mach_write_to_2(hdr, page_offset(insert_buf + extra_size));
+ static_assert(PAGE_INSTANT - PAGE_LAST_INSERT == 2, "compatibility");
+ static_assert(PAGE_DIRECTION_B - PAGE_INSTANT == 1, "compatibility");
+ static_assert(PAGE_N_DIRECTION - PAGE_DIRECTION_B == 1, "compat.");
+ static_assert(PAGE_N_RECS - PAGE_N_DIRECTION == 2, "compatibility");
- /* 3. Create the record */
- insert_rec = rec_copy(insert_buf, rec, offsets);
- rec_offs_make_valid(insert_rec, index, page_is_leaf(block->frame),
- offsets);
+ /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */
+ memcpy_aligned<2>(hdr + 2, PAGE_HEADER + PAGE_INSTANT + block->frame,
+ PAGE_N_RECS - PAGE_INSTANT + 2);
- /* 4. Insert the record in the linked list of records */
- ut_ad(current_rec != insert_rec);
+ if (!index->is_spatial())
+ {
+ byte *dir= &hdr[PAGE_DIRECTION_B - PAGE_LAST_INSERT];
+ byte *n= my_assume_aligned<2>(&hdr[PAGE_N_DIRECTION - PAGE_LAST_INSERT]);
+ if (UNIV_UNLIKELY(!last_insert))
+ {
+no_direction:
+ *dir= (*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION;
+ memset(n, 0, 2);
+ }
+ else if (last_insert == cur->rec && (*dir & ((1U << 3) - 1)) != PAGE_LEFT)
+ {
+ *dir= (*dir & ~((1U << 3) - 1)) | PAGE_RIGHT;
+inc_dir:
+ mach_write_to_2(n, mach_read_from_2(n) + 1);
+ }
+ else if (next_rec == last_insert && (*dir & ((1U << 3) - 1)) != PAGE_RIGHT)
+ {
+ *dir= (*dir & ~((1U << 3) - 1)) | PAGE_LEFT;
+ goto inc_dir;
+ }
+ else
+ goto no_direction;
+ }
- {
- /* next record after current before the insertion */
- if (page_is_comp(block->frame)) {
- const rec_t* next_rec = page_rec_get_next_low(
- current_rec, true);
+ /* Update PAGE_N_RECS. */
+ mach_write_to_2(hdr + PAGE_N_RECS - PAGE_LAST_INSERT,
+ mach_read_from_2(hdr + PAGE_N_RECS - PAGE_LAST_INSERT) + 1);
+ /* Write the header fields in one record. */
+ mtr->memcpy(*block, PAGE_LAST_INSERT + PAGE_HEADER + block->frame,
+ hdr, PAGE_N_RECS - PAGE_LAST_INSERT + 2);
+
+ /* Update the preceding record header, the 'owner' record and
+ prepare the record to insert. */
+ ulint n_owned;
+ static_assert(sizeof hdr >= REC_N_NEW_EXTRA_BYTES, "compatibility");
+ static_assert(sizeof hdr >= REC_N_OLD_EXTRA_BYTES, "compatibility");
+ ulint fixed_hdr;
+
+ if (page_is_comp(block->frame))
+ {
#ifdef UNIV_DEBUG
- switch (rec_get_status(current_rec)) {
- case REC_STATUS_ORDINARY:
- case REC_STATUS_NODE_PTR:
- case REC_STATUS_INSTANT:
- case REC_STATUS_INFIMUM:
- break;
- case REC_STATUS_SUPREMUM:
- ut_ad(!"wrong status on current_rec");
- }
- switch (rec_get_status(insert_rec)) {
- case REC_STATUS_ORDINARY:
- case REC_STATUS_NODE_PTR:
- case REC_STATUS_INSTANT:
- break;
- case REC_STATUS_INFIMUM:
- case REC_STATUS_SUPREMUM:
- ut_ad(!"wrong status on insert_rec");
- }
- ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
+ switch (rec_get_status(cur->rec)) {
+ case REC_STATUS_ORDINARY:
+ case REC_STATUS_NODE_PTR:
+ case REC_STATUS_INSTANT:
+ case REC_STATUS_INFIMUM:
+ break;
+ case REC_STATUS_SUPREMUM:
+ ut_ad(!"wrong status on cur->rec");
+ }
+ switch (rec_get_status(rec)) {
+ case REC_STATUS_ORDINARY:
+ case REC_STATUS_NODE_PTR:
+ case REC_STATUS_INSTANT:
+ break;
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ ut_ad(!"wrong status on rec");
+ }
+ ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
#endif
- mach_write_to_2(insert_rec - REC_NEXT,
- static_cast<uint16_t>
- (next_rec - insert_rec));
- mtr->write<2>(*block, current_rec - REC_NEXT,
- static_cast<uint16_t>
- (insert_rec - current_rec));
- } else {
- memcpy(insert_rec - REC_NEXT, current_rec - REC_NEXT,
- 2);
- mtr->write<2>(*block, current_rec - REC_NEXT,
- page_offset(insert_rec));
- }
- }
-
- mtr->write<2>(*block, PAGE_N_RECS + PAGE_HEADER + block->frame,
- 1U + page_get_n_recs(block->frame));
+ memcpy(hdr, rec - REC_N_NEW_EXTRA_BYTES, REC_N_NEW_EXTRA_BYTES);
+ rec_set_bit_field_1(hdr + REC_N_NEW_EXTRA_BYTES, 0, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ rec_set_bit_field_2(hdr + REC_N_NEW_EXTRA_BYTES, heap_no,
+ REC_NEW_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+ const rec_t *insert_rec= insert_buf + extra_size;
+ mach_write_to_2(REC_N_NEW_EXTRA_BYTES - REC_NEXT + hdr,
+ static_cast<uint16_t>(next_rec - insert_rec));
+ mtr->write<2>(*block, cur->rec - REC_NEXT,
+ static_cast<uint16_t>(insert_rec - cur->rec));
+ while (!(n_owned = rec_get_n_owned_new(next_rec)))
+ next_rec= page_rec_get_next_low(next_rec, true);
+ page_rec_set_n_owned<false>(block, const_cast<rec_t*>(next_rec),
+ n_owned + 1, true, mtr);
+ fixed_hdr= REC_N_NEW_EXTRA_BYTES;
+ }
+ else
+ {
+ memcpy(hdr, rec - REC_N_OLD_EXTRA_BYTES, REC_N_OLD_EXTRA_BYTES);
+ rec_set_bit_field_1(hdr + REC_N_OLD_EXTRA_BYTES, 0, REC_OLD_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ rec_set_bit_field_2(hdr + REC_N_OLD_EXTRA_BYTES, heap_no,
+ REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+ memcpy(hdr + REC_N_OLD_EXTRA_BYTES - REC_NEXT, cur->rec - REC_NEXT, 2);
+ mtr->write<2>(*block, cur->rec - REC_NEXT,
+ page_offset(insert_buf + extra_size));
+ while (!(n_owned = rec_get_n_owned_old(next_rec)))
+ next_rec= page_rec_get_next_low(next_rec, false);
+ page_rec_set_n_owned<false>(block, const_cast<rec_t*>(next_rec),
+ n_owned + 1, false, mtr);
+ fixed_hdr= REC_N_OLD_EXTRA_BYTES;
+ }
- /* 5. Set the n_owned field in the inserted record to zero,
- and set the heap_no field */
- if (page_is_comp(block->frame)) {
- rec_set_bit_field_1(insert_rec, 0, REC_NEW_N_OWNED,
- REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
- rec_set_bit_field_2(insert_rec, heap_no, REC_NEW_HEAP_NO,
- REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
- } else {
- rec_set_bit_field_1(insert_rec, 0, REC_OLD_N_OWNED,
- REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
- rec_set_bit_field_2(insert_rec, heap_no, REC_OLD_HEAP_NO,
- REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
- }
+ ut_ad(fixed_hdr <= extra_size);
+ /* Insert the record, possibly copying from the preceding record. */
+ const ulint data_size = rec_offs_data_size(offsets);
+ ut_ad(mtr->has_modifications());
- UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets),
- rec_offs_size(offsets));
- mtr->memcpy(*block, page_offset(insert_buf), rec_offs_size(offsets));
+ if (mtr->get_log_mode() == MTR_LOG_ALL)
+ {
+ /* Temporarily write everything to rec, to simplify the code below. */
+ byte rec_hdr[REC_N_OLD_EXTRA_BYTES];
+ memcpy(rec_hdr, rec - fixed_hdr, fixed_hdr);
+ memcpy(const_cast<rec_t*>(rec - fixed_hdr), hdr, fixed_hdr);
- /* 6. Update the last insertion info in page header */
+ byte *b= insert_buf;
+ const byte *r= rec - extra_size;
- last_insert = page_header_get_ptr(block->frame, PAGE_LAST_INSERT);
- ut_ad(!last_insert || !page_is_comp(block->frame)
- || rec_get_node_ptr_flag(last_insert)
- == rec_get_node_ptr_flag(insert_rec));
+ /* Skip any unchanged prefix of the record header. */
+ for (;; b++, r++)
+ if (UNIV_UNLIKELY(b == insert_buf + rec_size))
+ goto rec_done;
+ else if (*b != *r)
+ break;
- if (!index->is_spatial()) {
- byte* ptr = PAGE_HEADER + PAGE_DIRECTION_B + block->frame;
- if (UNIV_UNLIKELY(last_insert == NULL)) {
-no_direction:
- page_direction_reset(block, ptr, mtr);
- } else if (last_insert == current_rec
- && page_ptr_get_direction(ptr) != PAGE_LEFT) {
- page_direction_increment(block, ptr, PAGE_RIGHT, mtr);
- } else if (page_ptr_get_direction(ptr) != PAGE_RIGHT
- && page_rec_get_next(insert_rec) == last_insert) {
- page_direction_increment(block, ptr, PAGE_LEFT, mtr);
- } else {
- goto no_direction;
- }
- }
+ {
+ const byte *c= cur->rec - (rec - r);
+ const byte *c_end= std::min(cur->rec + data_size,
+ block->frame + srv_page_size);
+ if (c <= insert_buf && c_end > insert_buf)
+ c_end= insert_buf;
+
+ /* Try to copy any bytes of the preceding record. */
+ if (UNIV_LIKELY(c >= block->frame && c < c_end))
+ {
+ const byte *cm= c;
+ const byte *rm= r;
+ while (*rm++ == *cm++)
+ if (cm == c_end)
+ break;
+ rm--, cm--;
+ ut_ad(rm - r + b <= insert_buf + rec_size);
+ size_t len= static_cast<size_t>(rm - r);
+ ut_ad(!memcmp(r, c, len));
+ if (len > 2)
+ {
+ memcpy(b, c, len);
+ mtr->memmove(*block, page_offset(b), page_offset(c), len);
+ c= cm;
+ b+= rm - r;
+ r= rm;
+ }
+ }
+
+ if (c < cur->rec)
+ {
+ if (!data_size)
+ {
+no_data:
+ mtr->memcpy<mtr_t::FORCED>(*block, b, r, cur->rec - c);
+ goto rec_done;
+ }
+ /* Some header bytes differ. Compare the data separately. */
+ byte *bd= insert_buf + extra_size;
+ const byte *rd= rec;
+ /* Skip any unchanged prefix of the record payload. */
+ for (;; bd++, rd++)
+ if (bd == insert_buf + rec_size)
+ goto no_data;
+ else if (*bd != *rd)
+ break;
+
+ /* Try to copy any data bytes of the preceding record. */
+ const byte * const cd= cur->rec + (rd - rec);
+ const byte *cdm= cd;
+ const byte *rdm= rd;
+ while (*rdm++ == *cdm++)
+ if (cdm == c_end)
+ break;
+ cdm--, rdm--;
+ ut_ad(rdm - rd + bd <= insert_buf + rec_size);
+ size_t len= static_cast<size_t>(rdm - rd);
+ ut_ad(!memcmp(rd, cd, len));
+ if (len > 2)
+ {
+ mtr->memcpy<mtr_t::FORCED>(*block, b, r, cur->rec - c);
+ memcpy(bd, cd, len);
+ mtr->memmove(*block, page_offset(bd), page_offset(cd), len);
+ c= cdm;
+ b= rdm - rd + bd;
+ r= rdm;
+ }
+ }
+ }
- mtr->write<2>(*block, PAGE_LAST_INSERT + PAGE_HEADER + block->frame,
- page_offset(insert_rec));
+ if (size_t len= static_cast<size_t>(insert_buf + rec_size - b))
+ mtr->memcpy<mtr_t::FORCED>(*block, b, r, len);
+rec_done:
+ ut_ad(!memcmp(insert_buf, rec - extra_size, rec_size));
- /* 7. It remains to update the owner record. */
- {
- rec_t* owner_rec = page_rec_find_owner_rec(insert_rec);
- ulint n_owned;
- if (page_is_comp(block->frame)) {
- n_owned = rec_get_n_owned_new(owner_rec);
- page_rec_set_n_owned<false>(block, owner_rec,
- n_owned + 1, true, mtr);
- } else {
- n_owned = rec_get_n_owned_old(owner_rec);
- page_rec_set_n_owned<false>(block, owner_rec,
- n_owned + 1, false, mtr);
- }
+ /* Restore the record header. */
+ memcpy(const_cast<rec_t*>(rec - fixed_hdr), rec_hdr, fixed_hdr);
+ }
+ else
+ {
+ memcpy(insert_buf, rec - extra_size, extra_size - fixed_hdr);
+ memcpy(insert_buf + extra_size - fixed_hdr, hdr, fixed_hdr);
+ memcpy(insert_buf + extra_size, rec, data_size);
+ }
- /* 8. Now we have incremented the n_owned field of the owner
- record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED,
- we have to split the corresponding directory slot in two. */
+ /* We have incremented the n_owned field of the owner record.
+ If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, we have to split the
+ corresponding directory slot in two. */
- if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) {
- page_dir_split_slot<false>(
- block,
- page_dir_find_owner_slot(owner_rec), mtr);
- }
- }
+ if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
+ page_dir_split_slot<false>(block, page_dir_find_owner_slot(next_rec), mtr);
- return(insert_rec);
+ rec_offs_make_valid(insert_buf + extra_size, index,
+ page_is_leaf(block->frame), offsets);
+ return insert_buf + extra_size;
}
/** Add a slot to the dense page directory.
@@ -1541,8 +1591,8 @@ static inline void page_zip_dir_add_slot(buf_block_t *block,
if (const ulint len = ulint(stored - externs))
{
memmove(dst, externs, len);
- /* TODO: write MEMMOVE record */
- mtr->zmemcpy(block->page, dst - page_zip->data, len);
+ mtr->memmove(*block, dst - page_zip->data, externs - page_zip->data,
+ len);
}
}
else
@@ -1558,8 +1608,7 @@ static inline void page_zip_dir_add_slot(buf_block_t *block,
{
byte* dst = stored - PAGE_ZIP_DIR_SLOT_SIZE;
memmove(dst, stored, len);
- /* TODO: write MEMMOVE record */
- mtr->zmemcpy(block->page, dst - page_zip->data, len);
+ mtr->memmove(*block, dst - page_zip->data, stored - page_zip->data, len);
}
}
@@ -1584,16 +1633,396 @@ page_cur_insert_rec_zip(
offset_t* offsets,/*!< in/out: rec_get_offsets(rec, index) */
mtr_t* mtr) /*!< in/out: mini-transaction */
{
+ page_zip_des_t * const page_zip= page_cur_get_page_zip(cursor);
+ ut_ad(page_zip);
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ ut_ad(index->table->not_redundant());
+ ut_ad(page_is_comp(cursor->block->frame));
+ ut_ad(rec_offs_comp(offsets));
+ ut_ad(fil_page_get_type(cursor->block->frame) == FIL_PAGE_INDEX ||
+ fil_page_get_type(cursor->block->frame) == FIL_PAGE_RTREE);
+ ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + cursor->block->frame) ==
+ index->id ||
+ index->is_dummy ||
+ mtr->is_inside_ibuf());
+ ut_ad(!page_get_instant(cursor->block->frame));
+ ut_ad(!page_cur_is_after_last(cursor));
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, cursor->block->frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ /* 1. Get the size of the physical record in the page */
+ const ulint rec_size= rec_offs_size(offsets);
+
+#ifdef UNIV_DEBUG_VALGRIND
+ {
+ const void *rec_start= rec - rec_offs_extra_size(offsets);
+ ulint extra_size= rec_offs_extra_size(offsets) - REC_N_NEW_EXTRA_BYTES;
+ /* All data bytes of the record must be valid. */
+ UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+ /* The variable-length header must be valid. */
+ UNIV_MEM_ASSERT_RW(rec_start, extra_size);
+ }
+#endif /* UNIV_DEBUG_VALGRIND */
+ const bool reorg_before_insert= page_has_garbage(cursor->block->frame) &&
+ rec_size > page_get_max_insert_size(cursor->block->frame, 1) &&
+ rec_size <= page_get_max_insert_size_after_reorganize(cursor->block->frame,
+ 1);
+ constexpr uint16_t page_free_f= PAGE_FREE + PAGE_HEADER;
+ byte* const page_free = my_assume_aligned<4>(page_free_f +
+ cursor->block->frame);
+ uint16_t free_rec= 0;
+
+ /* 2. Try to find suitable space from page memory management */
+ ulint heap_no;
+ byte *insert_buf;
+
+ if (reorg_before_insert ||
+ !page_zip_available(page_zip, index->is_clust(), rec_size, 1))
+ {
+ /* SET GLOBAL might be executed concurrently. Sample the value once. */
+ ulint level= page_zip_level;
+#ifdef UNIV_DEBUG
+ const rec_t * const cursor_rec= page_cur_get_rec(cursor);
+#endif /* UNIV_DEBUG */
+
+ if (page_is_empty(cursor->block->frame))
+ {
+ ut_ad(page_cur_is_before_first(cursor));
+
+ /* This is an empty page. Recreate to remove the modification log. */
+ page_create_zip(cursor->block, index,
+ page_header_get_field(cursor->block->frame, PAGE_LEVEL),
+ 0, mtr);
+ ut_ad(!page_header_get_ptr(cursor->block->frame, PAGE_FREE));
+
+ if (page_zip_available(page_zip, index->is_clust(), rec_size, 1))
+ goto use_heap;
+
+ /* The cursor should remain on the page infimum. */
+ return nullptr;
+ }
+
+ if (page_zip->m_nonempty || page_has_garbage(cursor->block->frame))
+ {
+ ulint pos= page_rec_get_n_recs_before(cursor->rec);
+
+ if (!page_zip_reorganize(cursor->block, index, level, mtr, true))
+ {
+ ut_ad(cursor->rec == cursor_rec);
+ return nullptr;
+ }
+
+ if (pos)
+ cursor->rec= page_rec_get_nth(cursor->block->frame, pos);
+ else
+ ut_ad(cursor->rec == page_get_infimum_rec(cursor->block->frame));
+
+ ut_ad(!page_header_get_ptr(cursor->block->frame, PAGE_FREE));
+
+ if (page_zip_available(page_zip, index->is_clust(), rec_size, 1))
+ goto use_heap;
+ }
+
+ /* Try compressing the whole page afterwards. */
+ const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NONE);
+ rec_t *insert_rec= page_cur_insert_rec_low(cursor, index, rec, offsets,
+ mtr);
+ mtr->set_log_mode(log_mode);
+
+ if (insert_rec)
+ {
+ ulint pos= page_rec_get_n_recs_before(insert_rec);
+ ut_ad(pos > 0);
+
+ /* We are writing entire page images to the log. Reduce the redo
+ log volume by reorganizing the page at the same time. */
+ if (page_zip_reorganize(cursor->block, index, level, mtr))
+ {
+ /* The page was reorganized: Seek to pos. */
+ cursor->rec= pos > 1
+ ? page_rec_get_nth(cursor->block->frame, pos - 1)
+ : cursor->block->frame + PAGE_NEW_INFIMUM;
+ insert_rec= cursor->block->frame + rec_get_next_offs(cursor->rec, 1);
+ rec_offs_make_valid(insert_rec, index,
+ page_is_leaf(cursor->block->frame), offsets);
+ return insert_rec;
+ }
+
+ /* Theoretically, we could try one last resort of
+ page_zip_reorganize() followed by page_zip_available(), but that
+ would be very unlikely to succeed. (If the full reorganized page
+ failed to compress, why would it succeed to compress the page,
+ plus log the insert of this record?) */
+
+ /* Out of space: restore the page */
+ if (!page_zip_decompress(page_zip, cursor->block->frame, false))
+ ut_error; /* Memory corrupted? */
+ ut_ad(page_validate(cursor->block->frame, index));
+ insert_rec= nullptr;
+ }
+ return insert_rec;
+ }
+
+ free_rec= mach_read_from_2(page_free);
+ if (free_rec)
+ {
+ /* Try to allocate from the head of the free list. */
+ offset_t foffsets_[REC_OFFS_NORMAL_SIZE];
+ mem_heap_t *heap= nullptr;
+
+ rec_offs_init(foffsets_);
+
+ offset_t *foffsets= rec_get_offsets(cursor->block->frame + free_rec, index,
+ foffsets_,
+ page_is_leaf(cursor->block->frame),
+ ULINT_UNDEFINED, &heap);
+ insert_buf= cursor->block->frame + free_rec -
+ rec_offs_extra_size(foffsets);
+
+ if (rec_offs_size(foffsets) < rec_size)
+ {
+too_small:
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+ free_rec= 0;
+ goto use_heap;
+ }
+
+ /* On compressed pages, do not relocate records from
+ the free list. If extra_size would grow, use the heap. */
+ const ssize_t extra_size_diff= lint(rec_offs_extra_size(offsets) -
+ rec_offs_extra_size(foffsets));
+
+ if (UNIV_UNLIKELY(extra_size_diff < 0))
+ {
+ /* Add an offset to the extra_size. */
+ if (rec_offs_size(foffsets) < rec_size - ssize_t(extra_size_diff))
+ goto too_small;
+
+ insert_buf-= extra_size_diff;
+ }
+ else if (UNIV_UNLIKELY(extra_size_diff))
+ /* Do not allow extra_size to grow */
+ goto too_small;
+
+ byte *const free_rec_ptr= cursor->block->frame + free_rec;
+ heap_no= rec_get_heap_no_new(free_rec_ptr);
+ int16_t next_rec= mach_read_from_2(free_rec_ptr - REC_NEXT);
+ /* With innodb_page_size=64k, int16_t would be unsafe to use here,
+ but that cannot be used with ROW_FORMAT=COMPRESSED. */
+ static_assert(UNIV_ZIP_SIZE_SHIFT_MAX == 14, "compatibility");
+ if (next_rec)
+ {
+ next_rec+= free_rec;
+ ut_ad(int{PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES} <= next_rec);
+ ut_ad(static_cast<uint16_t>(next_rec) < srv_page_size);
+ }
+
+ byte *hdr= my_assume_aligned<4>(&page_zip->data[page_free_f]);
+ mach_write_to_2(hdr, static_cast<uint16_t>(next_rec));
+ const byte *const garbage= my_assume_aligned<2>(page_free + 2);
+ ut_ad(mach_read_from_2(garbage) >= rec_size);
+ mach_write_to_2(my_assume_aligned<2>(hdr + 2),
+ mach_read_from_2(garbage) - rec_size);
+ static_assert(PAGE_GARBAGE == PAGE_FREE + 2, "compatibility");
+ mtr->memcpy(*cursor->block, page_free, hdr, 4);
+
+ if (!page_is_leaf(cursor->block->frame))
+ {
+ /* Zero out the node pointer of free_rec, in case it will not be
+ overwritten by insert_rec. */
+ ut_ad(rec_size > REC_NODE_PTR_SIZE);
+
+ if (rec_offs_size(foffsets) > rec_size)
+ memset(rec_get_end(free_rec_ptr, foffsets) -
+ REC_NODE_PTR_SIZE, 0, REC_NODE_PTR_SIZE);
+ }
+ else if (index->is_clust())
+ {
+ /* Zero out DB_TRX_ID,DB_ROLL_PTR in free_rec, in case they will
+ not be overwritten by insert_rec. */
+
+ ulint len;
+ ulint trx_id_offs= rec_get_nth_field_offs(foffsets, index->db_trx_id(),
+ &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+
+ if (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + trx_id_offs +
+ rec_offs_extra_size(foffsets) > rec_size)
+ memset(free_rec_ptr + trx_id_offs, 0,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ ut_ad(free_rec_ptr + trx_id_offs + DATA_TRX_ID_LEN ==
+ rec_get_nth_field(free_rec_ptr, foffsets, index->db_roll_ptr(),
+ &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ }
+
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+ }
+ else
+ {
+use_heap:
+ ut_ad(!free_rec);
+ insert_buf = page_mem_alloc_heap<true>(cursor->block, rec_size, &heap_no,
+ mtr);
+
+ if (UNIV_UNLIKELY(!insert_buf))
+ return insert_buf;
+
+ page_zip_dir_add_slot(cursor->block, index, mtr);
+ }
+
+ /* 3. Create the record */
+ byte *insert_rec= rec_copy(insert_buf, rec, offsets);
+ rec_offs_make_valid(insert_rec, index, page_is_leaf(cursor->block->frame),
+ offsets);
+
+ /* 4. Insert the record in the linked list of records */
+ ut_ad(cursor->rec != insert_rec);
+
+ /* next record after current before the insertion */
+ const rec_t* next_rec = page_rec_get_next_low(cursor->rec, TRUE);
+ ut_ad(rec_get_status(cursor->rec) <= REC_STATUS_INFIMUM);
+ ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM);
+ ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
+
+ mach_write_to_2(insert_rec - REC_NEXT, static_cast<uint16_t>
+ (next_rec - insert_rec));
+ mach_write_to_2(cursor->rec - REC_NEXT, static_cast<uint16_t>
+ (insert_rec - cursor->rec));
+ byte *n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+ cursor->block->frame);
+ mtr->write<2>(*cursor->block, n_recs, 1U + mach_read_from_2(n_recs));
+ memcpy_aligned<2>(&page_zip->data[PAGE_N_RECS + PAGE_HEADER], n_recs, 2);
+
+ /* 5. Set the n_owned field in the inserted record to zero,
+ and set the heap_no field */
+ rec_set_bit_field_1(insert_rec, 0, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ rec_set_bit_field_2(insert_rec, heap_no, REC_NEW_HEAP_NO,
+ REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+
+ UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets),
+ rec_offs_size(offsets));
+
+ /* 6. Update the last insertion info in page header */
+ byte *last_insert= my_assume_aligned<4>(PAGE_LAST_INSERT + PAGE_HEADER +
+ page_zip->data);
+ const uint16_t last_insert_rec= mach_read_from_2(last_insert);
+ ut_ad(!last_insert_rec ||
+ rec_get_node_ptr_flag(cursor->block->frame + last_insert_rec) ==
+ rec_get_node_ptr_flag(insert_rec));
+ mach_write_to_2(last_insert, page_offset(insert_rec));
+
+ if (!index->is_spatial())
+ {
+ byte *dir= &page_zip->data[PAGE_HEADER + PAGE_DIRECTION_B];
+ ut_ad(!(*dir & ~((1U << 3) - 1)));
+ byte *n= my_assume_aligned<2>
+ (&page_zip->data[PAGE_HEADER + PAGE_N_DIRECTION]);
+ if (UNIV_UNLIKELY(!last_insert_rec))
+ {
+no_direction:
+ *dir= PAGE_NO_DIRECTION;
+ memset(n, 0, 2);
+ }
+ else if (*dir != PAGE_LEFT &&
+ cursor->block->frame + last_insert_rec == cursor->rec)
+ {
+ *dir= PAGE_RIGHT;
+inc_dir:
+ mach_write_to_2(n, mach_read_from_2(n) + 1);
+ }
+ else if (*dir != PAGE_RIGHT && page_rec_get_next(insert_rec) ==
+ cursor->block->frame + last_insert_rec)
+ {
+ *dir= PAGE_LEFT;
+ goto inc_dir;
+ }
+ else
+ goto no_direction;
+ }
+
+ /* Write the header fields in one record. */
+ mtr->memcpy(*cursor->block,
+ my_assume_aligned<8>(PAGE_LAST_INSERT + PAGE_HEADER +
+ cursor->block->frame),
+ my_assume_aligned<8>(PAGE_LAST_INSERT + PAGE_HEADER +
+ page_zip->data),
+ PAGE_N_RECS - PAGE_LAST_INSERT + 2);
+
+ /* 7. It remains to update the owner record. */
+ ulint n_owned;
+
+ while (!(n_owned = rec_get_n_owned_new(next_rec)))
+ next_rec = page_rec_get_next_low(next_rec, true);
+
+ rec_set_bit_field_1(const_cast<rec_t*>(next_rec), n_owned + 1,
+ REC_NEW_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+
+ page_zip_dir_insert(cursor, free_rec, insert_rec, mtr);
+
+ /* 8. Now we have incremented the n_owned field of the owner
+ record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED,
+ we have to split the corresponding directory slot in two. */
+ if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
+ page_dir_split_slot<true>(cursor->block,
+ page_dir_find_owner_slot(next_rec), mtr);
+
+ page_zip_write_rec(cursor->block, insert_rec, index, offsets, 1, mtr);
+ return insert_rec;
+}
+
+/** Increment PAGE_N_DIRECTION.
+@param[in,out] block ROW_FORMAT=COMPRESSED index page
+@param[in,out] ptr the PAGE_DIRECTION_B field
+@param[in] dir PAGE_RIGHT or PAGE_LEFT */
+static inline void page_direction_increment(buf_block_t *block, byte *ptr,
+ uint dir)
+{
+ ut_ad(ptr == PAGE_HEADER + PAGE_DIRECTION_B + block->frame);
+ ut_ad(dir == PAGE_RIGHT || dir == PAGE_LEFT);
+ block->page.zip.data[PAGE_HEADER + PAGE_DIRECTION_B]= *ptr= dir;
+ mach_write_to_2(PAGE_HEADER + PAGE_N_DIRECTION + block->frame,
+ 1U + page_header_get_field(block->frame, PAGE_N_DIRECTION));
+ memcpy_aligned<2>(PAGE_HEADER + PAGE_N_DIRECTION + block->frame,
+ PAGE_HEADER + PAGE_N_DIRECTION + block->page.zip.data, 2);
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor on a compressed and uncompressed
+page. Returns pointer to inserted record if succeed, i.e.,
+enough space available, NULL otherwise.
+The cursor stays at the same position.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to record if succeed, NULL otherwise */
+static rec_t*
+page_cur_parse_insert_rec_zip(
+ page_cur_t* cursor, /*!< in/out: page cursor */
+ dict_index_t* index, /*!< in: record descriptor */
+ const rec_t* rec, /*!< in: pointer to a physical record */
+ offset_t* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
byte* insert_buf;
ulint rec_size;
page_t* page; /*!< the relevant page */
- rec_t* free_rec; /*!< a free record that was reused,
- or NULL */
rec_t* insert_rec; /*!< inserted record */
ulint heap_no; /*!< heap number of the inserted
record */
page_zip_des_t* page_zip;
+ ut_ad(!log_sys.is_physical());
+
page_zip = page_cur_get_page_zip(cursor);
ut_ad(page_zip);
ut_ad(rec_offs_validate(rec, index, offsets));
@@ -1635,6 +2064,9 @@ page_cur_insert_rec_zip(
&& rec_size > page_get_max_insert_size(page, 1)
&& rec_size <= page_get_max_insert_size_after_reorganize(
page, 1);
+ constexpr uint16_t page_free_f = PAGE_FREE + PAGE_HEADER;
+ byte* const page_free = my_assume_aligned<4>(page_free_f + page);
+ uint16_t free_rec;
/* 2. Try to find suitable space from page memory management */
if (!page_zip_available(page_zip, dict_index_is_clust(index),
@@ -1646,16 +2078,14 @@ page_cur_insert_rec_zip(
rec_t* cursor_rec = page_cur_get_rec(cursor);
#endif /* UNIV_DEBUG */
-#if 1 /* MDEV-12353 FIXME: skip this for the physical log format! */
/* If we are not writing compressed page images, we
must reorganize the page before attempting the
insert. */
- if (recv_recovery_is_on()) {
+ if (recv_recovery_is_on() && !log_sys.is_physical()) {
/* Insert into the uncompressed page only.
The page reorganization or creation that we
would attempt outside crash recovery would
have been covered by a previous redo log record. */
-#endif
} else if (page_is_empty(page)) {
ut_ad(page_cur_is_before_first(cursor));
@@ -1669,6 +2099,7 @@ page_cur_insert_rec_zip(
if (page_zip_available(
page_zip, dict_index_is_clust(index),
rec_size, 1)) {
+ free_rec = 0;
goto use_heap;
}
@@ -1700,6 +2131,7 @@ page_cur_insert_rec_zip(
rec_size, 1)) {
/* After reorganizing, there is space
available. */
+ free_rec = 0;
goto use_heap;
}
}
@@ -1734,14 +2166,12 @@ page_cur_insert_rec_zip(
be logged after a successful operation. */
ut_ad(!recv_recovery_is_on());
ut_ad(!index->is_dummy);
-#if 1 /* MDEV-12353 FIXME: skip this for the physical log format! */
- } else if (recv_recovery_is_on()) {
+ } else if (recv_recovery_is_on() && !log_sys.is_physical()) {
/* This should be followed by
MLOG_ZIP_PAGE_COMPRESS_NO_DATA,
which should succeed. */
rec_offs_make_valid(insert_rec, index,
page_is_leaf(page), offsets);
-#endif
} else {
ulint pos = page_rec_get_n_recs_before(insert_rec);
ut_ad(pos > 0);
@@ -1786,8 +2216,8 @@ page_cur_insert_rec_zip(
return(insert_rec);
}
- free_rec = page_header_get_ptr(page, PAGE_FREE);
- if (UNIV_LIKELY_NULL(free_rec)) {
+ free_rec = mach_read_from_2(page_free);
+ if (free_rec) {
/* Try to allocate from the head of the free list. */
lint extra_size_diff;
offset_t foffsets_[REC_OFFS_NORMAL_SIZE];
@@ -1796,8 +2226,8 @@ page_cur_insert_rec_zip(
rec_offs_init(foffsets_);
- foffsets = rec_get_offsets(free_rec, index, foffsets,
- page_rec_is_leaf(free_rec),
+ foffsets = rec_get_offsets(page + free_rec, index, foffsets,
+ page_is_leaf(page),
ULINT_UNDEFINED, &heap);
if (rec_offs_size(foffsets) < rec_size) {
too_small:
@@ -1805,10 +2235,11 @@ too_small:
mem_heap_free(heap);
}
+ free_rec = 0;
goto use_heap;
}
- insert_buf = free_rec - rec_offs_extra_size(foffsets);
+ insert_buf = page + free_rec - rec_offs_extra_size(foffsets);
/* On compressed pages, do not relocate records from
the free list. If extra_size would grow, use the heap. */
@@ -1830,16 +2261,27 @@ too_small:
goto too_small;
}
- heap_no = rec_get_heap_no_new(free_rec);
- const rec_t* next = rec_get_next_ptr_const(free_rec, true);
- mach_write_to_2(PAGE_FREE + PAGE_HEADER + page,
- next ? page_offset(next) : 0);
- byte* garbage = PAGE_GARBAGE + PAGE_HEADER + page;
+ heap_no = rec_get_heap_no_new(page + free_rec);
+ int16_t next_rec = mach_read_from_2(page + free_rec - REC_NEXT);
+ /* We assume that int16_t is safe to use here.
+ With innodb_page_size=64k it would be unsafe,
+ but that cannot be used with ROW_FORMAT=COMPRESSED. */
+ static_assert(UNIV_ZIP_SIZE_SHIFT_MAX == 14, "compatibility");
+ if (next_rec) {
+ next_rec += free_rec;
+ ut_ad(int{PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES}
+ <= next_rec);
+ ut_ad(static_cast<uint16_t>(next_rec) < srv_page_size);
+ }
+ mtr->write<2>(*cursor->block, page_free,
+ static_cast<uint16_t>(next_rec));
+ byte* garbage = my_assume_aligned<2>(page_free + 2);
ut_ad(mach_read_from_2(garbage) >= rec_size);
- mach_write_to_2(garbage, mach_read_from_2(garbage) - rec_size);
+ mtr->write<2>(*cursor->block, garbage,
+ mach_read_from_2(garbage) - rec_size);
compile_time_assert(PAGE_GARBAGE == PAGE_FREE + 2);
- page_zip_write_header(cursor->block,
- PAGE_HEADER + PAGE_FREE + page, 4, mtr);
+ compile_time_assert(!((PAGE_HEADER + PAGE_FREE) % 4));
+ memcpy_aligned<4>(&page_zip->data[page_free_f], page_free, 4);
/* TODO: group with PAGE_LAST_INSERT */
if (!page_is_leaf(page)) {
@@ -1852,7 +2294,7 @@ too_small:
if (rec_offs_extra_size(foffsets)
+ rec_offs_data_size(foffsets) > rec_size) {
- memset(rec_get_end(free_rec, foffsets)
+ memset(rec_get_end(page + free_rec, foffsets)
- REC_NODE_PTR_SIZE, 0,
REC_NODE_PTR_SIZE);
}
@@ -1875,7 +2317,7 @@ too_small:
they will not be fully overwritten by
insert_rec. */
- memset(free_rec + trx_id_offs, 0,
+ memset(page + free_rec + trx_id_offs, 0,
DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
}
@@ -1890,7 +2332,7 @@ too_small:
}
} else {
use_heap:
- free_rec = NULL;
+ ut_ad(!free_rec);
insert_buf = page_mem_alloc_heap<true>(cursor->block, rec_size,
&heap_no, mtr);
@@ -1918,9 +2360,10 @@ use_heap:
(next_rec - insert_rec));
mach_write_to_2(cursor->rec - REC_NEXT, static_cast<uint16_t>
(insert_rec - cursor->rec));
- byte* n_recs = PAGE_N_RECS + PAGE_HEADER + page;
- mach_write_to_2(n_recs, mach_read_from_2(n_recs) + 1);
- page_zip_write_header(cursor->block, n_recs, 2, mtr);
+ byte* n_recs = my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page);
+ mtr->write<2>(*cursor->block, n_recs, 1U + mach_read_from_2(n_recs));
+ memcpy_aligned<2>(&page_zip->data[PAGE_N_RECS + PAGE_HEADER], n_recs,
+ 2);
/* 5. Set the n_owned field in the inserted record to zero,
and set the heap_no field */
@@ -1935,52 +2378,59 @@ use_heap:
page_zip_dir_insert(cursor, free_rec, insert_rec, mtr);
/* 6. Update the last insertion info in page header */
- byte* last_insert = PAGE_LAST_INSERT + PAGE_HEADER + page;
+ byte* last_insert = my_assume_aligned<4>(PAGE_LAST_INSERT + PAGE_HEADER
+ + page);
const uint16_t last_insert_rec = mach_read_from_2(last_insert);
ut_ad(!last_insert_rec
|| rec_get_node_ptr_flag(page + last_insert_rec)
== rec_get_node_ptr_flag(insert_rec));
- /* TODO: combine with PAGE_DIRECTION changes */
- mach_write_to_2(last_insert, page_offset(insert_rec));
- page_zip_write_header(cursor->block, last_insert, 2, mtr);
+ /* FIXME: combine with PAGE_DIRECTION changes */
+ mtr->write<2>(*cursor->block, last_insert, page_offset(insert_rec));
+ memcpy_aligned<4>(&page_zip->data[PAGE_LAST_INSERT + PAGE_HEADER],
+ last_insert, 2);
if (!index->is_spatial()) {
byte* ptr = PAGE_HEADER + PAGE_DIRECTION_B + page;
if (UNIV_UNLIKELY(!last_insert_rec)) {
no_direction:
- page_direction_reset<true>(cursor->block, ptr, mtr);
+ page_zip->data[PAGE_HEADER + PAGE_DIRECTION_B] = *ptr
+ = PAGE_NO_DIRECTION;
+ memset_aligned<2>(PAGE_HEADER + PAGE_N_DIRECTION + page,
+ 0, 2);
+ memset_aligned<2>(PAGE_HEADER + PAGE_N_DIRECTION
+ + page_zip->data, 0, 2);
} else if (page + last_insert_rec == cursor->rec
&& page_ptr_get_direction(ptr) != PAGE_LEFT) {
- page_direction_increment<true>(cursor->block, ptr,
- PAGE_RIGHT, mtr);
+ page_direction_increment(cursor->block, ptr,
+ PAGE_RIGHT);
} else if (page_ptr_get_direction(ptr) != PAGE_RIGHT
&& page_rec_get_next(insert_rec)
== page + last_insert_rec) {
- page_direction_increment<true>(cursor->block, ptr,
- PAGE_LEFT, mtr);
+ page_direction_increment(cursor->block, ptr,
+ PAGE_LEFT);
} else {
goto no_direction;
}
}
/* 7. It remains to update the owner record. */
- {
- rec_t* owner_rec = page_rec_find_owner_rec(insert_rec);
- ulint n_owned;
+ ulint n_owned;
- n_owned = rec_get_n_owned_new(owner_rec);
- rec_set_bit_field_1(owner_rec, n_owned + 1, REC_NEW_N_OWNED,
- REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ while (!(n_owned = rec_get_n_owned_new(next_rec))) {
+ next_rec = page_rec_get_next_low(next_rec, true);
+ }
- /* 8. Now we have incremented the n_owned field of the owner
- record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED,
- we have to split the corresponding directory slot in two. */
+ rec_set_bit_field_1(const_cast<rec_t*>(next_rec), n_owned + 1,
+ REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
- if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) {
- page_dir_split_slot<true>(
- page_cur_get_block(cursor),
- page_dir_find_owner_slot(owner_rec), mtr);
- }
+ /* 8. Now we have incremented the n_owned field of the owner
+ record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED,
+ we have to split the corresponding directory slot in two. */
+ if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) {
+ page_dir_split_slot<true>(page_cur_get_block(cursor),
+ page_dir_find_owner_slot(next_rec),
+ mtr);
}
page_zip_write_rec(cursor->block, insert_rec, index, offsets, 1, mtr);
@@ -2045,10 +2495,15 @@ page_parse_copy_rec_list_to_created_page(
+ block->page.zip.data, 0, 2);
}
- if (!index->is_spatial()) {
- page_direction_reset<true>(block,
- PAGE_HEADER + PAGE_DIRECTION_B
- + block->frame, mtr);
+ if (index->is_spatial()) {
+ return rec_end;
+ }
+
+ block->frame[PAGE_HEADER + PAGE_DIRECTION_B] &= ~((1U << 3) - 1);
+ block->frame[PAGE_HEADER + PAGE_DIRECTION_B] |= PAGE_NO_DIRECTION;
+ if (block->page.zip.data) {
+ block->page.zip.data[PAGE_HEADER + PAGE_DIRECTION_B]
+ = PAGE_NO_DIRECTION;
}
return(rec_end);
@@ -2364,13 +2819,11 @@ page_cur_delete_rec(
ut_ad(page_rec_is_user_rec(current_rec));
if (page_get_n_recs(block->frame) == 1
-#if 1 /* MDEV-12353 TODO: skip this for the physical log format */
/* Empty the page, unless we are applying the redo log
during crash recovery. During normal operation, the
page_create_empty() gets logged as one of MLOG_PAGE_CREATE,
MLOG_COMP_PAGE_CREATE, MLOG_ZIP_PAGE_COMPRESS. */
- && !recv_recovery_is_on()
-#endif
+ && !recv_recovery_is_on() && !log_sys.is_physical()
&& !rec_is_alter_metadata(current_rec, *index)) {
/* Empty the page. */
ut_ad(page_is_leaf(block->frame));
diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc
index a1711885bcf..7b7479906cf 100644
--- a/storage/innobase/page/page0page.cc
+++ b/storage/innobase/page/page0page.cc
@@ -198,17 +198,15 @@ page_set_max_trx_id(
mtr_t* mtr) /*!< in/out: mini-transaction, or NULL */
{
ut_ad(!mtr || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!page_zip || page_zip == &block->page.zip);
static_assert((PAGE_HEADER + PAGE_MAX_TRX_ID) % 8 == 0, "alignment");
byte *max_trx_id= my_assume_aligned<8>(PAGE_MAX_TRX_ID +
PAGE_HEADER + block->frame);
+ mtr->write<8>(*block, max_trx_id, trx_id);
if (UNIV_LIKELY_NULL(page_zip))
- {
- mach_write_to_8(max_trx_id, trx_id);
- page_zip_write_header(block, max_trx_id, 8, mtr);
- }
- else
- mtr->write<8>(*block, max_trx_id, trx_id);
+ memcpy_aligned<8>(&page_zip->data[PAGE_MAX_TRX_ID + PAGE_HEADER],
+ max_trx_id, 8);
}
/** Persist the AUTO_INCREMENT value on a clustered index root page.
@@ -229,17 +227,16 @@ page_set_autoinc(
ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX |
MTR_MEMO_PAGE_SX_FIX));
- byte *field= PAGE_HEADER + PAGE_ROOT_AUTO_INC + block->frame;
+ byte *field= my_assume_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC +
+ block->frame);
ib_uint64_t old= mach_read_from_8(field);
if (old == autoinc || (old > autoinc && !reset))
- /* nothing to update */;
- else if (UNIV_LIKELY_NULL(block->page.zip.data))
- {
- mach_write_to_8(field, autoinc);
- page_zip_write_header(block, field, 8, mtr);
- }
- else
- mtr->write<8>(*block, field, autoinc);
+ return; /* nothing to update */
+
+ mtr->write<8>(*block, field, autoinc);
+ if (UNIV_LIKELY_NULL(block->page.zip.data))
+ memcpy_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC + block->page.zip.data,
+ field, 8);
}
/** The page infimum and supremum of an empty page in ROW_FORMAT=REDUNDANT */
@@ -327,11 +324,11 @@ void page_create_low(const buf_block_t* block, bool comp)
@param[in,out] block buffer block
@param[in,out] mtr mini-transaction
@param[in] comp set unless ROW_FORMAT=REDUNDANT */
-void page_create(buf_block_t* block, mtr_t* mtr, bool comp)
+void page_create(buf_block_t *block, mtr_t *mtr, bool comp)
{
- mtr->page_create(block->page.id, comp);
- buf_block_modify_clock_inc(block);
- page_create_low(block, comp);
+ mtr->page_create(*block, comp);
+ buf_block_modify_clock_inc(block);
+ page_create_low(block, comp);
}
/**********************************************************//**
@@ -961,14 +958,15 @@ delete_all:
buf_block_modify_clock_inc(block);
const bool is_leaf = page_is_leaf(block->frame);
- byte* last_insert = my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER
- + block->frame);
+ mtr->write<2,mtr_t::OPT>(*block, my_assume_aligned<2>
+ (PAGE_LAST_INSERT + PAGE_HEADER
+ + block->frame), 0U);
if (UNIV_LIKELY_NULL(page_zip)) {
ut_ad(page_is_comp(block->frame));
- memset(last_insert, 0, 2);
- page_zip_write_header(block, last_insert, 2, mtr);
+ memset_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER
+ + page_zip->data, 0, 2);
do {
page_cur_t cur;
@@ -990,8 +988,6 @@ delete_all:
return;
}
- mtr->write<2,mtr_t::OPT>(*block, last_insert, 0U);
-
prev_rec = page_rec_get_prev(rec);
last_rec = page_rec_get_prev(page_get_supremum_rec(block->frame));
diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc
index f304616ad9d..c6739f067f4 100644
--- a/storage/innobase/page/page0zip.cc
+++ b/storage/innobase/page/page0zip.cc
@@ -361,6 +361,54 @@ page_zip_dir_get(
- PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1)));
}
+/** Write a byte string to a ROW_FORMAT=COMPRESSED page.
+@param[in] b ROW_FORMAT=COMPRESSED index page
+@param[in] offset byte offset from b.zip.data
+@param[in] len length of the data to write */
+inline void mtr_t::zmemcpy(const buf_page_t &b, ulint offset, ulint len)
+{
+ ut_ad(mach_read_from_2(b.zip.data + FIL_PAGE_TYPE) == FIL_PAGE_INDEX ||
+ mach_read_from_2(b.zip.data + FIL_PAGE_TYPE) == FIL_PAGE_RTREE);
+ ut_ad(page_zip_simple_validate(&b.zip));
+ ut_ad(offset + len <= page_zip_get_size(&b.zip));
+
+ memcpy_low(b, static_cast<uint16_t>(offset), &b.zip.data[offset], len);
+ m_last_offset= static_cast<uint16_t>(offset + len);
+}
+
+/** Write a byte string to a ROW_FORMAT=COMPRESSED page.
+@param[in,out] b ROW_FORMAT=COMPRESSED index page
+@param[in] dest destination within b.zip.data
+@param[in] str the data to write
+@param[in] len length of the data to write
+@tparam w write request type */
+template<mtr_t::write_type w>
+inline void mtr_t::zmemcpy(const buf_page_t &b, void *dest, const void *str,
+ ulint len)
+{
+ byte *d= static_cast<byte*>(dest);
+ const byte *s= static_cast<const byte*>(str);
+ ut_ad(d >= b.zip.data + FIL_PAGE_OFFSET);
+ if (w != FORCED)
+ {
+ ut_ad(len);
+ const byte *const end= d + len;
+ while (*d++ == *s++)
+ {
+ if (d == end)
+ {
+ ut_ad(w == OPT);
+ return;
+ }
+ }
+ s--;
+ d--;
+ len= static_cast<ulint>(end - d);
+ }
+ ::memcpy(d, s, len);
+ zmemcpy(b, d - b.zip.data, len);
+}
+
/** Write redo log for compressing a ROW_FORMAT=COMPRESSED index page.
@param[in,out] block ROW_FORMAT=COMPRESSED index page
@param[in] index the index that the block belongs to
@@ -3545,9 +3593,9 @@ page_zip_write_rec_ext(
byte* ext_start = ext_end
- n_ext * FIELD_REF_SIZE;
memmove(ext_start, ext_end, len);
- /* TODO: write MEMMOVE record */
- mtr->zmemcpy(block->page, ext_start
- - page_zip->data, len);
+ mtr->memmove(*block,
+ ext_start - page_zip->data,
+ ext_end - page_zip->data, len);
}
}
@@ -3783,8 +3831,8 @@ void page_zip_write_rec(buf_block_t *block, const byte *rec,
/* Copy the node pointer to the uncompressed area. */
byte* node_ptr = storage - REC_NODE_PTR_SIZE * (heap_no - 1);
- mtr->zmemcpy(&block->page, node_ptr - page_zip->data,
- rec + len, REC_NODE_PTR_SIZE);
+ mtr->zmemcpy<mtr_t::OPT>(block->page, node_ptr,
+ rec + len, REC_NODE_PTR_SIZE);
}
ut_a(!*data);
@@ -3917,8 +3965,8 @@ page_zip_write_blob_ptr(
externs -= (blob_no + 1) * BTR_EXTERN_FIELD_REF_SIZE;
field += len - BTR_EXTERN_FIELD_REF_SIZE;
- mtr->zmemcpy(&block->page, ulint(externs - page_zip->data),
- field, BTR_EXTERN_FIELD_REF_SIZE);
+ mtr->zmemcpy<mtr_t::OPT>(block->page, externs, field,
+ BTR_EXTERN_FIELD_REF_SIZE);
#ifdef UNIV_ZIP_DEBUG
ut_a(page_zip_validate(page_zip, page, index));
@@ -4040,8 +4088,7 @@ page_zip_write_node_ptr(
#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
compile_time_assert(REC_NODE_PTR_SIZE == 4);
mach_write_to_4(field, ptr);
- mtr->zmemcpy(&block->page, ulint(storage - page_zip->data),
- field, REC_NODE_PTR_SIZE);
+ mtr->zmemcpy(block->page, storage, field, REC_NODE_PTR_SIZE);
}
/** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record.
@@ -4062,9 +4109,6 @@ page_zip_write_trx_id_and_roll_ptr(
roll_ptr_t roll_ptr,
mtr_t* mtr)
{
- byte* field;
- byte* storage;
- ulint len;
page_zip_des_t* const page_zip = &block->page.zip;
ut_d(const page_t* const page = block->frame);
@@ -4084,12 +4128,13 @@ page_zip_write_trx_id_and_roll_ptr(
UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
constexpr ulint sys_len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
- storage = page_zip_dir_start(page_zip)
- - (rec_get_heap_no_new(rec) - 1)
- * sys_len;
+ const ulint heap_no = rec_get_heap_no_new(rec);
+ ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+ byte* storage = page_zip_dir_start(page_zip) - (heap_no - 1) * sys_len;
compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
- field = rec_get_nth_field(rec, offsets, trx_id_col, &len);
+ ulint len;
+ byte* field = rec_get_nth_field(rec, offsets, trx_id_col, &len);
ut_ad(len == DATA_TRX_ID_LEN);
ut_ad(field + DATA_TRX_ID_LEN
== rec_get_nth_field(rec, offsets, trx_id_col + 1, &len));
@@ -4101,8 +4146,47 @@ page_zip_write_trx_id_and_roll_ptr(
mach_write_to_6(field, trx_id);
compile_time_assert(DATA_ROLL_PTR_LEN == 7);
mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr);
- mtr->zmemcpy(&block->page, ulint(storage - page_zip->data),
- field, sys_len);
+ len = 0;
+ if (heap_no > PAGE_HEAP_NO_USER_LOW) {
+ byte* prev = storage + sys_len;
+ for (; len < sys_len && prev[len] == field[len]; len++);
+ if (len > 4) {
+ /* We save space by replacing a single record
+
+ WRITE,offset(storage),byte[13]
+
+ with up to two records:
+
+ MEMMOVE,offset(storage),len(1 byte),+13(1 byte),
+ WRITE|0x80,0,byte[13-len]
+
+ The single WRITE record would be x+13 bytes long (x>2).
+ The MEMMOVE record would be x+1+1 = x+2 bytes, and
+ the second WRITE would be 1+1+13-len = 15-len bytes.
+
+ The total size is: x+13 versus x+2+15-len = x+17-len.
+ To save space, we must have len>4. */
+ memcpy(storage, prev, len);
+ mtr->memmove(*block, ulint(storage - page_zip->data),
+ ulint(storage - page_zip->data) + sys_len,
+ len);
+ storage += len;
+ field += len;
+ if (UNIV_LIKELY(len < sys_len)) {
+ goto write;
+ }
+ } else {
+ len = 0;
+ goto write;
+ }
+ } else {
+write:
+ mtr->zmemcpy<mtr_t::OPT>(block->page, storage, field,
+ sys_len - len);
+ }
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(storage - len, field - len, sys_len));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
@@ -4222,9 +4306,8 @@ page_zip_clear_rec(
memset(field, 0, REC_NODE_PTR_SIZE);
storage -= (heap_no - 1) * REC_NODE_PTR_SIZE;
clear_page_zip:
- /* TODO: write MEMSET record */
memset(storage, 0, len);
- mtr->zmemcpy(block->page, storage - page_zip->data, len);
+ mtr->memset(*block, storage - page_zip->data, len, 0);
} else if (index->is_clust()) {
/* Clear trx_id and roll_ptr. On the compressed page,
there is an array of these fields immediately before the
@@ -4265,33 +4348,24 @@ clear_page_zip:
}
}
-/**********************************************************************//**
-Write the "deleted" flag of a record on a compressed page. The flag must
-already have been written on the uncompressed page. */
-void
-page_zip_rec_set_deleted(
-/*=====================*/
- buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */
- const byte* rec, /*!< in: record on the uncompressed page */
- ulint flag, /*!< in: the deleted flag (nonzero=TRUE) */
- mtr_t* mtr) /*!< in,out: mini-transaction */
+/** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record.
+@param[in,out] block buffer block
+@param[in,out] rec record on a physical index page
+@param[in] flag the value of the delete-mark flag
+@param[in,out] mtr mini-transaction */
+void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag,
+ mtr_t *mtr)
{
- ut_ad(page_align(rec) == block->frame);
- page_zip_des_t* const page_zip = &block->page.zip;
- byte* slot = page_zip_dir_find(&block->page.zip, page_offset(rec));
- ut_a(slot);
- UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
- byte b = *slot;
- if (flag) {
- b |= (PAGE_ZIP_DIR_SLOT_DEL >> 8);
- } else {
- b &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8);
- }
- if (b != *slot) {
- mtr->zmemcpy(&block->page, slot - page_zip->data, &b, 1);
- }
+ ut_ad(page_align(rec) == block->frame);
+ byte *slot= page_zip_dir_find(&block->page.zip, page_offset(rec));
+ byte b= *slot;
+ if (flag)
+ b|= (PAGE_ZIP_DIR_SLOT_DEL >> 8);
+ else
+ b&= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8);
+ mtr->zmemcpy<mtr_t::OPT>(block->page, slot, &b, 1);
#ifdef UNIV_ZIP_DEBUG
- ut_a(page_zip_validate(page_zip, page_align(rec), NULL));
+ ut_a(page_zip_validate(&block->page.zip, block->frame, nullptr));
#endif /* UNIV_ZIP_DEBUG */
}
@@ -4306,20 +4380,16 @@ page_zip_rec_set_owned(
ulint flag, /*!< in: the owned flag (nonzero=TRUE) */
mtr_t* mtr) /*!< in/out: mini-transaction */
{
- ut_ad(page_align(rec) == block->frame);
- page_zip_des_t* const page_zip = &block->page.zip;
- byte* slot = page_zip_dir_find(page_zip, page_offset(rec));
- ut_a(slot);
- UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
- byte b = *slot;
- if (flag) {
- b |= (PAGE_ZIP_DIR_SLOT_OWNED >> 8);
- } else {
- b &= ~(PAGE_ZIP_DIR_SLOT_OWNED >> 8);
- }
- if (b != *slot) {
- mtr->zmemcpy(&block->page, slot - page_zip->data, &b, 1);
- }
+ ut_ad(page_align(rec) == block->frame);
+ page_zip_des_t *const page_zip= &block->page.zip;
+ byte *slot= page_zip_dir_find(page_zip, page_offset(rec));
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+ byte b= *slot;
+ if (flag)
+ b|= (PAGE_ZIP_DIR_SLOT_OWNED >> 8);
+ else
+ b&= ~(PAGE_ZIP_DIR_SLOT_OWNED >> 8);
+ mtr->zmemcpy<mtr_t::OPT>(block->page, slot, &b, 1);
}
/**********************************************************************//**
@@ -4328,8 +4398,8 @@ void
page_zip_dir_insert(
/*================*/
page_cur_t* cursor, /*!< in/out: page cursor */
- const byte* free_rec,/*!< in: record from which rec was
- allocated, or NULL */
+ uint16_t free_rec,/*!< in: record from which rec was
+ allocated, or 0 */
byte* rec, /*!< in: record to insert */
mtr_t* mtr) /*!< in/out: mini-transaction */
{
@@ -4371,7 +4441,7 @@ page_zip_dir_insert(
n_dense = page_dir_get_n_heap(page_zip->data)
- (PAGE_HEAP_NO_USER_LOW + 1U);
- if (UNIV_LIKELY_NULL(free_rec)) {
+ if (UNIV_UNLIKELY(free_rec)) {
/* The record was allocated from the free list.
Shift the dense directory only up to that slot.
Note that in this case, n_dense is actually
@@ -4379,8 +4449,8 @@ page_zip_dir_insert(
did not increment n_heap. */
ut_ad(rec_get_heap_no_new(rec) < n_dense + 1
+ PAGE_HEAP_NO_USER_LOW);
- ut_ad(rec >= free_rec);
- slot_free = page_zip_dir_find(page_zip, page_offset(free_rec));
+ ut_ad(page_offset(rec) >= free_rec);
+ slot_free = page_zip_dir_find(page_zip, free_rec);
ut_ad(slot_free);
slot_free += PAGE_ZIP_DIR_SLOT_SIZE;
} else {
@@ -4394,17 +4464,20 @@ page_zip_dir_insert(
- PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
}
- const ulint slot_len = ulint(slot_rec - slot_free);
- /* Shift the dense directory to allocate place for rec. */
- memmove_aligned<2>(slot_free - PAGE_ZIP_DIR_SLOT_SIZE, slot_free,
- slot_len);
+ if (const ulint slot_len = ulint(slot_rec - slot_free)) {
+ /* Shift the dense directory to allocate place for rec. */
+ memmove_aligned<2>(slot_free - PAGE_ZIP_DIR_SLOT_SIZE,
+ slot_free, slot_len);
+ mtr->memmove(*cursor->block, (slot_free - page_zip->data)
+ - PAGE_ZIP_DIR_SLOT_SIZE,
+ slot_free - page_zip->data, slot_len);
+ }
/* Write the entry for the inserted record.
The "owned" and "deleted" flags must be zero. */
mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, page_offset(rec));
- /* TODO: issue MEMMOVE record to reduce log volume */
- mtr->zmemcpy(cursor->block->page, slot_free - PAGE_ZIP_DIR_SLOT_SIZE
- - page_zip->data, PAGE_ZIP_DIR_SLOT_SIZE + slot_len);
+ mtr->zmemcpy(cursor->block->page, slot_rec - page_zip->data
+ - PAGE_ZIP_DIR_SLOT_SIZE, PAGE_ZIP_DIR_SLOT_SIZE);
}
/** Shift the dense page directory and the array of BLOB pointers
@@ -4434,12 +4507,13 @@ void page_zip_dir_delete(buf_block_t *block, byte *rec,
free ? static_cast<uint16_t>(free - rec) : 0);
byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
block->frame);
- mach_write_to_2(page_free, page_offset(rec));
+ mtr->write<2>(*block, page_free, page_offset(rec));
byte *garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
block->frame);
- mach_write_to_2(garbage, rec_offs_size(offsets) + mach_read_from_2(garbage));
+ mtr->write<2>(*block, garbage, rec_offs_size(offsets) +
+ mach_read_from_2(garbage));
compile_time_assert(PAGE_GARBAGE == PAGE_FREE + 2);
- page_zip_write_header(block, page_free, 4, mtr);
+ memcpy_aligned<4>(PAGE_FREE + PAGE_HEADER + page_zip->data, page_free, 4);
byte *slot_rec= page_zip_dir_find(page_zip, page_offset(rec));
ut_a(slot_rec);
uint16_t n_recs= page_get_n_recs(block->frame);
@@ -4448,8 +4522,9 @@ void page_zip_dir_delete(buf_block_t *block, byte *rec,
/* This could not be done before page_zip_dir_find(). */
byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
block->frame);
- mach_write_to_2(page_n_recs, n_recs - 1);
- page_zip_write_header(block, page_n_recs, 2, mtr);
+ mtr->write<2>(*block, page_n_recs, n_recs - 1U);
+ memcpy_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page_zip->data, page_n_recs,
+ 2);
byte *slot_free;
@@ -4468,16 +4543,17 @@ void page_zip_dir_delete(buf_block_t *block, byte *rec,
const ulint slot_len= slot_rec > slot_free ? ulint(slot_rec - slot_free) : 0;
if (slot_len)
- /* MDEV-12353 TODO: issue MEMMOVE record */
+ {
memmove_aligned<2>(slot_free + PAGE_ZIP_DIR_SLOT_SIZE, slot_free,
slot_len);
+ mtr->memmove(*block, (slot_free - page_zip->data) + PAGE_ZIP_DIR_SLOT_SIZE,
+ slot_free - page_zip->data, slot_len);
+ }
/* Write the entry for the deleted record.
The "owned" and "deleted" flags will be cleared. */
mach_write_to_2(slot_free, page_offset(rec));
-
- mtr->zmemcpy(block->page, slot_free - page_zip->data,
- slot_len + PAGE_ZIP_DIR_SLOT_SIZE);
+ mtr->zmemcpy(block->page, slot_free - page_zip->data, 2);
if (const ulint n_ext= rec_offs_n_extern(offsets))
{
@@ -4491,18 +4567,18 @@ void page_zip_dir_delete(buf_block_t *block, byte *rec,
byte *externs= page_zip->data + page_zip_get_size(page_zip) -
(page_dir_get_n_heap(block->frame) - PAGE_HEAP_NO_USER_LOW) *
PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
-
byte *ext_end= externs - page_zip->n_blobs * FIELD_REF_SIZE;
/* Shift and zero fill the array. */
- memmove(ext_end + n_ext * FIELD_REF_SIZE, ext_end,
- ulint(page_zip->n_blobs - n_ext - blob_no) *
- BTR_EXTERN_FIELD_REF_SIZE);
+ if (const ulint ext_len= ulint(page_zip->n_blobs - n_ext - blob_no) *
+ BTR_EXTERN_FIELD_REF_SIZE)
+ {
+ memmove(ext_end + n_ext * FIELD_REF_SIZE, ext_end, ext_len);
+ mtr->memmove(*block, (ext_end - page_zip->data) + n_ext * FIELD_REF_SIZE,
+ ext_end - page_zip->data, ext_len);
+ }
memset(ext_end, 0, n_ext * FIELD_REF_SIZE);
- /* TODO: use MEMMOVE and MEMSET records to reduce volume */
- const ulint ext_len= ulint(page_zip->n_blobs - blob_no) * FIELD_REF_SIZE;
-
- mtr->zmemcpy(block->page, ext_end - page_zip->data, ext_len);
+ mtr->memset(*block, ext_end - page_zip->data, n_ext * FIELD_REF_SIZE, 0);
page_zip->n_blobs -= static_cast<unsigned>(n_ext);
}
diff --git a/storage/innobase/row/row0uins.cc b/storage/innobase/row/row0uins.cc
index 74c0b51fbbf..701f11992e1 100644
--- a/storage/innobase/row/row0uins.cc
+++ b/storage/innobase/row/row0uins.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -206,32 +206,7 @@ func_exit:
if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_INSERT_METADATA) {
/* When rolling back the very first instant ADD COLUMN
operation, reset the root page to the basic state. */
- ut_ad(!index->table->is_temporary());
- if (buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH,
- &mtr)) {
- byte* page_type = root->frame + FIL_PAGE_TYPE;
- ut_ad(mach_read_from_2(page_type)
- == FIL_PAGE_TYPE_INSTANT
- || mach_read_from_2(page_type)
- == FIL_PAGE_INDEX);
- mtr.write<2,mtr_t::OPT>(*root, page_type,
- FIL_PAGE_INDEX);
- byte* instant = PAGE_INSTANT + PAGE_HEADER
- + root->frame;
- mtr.write<2,mtr_t::OPT>(
- *root, instant,
- page_ptr_get_direction(instant + 1));
- rec_t* infimum = page_get_infimum_rec(root->frame);
- rec_t* supremum = page_get_supremum_rec(root->frame);
- static const byte str[8 + 8] = "supremuminfimum";
- if (memcmp(infimum, str + 8, 8)
- || memcmp(supremum, str, 8)) {
- mtr.memcpy(root, page_offset(infimum),
- str + 8, 8);
- mtr.memcpy(root, page_offset(supremum),
- str, 8);
- }
- }
+ btr_reset_instant(*index, true, &mtr);
}
btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc
index 5e5da78503a..f2e2e4e70d9 100644
--- a/storage/innobase/row/row0umod.cc
+++ b/storage/innobase/row/row0umod.cc
@@ -148,37 +148,12 @@ row_undo_mod_clust_low(
ut_a(!dummy_big_rec);
- static const byte
- INFIMUM[8] = {'i','n','f','i','m','u','m',0},
- SUPREMUM[8] = {'s','u','p','r','e','m','u','m'};
-
if (err == DB_SUCCESS
&& node->ref == &trx_undo_metadata
&& btr_cur_get_index(btr_cur)->table->instant
&& node->update->info_bits == REC_INFO_METADATA_ADD) {
- if (buf_block_t* root = btr_root_block_get(
- btr_cur_get_index(btr_cur), RW_SX_LATCH,
- mtr)) {
- uint16_t infimum, supremum;
- if (page_is_comp(root->frame)) {
- infimum = PAGE_NEW_INFIMUM;
- supremum = PAGE_NEW_SUPREMUM;
- } else {
- infimum = PAGE_OLD_INFIMUM;
- supremum = PAGE_OLD_SUPREMUM;
- }
-
- ut_ad(!memcmp(root->frame + infimum,
- INFIMUM, 8)
- == !memcmp(root->frame + supremum,
- SUPREMUM, 8));
-
- if (memcmp(root->frame + infimum, INFIMUM, 8)) {
- mtr->memcpy(root, infimum, INFIMUM, 8);
- mtr->memcpy(root, supremum, SUPREMUM,
- 8);
- }
- }
+ btr_reset_instant(*btr_cur_get_index(btr_cur), false,
+ mtr);
}
}
diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc
index 6f2abf96b69..d8378d271ec 100644
--- a/storage/innobase/srv/srv0start.cc
+++ b/storage/innobase/srv/srv0start.cc
@@ -1083,7 +1083,7 @@ srv_prepare_to_delete_redo_log_files(
ib::info info;
if (srv_log_file_size == 0
|| (log_sys.log.format & ~log_t::FORMAT_ENCRYPTED)
- != log_t::FORMAT_10_4) {
+ != log_t::FORMAT_10_5) {
info << "Upgrading redo log: ";
} else if (n_files != srv_n_log_files
|| srv_log_file_size
@@ -1829,8 +1829,8 @@ files_checked:
&& srv_n_log_files_found == srv_n_log_files
&& log_sys.log.format
== (srv_encrypt_log
- ? log_t::FORMAT_ENC_10_4
- : log_t::FORMAT_10_4)
+ ? log_t::FORMAT_ENC_10_5
+ : log_t::FORMAT_10_5)
&& log_sys.log.subformat == 2) {
/* No need to add or remove encryption,
upgrade, downgrade, or resize. */
diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc
index 546a4c51e03..0a8de8b7fa9 100644
--- a/storage/innobase/trx/trx0rseg.cc
+++ b/storage/innobase/trx/trx0rseg.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -72,8 +72,8 @@ trx_rseg_write_wsrep_checkpoint(
const ulint xid_length = static_cast<ulint>(xid->gtrid_length
+ xid->bqual_length);
- mtr->memcpy(rseg_header, TRX_RSEG + TRX_RSEG_WSREP_XID_DATA,
- xid->data, xid_length);
+ mtr->memcpy(*rseg_header, TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+ + rseg_header->frame, xid->data, xid_length);
if (UNIV_LIKELY(xid_length < XIDDATASIZE)) {
mtr->memset(rseg_header,
TRX_RSEG + TRX_RSEG_WSREP_XID_DATA + xid_length,
@@ -738,9 +738,9 @@ void trx_rseg_update_binlog_offset(buf_block_t *rseg_header, const trx_t *trx,
+ rseg_header->frame,
trx->mysql_log_offset);
- if (memcmp(trx->mysql_log_file_name, TRX_RSEG + TRX_RSEG_BINLOG_NAME
- + rseg_header->frame, len)) {
- mtr->memcpy(rseg_header, TRX_RSEG + TRX_RSEG_BINLOG_NAME,
- trx->mysql_log_file_name, len);
+ void* name = TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_header->frame;
+
+ if (memcmp(trx->mysql_log_file_name, name, len)) {
+ mtr->memcpy(*rseg_header, name, trx->mysql_log_file_name, len);
}
}
diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc
index d6575ba4d49..985cbeba2f9 100644
--- a/storage/innobase/trx/trx0undo.cc
+++ b/storage/innobase/trx/trx0undo.cc
@@ -390,12 +390,11 @@ static void trx_undo_page_init(const buf_block_t *undo_block, mtr_t *mtr)
compile_time_assert(TRX_UNDO_PAGE_START == 2);
compile_time_assert(TRX_UNDO_PAGE_NODE == TRX_UNDO_PAGE_FREE + 2);
- /* MDEV-12353 FIXME: write minimal number of bytes in the new encoding */
- mtr->write<4>(*undo_block, TRX_UNDO_PAGE_HDR + undo_block->frame,
- TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
- mtr->write<2>(*undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
- undo_block->frame,
- TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+ alignas(4) byte hdr[6];
+ mach_write_to_4(hdr, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+ memcpy_aligned<2>(hdr + 4, hdr + 2, 2);
+ static_assert(TRX_UNDO_PAGE_FREE == 4, "compatibility");
+ mtr->memcpy(*undo_block, undo_block->frame + TRX_UNDO_PAGE_HDR, hdr, 6);
}
/** Look for a free slot for an undo log segment.
@@ -501,41 +500,63 @@ trx_undo_seg_create(fil_space_t *space, buf_block_t *rseg_hdr, ulint *id,
static uint16_t trx_undo_header_create(buf_block_t *undo_page, trx_id_t trx_id,
mtr_t* mtr)
{
- const uint16_t free= mach_read_from_2(TRX_UNDO_PAGE_HDR +
- TRX_UNDO_PAGE_FREE + undo_page->frame);
- const uint16_t new_free= free + TRX_UNDO_LOG_OLD_HDR_SIZE;
-
+ /* Reset the TRX_UNDO_PAGE_TYPE in case this page is being
+ repurposed after upgrading to MariaDB 10.3. */
+ byte *undo_type= my_assume_aligned<2>
+ (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + undo_page->frame);
+ ut_ad(mach_read_from_2(undo_type) <= TRX_UNDO_UPDATE);
+ mtr->write<2,mtr_t::OPT>(*undo_page, undo_type, 0U);
+ byte *start= my_assume_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START +
+ undo_page->frame);
+ const uint16_t free= mach_read_from_2(start + 2);
+ static_assert(TRX_UNDO_PAGE_START + 2 == TRX_UNDO_PAGE_FREE,
+ "compatibility");
ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < srv_page_size - 100);
- mtr->write<2>(*undo_page, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START +
- undo_page->frame, new_free);
- /* MDEV-12353 TODO: use MEMMOVE record */
- mtr->write<2>(*undo_page, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
- undo_page->frame, new_free);
- mtr->write<2>(*undo_page, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE +
- undo_page->frame, TRX_UNDO_ACTIVE);
-
- mtr->write<2,mtr_t::OPT>(*undo_page, free + TRX_UNDO_NEEDS_PURGE +
- undo_page->frame, 1U);
- mtr->write<8>(*undo_page, free + TRX_UNDO_TRX_ID + undo_page->frame, trx_id);
- mtr->write<2,mtr_t::OPT>(*undo_page, free + TRX_UNDO_LOG_START +
- undo_page->frame, new_free);
- mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS,
- TRX_UNDO_LOG_OLD_HDR_SIZE - TRX_UNDO_XID_EXISTS, 0);
-
- if (uint16_t prev_log= mach_read_from_2(TRX_UNDO_SEG_HDR +
- TRX_UNDO_LAST_LOG +
- undo_page->frame))
- {
+ mach_write_to_2(start, free + TRX_UNDO_LOG_XA_HDR_SIZE);
+ /* A WRITE of 2 bytes is never longer than a MEMMOVE.
+ So, WRITE 2+2 bytes is better than WRITE+MEMMOVE.
+ But, a MEMSET will only be 1+2 bytes, that is, 1 byte shorter! */
+ memcpy_aligned<2>(start + 2, start, 2);
+ mtr->memset(*undo_page, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START, 4,
+ start, 2);
+ uint16_t prev_log= mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG +
+ undo_page->frame);
+ alignas(4) byte buf[4];
+ mach_write_to_2(buf, TRX_UNDO_ACTIVE);
+ mach_write_to_2(buf + 2, free);
+ static_assert(TRX_UNDO_STATE + 2 == TRX_UNDO_LAST_LOG, "compatibility");
+ static_assert(!((TRX_UNDO_SEG_HDR + TRX_UNDO_STATE) % 4), "alignment");
+ mtr->memcpy(*undo_page, my_assume_aligned<4>
+ (TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + undo_page->frame),
+ buf, 4);
+ if (prev_log)
mtr->write<2>(*undo_page, prev_log + TRX_UNDO_NEXT_LOG + undo_page->frame,
free);
- mtr->write<2>(*undo_page, free + TRX_UNDO_PREV_LOG + undo_page->frame,
- prev_log);
+ mtr->write<8>(*undo_page, free + TRX_UNDO_TRX_ID + undo_page->frame, trx_id);
+ /* Write TRX_UNDO_NEEDS_PURGE=1 and TRX_UNDO_LOG_START. */
+ mach_write_to_2(buf, 1);
+ memcpy_aligned<2>(buf + 2, start, 2);
+ static_assert(TRX_UNDO_NEEDS_PURGE + 2 == TRX_UNDO_LOG_START,
+ "compatibility");
+ mtr->memcpy(*undo_page, free + TRX_UNDO_NEEDS_PURGE + undo_page->frame,
+ buf, 4);
+ /* Initialize all fields TRX_UNDO_XID_EXISTS to TRX_UNDO_HISTORY_NODE. */
+ if (prev_log)
+ {
+ mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS,
+ TRX_UNDO_PREV_LOG - TRX_UNDO_XID_EXISTS, 0);
+ mtr->write<2,mtr_t::OPT>(*undo_page, free + TRX_UNDO_PREV_LOG +
+ undo_page->frame, prev_log);
+ static_assert(TRX_UNDO_PREV_LOG + 2 == TRX_UNDO_HISTORY_NODE,
+ "compatibility");
+ mtr->memset(undo_page, free + TRX_UNDO_HISTORY_NODE, FLST_NODE_SIZE, 0);
+ static_assert(TRX_UNDO_LOG_OLD_HDR_SIZE == TRX_UNDO_HISTORY_NODE +
+ FLST_NODE_SIZE, "compatibility");
}
-
- mtr->write<2>(*undo_page, TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG +
- undo_page->frame, free);
-
+ else
+ mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS,
+ TRX_UNDO_LOG_OLD_HDR_SIZE - TRX_UNDO_XID_EXISTS, 0);
return free;
}
@@ -563,7 +584,8 @@ static void trx_undo_write_xid(buf_block_t *block, uint16_t offset,
static_cast<uint32_t>(xid.bqual_length));
const ulint xid_length= static_cast<ulint>(xid.gtrid_length
+ xid.bqual_length);
- mtr->memcpy(block, offset + TRX_UNDO_XA_XID, xid.data, xid_length);
+ mtr->memcpy(*block, &block->frame[offset + TRX_UNDO_XA_XID],
+ xid.data, xid_length);
if (UNIV_LIKELY(xid_length < XIDDATASIZE))
mtr->memset(block, offset + TRX_UNDO_XA_XID + xid_length,
XIDDATASIZE - xid_length, 0);
@@ -587,29 +609,6 @@ trx_undo_read_xid(const trx_ulogf_t* log_hdr, XID* xid)
memcpy(xid->data, log_hdr + TRX_UNDO_XA_XID, XIDDATASIZE);
}
-/** Add space for the XA XID after an undo log old-style header.
-@param[in,out] block undo page
-@param[in] offset offset of the undo log header
-@param[in,out] mtr mini-transaction */
-static void trx_undo_header_add_space_for_xid(buf_block_t *block, ulint offset,
- mtr_t *mtr)
-{
- uint16_t free= mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
- block->frame);
- /* free is now the end offset of the old style undo log header */
- ut_a(free == offset + TRX_UNDO_LOG_OLD_HDR_SIZE);
- free += TRX_UNDO_LOG_XA_HDR_SIZE - TRX_UNDO_LOG_OLD_HDR_SIZE;
- /* Add space for a XID after the header, update the free offset
- fields on the undo log page and in the undo log header */
-
- mtr->write<2>(*block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block->frame,
- free);
- /* MDEV-12353 TODO: use MEMMOVE record */
- mtr->write<2>(*block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block->frame,
- free);
- mtr->write<2>(*block, offset + TRX_UNDO_LOG_START + block->frame, free);
-}
-
/** Parse the redo log entry of an undo log page header create.
@param[in] ptr redo log record
@param[in] end_ptr end of log buffer
@@ -1133,8 +1132,6 @@ trx_undo_create(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
uint16_t offset = trx_undo_header_create(block, trx->id, mtr);
- trx_undo_header_add_space_for_xid(block, offset, mtr);
-
*undo = trx_undo_mem_create(rseg, id, trx->id, trx->xid,
block->page.id.page_no(), offset);
if (*undo == NULL) {
@@ -1204,17 +1201,6 @@ trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo,
*pundo = undo;
uint16_t offset = trx_undo_header_create(block, trx->id, mtr);
- /* Reset the TRX_UNDO_PAGE_TYPE in case this page is being
- repurposed after upgrading to MariaDB 10.3. */
- if (ut_d(ulint type =) UNIV_UNLIKELY(
- mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
- + block->frame))) {
- ut_ad(type == TRX_UNDO_INSERT || type == TRX_UNDO_UPDATE);
- mtr->write<2>(*block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
- + block->frame, 0U);
- }
-
- trx_undo_header_add_space_for_xid(block, offset, mtr);
trx_undo_mem_init_for_reuse(undo, trx->id, trx->xid, offset);