diff options
author | Marko Mäkelä <marko.makela@mariadb.com> | 2022-05-27 15:45:40 +0300 |
---|---|---|
committer | Marko Mäkelä <marko.makela@mariadb.com> | 2022-05-27 15:45:40 +0300 |
commit | 444a56458f3fb393368e87d31133a31b1a27f9e1 (patch) | |
tree | 3a1a4e44e2749c4708ec3ee450276e1cfa9bc964 | |
parent | 41bae618ddb195d2b7c84ee90c1410a3f580ce9a (diff) | |
download | mariadb-git-444a56458f3fb393368e87d31133a31b1a27f9e1.tar.gz |
MDEV-18976 Implement OPT_PAGE_CHECKSUM log record for improved validation
We will introduce an optional log record OPT_PAGE_CHECKSUM for recording
page checksums, so that more inconsistencies on crash recovery may be
caught.
buf_block_t::page_checksum(): Calculate a checksum for OPT_PAGE_CHECKSUM,
skipping page checksums and the LSN.
mtr_t::page_checksum(const buf_block_t&): Write OPT_PAGE_CHECKSUM
(currently not for ROW_FORMAT=COMPRESSED pages).
mtr_t::do_write(): Write OPT_PAGE_CHECKSUM records for all pages
(currently, in debug builds only).
log_phys_t::apply(): Validate OPT_PAGE_CHECKSUM records.
recv_sys_t::parse(): Store OPT_PAGE_CHECKSUM records.
-rw-r--r-- | storage/innobase/include/buf0buf.h | 11 | ||||
-rw-r--r-- | storage/innobase/include/fil0fil.h | 7 | ||||
-rw-r--r-- | storage/innobase/include/mtr0log.h | 2 | ||||
-rw-r--r-- | storage/innobase/include/mtr0mtr.h | 3 | ||||
-rw-r--r-- | storage/innobase/include/mtr0types.h | 21 | ||||
-rw-r--r-- | storage/innobase/log/log0recv.cc | 38 | ||||
-rw-r--r-- | storage/innobase/mtr/mtr0mtr.cc | 59 |
7 files changed, 107 insertions, 34 deletions
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 17d6d10f8b7..7032d2ee68a 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -24,8 +24,7 @@ The database buffer pool high-level routines Created 11/5/1995 Heikki Tuuri *******************************************************/ -#ifndef buf0buf_h -#define buf0buf_h +#pragma once /** Magic value to use instead of checksums when they are disabled */ #define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL @@ -43,9 +42,6 @@ Created 11/5/1995 Heikki Tuuri #include "srv0srv.h" #include <ostream> -// Forward declaration -struct fil_addr_t; - /** @name Modes for buf_page_get_gen */ /* @{ */ #define BUF_GET 10 /*!< get always */ @@ -1219,6 +1215,9 @@ struct buf_block_t{ @param zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param fix initial buf_fix_count() */ void initialise(const page_id_t page_id, ulint zip_size, uint32_t fix= 0); + + /** @return checksum for an OPT_PAGE_CHECKSUM record */ + uint32_t page_checksum() const; }; /**********************************************************************//** @@ -2429,5 +2428,3 @@ struct CheckUnzipLRUAndLRUList { #include "buf0buf.inl" #endif /* !UNIV_INNOCHECKSUM */ - -#endif diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 9d5bbcadc65..399cb6f4344 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, 2021, MariaDB Corporation. +Copyright (c) 2013, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1179,8 +1179,9 @@ struct fil_addr_t { /** For the first page in a system tablespace data file(ibdata*, not *.ibd): the file has been flushed to disk at least up to this lsn -For other pages: 32-bit key version used to encrypt the page + 32-bit checksum -or 64 bites of zero if no encryption */ +For other pages of tablespaces not in innodb_checksum_algorithm=full_crc32 +format: 32-bit key version used to encrypt the page + 32-bit checksum +or 64 bits of zero if no encryption */ #define FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION 26U /** This overloads FIL_PAGE_FILE_FLUSH_LSN for RTREE Split Sequence Number */ diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h index 285672be898..dd77e37ce6b 100644 --- a/storage/innobase/include/mtr0log.h +++ b/storage/innobase/include/mtr0log.h @@ -387,7 +387,7 @@ template<byte type> inline byte *mtr_t::log_write(const page_id_t id, const buf_page_t *bpage, size_t len, bool alloc, size_t offset) { - static_assert(!(type & 15) && type != RESERVED && type != OPTION && + static_assert(!(type & 15) && type != RESERVED && type <= FILE_CHECKPOINT, "invalid type"); ut_ad(type >= FILE_CREATE || is_named_space(id.space())); ut_ad(!bpage || bpage->id() == id); diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index 903b3f4699f..dda89dac427 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -376,6 +376,9 @@ public: /** @return whether the log and memo are empty */ bool is_empty() const { return m_memo.size() == 0 && m_log.size() == 0; } + /** Write a OPT_PAGE_CHECKSUM record. */ + inline void page_checksum(const buf_block_t &block); + /** Write request types */ enum write_type { diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h index 9e59dc814d3..95af96f113a 100644 --- a/storage/innobase/include/mtr0types.h +++ b/storage/innobase/include/mtr0types.h @@ -80,12 +80,8 @@ type. The following record types refer to data pages: RESERVED (6): reserved for future use; a subtype code (encoded immediately after the length) would be written to reserve code space for further extensions - OPTION (7): optional record that may be ignored; a subtype code - (encoded immediately after the length) would distinguish actual - usage, such as: - * MDEV-18976 page checksum record - * binlog record - * SQL statement (at the start of statement) + OPTION (7): optional record that may be ignored; a subtype @see mrec_opt + (encoded immediately after the length) would distinguish actual usage Bits 3..0 indicate the redo log record length, excluding the first byte, but including additional length bytes and any other bytes, @@ -232,9 +228,7 @@ enum mrec_type_t /** Reserved for future use. */ RESERVED= 0x60, /** Optional record that may be ignored in crash recovery. - A subtype code will be encoded immediately after the length. - Possible subtypes would include a MDEV-18976 page checksum record, - a binlog record, or an SQL statement. */ + A subtype (@see mrec_opt) will be encoded after the page identifier. */ OPTION= 0x70 }; @@ -286,6 +280,15 @@ enum mrec_ext_t }; +/** Recognized OPTION record subtypes. */ +enum mrec_opt +{ + /** page checksum at the end of the mini-transaction */ + OPT_PAGE_CHECKSUM= 0 + /* Other possible subtypes: a binlog record, or an SQL statement. */ +}; + + /** Redo log record types for file-level operations. These bit patterns will be written to redo log files, so the existing codes or their interpretation on crash recovery must not be changed. */ diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index d9761fe9d85..44301d1eb31 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -262,9 +262,26 @@ public: next_not_same_page: last_offset= 1; /* the next record must not be same_page */ } - next: l+= rlen; continue; + case OPTION: + ut_ad(rlen == 5); + ut_ad(*l == OPT_PAGE_CHECKSUM); + ut_ad(!block.page.zip.data); + if (UNIV_UNLIKELY(block.page_checksum() != mach_read_from_4(l + 1))) + { + ib::error() << "InnoDB: OPT_PAGE_CHECKSUM mismatch on " + << block.page.id(); + if (!srv_force_recovery) + { + applied= APPLIED_YES; +page_corrupted: + ib::error() << "Set innodb_force_recovery=1 to ignore corruption."; + recv_sys.found_corrupt_log= true; + return applied; + } + } + goto next_after_applying; } ut_ad(mach_read_from_4(frame + FIL_PAGE_OFFSET) == @@ -275,8 +292,6 @@ public: ut_ad(last_offset <= size); switch (b & 0x70) { - case OPTION: - goto next; case EXTENDED: if (UNIV_UNLIKELY(block.page.id().page_no() < 3 || block.page.zip.ssize)) @@ -305,12 +320,7 @@ public: if (UNIV_UNLIKELY(rlen <= 3)) goto record_corrupted; if (undo_append(block, ++l, --rlen) && !srv_force_recovery) - { -page_corrupted: - ib::error() << "Set innodb_force_recovery=1 to ignore corruption."; - recv_sys.found_corrupt_log= true; - return applied; - } + goto page_corrupted; break; case INSERT_HEAP_REDUNDANT: case INSERT_REUSE_REDUNDANT: @@ -1931,7 +1941,8 @@ same_page: if (got_page_op) { const page_id_t id(space_id, page_no); - ut_d(if ((b & 0x70) == INIT_PAGE) freed.erase(id)); + ut_d(if ((b & 0x70) == INIT_PAGE || (b & 0x70) == OPTION) + freed.erase(id)); ut_ad(freed.find(id) == freed.end()); switch (b & 0x70) { case FREE_PAGE: @@ -1967,8 +1978,11 @@ same_page: } last_offset= FIL_PAGE_TYPE; break; - case RESERVED: case OPTION: + if (rlen == 5 && *l == OPT_PAGE_CHECKSUM) + break; + /* fall through */ + case RESERVED: continue; case WRITE: case MEMMOVE: @@ -2060,9 +2074,9 @@ same_page: #if 0 && defined UNIV_DEBUG switch (b & 0x70) { case RESERVED: - case OPTION: ut_ad(0); /* we did "continue" earlier */ break; + case OPTION: case FREE_PAGE: break; default: diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index 2feb5a0583f..3b1356160e9 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -376,8 +376,8 @@ struct ReleaseBlocks return true; } - buf_flush_note_modification(static_cast<buf_block_t*>(slot->object), - start, end); + buf_block_t *block= static_cast<buf_block_t*>(slot->object); + buf_flush_note_modification(block, start, end); return true; } }; @@ -573,6 +573,7 @@ void mtr_t::commit_shrink(fil_space_t &space) log_write_and_flush_prepare(); const lsn_t start_lsn= do_write().first; + ut_d(m_log.erase()); mysql_mutex_lock(&log_sys.flush_order_mutex); /* Durably write the reduced FSP_SIZE before truncating the data file. */ @@ -966,6 +967,55 @@ static mtr_t::page_flush_ahead log_close(lsn_t lsn) return mtr_t::PAGE_FLUSH_SYNC; } +/** @return checksum for an OPT_PAGE_CHECKSUM record */ +uint32_t buf_block_t::page_checksum() const +{ + /* We have to exclude from the checksum the normal + page checksum that is written by buf_flush_init_for_writing() + and FIL_PAGE_LSN which would be updated once we have actually + allocated the LSN. + + Unfortunately, we cannot access fil_space_t easily here. In order to + be compatible with encrypted tablespaces in the pre-full_crc32 + format we will unconditionally exclude the 8 bytes at + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + a.k.a. FIL_RTREE_SPLIT_SEQ_NUM. */ + return my_crc32c(my_crc32c(my_crc32c(0, frame + FIL_PAGE_OFFSET, + FIL_PAGE_LSN - FIL_PAGE_OFFSET), + frame + FIL_PAGE_TYPE, 2), + frame + FIL_PAGE_SPACE_ID, + srv_page_size - (FIL_PAGE_SPACE_ID + 8)); +} + +inline void mtr_t::page_checksum(const buf_block_t &block) +{ + if (UNIV_LIKELY_NULL(block.page.zip.data)) + return; /* FIXME: support ROW_FORMAT=COMPRESSED */ + byte *l= log_write<OPTION>(block.page.id(), nullptr, 5, true, 0); + *l++= OPT_PAGE_CHECKSUM; + mach_write_to_4(l, block.page_checksum()); + m_log.close(l + 4); +} + +/** Write OPT_PAGE_CHECKSUM records for modified pages */ +struct Write_OPT_PAGE_CHECKSUM +{ + mtr_t &mtr; + Write_OPT_PAGE_CHECKSUM(mtr_t &mtr) : mtr(mtr) {} + + /** @return true always */ + bool operator()(const mtr_memo_slot_t *slot) const + { + if (slot->type & MTR_MEMO_MODIFY) + { + const buf_block_t &block= *static_cast<const buf_block_t*>(slot->object); + if (block.page.status < buf_page_t::FREED) + mtr.page_checksum(block); + } + return true; + } +}; + /** Write the block contents to the REDO log */ struct mtr_write_log { @@ -986,6 +1036,11 @@ std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::do_write() ulint len = m_log.size(); ut_ad(len > 0); +#ifdef UNIV_DEBUG + m_memo.for_each_block(CIterate<Write_OPT_PAGE_CHECKSUM>(*this)); + len = m_log.size(); +#endif + if (len > srv_log_buffer_size / 2) { log_buffer_extend(ulong((len + 1) * 2)); } |