summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarko Mäkelä <marko.makela@mariadb.com>2022-05-27 15:45:40 +0300
committerMarko Mäkelä <marko.makela@mariadb.com>2022-05-27 15:45:40 +0300
commit444a56458f3fb393368e87d31133a31b1a27f9e1 (patch)
tree3a1a4e44e2749c4708ec3ee450276e1cfa9bc964
parent41bae618ddb195d2b7c84ee90c1410a3f580ce9a (diff)
downloadmariadb-git-444a56458f3fb393368e87d31133a31b1a27f9e1.tar.gz
MDEV-18976 Implement OPT_PAGE_CHECKSUM log record for improved validation
We will introduce an optional log record OPT_PAGE_CHECKSUM for recording page checksums, so that more inconsistencies on crash recovery may be caught. buf_block_t::page_checksum(): Calculate a checksum for OPT_PAGE_CHECKSUM, skipping page checksums and the LSN. mtr_t::page_checksum(const buf_block_t&): Write OPT_PAGE_CHECKSUM (currently not for ROW_FORMAT=COMPRESSED pages). mtr_t::do_write(): Write OPT_PAGE_CHECKSUM records for all pages (currently, in debug builds only). log_phys_t::apply(): Validate OPT_PAGE_CHECKSUM records. recv_sys_t::parse(): Store OPT_PAGE_CHECKSUM records.
-rw-r--r--storage/innobase/include/buf0buf.h11
-rw-r--r--storage/innobase/include/fil0fil.h7
-rw-r--r--storage/innobase/include/mtr0log.h2
-rw-r--r--storage/innobase/include/mtr0mtr.h3
-rw-r--r--storage/innobase/include/mtr0types.h21
-rw-r--r--storage/innobase/log/log0recv.cc38
-rw-r--r--storage/innobase/mtr/mtr0mtr.cc59
7 files changed, 107 insertions, 34 deletions
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
index 17d6d10f8b7..7032d2ee68a 100644
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -24,8 +24,7 @@ The database buffer pool high-level routines
Created 11/5/1995 Heikki Tuuri
*******************************************************/
-#ifndef buf0buf_h
-#define buf0buf_h
+#pragma once
/** Magic value to use instead of checksums when they are disabled */
#define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL
@@ -43,9 +42,6 @@ Created 11/5/1995 Heikki Tuuri
#include "srv0srv.h"
#include <ostream>
-// Forward declaration
-struct fil_addr_t;
-
/** @name Modes for buf_page_get_gen */
/* @{ */
#define BUF_GET 10 /*!< get always */
@@ -1219,6 +1215,9 @@ struct buf_block_t{
@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
@param fix initial buf_fix_count() */
void initialise(const page_id_t page_id, ulint zip_size, uint32_t fix= 0);
+
+ /** @return checksum for an OPT_PAGE_CHECKSUM record */
+ uint32_t page_checksum() const;
};
/**********************************************************************//**
@@ -2429,5 +2428,3 @@ struct CheckUnzipLRUAndLRUList {
#include "buf0buf.inl"
#endif /* !UNIV_INNOCHECKSUM */
-
-#endif
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
index 9d5bbcadc65..399cb6f4344 100644
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2021, MariaDB Corporation.
+Copyright (c) 2013, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -1179,8 +1179,9 @@ struct fil_addr_t {
/** For the first page in a system tablespace data file(ibdata*, not *.ibd):
the file has been flushed to disk at least up to this lsn
-For other pages: 32-bit key version used to encrypt the page + 32-bit checksum
-or 64 bites of zero if no encryption */
+For other pages of tablespaces not in innodb_checksum_algorithm=full_crc32
+format: 32-bit key version used to encrypt the page + 32-bit checksum
+or 64 bits of zero if no encryption */
#define FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION 26U
/** This overloads FIL_PAGE_FILE_FLUSH_LSN for RTREE Split Sequence Number */
diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h
index 285672be898..dd77e37ce6b 100644
--- a/storage/innobase/include/mtr0log.h
+++ b/storage/innobase/include/mtr0log.h
@@ -387,7 +387,7 @@ template<byte type>
inline byte *mtr_t::log_write(const page_id_t id, const buf_page_t *bpage,
size_t len, bool alloc, size_t offset)
{
- static_assert(!(type & 15) && type != RESERVED && type != OPTION &&
+ static_assert(!(type & 15) && type != RESERVED &&
type <= FILE_CHECKPOINT, "invalid type");
ut_ad(type >= FILE_CREATE || is_named_space(id.space()));
ut_ad(!bpage || bpage->id() == id);
diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h
index 903b3f4699f..dda89dac427 100644
--- a/storage/innobase/include/mtr0mtr.h
+++ b/storage/innobase/include/mtr0mtr.h
@@ -376,6 +376,9 @@ public:
/** @return whether the log and memo are empty */
bool is_empty() const { return m_memo.size() == 0 && m_log.size() == 0; }
+ /** Write a OPT_PAGE_CHECKSUM record. */
+ inline void page_checksum(const buf_block_t &block);
+
/** Write request types */
enum write_type
{
diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h
index 9e59dc814d3..95af96f113a 100644
--- a/storage/innobase/include/mtr0types.h
+++ b/storage/innobase/include/mtr0types.h
@@ -80,12 +80,8 @@ type. The following record types refer to data pages:
RESERVED (6): reserved for future use; a subtype code
(encoded immediately after the length) would be written
to reserve code space for further extensions
- OPTION (7): optional record that may be ignored; a subtype code
- (encoded immediately after the length) would distinguish actual
- usage, such as:
- * MDEV-18976 page checksum record
- * binlog record
- * SQL statement (at the start of statement)
+ OPTION (7): optional record that may be ignored; a subtype @see mrec_opt
+ (encoded immediately after the length) would distinguish actual usage
Bits 3..0 indicate the redo log record length, excluding the first
byte, but including additional length bytes and any other bytes,
@@ -232,9 +228,7 @@ enum mrec_type_t
/** Reserved for future use. */
RESERVED= 0x60,
/** Optional record that may be ignored in crash recovery.
- A subtype code will be encoded immediately after the length.
- Possible subtypes would include a MDEV-18976 page checksum record,
- a binlog record, or an SQL statement. */
+ A subtype (@see mrec_opt) will be encoded after the page identifier. */
OPTION= 0x70
};
@@ -286,6 +280,15 @@ enum mrec_ext_t
};
+/** Recognized OPTION record subtypes. */
+enum mrec_opt
+{
+ /** page checksum at the end of the mini-transaction */
+ OPT_PAGE_CHECKSUM= 0
+ /* Other possible subtypes: a binlog record, or an SQL statement. */
+};
+
+
/** Redo log record types for file-level operations. These bit
patterns will be written to redo log files, so the existing codes or
their interpretation on crash recovery must not be changed. */
diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
index d9761fe9d85..44301d1eb31 100644
--- a/storage/innobase/log/log0recv.cc
+++ b/storage/innobase/log/log0recv.cc
@@ -262,9 +262,26 @@ public:
next_not_same_page:
last_offset= 1; /* the next record must not be same_page */
}
- next:
l+= rlen;
continue;
+ case OPTION:
+ ut_ad(rlen == 5);
+ ut_ad(*l == OPT_PAGE_CHECKSUM);
+ ut_ad(!block.page.zip.data);
+ if (UNIV_UNLIKELY(block.page_checksum() != mach_read_from_4(l + 1)))
+ {
+ ib::error() << "InnoDB: OPT_PAGE_CHECKSUM mismatch on "
+ << block.page.id();
+ if (!srv_force_recovery)
+ {
+ applied= APPLIED_YES;
+page_corrupted:
+ ib::error() << "Set innodb_force_recovery=1 to ignore corruption.";
+ recv_sys.found_corrupt_log= true;
+ return applied;
+ }
+ }
+ goto next_after_applying;
}
ut_ad(mach_read_from_4(frame + FIL_PAGE_OFFSET) ==
@@ -275,8 +292,6 @@ public:
ut_ad(last_offset <= size);
switch (b & 0x70) {
- case OPTION:
- goto next;
case EXTENDED:
if (UNIV_UNLIKELY(block.page.id().page_no() < 3 ||
block.page.zip.ssize))
@@ -305,12 +320,7 @@ public:
if (UNIV_UNLIKELY(rlen <= 3))
goto record_corrupted;
if (undo_append(block, ++l, --rlen) && !srv_force_recovery)
- {
-page_corrupted:
- ib::error() << "Set innodb_force_recovery=1 to ignore corruption.";
- recv_sys.found_corrupt_log= true;
- return applied;
- }
+ goto page_corrupted;
break;
case INSERT_HEAP_REDUNDANT:
case INSERT_REUSE_REDUNDANT:
@@ -1931,7 +1941,8 @@ same_page:
if (got_page_op)
{
const page_id_t id(space_id, page_no);
- ut_d(if ((b & 0x70) == INIT_PAGE) freed.erase(id));
+ ut_d(if ((b & 0x70) == INIT_PAGE || (b & 0x70) == OPTION)
+ freed.erase(id));
ut_ad(freed.find(id) == freed.end());
switch (b & 0x70) {
case FREE_PAGE:
@@ -1967,8 +1978,11 @@ same_page:
}
last_offset= FIL_PAGE_TYPE;
break;
- case RESERVED:
case OPTION:
+ if (rlen == 5 && *l == OPT_PAGE_CHECKSUM)
+ break;
+ /* fall through */
+ case RESERVED:
continue;
case WRITE:
case MEMMOVE:
@@ -2060,9 +2074,9 @@ same_page:
#if 0 && defined UNIV_DEBUG
switch (b & 0x70) {
case RESERVED:
- case OPTION:
ut_ad(0); /* we did "continue" earlier */
break;
+ case OPTION:
case FREE_PAGE:
break;
default:
diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc
index 2feb5a0583f..3b1356160e9 100644
--- a/storage/innobase/mtr/mtr0mtr.cc
+++ b/storage/innobase/mtr/mtr0mtr.cc
@@ -376,8 +376,8 @@ struct ReleaseBlocks
return true;
}
- buf_flush_note_modification(static_cast<buf_block_t*>(slot->object),
- start, end);
+ buf_block_t *block= static_cast<buf_block_t*>(slot->object);
+ buf_flush_note_modification(block, start, end);
return true;
}
};
@@ -573,6 +573,7 @@ void mtr_t::commit_shrink(fil_space_t &space)
log_write_and_flush_prepare();
const lsn_t start_lsn= do_write().first;
+ ut_d(m_log.erase());
mysql_mutex_lock(&log_sys.flush_order_mutex);
/* Durably write the reduced FSP_SIZE before truncating the data file. */
@@ -966,6 +967,55 @@ static mtr_t::page_flush_ahead log_close(lsn_t lsn)
return mtr_t::PAGE_FLUSH_SYNC;
}
+/** @return checksum for an OPT_PAGE_CHECKSUM record */
+uint32_t buf_block_t::page_checksum() const
+{
+ /* We have to exclude from the checksum the normal
+ page checksum that is written by buf_flush_init_for_writing()
+ and FIL_PAGE_LSN which would be updated once we have actually
+ allocated the LSN.
+
+ Unfortunately, we cannot access fil_space_t easily here. In order to
+ be compatible with encrypted tablespaces in the pre-full_crc32
+ format we will unconditionally exclude the 8 bytes at
+ FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+ a.k.a. FIL_RTREE_SPLIT_SEQ_NUM. */
+ return my_crc32c(my_crc32c(my_crc32c(0, frame + FIL_PAGE_OFFSET,
+ FIL_PAGE_LSN - FIL_PAGE_OFFSET),
+ frame + FIL_PAGE_TYPE, 2),
+ frame + FIL_PAGE_SPACE_ID,
+ srv_page_size - (FIL_PAGE_SPACE_ID + 8));
+}
+
+inline void mtr_t::page_checksum(const buf_block_t &block)
+{
+ if (UNIV_LIKELY_NULL(block.page.zip.data))
+ return; /* FIXME: support ROW_FORMAT=COMPRESSED */
+ byte *l= log_write<OPTION>(block.page.id(), nullptr, 5, true, 0);
+ *l++= OPT_PAGE_CHECKSUM;
+ mach_write_to_4(l, block.page_checksum());
+ m_log.close(l + 4);
+}
+
+/** Write OPT_PAGE_CHECKSUM records for modified pages */
+struct Write_OPT_PAGE_CHECKSUM
+{
+ mtr_t &mtr;
+ Write_OPT_PAGE_CHECKSUM(mtr_t &mtr) : mtr(mtr) {}
+
+ /** @return true always */
+ bool operator()(const mtr_memo_slot_t *slot) const
+ {
+ if (slot->type & MTR_MEMO_MODIFY)
+ {
+ const buf_block_t &block= *static_cast<const buf_block_t*>(slot->object);
+ if (block.page.status < buf_page_t::FREED)
+ mtr.page_checksum(block);
+ }
+ return true;
+ }
+};
+
/** Write the block contents to the REDO log */
struct mtr_write_log
{
@@ -986,6 +1036,11 @@ std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::do_write()
ulint len = m_log.size();
ut_ad(len > 0);
+#ifdef UNIV_DEBUG
+ m_memo.for_each_block(CIterate<Write_OPT_PAGE_CHECKSUM>(*this));
+ len = m_log.size();
+#endif
+
if (len > srv_log_buffer_size / 2) {
log_buffer_extend(ulong((len + 1) * 2));
}