diff options
Diffstat (limited to 'storage/innobase/include')
29 files changed, 616 insertions, 1442 deletions
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 0c61888e5cc..24a092ed5b5 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -808,7 +808,7 @@ public: { ut_ad(fsp_is_system_temporary(id().space())); ut_ad(in_file()); - ut_ad(!oldest_modification()); + ut_ad(!oldest_modification() || oldest_modification() == 2); oldest_modification_= 2; } @@ -1743,6 +1743,12 @@ public: FlushHp flush_hp; /** modified blocks (a subset of LRU) */ UT_LIST_BASE_NODE_T(buf_page_t) flush_list; + /** number of blocks ever added to flush_list; + sometimes protected by flush_list_mutex */ + size_t flush_list_requests; + + TPOOL_SUPPRESS_TSAN void add_flush_list_requests(size_t size) + { ut_ad(size); flush_list_requests+= size; } private: /** whether the page cleaner needs wakeup from indefinite sleep */ bool page_cleaner_is_idle; @@ -1753,7 +1759,7 @@ public: pthread_cond_t do_flush_list; /** @return whether the page cleaner must sleep due to being idle */ - bool page_cleaner_idle() const + bool page_cleaner_idle() const noexcept { mysql_mutex_assert_owner(&flush_list_mutex); return page_cleaner_is_idle; @@ -1878,24 +1884,31 @@ public: private: /** Remove a block from the flush list. */ - inline void delete_from_flush_list_low(buf_page_t *bpage); + inline void delete_from_flush_list_low(buf_page_t *bpage) noexcept; /** Remove a block from flush_list. @param bpage buffer pool page @param clear whether to invoke buf_page_t::clear_oldest_modification() */ - void delete_from_flush_list(buf_page_t *bpage, bool clear); + void delete_from_flush_list(buf_page_t *bpage, bool clear) noexcept; public: /** Remove a block from flush_list. @param bpage buffer pool page */ - void delete_from_flush_list(buf_page_t *bpage) + void delete_from_flush_list(buf_page_t *bpage) noexcept { delete_from_flush_list(bpage, true); } + /** Prepare to insert a modified blcok into flush_list. + @param lsn start LSN of the mini-transaction + @return insert position for insert_into_flush_list() */ + inline buf_page_t *prepare_insert_into_flush_list(lsn_t lsn) noexcept; + /** Insert a modified block into the flush list. + @param prev insert position (from prepare_insert_into_flush_list()) @param block modified block @param lsn start LSN of the mini-transaction that modified the block */ - void insert_into_flush_list(buf_block_t *block, lsn_t lsn); + inline void insert_into_flush_list(buf_page_t *prev, buf_block_t *block, + lsn_t lsn) noexcept; /** Free a page whose underlying file page has been freed. */ - inline void release_freed_page(buf_page_t *bpage); + inline void release_freed_page(buf_page_t *bpage) noexcept; private: /** Temporary memory for page_compressed and encrypted I/O */ diff --git a/storage/innobase/include/buf0buf.inl b/storage/innobase/include/buf0buf.inl index 3c4da98f83b..4516a24803c 100644 --- a/storage/innobase/include/buf0buf.inl +++ b/storage/innobase/include/buf0buf.inl @@ -2,7 +2,7 @@ Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. -Copyright (c) 2014, 2021, MariaDB Corporation. +Copyright (c) 2014, 2022, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -141,4 +141,3 @@ buf_block_get_modify_clock( ut_ad(block->page.lock.have_any()); return(block->modify_clock); } - diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h index 665fd1115e7..af38f61b13b 100644 --- a/storage/innobase/include/buf0flu.h +++ b/storage/innobase/include/buf0flu.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2014, 2021, MariaDB Corporation. +Copyright (c) 2014, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -103,33 +103,6 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn); @param furious true=furious flushing, false=limit to innodb_io_capacity */ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious); -/********************************************************************//** -This function should be called at a mini-transaction commit, if a page was -modified in it. Puts the block to the list of modified blocks, if it not -already in it. */ -inline void buf_flush_note_modification(buf_block_t *b, lsn_t start, lsn_t end) -{ - ut_ad(!srv_read_only_mode); - ut_d(const auto s= b->page.state()); - ut_ad(s > buf_page_t::FREED); - ut_ad(s < buf_page_t::READ_FIX); - ut_ad(mach_read_from_8(b->page.frame + FIL_PAGE_LSN) <= end); - mach_write_to_8(b->page.frame + FIL_PAGE_LSN, end); - if (UNIV_LIKELY_NULL(b->page.zip.data)) - memcpy_aligned<8>(FIL_PAGE_LSN + b->page.zip.data, - FIL_PAGE_LSN + b->page.frame, 8); - - const lsn_t oldest_modification= b->page.oldest_modification(); - - if (oldest_modification > 1) - ut_ad(oldest_modification <= start); - else if (fsp_is_system_temporary(b->page.id().space())) - b->page.set_temp_modified(); - else - buf_pool.insert_into_flush_list(b, start); - srv_stats.buf_pool_write_requests.inc(); -} - /** Initialize page_cleaner. */ ATTRIBUTE_COLD void buf_flush_page_cleaner_init(); diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h index c2b8c3e00b6..a9b4aeb1ea3 100644 --- a/storage/innobase/include/data0data.h +++ b/storage/innobase/include/data0data.h @@ -316,16 +316,6 @@ dtuple_get_n_ext( /*=============*/ const dtuple_t* tuple) /*!< in: tuple */ MY_ATTRIBUTE((nonnull)); -/** Compare two data tuples. -@param[in] tuple1 first data tuple -@param[in] tuple2 second data tuple -@return positive, 0, negative if tuple1 is greater, equal, less, than tuple2, -respectively */ -int -dtuple_coll_cmp( - const dtuple_t* tuple1, - const dtuple_t* tuple2) - MY_ATTRIBUTE((warn_unused_result)); /** Fold a prefix given as the number of fields of a tuple. @param[in] tuple index record @param[in] n_fields number of complete fields to fold diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h index 57af86bcf85..f580a0f49b5 100644 --- a/storage/innobase/include/dict0dict.h +++ b/storage/innobase/include/dict0dict.h @@ -1092,16 +1092,16 @@ dict_table_get_nth_col_pos( ulint n, /*!< in: column number */ ulint* prefix_col_pos) /*!< out: col num if prefix */ MY_ATTRIBUTE((nonnull(1), warn_unused_result)); -/*******************************************************************//** -Adds a column to index. */ -void -dict_index_add_col( -/*===============*/ - dict_index_t* index, /*!< in/out: index */ - const dict_table_t* table, /*!< in: table */ - dict_col_t* col, /*!< in: column */ - ulint prefix_len) /*!< in: column prefix length */ - MY_ATTRIBUTE((nonnull)); +/** Add a column to an index. +@param index index +@param table table +@param col column +@param prefix_len column prefix length +@param descending whether to use descending order */ +void dict_index_add_col(dict_index_t *index, const dict_table_t *table, + dict_col_t *col, ulint prefix_len, + bool descending= false) + MY_ATTRIBUTE((nonnull)); /*******************************************************************//** Copies types of fields contained in index to tuple. */ diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index aaf232366e3..5a316f8c734 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -396,18 +396,7 @@ dict_mem_index_create( ulint type, /*!< in: DICT_UNIQUE, DICT_CLUSTERED, ... ORed */ ulint n_fields); /*!< in: number of fields */ -/**********************************************************************//** -Adds a field definition to an index. NOTE: does not take a copy -of the column name if the field is a column. The memory occupied -by the column name may be released only after publishing the index. */ -void -dict_mem_index_add_field( -/*=====================*/ - dict_index_t* index, /*!< in: index */ - const char* name, /*!< in: column name */ - ulint prefix_len); /*!< in: 0 or the column prefix length - in a MySQL index like - INDEX (textcol(25)) */ + /**********************************************************************//** Frees an index memory object. */ void @@ -886,9 +875,11 @@ struct dict_field_t{ unsigned fixed_len:10; /*!< 0 or the fixed length of the column if smaller than DICT_ANTELOPE_MAX_INDEX_COL_LEN */ + /** 1=DESC, 0=ASC */ + unsigned descending:1; /** Zero-initialize all fields */ - dict_field_t() : col(NULL), name(NULL), prefix_len(0), fixed_len(0) {} + dict_field_t() { memset((void*) this, 0, sizeof *this); } /** Check whether two index fields are equivalent. @param[in] old the other index field @@ -1434,6 +1425,21 @@ inline void dict_col_t::detach(const dict_index_t &index) reinterpret_cast<dict_v_col_t*>(this)->detach(index); } +/** Add a field definition to an index. +@param index index +@param name pointer to column name +@param prefix_len column prefix length, or 0 +@param descending whether to use descending order */ +inline void dict_mem_index_add_field(dict_index_t *index, const char *name, + ulint prefix_len, bool descending= false) +{ + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + dict_field_t &field= index->fields[index->n_def++]; + field.name= name; + field.prefix_len= prefix_len & ((1U << 12) - 1); + field.descending= descending; +} + /** The status of online index creation */ enum online_index_status { /** the index is complete and ready for access */ diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 79583000173..d6c7e07eaf5 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -353,13 +353,10 @@ struct fil_space_t final /** fil_system.spaces chain node */ fil_space_t *hash; - lsn_t max_lsn; - /*!< LSN of the most recent - fil_names_write_if_was_clean(). - Reset to 0 by fil_names_clear(). - Protected by log_sys.mutex. - If and only if this is nonzero, the - tablespace will be in named_spaces. */ + /** LSN of the most recent fil_names_write_if_was_clean(). + Reset to 0 by fil_names_clear(). Protected by exclusive log_sys.latch. + If and only if max_lsn is nonzero, this is in fil_system.named_spaces. */ + lsn_t max_lsn; /** tablespace identifier */ uint32_t id; /** whether undo tablespace truncation is in progress */ @@ -427,9 +424,10 @@ private: /** Whether any corrupton of this tablespace has been reported */ mutable std::atomic_flag is_corrupted; +public: /** mutex to protect freed_ranges and last_freed_lsn */ std::mutex freed_range_mutex; - +private: /** Ranges of freed page numbers; protected by freed_range_mutex */ range_set freed_ranges; @@ -649,11 +647,7 @@ public: /** @return last_freed_lsn */ lsn_t get_last_freed_lsn() { return last_freed_lsn; } /** Update last_freed_lsn */ - void update_last_freed_lsn(lsn_t lsn) - { - std::lock_guard<std::mutex> freed_lock(freed_range_mutex); - last_freed_lsn= lsn; - } + void update_last_freed_lsn(lsn_t lsn) { last_freed_lsn= lsn; } /** Note that the file will need fsync(). @return whether this needs to be added to fil_system.unflushed_spaces */ @@ -674,11 +668,7 @@ public: /** Clear all freed ranges for undo tablespace when InnoDB encounters TRIM redo log record */ - void clear_freed_ranges() - { - std::lock_guard<std::mutex> freed_lock(freed_range_mutex); - freed_ranges.clear(); - } + void clear_freed_ranges() { freed_ranges.clear(); } #endif /* !UNIV_INNOCHECKSUM */ /** FSP_SPACE_FLAGS and FSP_FLAGS_MEM_ flags; check fsp0types.h to more info about flags. */ @@ -951,7 +941,6 @@ public: /** Add the set of freed page ranges */ void add_free_range(const range_t range) { - std::lock_guard<std::mutex> freed_lock(freed_range_mutex); freed_ranges.add_range(range); } @@ -1052,7 +1041,7 @@ struct fil_node_t final { /** tablespace containing this file */ fil_space_t *space; - /** file name; protected by fil_system.mutex and log_sys.mutex */ + /** file name; protected by fil_system.mutex and exclusive log_sys.latch */ char *name; /** file handle */ pfs_os_file_t handle; @@ -1444,14 +1433,12 @@ public: /** nonzero if fil_node_open_file_low() should avoid moving the tablespace to the end of space_list, for FIFO policy of try_to_close() */ ulint freeze_space_list; + /** list of all tablespaces */ ilist<fil_space_t, space_list_tag_t> space_list; - /*!< list of all file spaces */ + /** list of all tablespaces for which a FILE_MODIFY record has been written + since the latest redo log checkpoint. + Protected only by exclusive log_sys.latch. */ ilist<fil_space_t, named_spaces_tag_t> named_spaces; - /*!< list of all file spaces - for which a FILE_MODIFY - record has been written since - the latest redo log checkpoint. - Protected only by log_sys.mutex. */ /** list of all ENCRYPTED=DEFAULT tablespaces that need to be converted to the current value of innodb_encrypt_tables */ @@ -1603,15 +1590,6 @@ Sets the max tablespace id counter if the given number is bigger than the previous value. */ void fil_set_max_space_id_if_bigger(uint32_t max_id); -/** Write the flushed LSN to the page header of the first page in the -system tablespace. -@param[in] lsn flushed LSN -@return DB_SUCCESS or error number */ -dberr_t -fil_write_flushed_lsn( - lsn_t lsn) -MY_ATTRIBUTE((warn_unused_result)); - MY_ATTRIBUTE((warn_unused_result)) /** Delete a tablespace and associated .ibd file. @param id tablespace identifier @@ -1780,50 +1758,14 @@ void fil_names_dirty( fil_space_t* space); -/** Write FILE_MODIFY records when a non-predefined persistent -tablespace was modified for the first time since the latest -fil_names_clear(). -@param[in,out] space tablespace */ -void fil_names_dirty_and_write(fil_space_t* space); - -/** Write FILE_MODIFY records if a persistent tablespace was modified -for the first time since the latest fil_names_clear(). -@param[in,out] space tablespace -@param[in,out] mtr mini-transaction -@return whether any FILE_MODIFY record was written */ -inline bool fil_names_write_if_was_clean(fil_space_t* space) -{ - mysql_mutex_assert_owner(&log_sys.mutex); - - if (space == NULL) { - return(false); - } - - const bool was_clean = space->max_lsn == 0; - ut_ad(space->max_lsn <= log_sys.get_lsn()); - space->max_lsn = log_sys.get_lsn(); - - if (was_clean) { - fil_names_dirty_and_write(space); - } - - return(was_clean); -} - bool fil_comp_algo_loaded(ulint comp_algo); /** On a log checkpoint, reset fil_names_dirty_and_write() flags -and write out FILE_MODIFY and FILE_CHECKPOINT if needed. -@param[in] lsn checkpoint LSN -@param[in] do_write whether to always write FILE_CHECKPOINT -@return whether anything was written to the redo log -@retval false if no flags were set and nothing written -@retval true if anything was written to the redo log */ -bool -fil_names_clear( - lsn_t lsn, - bool do_write); +and write out FILE_MODIFY if needed, and write FILE_CHECKPOINT. +@param lsn checkpoint LSN +@return current LSN */ +lsn_t fil_names_clear(lsn_t lsn); #ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH void test_make_filepath(); diff --git a/storage/innobase/include/fsp0file.h b/storage/innobase/include/fsp0file.h index 9dfb3cc7561..ce11b868bd1 100644 --- a/storage/innobase/include/fsp0file.h +++ b/storage/innobase/include/fsp0file.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2018, 2021, MariaDB Corporation. +Copyright (c) 2018, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -216,11 +216,10 @@ public: tablespace is opened. This occurs before the fil_space_t is created so the Space ID found here must not already be open. m_is_valid is set true on success, else false. - @param[out] flush_lsn contents of FIL_PAGE_FILE_FLUSH_LSN @retval DB_SUCCESS on if the datafile is valid @retval DB_CORRUPTION if the datafile is not readable @retval DB_TABLESPACE_EXISTS if there is a duplicate space_id */ - dberr_t validate_first_page(lsn_t* flush_lsn) + dberr_t validate_first_page() MY_ATTRIBUTE((warn_unused_result)); /** Get Datafile::m_filepath. diff --git a/storage/innobase/include/fsp0sysspace.h b/storage/innobase/include/fsp0sysspace.h index b6bdadd3501..514f3fdbf25 100644 --- a/storage/innobase/include/fsp0sysspace.h +++ b/storage/innobase/include/fsp0sysspace.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2016, 2021, MariaDB Corporation. +Copyright (c) 2016, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -151,20 +151,17 @@ public: @param[in] is_temp whether this is a temporary tablespace @param[in] create_new_db whether we are creating a new database @param[out] sum_new_sizes sum of sizes of the new files added - @param[out] flush_lsn FIL_PAGE_FILE_FLUSH_LSN of first file @return DB_SUCCESS or error code */ dberr_t open_or_create( bool is_temp, bool create_new_db, - ulint* sum_new_sizes, - lsn_t* flush_lsn) + ulint* sum_new_sizes) MY_ATTRIBUTE((warn_unused_result)); private: /** Check the tablespace header for this tablespace. - @param[out] flushed_lsn the value of FIL_PAGE_FILE_FLUSH_LSN @return DB_SUCCESS or error code */ - dberr_t read_lsn_and_check_flags(lsn_t* flushed_lsn); + inline dberr_t read_lsn_and_check_flags(); /** @return true if the last file size is valid. */ diff --git a/storage/innobase/include/ibuf0ibuf.inl b/storage/innobase/include/ibuf0ibuf.inl index 2c2620511c7..2d8265d2206 100644 --- a/storage/innobase/include/ibuf0ibuf.inl +++ b/storage/innobase/include/ibuf0ibuf.inl @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -130,12 +131,17 @@ ibuf_should_try( a secondary index when we decide */ { - return(innodb_change_buffering - && ibuf.max_size != 0 - && !dict_index_is_clust(index) - && !dict_index_is_spatial(index) - && index->table->quiesce == QUIESCE_NONE - && (ignore_sec_unique || !dict_index_is_unique(index))); + if (!innodb_change_buffering || !ibuf.max_size || index->is_clust() || + index->is_spatial()) + return false; + if (!ignore_sec_unique && index->is_unique()) + return false; + if (index->table->quiesce != QUIESCE_NONE) + return false; + for (unsigned i= 0; i < index->n_fields; i++) + if (index->fields[i].descending) + return false; + return true; } /******************************************************************//** diff --git a/storage/innobase/include/log0crypt.h b/storage/innobase/include/log0crypt.h index b9390927ece..22c0c9636bf 100644 --- a/storage/innobase/include/log0crypt.h +++ b/storage/innobase/include/log0crypt.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (C) 2013, 2015, Google Inc. All Rights Reserved. -Copyright (C) 2014, 2021, MariaDB Corporation. +Copyright (C) 2014, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -24,27 +24,26 @@ Created 11/25/2013 Minli Zhu Modified Jan Lindström jan.lindstrom@mariadb.com MDEV-11782: Rewritten for MariaDB 10.2 by Marko Mäkelä, MariaDB Corporation. *******************************************************/ -#ifndef log0crypt_h -#define log0crypt_h +#pragma once #include "log0log.h" -/** innodb_encrypt_log: whether to encrypt the redo log */ -extern my_bool srv_encrypt_log; - /** Initialize the redo log encryption key and random parameters when creating a new redo log. -The random parameters will be persisted in the log checkpoint pages. -@see log_crypt_write_checkpoint_buf() -@see log_crypt_read_checkpoint_buf() +The random parameters will be persisted in the log header. +@see log_crypt_write_header() +@see log_crypt_read_header() @return whether the operation succeeded */ bool log_crypt_init(); -/*********************************************************************//** -Writes the crypto (version, msg and iv) info, which has been used for -log blocks with lsn <= this checkpoint's lsn, to a log header's -checkpoint buf. */ -void log_crypt_write_checkpoint_buf(byte *buf); +/** Add the encryption information to the log header buffer. +@param buf part of log header buffer */ +void log_crypt_write_header(byte *buf); + +/** Read the encryption information from a redo log checkpoint buffer. +@param buf part of checkpoint buffer +@return whether the operation was successful */ +bool log_crypt_read_header(const byte *buf); /** Read the MariaDB 10.1 checkpoint crypto (version, msg and iv) info. @param[in] buf checkpoint buffer @@ -60,25 +59,28 @@ ATTRIBUTE_COLD bool log_crypt_101_read_block(byte* buf, lsn_t start_lsn); /** Read the checkpoint crypto (version, msg and iv) info. @param[in] buf checkpoint buffer @return whether the operation was successful */ -bool log_crypt_read_checkpoint_buf(const byte* buf); - -/** log_crypt() operation code */ -enum log_crypt_t { - /** encrypt a log block without rotating key */ - LOG_ENCRYPT, - /** decrypt a log block */ - LOG_DECRYPT, - /** attempt to rotate the key, and encrypt a log block */ - LOG_ENCRYPT_ROTATE_KEY -}; - -/** Encrypt or decrypt log blocks. -@param[in,out] buf log blocks to encrypt or decrypt +ATTRIBUTE_COLD bool log_crypt_read_checkpoint_buf(const byte* buf); + +/** Decrypt log blocks. +@param[in,out] buf log blocks to decrypt @param[in] lsn log sequence number of the start of the buffer @param[in] size size of the buffer, in bytes -@param[in] op whether to decrypt, encrypt, or rotate key and encrypt -@return whether the operation succeeded (encrypt always does) */ -bool log_crypt(byte* buf, lsn_t lsn, ulint size, log_crypt_t op = LOG_ENCRYPT); +@return whether the operation succeeded */ +ATTRIBUTE_COLD bool log_decrypt(byte* buf, lsn_t lsn, ulint size); + +/** Decrypt part of a log record. +@param iv initialization vector +@param buf buffer for the decrypted data +@param data the encrypted data +@param len length of the data, in bytes +@return buf */ +byte *log_decrypt_buf(const byte *iv, byte *buf, const byte *data, uint len); + +/** Decrypt a log snippet. +@param iv initialization vector +@param buf buffer to be replaced with encrypted contents +@param end pointer past the end of buf */ +void log_decrypt_buf(const byte *iv, byte *buf, const byte *const end); /** Encrypt or decrypt a temporary file block. @param[in] src block to encrypt or decrypt @@ -111,7 +113,3 @@ log_tmp_block_decrypt( { return(log_tmp_block_encrypt(src, size, dst, offs, false)); } - -/** @return whether temporary files are encrypted */ -inline bool log_tmp_is_encrypted() { return srv_encrypt_log; } -#endif // log0crypt.h diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index 629ddacdf1b..d1c6e40d946 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -2,7 +2,7 @@ Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2009, Google Inc. -Copyright (c) 2017, 2021, MariaDB Corporation. +Copyright (c) 2017, 2022, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -31,14 +31,13 @@ Database log Created 12/9/1995 Heikki Tuuri *******************************************************/ -#ifndef log0log_h -#define log0log_h +#pragma once #include "log0types.h" #include "os0file.h" #include "span.h" #include "my_atomic_wrapper.h" -#include <vector> +#include "srw_lock.h" #include <string> using st_::span; @@ -51,77 +50,34 @@ static const char LOG_FILE_NAME[] = "ib_logfile0"; @return path with log file name*/ std::string get_log_file_path(const char *filename= LOG_FILE_NAME); -/** Returns paths for all existing log files */ -std::vector<std::string> get_existing_log_files_paths(); - /** Delete log file. @param[in] suffix suffix of the file name */ static inline void delete_log_file(const char* suffix) { auto path = get_log_file_path(LOG_FILE_NAME_PREFIX).append(suffix); - os_file_delete_if_exists(innodb_log_file_key, path.c_str(), nullptr); + os_file_delete_if_exists_func(path.c_str(), nullptr); } -/** Append a string to the log. -@param[in] str string -@param[in] len string length -@param[out] start_lsn start LSN of the log record -@return end lsn of the log record, zero if did not succeed */ -UNIV_INLINE -lsn_t -log_reserve_and_write_fast( - const void* str, - ulint len, - lsn_t* start_lsn); -/***********************************************************************//** -Checks if there is need for a log buffer flush or a new checkpoint, and does -this if yes. Any database operation should call this when it has modified -more than about 4 pages. NOTE that this function may only be called when the -OS thread owns no synchronization objects except dict_sys.latch. */ -UNIV_INLINE -void -log_free_check(void); -/*================*/ - -/** Extends the log buffer. -@param[in] len requested minimum size in bytes */ -void log_buffer_extend(ulong len); - -/** Calculate the recommended highest values for lsn - last_checkpoint_lsn -and lsn - buf_pool.get_oldest_modification(). -@param[in] file_size requested innodb_log_file_size -@retval true on success -@retval false if the smallest log is too small to -accommodate the number of OS threads in the database server */ -bool -log_set_capacity(ulonglong file_size) - MY_ATTRIBUTE((warn_unused_result)); +struct completion_callback; -/** -Ensure that the log has been written to the log file up to a given +/** Ensure that the log has been written to the log file up to a given log entry (such as that of a transaction commit). Start a new write, or wait and check if an already running write is covering the request. -@param[in] lsn log sequence number that should be -included in the redo log file write -@param[in] flush_to_disk whether the written log should also -be flushed to the file system -@param[in] rotate_key whether to rotate the encryption key -@param[in] cb completion callback. If not NULL, the callback will be called - whenever lsn is written or flushed. -*/ -struct completion_callback; -void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key = false, - const completion_callback* cb=nullptr); +@param lsn log sequence number that should be included in the file write +@param durable whether the write needs to be durable +@param callback log write completion callback */ +void log_write_up_to(lsn_t lsn, bool durable, + const completion_callback *callback= nullptr); /** Write to the log file up to the last log entry. -@param sync whether to wait for a durable write to complete */ -void log_buffer_flush_to_disk(bool sync= true); +@param durable whether to wait for a durable write to complete */ +void log_buffer_flush_to_disk(bool durable= true); -/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.mutex. */ +/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */ ATTRIBUTE_COLD void log_write_and_flush_prepare(); -/** Durably write the log up to log_sys.lsn() and release log_sys.mutex. */ +/** Durably write the log up to log_sys.get_lsn(). */ ATTRIBUTE_COLD void log_write_and_flush(); /** Make a checkpoint */ @@ -130,10 +86,6 @@ ATTRIBUTE_COLD void log_make_checkpoint(); /** Make a checkpoint at the latest lsn on shutdown. */ ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown(); -/** Write checkpoint info to the log header and release log_sys.mutex. -@param[in] end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */ -ATTRIBUTE_COLD void log_write_checkpoint_info(lsn_t end_lsn); - /** Checks that there is enough free space in the log to start a new query step. Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this @@ -141,175 +93,12 @@ function may only be called if the calling thread owns no synchronization objects! */ ATTRIBUTE_COLD void log_check_margins(); -/************************************************************//** -Gets a log block flush bit. -@return TRUE if this block was the first to be written in a log flush */ -UNIV_INLINE -ibool -log_block_get_flush_bit( -/*====================*/ - const byte* log_block); /*!< in: log block */ -/************************************************************//** -Gets a log block number stored in the header. -@return log block number stored in the block header */ -UNIV_INLINE -ulint -log_block_get_hdr_no( -/*=================*/ - const byte* log_block); /*!< in: log block */ -/************************************************************//** -Gets a log block data length. -@return log block data length measured as a byte offset from the block start */ -UNIV_INLINE -ulint -log_block_get_data_len( -/*===================*/ - const byte* log_block); /*!< in: log block */ -/************************************************************//** -Sets the log block data length. */ -UNIV_INLINE -void -log_block_set_data_len( -/*===================*/ - byte* log_block, /*!< in/out: log block */ - ulint len); /*!< in: data length */ -/** Calculate the CRC-32C checksum of a log block. -@param[in] block log block -@return checksum */ -inline ulint log_block_calc_checksum_crc32(const byte* block); - -/************************************************************//** -Gets a log block checksum field value. -@return checksum */ -UNIV_INLINE -ulint -log_block_get_checksum( -/*===================*/ - const byte* log_block); /*!< in: log block */ -/************************************************************//** -Sets a log block checksum field value. */ -UNIV_INLINE -void -log_block_set_checksum( -/*===================*/ - byte* log_block, /*!< in/out: log block */ - ulint checksum); /*!< in: checksum */ -/************************************************************//** -Gets a log block first mtr log record group offset. -@return first mtr log record group byte offset from the block start, 0 -if none */ -UNIV_INLINE -ulint -log_block_get_first_rec_group( -/*==========================*/ - const byte* log_block); /*!< in: log block */ -/************************************************************//** -Sets the log block first mtr log record group offset. */ -UNIV_INLINE -void -log_block_set_first_rec_group( -/*==========================*/ - byte* log_block, /*!< in/out: log block */ - ulint offset); /*!< in: offset, 0 if none */ -/************************************************************//** -Gets a log block checkpoint number field (4 lowest bytes). -@return checkpoint no (4 lowest bytes) */ -UNIV_INLINE -ulint -log_block_get_checkpoint_no( -/*========================*/ - const byte* log_block); /*!< in: log block */ -/************************************************************//** -Initializes a log block in the log buffer. */ -UNIV_INLINE -void -log_block_init( -/*===========*/ - byte* log_block, /*!< in: pointer to the log buffer */ - lsn_t lsn); /*!< in: lsn within the log block */ -/************************************************************//** -Converts a lsn to a log block number. -@return log block number, it is > 0 and <= 1G */ -UNIV_INLINE -ulint -log_block_convert_lsn_to_no( -/*========================*/ - lsn_t lsn); /*!< in: lsn of a byte within the block */ /******************************************************//** Prints info of the log. */ void log_print( /*======*/ FILE* file); /*!< in: file where to print */ -/**********************************************************************//** -Refreshes the statistics used to print per-second averages. */ -void -log_refresh_stats(void); -/*===================*/ - -/* The counting of lsn's starts from this value: this must be non-zero */ -#define LOG_START_LSN ((lsn_t) (16 * OS_FILE_LOG_BLOCK_SIZE)) - -/* Offsets of a log block header */ -#define LOG_BLOCK_HDR_NO 0 /* block number which must be > 0 and - is allowed to wrap around at 2G; the - highest bit is set to 1 if this is the - first log block in a log flush write - segment */ -#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000UL - /* mask used to get the highest bit in - the preceding field */ -#define LOG_BLOCK_HDR_DATA_LEN 4 /* number of bytes of log written to - this block */ -#define LOG_BLOCK_FIRST_REC_GROUP 6 /* offset of the first start of an - mtr log record group in this log block, - 0 if none; if the value is the same - as LOG_BLOCK_HDR_DATA_LEN, it means - that the first rec group has not yet - been catenated to this log block, but - if it will, it will start at this - offset; an archive recovery can - start parsing the log records starting - from this offset in this log block, - if value not 0 */ -#define LOG_BLOCK_CHECKPOINT_NO 8 /* 4 lower bytes of the value of - log_sys.next_checkpoint_no when the - log block was last written to: if the - block has not yet been written full, - this value is only updated before a - log buffer flush */ -#define LOG_BLOCK_HDR_SIZE 12 /* size of the log block header in - bytes */ - -#define LOG_BLOCK_KEY 4 /* encryption key version - before LOG_BLOCK_CHECKSUM; - after log_t::FORMAT_ENC_10_4 only */ -#define LOG_BLOCK_CHECKSUM 4 /* 4 byte checksum of the log block - contents; in InnoDB versions - < 3.23.52 this did not contain the - checksum but the same value as - LOG_BLOCK_HDR_NO */ - -/** Offsets inside the checkpoint pages (redo log format version 1) @{ */ -/** Checkpoint number */ -#define LOG_CHECKPOINT_NO 0 -/** Log sequence number up to which all changes have been flushed */ -#define LOG_CHECKPOINT_LSN 8 -/** Byte offset of the log record corresponding to LOG_CHECKPOINT_LSN */ -#define LOG_CHECKPOINT_OFFSET 16 -/** srv_log_buffer_size at the time of the checkpoint (not used) */ -#define LOG_CHECKPOINT_LOG_BUF_SIZE 24 -/** MariaDB 10.2.5 encrypted redo log encryption key version (32 bits)*/ -#define LOG_CHECKPOINT_CRYPT_KEY 32 -/** MariaDB 10.2.5 encrypted redo log random nonce (32 bits) */ -#define LOG_CHECKPOINT_CRYPT_NONCE 36 -/** MariaDB 10.2.5 encrypted redo log random message (MY_AES_BLOCK_SIZE) */ -#define LOG_CHECKPOINT_CRYPT_MESSAGE 40 -/** start LSN of the MLOG_CHECKPOINT mini-transaction corresponding -to this checkpoint, or 0 if the information has not been written */ -#define LOG_CHECKPOINT_END_LSN OS_FILE_LOG_BLOCK_SIZE - 16 - -/* @} */ /** Offsets of a log file header */ /* @{ */ @@ -317,12 +106,6 @@ to this checkpoint, or 0 if the information has not been written */ This used to be called LOG_GROUP_ID and always written as 0, because InnoDB never supported more than one copy of the redo log. */ #define LOG_HEADER_FORMAT 0 -/** Redo log subformat (originally 0). In format version 0, the -LOG_FILE_START_LSN started here, 4 bytes earlier than LOG_HEADER_START_LSN, -which the LOG_FILE_START_LSN was renamed to. -Subformat 1 is for the fully redo-logged TRUNCATE -(no MLOG_TRUNCATE records or extra log checkpoints or log file) */ -#define LOG_HEADER_SUBFORMAT 4 /** LSN of the start of data in this log file (with format version 1; in format version 0, it was called LOG_FILE_START_LSN and at offset 4). */ #define LOG_HEADER_START_LSN 8 @@ -331,123 +114,69 @@ and the creation time if the log file was created by mysqlbackup --restore, or the MySQL version that created the redo log file. */ #define LOG_HEADER_CREATOR 16 /** End of the log file creator field. */ -#define LOG_HEADER_CREATOR_END (LOG_HEADER_CREATOR + 32) -/** Contents of the LOG_HEADER_CREATOR field */ -#define LOG_HEADER_CREATOR_CURRENT \ - "MariaDB " \ - IB_TO_STR(MYSQL_VERSION_MAJOR) "." \ - IB_TO_STR(MYSQL_VERSION_MINOR) "." \ - IB_TO_STR(MYSQL_VERSION_PATCH) - +#define LOG_HEADER_CREATOR_END 48 /* @} */ -#define LOG_CHECKPOINT_1 OS_FILE_LOG_BLOCK_SIZE - /* first checkpoint field in the log - header; we write alternately to the - checkpoint fields when we make new - checkpoints; this field is only defined - in the first log file of a log */ -#define LOG_CHECKPOINT_2 (3 * OS_FILE_LOG_BLOCK_SIZE) - /* second checkpoint field in the log - header */ -#define LOG_FILE_HDR_SIZE (4 * OS_FILE_LOG_BLOCK_SIZE) - -/** Abstraction for reading, writing and flushing file cache to disk */ -class file_io -{ -public: - file_io(bool durable_writes= false) : m_durable_writes(durable_writes) {} - virtual ~file_io() noexcept {}; - virtual dberr_t open(const char *path, bool read_only) noexcept= 0; - virtual dberr_t rename(const char *old_path, - const char *new_path) noexcept= 0; - virtual dberr_t close() noexcept= 0; - virtual dberr_t read(os_offset_t offset, span<byte> buf) noexcept= 0; - virtual dberr_t write(const char *path, os_offset_t offset, - span<const byte> buf) noexcept= 0; - virtual dberr_t flush() noexcept= 0; - - /** Durable writes doesn't require calling flush() */ - bool writes_are_durable() const noexcept { return m_durable_writes; } - -protected: - bool m_durable_writes; -}; +struct log_t; -class file_os_io final: public file_io -{ -public: - file_os_io()= default; - file_os_io(const file_os_io &)= delete; - file_os_io &operator=(const file_os_io &)= delete; - file_os_io(file_os_io &&rhs); - file_os_io &operator=(file_os_io &&rhs); - ~file_os_io() noexcept; - - dberr_t open(const char *path, bool read_only) noexcept final; - bool is_opened() const noexcept { return m_fd != OS_FILE_CLOSED; } - dberr_t rename(const char *old_path, const char *new_path) noexcept final; - dberr_t close() noexcept final; - dberr_t read(os_offset_t offset, span<byte> buf) noexcept final; - dberr_t write(const char *path, os_offset_t offset, - span<const byte> buf) noexcept final; - dberr_t flush() noexcept final; - -private: - pfs_os_file_t m_fd{OS_FILE_CLOSED}; -}; - -/** File abstraction + path */ +/** File abstraction */ class log_file_t { + friend log_t; + os_file_t m_file{OS_FILE_CLOSED}; public: - log_file_t(std::string path= "") noexcept : m_path{std::move(path)} {} - - dberr_t open(bool read_only) noexcept; - bool is_opened() const noexcept; + log_file_t()= default; + log_file_t(os_file_t file) noexcept : m_file(file) {} - const std::string &get_path() const noexcept { return m_path; } + /** Open a file + @return file size in bytes + @retval 0 if not readable */ + os_offset_t open(bool read_only) noexcept; + bool is_opened() const noexcept { return m_file != OS_FILE_CLOSED; } - dberr_t rename(std::string new_path) noexcept; dberr_t close() noexcept; dberr_t read(os_offset_t offset, span<byte> buf) noexcept; - bool writes_are_durable() const noexcept; - dberr_t write(os_offset_t offset, span<const byte> buf) noexcept; - dberr_t flush() noexcept; - void free() - { - m_path.clear(); - m_path.shrink_to_fit(); - } - -private: - std::unique_ptr<file_io> m_file; - std::string m_path; + void write(os_offset_t offset, span<const byte> buf) noexcept; + bool flush() const noexcept { return os_file_flush(m_file); } +#ifdef HAVE_PMEM + byte *mmap(bool read_only, const struct stat &st) noexcept; +#endif }; /** Redo log buffer */ -struct log_t{ +struct log_t +{ /** The original (not version-tagged) InnoDB redo log format */ - static constexpr uint32_t FORMAT_3_23 = 0; + static constexpr uint32_t FORMAT_3_23= 0; /** The MySQL 5.7.9/MariaDB 10.2.2 log format */ - static constexpr uint32_t FORMAT_10_2 = 1; - /** The MariaDB 10.3.2 log format. - To prevent crash-downgrade to earlier 10.2 due to the inability to - roll back a retroactively introduced TRX_UNDO_RENAME_TABLE undo log record, - MariaDB 10.2.18 and later will use the 10.3 format, but LOG_HEADER_SUBFORMAT - 1 instead of 0. MariaDB 10.3 will use subformat 0 (5.7-style TRUNCATE) or 2 - (MDEV-13564 backup-friendly TRUNCATE). */ - static constexpr uint32_t FORMAT_10_3 = 103; + static constexpr uint32_t FORMAT_10_2= 1; + /** The MariaDB 10.3.2 log format. */ + static constexpr uint32_t FORMAT_10_3= 103; /** The MariaDB 10.4.0 log format. */ - static constexpr uint32_t FORMAT_10_4 = 104; + static constexpr uint32_t FORMAT_10_4= 104; /** Encrypted MariaDB redo log */ - static constexpr uint32_t FORMAT_ENCRYPTED = 1U << 31; + static constexpr uint32_t FORMAT_ENCRYPTED= 1U << 31; /** The MariaDB 10.4.0 log format (only with innodb_encrypt_log=ON) */ - static constexpr uint32_t FORMAT_ENC_10_4 = FORMAT_10_4 | FORMAT_ENCRYPTED; - /** The MariaDB 10.5 physical redo log format */ - static constexpr uint32_t FORMAT_10_5 = 0x50485953; - /** The MariaDB 10.5 physical format (only with innodb_encrypt_log=ON) */ - static constexpr uint32_t FORMAT_ENC_10_5 = FORMAT_10_5 | FORMAT_ENCRYPTED; + static constexpr uint32_t FORMAT_ENC_10_4= FORMAT_10_4 | FORMAT_ENCRYPTED; + /** The MariaDB 10.5.1 physical redo log format */ + static constexpr uint32_t FORMAT_10_5= 0x50485953; + /** The MariaDB 10.5.1 physical format (only with innodb_encrypt_log=ON) */ + static constexpr uint32_t FORMAT_ENC_10_5= FORMAT_10_5 | FORMAT_ENCRYPTED; + /** The MariaDB 10.8.0 variable-block-size redo log format */ + static constexpr uint32_t FORMAT_10_8= 0x50687973; + /** The MariaDB 10.8.0 format with innodb_encrypt_log=ON */ + static constexpr uint32_t FORMAT_ENC_10_8= FORMAT_10_8 | FORMAT_ENCRYPTED; + + /** Location of the first checkpoint block */ + static constexpr size_t CHECKPOINT_1= 4096; + /** Location of the second checkpoint block */ + static constexpr size_t CHECKPOINT_2= 8192; + /** Start of record payload */ + static constexpr lsn_t START_OFFSET= 12288; + + /** smallest possible log sequence number in the current format + (used to be 2048 before FORMAT_10_8). */ + static constexpr lsn_t FIRST_LSN= START_OFFSET; private: /** The log sequence number of the last change of durable InnoDB files */ @@ -459,127 +188,77 @@ private: preflush buffer pool pages, or initiate a log checkpoint. This must hold if lsn - last_checkpoint_lsn > max_checkpoint_age. */ std::atomic<bool> check_flush_or_checkpoint_; + + +#if defined(__aarch64__) +/* On ARM, we do more spinning */ +typedef srw_spin_lock log_rwlock_t; +#define LSN_LOCK_ATTR MY_MUTEX_INIT_FAST +#else +typedef srw_lock log_rwlock_t; +#define LSN_LOCK_ATTR nullptr +#endif + public: - /** mutex protecting the log */ - alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex; - /** first free offset within the log buffer in use */ - size_t buf_free; - /** recommended maximum size of buf, after which the buffer is flushed */ - size_t max_buf_free; - /** mutex to serialize access to the flush list when we are putting - dirty blocks in the list. The idea behind this mutex is to be able - to release log_sys.mutex during mtr_commit and still ensure that - insertions in the flush_list happen in the LSN order. */ - alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_order_mutex; - /** log_buffer, append data here */ + /** rw-lock protecting buf */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) log_rwlock_t latch; +private: + /** Last written LSN */ + lsn_t write_lsn; +public: + /** log record buffer, written to by mtr_t::commit() */ byte *buf; - /** log_buffer, writing data to file from this buffer. - Before flushing write_buf is swapped with flush_buf */ + /** buffer for writing data to ib_logfile0, or nullptr if is_pmem() + In write_buf(), buf and flush_buf are swapped */ byte *flush_buf; - /** Log file stuff. Protected by mutex. */ - struct file { - /** format of the redo log: e.g., FORMAT_10_5 */ - uint32_t format; - /** redo log subformat: 0 with separately logged TRUNCATE, - 2 with fully redo-logged TRUNCATE (1 in MariaDB 10.2) */ - uint32_t subformat; - /** individual log file size in bytes, including the header */ - lsn_t file_size; - private: - /** lsn used to fix coordinates within the log group */ - lsn_t lsn; - /** the byte offset of the above lsn */ - lsn_t lsn_offset; - /** log file */ - log_file_t fd; - - public: - /** used only in recovery: recovery scan succeeded up to this - lsn in this log group */ - lsn_t scanned_lsn; - - /** opens log file which must be closed prior this call */ - void open_file(std::string path); - /** writes header */ - void write_header_durable(lsn_t lsn); - /** opens log file which must be closed prior this call */ - dberr_t rename(std::string path) { return fd.rename(path); } - /** reads buffer from log file - @param[in] offset offset in log file - @param[in] buf buffer where to read */ - void read(os_offset_t offset, span<byte> buf); - /** Tells whether writes require calling flush() */ - bool writes_are_durable() const noexcept; - /** writes buffer to log file - @param[in] offset offset in log file - @param[in] buf buffer from which to write */ - void write(os_offset_t offset, span<byte> buf); - /** flushes OS page cache (excluding metadata!) for log file */ - void flush(); - /** closes log file */ - void close_file(); - - /** @return whether the redo log is encrypted */ - bool is_encrypted() const { return format & FORMAT_ENCRYPTED; } - /** @return whether the redo log is in the physical format */ - bool is_physical() const - { return (format & ~FORMAT_ENCRYPTED) == FORMAT_10_5; } - /** @return capacity in bytes */ - lsn_t capacity() const{ return file_size - LOG_FILE_HDR_SIZE; } - /** Calculate the offset of a log sequence number. - @param[in] lsn log sequence number - @return offset within the log */ - inline lsn_t calc_lsn_offset(lsn_t lsn) const; - inline lsn_t calc_lsn_offset_old(lsn_t lsn) const; - - /** Set the field values to correspond to a given lsn. */ - void set_fields(lsn_t lsn) - { - lsn_t c_lsn_offset = calc_lsn_offset(lsn); - set_lsn(lsn); - set_lsn_offset(c_lsn_offset); - } - - /** Read a log segment to log_sys.buf. - @param[in,out] start_lsn in: read area start, - out: the last read valid lsn - @param[in] end_lsn read area end - @return whether no invalid blocks (e.g checksum mismatch) were found */ - bool read_log_seg(lsn_t* start_lsn, lsn_t end_lsn); - - /** Initialize the redo log buffer. */ - void create(); - - /** Close the redo log buffer. */ - void close() { close_file(); } - void set_lsn(lsn_t a_lsn); - lsn_t get_lsn() const { return lsn; } - void set_lsn_offset(lsn_t a_lsn); - lsn_t get_lsn_offset() const { return lsn_offset; } - } log; - - /** The fields involved in the log buffer flush @{ */ - - size_t buf_next_to_write;/*!< first offset in the log buffer - where the byte content may not exist - written to file, e.g., the start - offset of a log record catenated - later; this is advanced when a flush - operation is completed to all the log - groups */ - lsn_t write_lsn; /*!< last written lsn */ - lsn_t current_flush_lsn;/*!< end lsn for the current running - write + flush operation */ - std::atomic<size_t> pending_flushes; /*!< system calls in progress */ - std::atomic<size_t> flushes; /*!< system calls counter */ - - ulint n_log_ios; /*!< number of log i/os initiated thus - far */ - ulint n_log_ios_old; /*!< number of log i/o's at the - previous printout */ - time_t last_printout_time;/*!< when log_print was last time - called */ - /* @} */ + /** number of std::swap(buf, flush_buf) and writes from buf to log; + protected by latch.wr_lock() */ + ulint write_to_log; + /** innodb_log_buffer_size (size of buf and flush_buf, in bytes) */ + size_t buf_size; + +private: + /** spin lock protecting lsn, buf_free in append_prepare() */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) pthread_mutex_t lsn_lock; + void init_lsn_lock() { pthread_mutex_init(&lsn_lock, LSN_LOCK_ATTR); } + void lock_lsn() { pthread_mutex_lock(&lsn_lock); } + void unlock_lsn() { pthread_mutex_unlock(&lsn_lock); } + void destroy_lsn_lock() { pthread_mutex_destroy(&lsn_lock); } + +public: + /** first free offset within buf use; protected by lsn_lock */ + Atomic_relaxed<size_t> buf_free; + /** number of write requests (to buf); protected by exclusive lsn_lock */ + ulint write_to_buf; + /** number of waits in append_prepare(); protected by lsn_lock */ + ulint waits; + /** recommended maximum size of buf, after which the buffer is flushed */ + size_t max_buf_free; + + /** log file size in bytes, including the header */ + lsn_t file_size; +private: + /** the log sequence number at the start of the log file */ + lsn_t first_lsn; +#if defined __linux__ || defined _WIN32 + /** The physical block size of the storage */ + uint32_t block_size; +#endif +public: + /** format of the redo log: e.g., FORMAT_10_8 */ + uint32_t format; + /** Log file */ + log_file_t log; +#if defined __linux__ || defined _WIN32 + /** whether file system caching is enabled for the log */ + my_bool log_buffered; +# ifdef _WIN32 + static constexpr bool log_maybe_unbuffered= true; +# else + /** whether file system caching may be disabled */ + bool log_maybe_unbuffered; +# endif +#endif /** Fields involved in checkpoints @{ */ lsn_t log_capacity; /*!< capacity of the log; if @@ -597,12 +276,12 @@ public: /*!< this is the maximum allowed value for lsn - last_checkpoint_lsn when a new query step is started */ - ib_uint64_t next_checkpoint_no; - /*!< next checkpoint number */ - /** latest completed checkpoint (protected by log_sys.mutex) */ + /** latest completed checkpoint (protected by latch.wr_lock()) */ Atomic_relaxed<lsn_t> last_checkpoint_lsn; /** next checkpoint LSN (protected by log_sys.mutex) */ lsn_t next_checkpoint_lsn; + /** next checkpoint number (protected by latch.wr_lock()) */ + ulint next_checkpoint_no; /** whether a checkpoint is pending */ Atomic_relaxed<bool> checkpoint_pending; @@ -610,32 +289,68 @@ public: byte *checkpoint_buf; /* @} */ -private: - bool m_initialised; -public: - /** - Constructor. + bool is_initialised() const noexcept { return max_buf_free != 0; } - Some members may require late initialisation, thus we just mark object as - uninitialised. Real initialisation happens in create(). - */ - log_t(): m_initialised(false) {} +#ifdef HAVE_PMEM + bool is_pmem() const noexcept { return !flush_buf; } +#else + static constexpr bool is_pmem() { return false; } +#endif - /** @return whether the redo log is encrypted */ - bool is_encrypted() const { return(log.is_encrypted()); } - /** @return whether the redo log is in the physical format */ - bool is_physical() const { return log.is_physical(); } + bool is_opened() const noexcept { return log.is_opened(); } + + static constexpr bool resize_in_progress() { return false; } + + /** Rename a log file after resizing. + @return whether an error occurred */ + static bool rename_resized() noexcept; + +#if defined __linux__ || defined _WIN32 + /** Try to enable or disable file system caching (update log_buffered) */ + void set_buffered(bool buffered); +#endif + + void attach(log_file_t file, os_offset_t size); + + void close_file(); - bool is_initialised() const { return m_initialised; } + /** Calculate the checkpoint safety margins. */ + static void set_capacity(); lsn_t get_lsn(std::memory_order order= std::memory_order_relaxed) const { return lsn.load(order); } void set_lsn(lsn_t lsn) { this->lsn.store(lsn, std::memory_order_release); } - lsn_t get_flushed_lsn() const - { return flushed_to_disk_lsn.load(std::memory_order_acquire); } - void set_flushed_lsn(lsn_t lsn) - { flushed_to_disk_lsn.store(lsn, std::memory_order_release); } + lsn_t get_flushed_lsn(std::memory_order order= std::memory_order_acquire) + const noexcept + { return flushed_to_disk_lsn.load(order); } + + /** Initialize the LSN on initial log file creation. */ + lsn_t init_lsn() noexcept + { + latch.wr_lock(SRW_LOCK_CALL); + const lsn_t lsn{get_lsn()}; + flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); + write_lsn= lsn; + latch.wr_unlock(); + return lsn; + } + + void set_recovered_lsn(lsn_t lsn) noexcept + { +#ifndef SUX_LOCK_GENERIC + ut_ad(latch.is_write_locked()); +#endif /* SUX_LOCK_GENERIC */ + write_lsn= lsn; + this->lsn.store(lsn, std::memory_order_relaxed); + flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); + } + +#ifdef HAVE_PMEM + /** Persist the log. + @param lsn desired new value of flushed_to_disk_lsn */ + inline void persist(lsn_t lsn) noexcept; +#endif bool check_flush_or_checkpoint() const { @@ -645,96 +360,105 @@ public: void set_check_flush_or_checkpoint(bool flag= true) { check_flush_or_checkpoint_.store(flag, std::memory_order_relaxed); } - bool has_encryption_key_rotation() const { - return log.format == FORMAT_ENC_10_4 || log.format == FORMAT_ENC_10_5; - } + /** Make previous write_buf() durable and update flushed_to_disk_lsn. */ + inline bool flush(lsn_t lsn) noexcept; - /** @return the log block header + trailer size */ - unsigned framing_size() const - { - return has_encryption_key_rotation() - ? LOG_BLOCK_HDR_SIZE + LOG_BLOCK_KEY + LOG_BLOCK_CHECKSUM - : LOG_BLOCK_HDR_SIZE + LOG_BLOCK_CHECKSUM; - } - /** @return the log block payload size */ - unsigned payload_size() const - { - return has_encryption_key_rotation() - ? OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE - LOG_BLOCK_CHECKSUM - - LOG_BLOCK_KEY - : OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE - LOG_BLOCK_CHECKSUM; - } - /** @return the log block trailer offset */ - unsigned trailer_offset() const + /** Initialise the redo log subsystem. */ + void create(); + + /** Shut down the redo log subsystem. */ + void close(); + +#if defined __linux__ || defined _WIN32 + /** @return the physical block size of the storage */ + size_t get_block_size() const noexcept + { ut_ad(block_size); return block_size; } + /** Set the log block size for file I/O. */ + void set_block_size(uint32_t size) noexcept { block_size= size; } +#else + /** @return the physical block size of the storage */ + static size_t get_block_size() { return 512; } +#endif + +private: + /** Wait in append_prepare() for buffer to become available + @param ex whether log_sys.latch is exclusively locked */ + ATTRIBUTE_COLD static void append_prepare_wait(bool ex) noexcept; +public: + /** Reserve space in the log buffer for appending data. + @tparam pmem log_sys.is_pmem() + @param size total length of the data to append(), in bytes + @param ex whether log_sys.latch is exclusively locked + @return the start LSN and the buffer position for append() */ + template<bool pmem> + inline std::pair<lsn_t,byte*> append_prepare(size_t size, bool ex) noexcept; + + /** Append a string of bytes to the redo log. + @param d destination + @param s string of bytes + @param size length of str, in bytes */ + void append(byte *&d, const void *s, size_t size) noexcept { - return has_encryption_key_rotation() - ? OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM - LOG_BLOCK_KEY - : OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM; +#ifndef SUX_LOCK_GENERIC + ut_ad(latch.is_locked()); +#endif + ut_ad(d + size <= buf + (is_pmem() ? file_size : buf_size)); + memcpy(d, s, size); + d+= size; } - size_t get_pending_flushes() const + /** Set the log file format. */ + void set_latest_format(bool encrypted) noexcept + { format= encrypted ? FORMAT_ENC_10_8 : FORMAT_10_8; } + /** @return whether the redo log is encrypted */ + bool is_encrypted() const noexcept { return format & FORMAT_ENCRYPTED; } + /** @return whether the redo log is in the latest format */ + bool is_latest() const noexcept + { return (~FORMAT_ENCRYPTED & format) == FORMAT_10_8; } + + /** @return capacity in bytes */ + lsn_t capacity() const noexcept { return file_size - START_OFFSET; } + + /** Set the LSN of the log file at file creation. */ + void set_first_lsn(lsn_t lsn) noexcept { write_lsn= first_lsn= lsn; } + /** @return the first LSN of the log file */ + lsn_t get_first_lsn() const noexcept { return first_lsn; } + + /** Determine the sequence bit at a log sequence number */ + byte get_sequence_bit(lsn_t lsn) const noexcept { - return pending_flushes.load(std::memory_order_relaxed); + ut_ad(lsn >= first_lsn); + return !(((lsn - first_lsn) / capacity()) & 1); } - size_t get_flushes() const + /** Calculate the offset of a log sequence number. + @param lsn log sequence number + @return byte offset within ib_logfile0 */ + lsn_t calc_lsn_offset(lsn_t lsn) const noexcept { - return flushes.load(std::memory_order_relaxed); + ut_ad(lsn >= first_lsn); + return START_OFFSET + (lsn - first_lsn) % capacity(); } - /** Initialise the redo log subsystem. */ - void create(); + /** Write checkpoint information and invoke latch.wr_unlock(). + @param end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */ + inline void write_checkpoint(lsn_t end_lsn) noexcept; - /** Shut down the redo log subsystem. */ - void close(); + /** Write buf to ib_logfile0. + @tparam release_latch whether to invoke latch.wr_unlock() + @return new write target + @retval 0 if everything was written */ + template<bool release_latch> inline lsn_t write_buf() noexcept; + + /** Create the log. */ + void create(lsn_t lsn) noexcept; }; /** Redo log system */ extern log_t log_sys; -#ifdef UNIV_DEBUG -extern bool log_write_lock_own(); -#endif - -/** Calculate the offset of a log sequence number. -@param[in] lsn log sequence number -@return offset within the log */ -inline lsn_t log_t::file::calc_lsn_offset(lsn_t lsn) const -{ - ut_ad(this == &log_sys.log); - /* The lsn parameters are updated while holding both the mutexes - and it is ok to have either of them while reading */ -#ifdef SAFE_MUTEX - ut_ad(mysql_mutex_is_owner(&log_sys.mutex) || log_write_lock_own()); -#endif /* SAFE_MUTEX */ - const lsn_t size = capacity(); - lsn_t l= lsn - this->lsn; - if (longlong(l) < 0) { - l = lsn_t(-longlong(l)) % size; - l = size - l; - } - - l+= lsn_offset - LOG_FILE_HDR_SIZE * (1 + lsn_offset / file_size); - l %= size; - return l + LOG_FILE_HDR_SIZE * (1 + l / (file_size - LOG_FILE_HDR_SIZE)); -} -inline void log_t::file::set_lsn(lsn_t a_lsn) +inline void log_free_check() { -#ifdef SAFE_MUTEX - ut_ad(mysql_mutex_is_owner(&log_sys.mutex) || log_write_lock_own()); -#endif /* SAFE_MUTEX */ - lsn= a_lsn; + if (log_sys.check_flush_or_checkpoint()) + log_check_margins(); } - -inline void log_t::file::set_lsn_offset(lsn_t a_lsn) -{ -#ifdef SAFE_MUTEX - ut_ad(mysql_mutex_is_owner(&log_sys.mutex) || log_write_lock_own()); -#endif /* SAFE_MUTEX */ - ut_ad((lsn % OS_FILE_LOG_BLOCK_SIZE) == (a_lsn % OS_FILE_LOG_BLOCK_SIZE)); - lsn_offset= a_lsn; -} - -#include "log0log.inl" - -#endif diff --git a/storage/innobase/include/log0log.inl b/storage/innobase/include/log0log.inl deleted file mode 100644 index 73434737925..00000000000 --- a/storage/innobase/include/log0log.inl +++ /dev/null @@ -1,311 +0,0 @@ -/***************************************************************************** - -Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/log0log.ic -Database log - -Created 12/9/1995 Heikki Tuuri -*******************************************************/ - -#include "mach0data.h" -#include "assume_aligned.h" -#include "ut0crc32.h" - -extern ulong srv_log_buffer_size; - -/************************************************************//** -Gets a log block flush bit. -@return TRUE if this block was the first to be written in a log flush */ -UNIV_INLINE -ibool -log_block_get_flush_bit( -/*====================*/ - const byte* log_block) /*!< in: log block */ -{ - static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility"); - static_assert(LOG_BLOCK_FLUSH_BIT_MASK == 0x80000000, "compatibility"); - - return *log_block & 0x80; -} - -/************************************************************//** -Sets the log block flush bit. */ -UNIV_INLINE -void -log_block_set_flush_bit( -/*====================*/ - byte* log_block, /*!< in/out: log block */ - ibool val) /*!< in: value to set */ -{ - static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility"); - static_assert(LOG_BLOCK_FLUSH_BIT_MASK == 0x80000000, "compatibility"); - - if (val) - *log_block|= 0x80; - else - *log_block&= 0x7f; -} - -/************************************************************//** -Gets a log block number stored in the header. -@return log block number stored in the block header */ -UNIV_INLINE -ulint -log_block_get_hdr_no( -/*=================*/ - const byte* log_block) /*!< in: log block */ -{ - static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility"); - return mach_read_from_4(my_assume_aligned<4>(log_block)) & - ~LOG_BLOCK_FLUSH_BIT_MASK; -} - -/************************************************************//** -Sets the log block number stored in the header; NOTE that this must be set -before the flush bit! */ -UNIV_INLINE -void -log_block_set_hdr_no( -/*=================*/ - byte* log_block, /*!< in/out: log block */ - ulint n) /*!< in: log block number: must be > 0 and - < LOG_BLOCK_FLUSH_BIT_MASK */ -{ - static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility"); - ut_ad(n > 0); - ut_ad(n < LOG_BLOCK_FLUSH_BIT_MASK); - - mach_write_to_4(my_assume_aligned<4>(log_block), n); -} - -/************************************************************//** -Gets a log block data length. -@return log block data length measured as a byte offset from the block start */ -UNIV_INLINE -ulint -log_block_get_data_len( -/*===================*/ - const byte* log_block) /*!< in: log block */ -{ - return mach_read_from_2(my_assume_aligned<2> - (log_block + LOG_BLOCK_HDR_DATA_LEN)); -} - -/************************************************************//** -Sets the log block data length. */ -UNIV_INLINE -void -log_block_set_data_len( -/*===================*/ - byte* log_block, /*!< in/out: log block */ - ulint len) /*!< in: data length */ -{ - mach_write_to_2(my_assume_aligned<2>(log_block + LOG_BLOCK_HDR_DATA_LEN), - len); -} - -/************************************************************//** -Gets a log block first mtr log record group offset. -@return first mtr log record group byte offset from the block start, 0 -if none */ -UNIV_INLINE -ulint -log_block_get_first_rec_group( -/*==========================*/ - const byte* log_block) /*!< in: log block */ -{ - return mach_read_from_2(my_assume_aligned<2> - (log_block + LOG_BLOCK_FIRST_REC_GROUP)); -} - -/************************************************************//** -Sets the log block first mtr log record group offset. */ -UNIV_INLINE -void -log_block_set_first_rec_group( -/*==========================*/ - byte* log_block, /*!< in/out: log block */ - ulint offset) /*!< in: offset, 0 if none */ -{ - mach_write_to_2(my_assume_aligned<2> - (log_block + LOG_BLOCK_FIRST_REC_GROUP), offset); -} - -/************************************************************//** -Gets a log block checkpoint number field (4 lowest bytes). -@return checkpoint no (4 lowest bytes) */ -UNIV_INLINE -ulint -log_block_get_checkpoint_no( -/*========================*/ - const byte* log_block) /*!< in: log block */ -{ - return mach_read_from_4(my_assume_aligned<4> - (log_block + LOG_BLOCK_CHECKPOINT_NO)); -} - -/************************************************************//** -Sets a log block checkpoint number field (4 lowest bytes). */ -UNIV_INLINE -void -log_block_set_checkpoint_no( -/*========================*/ - byte* log_block, /*!< in/out: log block */ - ib_uint64_t no) /*!< in: checkpoint no */ -{ - mach_write_to_4(my_assume_aligned<4>(log_block + LOG_BLOCK_CHECKPOINT_NO), - static_cast<uint32_t>(no)); -} - -/************************************************************//** -Converts a lsn to a log block number. -@return log block number, it is > 0 and <= 1G */ -UNIV_INLINE -ulint -log_block_convert_lsn_to_no( -/*========================*/ - lsn_t lsn) /*!< in: lsn of a byte within the block */ -{ - return(((ulint) (lsn / OS_FILE_LOG_BLOCK_SIZE) & - (DBUG_IF("innodb_small_log_block_no_limit") - ? 0xFUL : 0x3FFFFFFFUL)) + 1); -} - -/** Calculate the CRC-32C checksum of a log block. -@param[in] block log block -@return checksum */ -inline ulint log_block_calc_checksum_crc32(const byte* block) -{ - return ut_crc32(block, OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM); -} - -/************************************************************//** -Gets a log block checksum field value. -@return checksum */ -UNIV_INLINE -ulint -log_block_get_checksum( -/*===================*/ - const byte* log_block) /*!< in: log block */ -{ - return mach_read_from_4(my_assume_aligned<4> - (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM + - log_block)); -} - -/************************************************************//** -Sets a log block checksum field value. */ -UNIV_INLINE -void -log_block_set_checksum( -/*===================*/ - byte* log_block, /*!< in/out: log block */ - ulint checksum) /*!< in: checksum */ -{ - mach_write_to_4(my_assume_aligned<4> - (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM + - log_block), checksum); -} - -/************************************************************//** -Initializes a log block in the log buffer. */ -UNIV_INLINE -void -log_block_init( -/*===========*/ - byte* log_block, /*!< in: pointer to the log buffer */ - lsn_t lsn) /*!< in: lsn within the log block */ -{ - ulint no; - - no = log_block_convert_lsn_to_no(lsn); - - log_block_set_hdr_no(log_block, no); - - log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE); - log_block_set_first_rec_group(log_block, 0); -} - -/** Append a string to the log. -@param[in] str string -@param[in] len string length -@param[out] start_lsn start LSN of the log record -@return end lsn of the log record, zero if did not succeed */ -UNIV_INLINE -lsn_t -log_reserve_and_write_fast( - const void* str, - ulint len, - lsn_t* start_lsn) -{ - mysql_mutex_assert_owner(&log_sys.mutex); - ut_ad(len > 0); - - const ulint data_len = len - + log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE; - - if (data_len >= log_sys.trailer_offset()) { - - /* The string does not fit within the current log block - or the log block would become full */ - - return(0); - } - - lsn_t lsn = log_sys.get_lsn(); - *start_lsn = lsn; - - memcpy(log_sys.buf + log_sys.buf_free, str, len); - - log_block_set_data_len( - reinterpret_cast<byte*>(ut_align_down( - log_sys.buf + log_sys.buf_free, - OS_FILE_LOG_BLOCK_SIZE)), - data_len); - - log_sys.buf_free += len; - - ut_ad(log_sys.buf_free <= size_t{srv_log_buffer_size}); - - lsn += len; - log_sys.set_lsn(lsn); - - return lsn; -} - -/***********************************************************************//** -Checks if there is need for a log buffer flush or a new checkpoint, and does -this if yes. Any database operation should call this when it has modified -more than about 4 pages. NOTE that this function may only be called when the -OS thread owns no synchronization objects except dict_sys.latch. */ -UNIV_INLINE -void -log_free_check(void) -/*================*/ -{ - /* During row_log_table_apply(), this function will be called while we - are holding some latches. This is OK, as long as we are not holding - any latches on buffer blocks. */ - - if (log_sys.check_flush_or_checkpoint()) { - - log_check_margins(); - } -} diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h index 6c47c135526..c661c52905b 100644 --- a/storage/innobase/include/log0recv.h +++ b/storage/innobase/include/log0recv.h @@ -37,13 +37,6 @@ Created 9/20/1997 Heikki Tuuri /** @return whether recovery is currently running. */ #define recv_recovery_is_on() UNIV_UNLIKELY(recv_sys.recovery_on) -/** Find the latest checkpoint in the log header. -@param[out] max_field LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 -@return error code or DB_SUCCESS */ -dberr_t -recv_find_max_checkpoint(ulint* max_field) - MY_ATTRIBUTE((nonnull, warn_unused_result)); - ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result)) /** Apply any buffered redo log to a page that was just read from a data file. @param[in,out] space tablespace @@ -52,12 +45,9 @@ ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result)) bool recv_recover_page(fil_space_t* space, buf_page_t* bpage); /** Start recovering from a redo log checkpoint. -@param[in] flush_lsn FIL_PAGE_FILE_FLUSH_LSN of first system tablespace page @return error code or DB_SUCCESS */ -dberr_t -recv_recovery_from_checkpoint_start( - lsn_t flush_lsn); +dberr_t recv_recovery_from_checkpoint_start(); /** Whether to store redo log records in recv_sys.pages */ enum store_t { @@ -70,17 +60,6 @@ enum store_t { }; -/** Adds data from a new log block to the parsing buffer of recv_sys if -recv_sys.parse_start_lsn is non-zero. -@param[in] log_block log block to add -@param[in] scanned_lsn lsn of how far we were able to find - data in this log block -@return true if more data added */ -bool recv_sys_add_to_parsing_buf(const byte* log_block, lsn_t scanned_lsn); - -/** Moves the parsing buffer data left to the buffer start */ -void recv_sys_justify_left_parsing_buf(); - /** Report an operation to create, delete, or rename a file during backup. @param[in] space_id tablespace identifier @param[in] type file operation redo log type @@ -228,35 +207,24 @@ private: during log scan or apply */ bool found_corrupt_fs; public: + /** @return maximum guaranteed size of a mini-transaction on recovery */ + static constexpr size_t MTR_SIZE_MAX{1U << 20}; + /** whether we are applying redo log records during crash recovery */ bool recovery_on; /** whether recv_recover_page(), invoked from buf_page_t::read_complete(), should apply log records*/ bool apply_log_recs; - byte* buf; /*!< buffer for parsing log records */ - ulint len; /*!< amount of data in buf */ - lsn_t parse_start_lsn; - /*!< this is the lsn from which we were able to - start parsing log records and adding them to - pages; zero if a suitable - start point not found yet */ - lsn_t scanned_lsn; - /*!< the log data has been scanned up to this - lsn */ - ulint scanned_checkpoint_no; - /*!< the log data has been scanned up to this - checkpoint number (lowest 4 bytes) */ - ulint recovered_offset; - /*!< start offset of non-parsed log records in - buf */ - lsn_t recovered_lsn; - /*!< the log records have been parsed up to - this lsn */ - lsn_t mlog_checkpoint_lsn; - /*!< the LSN of a FILE_CHECKPOINT - record, or 0 if none was parsed */ - /** the time when progress was last reported */ - time_t progress_time; + /** number of bytes in log_sys.buf */ + size_t len; + /** start offset of non-parsed log records in log_sys.buf */ + size_t offset; + /** log sequence number of the first non-parsed record */ + lsn_t lsn; + /** log sequence number at the end of the FILE_CHECKPOINT record, or 0 */ + lsn_t file_checkpoint; + /** the time when progress was last reported */ + time_t progress_time; using map = std::map<const page_id_t, page_recv_t, std::less<const page_id_t>, @@ -284,10 +252,10 @@ public: /** The contents of the doublewrite buffer */ recv_dblwr_t dblwr; - /** Last added LSN to pages. */ + /** Last added LSN to pages, before switching to STORE_NO */ lsn_t last_stored_lsn= 0; - void read(os_offset_t offset, span<byte> buf); + inline void read(os_offset_t offset, span<byte> buf); inline size_t files_size(); void close_files() { files.clear(); files.shrink_to_fit(); } @@ -313,17 +281,13 @@ private: from before MariaDB Server 10.5.1) */ std::vector<log_file_t> files; - void open_log_files_if_needed(); - /** Base node of the redo block list. List elements are linked via buf_block_t::unzip_LRU. */ UT_LIST_BASE_NODE_T(buf_block_t) blocks; public: /** Check whether the number of read redo log blocks exceeds the maximum. - Store last_stored_lsn if the recovery is not in the last phase. - @param[in,out] store whether to store page operations @return whether the memory is exhausted */ - inline bool is_memory_exhausted(store_t *store); + inline bool is_memory_exhausted(); /** Apply buffered log to persistent data pages. @param last_batch whether it is possible to write more redo log */ void apply(bool last_batch); @@ -343,22 +307,42 @@ public: bool is_initialised() const { return last_stored_lsn != 0; } + /** Find the latest checkpoint. + @return error code or DB_SUCCESS */ + dberr_t find_checkpoint(); + /** Register a redo log snippet for a page. @param it page iterator @param start_lsn start LSN of the mini-transaction @param lsn @see mtr_t::commit_lsn() - @param l redo log snippet @see log_t::FORMAT_10_5 + @param l redo log snippet @param len length of l, in bytes */ inline void add(map::iterator it, lsn_t start_lsn, lsn_t lsn, const byte *l, size_t len); - /** Parse and register one mini-transaction in log_t::FORMAT_10_5. - @param checkpoint_lsn the log sequence number of the latest checkpoint - @param store whether to store the records - @param apply whether to apply file-level log records - @return whether FILE_CHECKPOINT record was seen the first time, - or corruption was noticed */ - bool parse(lsn_t checkpoint_lsn, store_t *store, bool apply); + enum parse_mtr_result { OK, PREMATURE_EOF, GOT_EOF }; + +private: + /** Parse and register one log_t::FORMAT_10_8 mini-transaction. + @param store whether to store the records + @param l log data source */ + template<typename source> + inline parse_mtr_result parse(store_t store, source& l) noexcept; +public: + /** Parse and register one log_t::FORMAT_10_8 mini-transaction, + handling log_sys.is_pmem() buffer wrap-around. + @param store whether to store the records */ + static parse_mtr_result parse_mtr(store_t store) noexcept; + + /** Parse and register one log_t::FORMAT_10_8 mini-transaction, + handling log_sys.is_pmem() buffer wrap-around. + @param store whether to store the records */ + static parse_mtr_result parse_pmem(store_t store) noexcept +#ifdef HAVE_PMEM + ; +#else + { return parse_mtr(store); } +#endif /** Clear a fully processed set of stored redo log records. */ inline void clear(); @@ -441,20 +425,12 @@ extern bool recv_no_ibuf_operations; /** TRUE when recv_init_crash_recovery() has been called. */ extern bool recv_needed_recovery; #ifdef UNIV_DEBUG -/** TRUE if writing to the redo log (mtr_commit) is forbidden. -Protected by log_sys.mutex. */ -extern bool recv_no_log_write; +/** whether writing to the redo log is forbidden; +protected by exclusive log_sys.latch. */ +extern bool recv_no_log_write; #endif /* UNIV_DEBUG */ /** TRUE if buf_page_is_corrupted() should check if the log sequence number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by recv_recovery_from_checkpoint_start(). */ extern bool recv_lsn_checks_on; - -/** Size of the parsing buffer; it must accommodate RECV_SCAN_SIZE many -times! */ -#define RECV_PARSING_BUF_SIZE (2U << 20) - -/** Size of block reads when the log groups are scanned forward to do a -roll-forward */ -#define RECV_SCAN_SIZE (4U << srv_page_size_shift) diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h index 093b706c1de..ca194f905b5 100644 --- a/storage/innobase/include/mtr0log.h +++ b/storage/innobase/include/mtr0log.h @@ -53,7 +53,8 @@ inline uint8_t mlog_decode_varint_length(byte first) @param log redo log record buffer @return the decoded integer @retval MLOG_DECODE_ERROR on error */ -inline uint32_t mlog_decode_varint(const byte* log) +template<typename byte_pointer> +inline uint32_t mlog_decode_varint(const byte_pointer log) { uint32_t i= *log; if (i < MIN_2BYTE) diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index 84f62334c5c..f068467f70c 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -105,10 +105,11 @@ struct mtr_t { /** Commit a mini-transaction that did not modify any pages, but generated some redo log on a higher level, such as FILE_MODIFY records and an optional FILE_CHECKPOINT marker. - The caller must hold log_sys.mutex. + The caller must hold exclusive log_sys.latch. This is to be used at log_checkpoint(). - @param checkpoint_lsn the log sequence number of a checkpoint, or 0 */ - void commit_files(lsn_t checkpoint_lsn= 0); + @param checkpoint_lsn the log sequence number of a checkpoint, or 0 + @return current LSN */ + lsn_t commit_files(lsn_t checkpoint_lsn= 0); /** @return mini-transaction savepoint (current size of m_memo) */ ulint get_savepoint() const { ut_ad(is_active()); return m_memo.size(); } @@ -319,12 +320,9 @@ public: /** @return true if we are inside the change buffer code */ bool is_inside_ibuf() const { return m_inside_ibuf; } - /** Note that pages has been trimed */ + /** Note that some pages have been freed */ void set_trim_pages() { m_trim_pages= true; } - /** @return true if pages has been trimed */ - bool is_trim_pages() { return m_trim_pages; } - /** Latch a buffer pool block. @param block block to be latched @param rw_latch RW_S_LATCH, RW_SX_LATCH, RW_X_LATCH, RW_NO_LATCH */ @@ -592,6 +590,9 @@ public: @return number of buffer count added by this mtr */ uint32_t get_fix_count(const buf_block_t *block) const; + /** Note that log_sys.latch is no longer being held exclusively. */ + void flag_wr_unlock() noexcept { ut_ad(m_latch_ex); m_latch_ex= false; } + /** type of page flushing is needed during commit() */ enum page_flush_ahead { @@ -610,6 +611,11 @@ public: #endif private: + /** Handle any pages that were freed during the mini-transaction. */ + void process_freed_pages(); + /** Release modified pages when no log was written. */ + void release_unlogged(); + /** Log a write of a byte string to a page. @param block buffer page @param offset byte offset within page @@ -635,6 +641,13 @@ private: @param type extended record subtype; @see mrec_ext_t */ inline void log_write_extended(const buf_block_t &block, byte type); + /** Write a FILE_MODIFY record when a non-predefined persistent + tablespace was modified for the first time since fil_names_clear(). */ + ATTRIBUTE_NOINLINE ATTRIBUTE_COLD void name_write(); + + /** Encrypt the log */ + ATTRIBUTE_NOINLINE void encrypt(); + /** Append the redo log records to the redo log buffer. @return {start_lsn,flush_ahead} */ std::pair<lsn_t,page_flush_ahead> do_write(); @@ -642,7 +655,7 @@ private: /** Append the redo log records to the redo log buffer. @param len number of bytes to write @return {start_lsn,flush_ahead} */ - inline std::pair<lsn_t,page_flush_ahead> finish_write(ulint len); + std::pair<lsn_t,page_flush_ahead> finish_write(size_t len); /** Release the resources */ inline void release_resources(); @@ -666,7 +679,7 @@ private: /** whether freeing_tree() has been called */ bool m_freeing_tree= false; #endif - +private: /** The page of the most recent m_log record written, or NULL */ const buf_page_t* m_last; /** The current byte offset in m_last, or 0 */ @@ -681,6 +694,9 @@ private: /** whether at least one previously clean buffer pool page was written to */ uint16_t m_made_dirty:1; + /** whether log_sys.latch is locked exclusively */ + uint16_t m_latch_ex:1; + /** whether change buffer is latched; only needed in non-debug builds to suppress some read-ahead operations, @see ibuf_inside() */ uint16_t m_inside_ibuf:1; @@ -688,6 +704,9 @@ private: /** whether the pages has been trimmed */ uint16_t m_trim_pages:1; + /** CRC-32C of m_log */ + uint32_t m_crc; + #ifdef UNIV_DEBUG /** Persistent user tablespace associated with the mini-transaction, or 0 (TRX_SYS_SPACE) if none yet */ diff --git a/storage/innobase/include/mtr0mtr.inl b/storage/innobase/include/mtr0mtr.inl index 71b476a2f5d..0b45bc1d695 100644 --- a/storage/innobase/include/mtr0mtr.inl +++ b/storage/innobase/include/mtr0mtr.inl @@ -49,9 +49,8 @@ mtr_t::memo_push(void* object, mtr_memo_type_t type) ut_ad(type == MTR_MEMO_PAGE_X_MODIFY || ut_is_2pow(type)); /* If this mtr has x-fixed a clean page then we set - the made_dirty flag. This tells us if we need to - grab log_sys.flush_order_mutex at mtr_t::commit() so that we - can insert the dirtied page into the flush list. */ + the made_dirty flag. This tells mtr_t::commit() + to hold log_sys.latch longer. */ if (!m_made_dirty && (type == MTR_MEMO_PAGE_X_FIX || type == MTR_MEMO_PAGE_SX_FIX)) { @@ -107,9 +106,8 @@ mtr_t::sx_latch_at_savepoint( mtr_memo_slot_t* slot = m_memo.at<mtr_memo_slot_t*>(savepoint); ut_ad(slot->object == block); - - /* == RW_NO_LATCH */ - ut_a(slot->type == MTR_MEMO_BUF_FIX); + ut_ad(slot->type == MTR_MEMO_BUF_FIX); /* == RW_NO_LATCH */ + slot->type = MTR_MEMO_PAGE_SX_FIX; block->page.lock.u_lock(); ut_ad(!block->page.is_io_fixed()); @@ -117,8 +115,6 @@ mtr_t::sx_latch_at_savepoint( if (!m_made_dirty) { m_made_dirty = is_block_dirtied(block); } - - slot->type = MTR_MEMO_PAGE_SX_FIX; } /** @@ -141,9 +137,8 @@ mtr_t::x_latch_at_savepoint( mtr_memo_slot_t* slot = m_memo.at<mtr_memo_slot_t*>(savepoint); ut_ad(slot->object == block); - - /* == RW_NO_LATCH */ - ut_a(slot->type == MTR_MEMO_BUF_FIX); + ut_ad(slot->type == MTR_MEMO_BUF_FIX); /* == RW_NO_LATCH */ + slot->type = MTR_MEMO_PAGE_X_FIX; block->page.lock.x_lock(); ut_ad(!block->page.is_io_fixed()); @@ -151,8 +146,6 @@ mtr_t::x_latch_at_savepoint( if (!m_made_dirty) { m_made_dirty = is_block_dirtied(block); } - - slot->type = MTR_MEMO_PAGE_X_FIX; } /** diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h index 7acc255da36..1de31126a88 100644 --- a/storage/innobase/include/mtr0types.h +++ b/storage/innobase/include/mtr0types.h @@ -52,17 +52,17 @@ enum mtr_log_t { /* A mini-transaction is a stream of records that is always terminated by -a NUL byte. The first byte of a mini-transaction record is never NUL, -but NUL bytes can occur within mini-transaction records. The first -bytes of each record will explicitly encode the length of the record. -NUL bytes also acts as padding in log blocks, that is, there can be -multiple sucessive NUL bytes between mini-transactions in a redo log -block. +a byte 0x00 or 0x01. The first byte of a mini-transaction record is +never one of these bytes, but these bytes can occur within mini-transaction +records. The first byte of the record would contain a record type, flags, and a part of length. The optional second byte of the record will contain more length. (Not needed for short records.) +For example, because the length of an INIT_PAGE record is 3 to 11 bytes, +the first byte will be 0x02 to 0x0a, indicating the number of subsequent bytes. + Bit 7 of the first byte of a redo log record is the same_page flag. If same_page=1, the record is referring to the same page as the previous record. Records that do not refer to data pages but to file @@ -187,8 +187,11 @@ A subsequent WRITE to the same page could be logged 0xb5 0x7f 0x23 0x34 0x56 0x78, meaning "same page, type code 3 (WRITE), 5 bytes to follow", "byte offset 0x7f"+0x60+2, bytes 0x23,0x34,0x56,0x78. -The end of the mini-transaction would be indicated by a NUL byte. -*/ +The end of the mini-transaction would be indicated by the end byte +0x00 or 0x01; @see log_sys.get_sequence_bit(). +If log_sys.is_encrypted(), that is followed by 8 bytes of nonce +(part of initialization vector). That will be followed by 4 bytes +of CRC-32C of the entire mini-tranasction, excluding the end byte. */ /** Redo log record types. These bit patterns (3 bits) will be written to the redo log file, so the existing codes or their interpretation on @@ -305,14 +308,16 @@ enum mfile_type_t FILE_RENAME = 0xa0, /** Modify a file. Followed by tablespace ID and the file name. */ FILE_MODIFY = 0xb0, - /** End-of-checkpoint marker. Followed by 2 dummy bytes of page identifier, - 8 bytes of LSN, and padded with a NUL; @see SIZE_OF_FILE_CHECKPOINT. */ + /** End-of-checkpoint marker, at the end of a mini-transaction. + Followed by 2 NUL bytes of page identifier and 8 bytes of LSN; + @see SIZE_OF_FILE_CHECKPOINT. + When all bytes are NUL, this is a dummy padding record. */ FILE_CHECKPOINT = 0xf0 }; /** Size of a FILE_CHECKPOINT record, including the trailing byte to -terminate the mini-transaction. */ -constexpr byte SIZE_OF_FILE_CHECKPOINT= 3/*type,page_id*/ + 8/*LSN*/ + 1; +terminate the mini-transaction and the CRC-32C. */ +constexpr byte SIZE_OF_FILE_CHECKPOINT= 3/*type,page_id*/ + 8/*LSN*/ + 1 + 4; #ifndef UNIV_INNOCHECKSUM /** Types for the mlock objects to store in the mtr_t::m_memo */ diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index fe977c10633..727e9e49ead 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -107,15 +107,6 @@ struct pfs_os_file_t #endif }; -/** The next value should be smaller or equal to the smallest sector size used -on any disk. A log block is required to be a portion of disk which is written -so that if the start and the end of a block get written to disk, then the -whole block gets written. This should be true even in most cases of a crash: -if this fails for a log block, then it is equivalent to a media failure in the -log. */ - -#define OS_FILE_LOG_BLOCK_SIZE 512U - /** Options for os_file_create_func @{ */ enum os_file_create_t { OS_FILE_OPEN = 51, /*!< to open an existing file (if @@ -456,7 +447,6 @@ bool os_file_close_func(os_file_t file); /* Keys to register InnoDB I/O with performance schema */ extern mysql_pfs_key_t innodb_data_file_key; -extern mysql_pfs_key_t innodb_log_file_key; extern mysql_pfs_key_t innodb_temp_file_key; /* Following four macros are instumentations to register diff --git a/storage/innobase/include/page0cur.inl b/storage/innobase/include/page0cur.inl index 6f7c633561f..48ac428f09c 100644 --- a/storage/innobase/include/page0cur.inl +++ b/storage/innobase/include/page0cur.inl @@ -272,6 +272,7 @@ page_cur_tuple_insert( index, rec, *offsets, mtr); } - ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, *offsets)); + ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, index, *offsets)); return(rec); } + diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h index 4787ce36c7a..ae06d06ad5e 100644 --- a/storage/innobase/include/page0page.h +++ b/storage/innobase/include/page0page.h @@ -1130,7 +1130,7 @@ page_find_rec_with_heap_no( @return the last record, not delete-marked @retval infimum record if all records are delete-marked */ const rec_t* -page_find_rec_max_not_deleted( +page_find_rec_last_not_deleted( const page_t* page); #endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/rem0cmp.h b/storage/innobase/include/rem0cmp.h index 6f2201971d1..3a30f5a92f3 100644 --- a/storage/innobase/include/rem0cmp.h +++ b/storage/innobase/include/rem0cmp.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -24,8 +24,7 @@ Comparison services for records Created 7/1/1994 Heikki Tuuri ************************************************************************/ -#ifndef rem0cmp_h -#define rem0cmp_h +#pragma once #include "data0data.h" #include "data0type.h" @@ -43,39 +42,40 @@ cmp_cols_are_equal( ibool check_charsets); /*!< in: whether to check charsets */ /** Compare two data fields. -@param[in] mtype main type -@param[in] prtype precise type -@param[in] data1 data field -@param[in] len1 length of data1 in bytes, or UNIV_SQL_NULL -@param[in] data2 data field -@param[in] len2 length of data2 in bytes, or UNIV_SQL_NULL +@param mtype main type +@param prtype precise type +@param descending whether to use descending order +@param data1 data field +@param len1 length of data1 in bytes, or UNIV_SQL_NULL +@param data2 data field +@param len2 length of data2 in bytes, or UNIV_SQL_NULL @return the comparison result of data1 and data2 @retval 0 if data1 is equal to data2 @retval negative if data1 is less than data2 @retval positive if data1 is greater than data2 */ -int -cmp_data_data( - ulint mtype, - ulint prtype, - const byte* data1, - ulint len1, - const byte* data2, - ulint len2) - MY_ATTRIBUTE((warn_unused_result)); +int cmp_data(ulint mtype, ulint prtype, bool descending, + const byte *data1, size_t len1, const byte *data2, size_t len2) + MY_ATTRIBUTE((warn_unused_result)); /** Compare two data fields. -@param[in] dfield1 data field; must have type field set -@param[in] dfield2 data field +@param dfield1 data field; must have type field set +@param dfield2 data field +@param descending whether to use descending order @return the comparison result of dfield1 and dfield2 @retval 0 if dfield1 is equal to dfield2 @retval negative if dfield1 is less than dfield2 @retval positive if dfield1 is greater than dfield2 */ -UNIV_INLINE -int -cmp_dfield_dfield( -/*==============*/ - const dfield_t* dfield1,/*!< in: data field; must have type field set */ - const dfield_t* dfield2);/*!< in: data field */ +inline int cmp_dfield_dfield(const dfield_t *dfield1, const dfield_t *dfield2, + bool descending= false) +{ + ut_ad(dfield_check_typed(dfield1)); + const dtype_t *type= dfield_get_type(dfield1); + return cmp_data(type->mtype, type->prtype, descending, + static_cast<const byte*>(dfield_get_data(dfield1)), + dfield_get_len(dfield1), + static_cast<const byte*>(dfield_get_data(dfield2)), + dfield_get_len(dfield2)); +} #ifdef UNIV_DEBUG /** Compare a GIS data tuple to a physical record. @@ -103,15 +103,15 @@ inline int cmp_geometry_field(const void *a, const void *b) double x2= mach_double_read(mbr2); if (x1 > x2) return 1; - if (x2 > x1) + if (x1 < x2) return -1; - double y1= mach_double_read(mbr1 + sizeof(double) * SPDIMS); - double y2= mach_double_read(mbr2 + sizeof(double) * SPDIMS); + x1= mach_double_read(mbr1 + sizeof(double) * SPDIMS); + x2= mach_double_read(mbr2 + sizeof(double) * SPDIMS); - if (y1 > y2) + if (x1 > x2) return 1; - if (y2 > y1) + if (x1 < x2) return -1; /* left lower corner (xmin, ymin) overlaps, now right upper corner */ @@ -120,41 +120,39 @@ inline int cmp_geometry_field(const void *a, const void *b) if (x1 > x2) return 1; - if (x2 > x1) + if (x1 < x2) return -1; - y1= mach_double_read(mbr1 + sizeof(double) * 2 + sizeof(double)); - y2= mach_double_read(mbr2 + sizeof(double) * 2 + sizeof(double)); + x1= mach_double_read(mbr1 + sizeof(double) * 2 + sizeof(double)); + x2= mach_double_read(mbr2 + sizeof(double) * 2 + sizeof(double)); - if (y1 > y2) + if (x1 > x2) return 1; - if (y2 > y1) + if (x1 < x2) return -1; return 0; } /** Compare a data tuple to a physical record. -@param[in] dtuple data tuple -@param[in] rec B-tree record -@param[in] offsets rec_get_offsets(rec) -@param[in] n_cmp number of fields to compare -@param[in,out] matched_fields number of completely matched fields +@param dtuple data tuple +@param rec B-tree index record +@param index B-tree index +@param offsets rec_get_offsets(rec,index) +@param n_cmp number of fields to compare +@param matched_fields number of completely matched fields @return the comparison result of dtuple and rec @retval 0 if dtuple is equal to rec @retval negative if dtuple is less than rec @retval positive if dtuple is greater than rec */ -int -cmp_dtuple_rec_with_match_low( - const dtuple_t* dtuple, - const rec_t* rec, - const rec_offs* offsets, - ulint n_cmp, - ulint* matched_fields) - MY_ATTRIBUTE((nonnull)); -#define cmp_dtuple_rec_with_match(tuple,rec,offsets,fields) \ +int cmp_dtuple_rec_with_match_low(const dtuple_t *dtuple, const rec_t *rec, + const dict_index_t *index, + const rec_offs *offsets, + ulint n_cmp, ulint *matched_fields) + MY_ATTRIBUTE((nonnull)); +#define cmp_dtuple_rec_with_match(tuple,rec,index,offsets,fields) \ cmp_dtuple_rec_with_match_low( \ - tuple,rec,offsets,dtuple_get_n_fields_cmp(tuple),fields) + tuple,rec,index,offsets,dtuple_get_n_fields_cmp(tuple),fields) /** Compare a data tuple to a physical record. @param[in] dtuple data tuple @param[in] rec B-tree or R-tree index record @@ -178,28 +176,32 @@ cmp_dtuple_rec_with_match_bytes( MY_ATTRIBUTE((warn_unused_result)); /** Compare a data tuple to a physical record. @see cmp_dtuple_rec_with_match -@param[in] dtuple data tuple -@param[in] rec B-tree record -@param[in] offsets rec_get_offsets(rec) +@param dtuple data tuple +@param rec index record +@param index index +@param offsets rec_get_offsets(rec, index) @return the comparison result of dtuple and rec @retval 0 if dtuple is equal to rec @retval negative if dtuple is less than rec @retval positive if dtuple is greater than rec */ -int -cmp_dtuple_rec( - const dtuple_t* dtuple, - const rec_t* rec, - const rec_offs* offsets); -/**************************************************************//** -Checks if a dtuple is a prefix of a record. The last field in dtuple -is allowed to be a prefix of the corresponding field in the record. -@return TRUE if prefix */ -ibool -cmp_dtuple_is_prefix_of_rec( -/*========================*/ - const dtuple_t* dtuple, /*!< in: data tuple */ - const rec_t* rec, /*!< in: physical record */ - const rec_offs* offsets);/*!< in: array returned by rec_get_offsets() */ +inline int cmp_dtuple_rec(const dtuple_t *dtuple, const rec_t *rec, + const dict_index_t *index, const rec_offs *offsets) +{ + ulint matched= 0; + return cmp_dtuple_rec_with_match(dtuple, rec, index, offsets, &matched); +} + +/** Check if a dtuple is a prefix of a record. +@param dtuple data tuple +@param rec index record +@param index index +@param offsets rec_get_offsets(rec) +@return whether dtuple is a prefix of rec */ +bool cmp_dtuple_is_prefix_of_rec(const dtuple_t *dtuple, const rec_t *rec, + const dict_index_t *index, + const rec_offs *offsets) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + /** Compare two physical records that contain the same number of columns, none of which are stored externally. @retval positive if rec1 (including non-ordering columns) is greater than rec2 @@ -246,18 +248,39 @@ cmp_rec_rec( MY_ATTRIBUTE((nonnull(1,2,3,4,5))); /** Compare two data fields. -@param[in] dfield1 data field -@param[in] dfield2 data field +@param dfield1 data field +@param dfield2 data field @return the comparison result of dfield1 and dfield2 -@retval 0 if dfield1 is equal to dfield2, or a prefix of dfield1 -@retval negative if dfield1 is less than dfield2 -@retval positive if dfield1 is greater than dfield2 */ -UNIV_INLINE -int -cmp_dfield_dfield_like_prefix( - const dfield_t* dfield1, - const dfield_t* dfield2); +@retval true if dfield1 is equal to dfield2, or a prefix of dfield1 +@retval false otherwise */ +inline bool cmp_dfield_dfield_eq_prefix(const dfield_t *dfield1, + const dfield_t *dfield2) +{ + ut_ad(dfield_check_typed(dfield1)); + ut_ad(dfield_check_typed(dfield2)); + const dtype_t *type= dfield_get_type(dfield1); -#include "rem0cmp.inl" +#ifdef UNIV_DEBUG + switch (type->prtype & DATA_MYSQL_TYPE_MASK) { + case MYSQL_TYPE_BIT: + case MYSQL_TYPE_STRING: + case MYSQL_TYPE_VAR_STRING: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + case MYSQL_TYPE_VARCHAR: + break; + default: + ut_error; + } +#endif /* UNIV_DEBUG */ -#endif + uint cs_num= dtype_get_charset_coll(type->prtype); + CHARSET_INFO *cs= get_charset(cs_num, MYF(MY_WME)); + ut_a(cs); + return !cs->strnncoll(static_cast<const uchar*>(dfield_get_data(dfield1)), + dfield_get_len(dfield1), + static_cast<const uchar*>(dfield_get_data(dfield2)), + dfield_get_len(dfield2), 1); +} diff --git a/storage/innobase/include/rem0cmp.inl b/storage/innobase/include/rem0cmp.inl deleted file mode 100644 index 6e21382d187..00000000000 --- a/storage/innobase/include/rem0cmp.inl +++ /dev/null @@ -1,107 +0,0 @@ -/***************************************************************************** - -Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2020, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/*******************************************************************//** -@file include/rem0cmp.ic -Comparison services for records - -Created 7/1/1994 Heikki Tuuri -************************************************************************/ - -#include <mysql_com.h> -#include <my_sys.h> - -/** Compare two data fields. -@param[in] dfield1 data field; must have type field set -@param[in] dfield2 data field -@return the comparison result of dfield1 and dfield2 -@retval 0 if dfield1 is equal to dfield2 -@retval negative if dfield1 is less than dfield2 -@retval positive if dfield1 is greater than dfield2 */ -UNIV_INLINE -int -cmp_dfield_dfield( - const dfield_t* dfield1, - const dfield_t* dfield2) -{ - const dtype_t* type; - - ut_ad(dfield_check_typed(dfield1)); - - type = dfield_get_type(dfield1); - - return(cmp_data_data(type->mtype, type->prtype, - (const byte*) dfield_get_data(dfield1), - dfield_get_len(dfield1), - (const byte*) dfield_get_data(dfield2), - dfield_get_len(dfield2))); -} - -/** Compare two data fields. -@param[in] dfield1 data field -@param[in] dfield2 data field -@return the comparison result of dfield1 and dfield2 -@retval 0 if dfield1 is equal to dfield2, or a prefix of dfield1 -@retval negative if dfield1 is less than dfield2 -@retval positive if dfield1 is greater than dfield2 */ -UNIV_INLINE -int -cmp_dfield_dfield_like_prefix( - const dfield_t* dfield1, - const dfield_t* dfield2) -{ - const dtype_t* type; - - ut_ad(dfield_check_typed(dfield1)); - ut_ad(dfield_check_typed(dfield2)); - - type = dfield_get_type(dfield1); - -#ifdef UNIV_DEBUG - switch (type->prtype & DATA_MYSQL_TYPE_MASK) { - case MYSQL_TYPE_BIT: - case MYSQL_TYPE_STRING: - case MYSQL_TYPE_VAR_STRING: - case MYSQL_TYPE_TINY_BLOB: - case MYSQL_TYPE_MEDIUM_BLOB: - case MYSQL_TYPE_BLOB: - case MYSQL_TYPE_LONG_BLOB: - case MYSQL_TYPE_VARCHAR: - break; - default: - ut_error; - } -#endif /* UNIV_DEBUG */ - - uint cs_num = (uint) dtype_get_charset_coll(type->prtype); - - if (CHARSET_INFO* cs = get_charset(cs_num, MYF(MY_WME))) { - return(cs->strnncoll( - static_cast<const uchar*>( - dfield_get_data(dfield1)), - dfield_get_len(dfield1), - static_cast<const uchar*>( - dfield_get_data(dfield2)), - dfield_get_len(dfield2), - 1)); - } - - ib::fatal() << "Unable to find charset-collation " << cs_num; - return(0); -} diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h index f29fd98ad30..ec435df17d8 100644 --- a/storage/innobase/include/row0merge.h +++ b/storage/innobase/include/row0merge.h @@ -109,6 +109,7 @@ struct index_field_t { ulint prefix_len; /*!< column prefix length, or 0 if indexing the whole column */ bool is_v_col; /*!< whether this is a virtual column */ + bool descending; /*!< whether to use DESC order */ }; /** Definition of an index being created */ diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h index 971f6363bdb..1ca2d7a429f 100644 --- a/storage/innobase/include/srv0mon.h +++ b/storage/innobase/include/srv0mon.h @@ -269,9 +269,6 @@ enum monitor_id_t { MONITOR_OS_PENDING_READS, MONITOR_OS_PENDING_WRITES, MONITOR_OVLD_OS_LOG_WRITTEN, - MONITOR_OVLD_OS_LOG_FSYNC, - MONITOR_OVLD_OS_LOG_PENDING_FSYNC, - MONITOR_OVLD_OS_LOG_PENDING_WRITES, /* Transaction related counters */ MONITOR_MODULE_TRX, @@ -298,20 +295,16 @@ enum monitor_id_t { /* Recovery related counters */ MONITOR_MODULE_RECOVERY, - MONITOR_NUM_CHECKPOINT, + MONITOR_OVLD_CHECKPOINTS, MONITOR_OVLD_LSN_FLUSHDISK, MONITOR_OVLD_LSN_CHECKPOINT, MONITOR_OVLD_LSN_CURRENT, MONITOR_LSN_CHECKPOINT_AGE, MONITOR_OVLD_BUF_OLDEST_LSN, MONITOR_OVLD_MAX_AGE_ASYNC, - MONITOR_PENDING_LOG_FLUSH, - MONITOR_PENDING_CHECKPOINT_WRITE, - MONITOR_LOG_IO, MONITOR_OVLD_LOG_WAITS, MONITOR_OVLD_LOG_WRITE_REQUEST, MONITOR_OVLD_LOG_WRITES, - MONITOR_OVLD_LOG_PADDED, /* Page Manager related counters */ MONITOR_MODULE_PAGE, diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 4cd77b08a60..41f61567b53 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -86,28 +86,6 @@ struct srv_stats_t /** Count the amount of data written in total (in bytes) */ ulint_ctr_1_t data_written; - /** Number of the log write requests done */ - ulint_ctr_1_t log_write_requests; - - /** Number of physical writes to the log performed */ - ulint_ctr_1_t log_writes; - - /** Amount of data padded for log write ahead */ - ulint_ctr_1_t log_padded; - - /** Amount of data written to the log files in bytes */ - lsn_ctr_1_t os_log_written; - - /** Number of writes being done to the log files */ - ulint_ctr_1_t os_log_pending_writes; - - /** We increase this counter, when we don't have enough - space in the log buffer and have to flush it */ - ulint_ctr_1_t log_waits; - - /** Store the number of write requests issued */ - ulint_ctr_1_t buf_pool_write_requests; - /** Number of buffer pool reads that led to the reading of a disk page */ ulint_ctr_1_t buf_pool_reads; @@ -286,18 +264,15 @@ extern char* srv_log_group_home_dir; /** The InnoDB redo log file size, or 0 when changing the redo log format at startup (while disallowing writes to the redo log). */ extern ulonglong srv_log_file_size; -extern ulong srv_log_buffer_size; extern ulong srv_flush_log_at_trx_commit; extern uint srv_flush_log_at_timeout; -extern ulong srv_log_write_ahead_size; extern my_bool srv_adaptive_flushing; extern my_bool srv_flush_sync; /** Requested size in bytes */ extern ulint srv_buf_pool_size; -/** Requested buffer pool chunk size. Each buffer pool instance consists -of one or more chunks. */ -extern ulong srv_buf_pool_chunk_unit; +/** Requested buffer pool chunk size */ +extern size_t srv_buf_pool_chunk_unit; /** Scan depth for LRU flush batch i.e.: number of blocks scanned*/ extern ulong srv_LRU_scan_depth; /** Whether or not to flush neighbors of a block */ @@ -459,9 +434,13 @@ extern my_bool srv_print_all_deadlocks; extern my_bool srv_cmp_per_index_enabled; +/** innodb_encrypt_log */ +extern my_bool srv_encrypt_log; + /* is encryption enabled */ extern ulong srv_encrypt_tables; + /** Status variables to be passed to MySQL */ extern struct export_var_t export_vars; @@ -688,7 +667,6 @@ struct export_var_t{ ulint innodb_buffer_pool_pages_old; ulint innodb_buffer_pool_read_requests; /*!< buf_pool.stat.n_page_gets */ ulint innodb_buffer_pool_reads; /*!< srv_buf_pool_reads */ - ulint innodb_buffer_pool_write_requests;/*!< srv_stats.buf_pool_write_requests */ ulint innodb_buffer_pool_read_ahead_rnd;/*!< srv_read_ahead_rnd */ ulint innodb_buffer_pool_read_ahead; /*!< srv_read_ahead */ ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/ @@ -696,8 +674,6 @@ struct export_var_t{ ulint innodb_checkpoint_max_age; ulint innodb_data_pending_reads; /*!< Pending reads */ ulint innodb_data_pending_writes; /*!< Pending writes */ - ulint innodb_data_pending_fsyncs; /*!< Pending fsyncs */ - ulint innodb_data_fsyncs; /*!< Number of fsyncs so far */ ulint innodb_data_read; /*!< Data bytes read */ ulint innodb_data_writes; /*!< I/O write requests */ ulint innodb_data_written; /*!< Data bytes written */ @@ -706,9 +682,6 @@ struct export_var_t{ ulint innodb_dblwr_writes; /*!< srv_dblwr_writes */ ulint innodb_deadlocks; ulint innodb_history_list_length; - ulint innodb_log_waits; /*!< srv_log_waits */ - ulint innodb_log_write_requests; /*!< srv_log_write_requests */ - ulint innodb_log_writes; /*!< srv_log_writes */ lsn_t innodb_lsn_current; lsn_t innodb_lsn_flushed; lsn_t innodb_lsn_last_checkpoint; @@ -717,10 +690,8 @@ struct export_var_t{ ulint innodb_mem_adaptive_hash; #endif ulint innodb_mem_dictionary; - lsn_t innodb_os_log_written; /*!< srv_os_log_written */ - ulint innodb_os_log_fsyncs; /*!< n_log_flushes */ - ulint innodb_os_log_pending_writes; /*!< srv_os_log_pending_writes */ - ulint innodb_os_log_pending_fsyncs; /*!< n_pending_log_flushes */ + /** log_sys.get_lsn() - recv_sys.lsn */ + lsn_t innodb_os_log_written; ulint innodb_row_lock_waits; /*!< srv_n_lock_wait_count */ ulint innodb_row_lock_current_waits; /*!< srv_n_lock_wait_current_count */ int64_t innodb_row_lock_time; /*!< srv_n_lock_wait_time diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i index 567554f34c3..6dc1d8b7341 100644 --- a/storage/innobase/include/univ.i +++ b/storage/innobase/include/univ.i @@ -510,8 +510,6 @@ extern mysql_pfs_key_t fts_doc_id_mutex_key; extern mysql_pfs_key_t ibuf_bitmap_mutex_key; extern mysql_pfs_key_t ibuf_mutex_key; extern mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key; -extern mysql_pfs_key_t log_sys_mutex_key; -extern mysql_pfs_key_t log_flush_order_mutex_key; extern mysql_pfs_key_t recalc_pool_mutex_key; extern mysql_pfs_key_t purge_sys_pq_mutex_key; extern mysql_pfs_key_t recv_sys_mutex_key; @@ -538,6 +536,7 @@ extern mysql_pfs_key_t index_tree_rw_lock_key; extern mysql_pfs_key_t index_online_log_key; extern mysql_pfs_key_t trx_sys_rw_lock_key; extern mysql_pfs_key_t lock_latch_key; +extern mysql_pfs_key_t log_latch_key; extern mysql_pfs_key_t trx_rseg_latch_key; # endif /* UNIV_PFS_RWLOCK */ #endif /* HAVE_PSI_INTERFACE */ diff --git a/storage/innobase/include/ut0crc32.h b/storage/innobase/include/ut0crc32.h deleted file mode 100644 index 0cbccb976e2..00000000000 --- a/storage/innobase/include/ut0crc32.h +++ /dev/null @@ -1,37 +0,0 @@ -/***************************************************************************** - -Copyright (c) 2011, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2016, 2020, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/ut0crc32.h -CRC32 implementation - -Created Aug 10, 2011 Vasil Dimov -*******************************************************/ - -#ifndef ut0crc32_h -#define ut0crc32_h - -#include "univ.i" -#include <my_sys.h> -static inline uint32_t ut_crc32(const byte *s, size_t size) -{ - return my_crc32c(0, s, size); -} - -#endif /* ut0crc32_h */ diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h index 89ff0ca709f..95541ea574e 100644 --- a/storage/innobase/include/ut0ut.h +++ b/storage/innobase/include/ut0ut.h @@ -305,6 +305,16 @@ operator<<( return(lhs); } +/** This is a wrapper class, used to print any number in IEC style */ +struct bytes_iec { + explicit bytes_iec(unsigned long long t): m_val(t) {} + double get_double() const { return static_cast<double>(m_val); } + const unsigned long long m_val; +}; + +/** Like hex operator above, except for bytes_iec */ +std::ostream &operator<<(std::ostream &lhs, const bytes_iec &rhs); + /** The class logger is the base class of all the error log related classes. It contains a std::ostringstream object. The main purpose of this class is to forward operator<< to the underlying std::ostringstream object. Do not |