diff options
Diffstat (limited to 'storage/innobase/include')
114 files changed, 2752 insertions, 4366 deletions
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index bcf5904cd09..29382bb033f 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -173,24 +173,19 @@ record is in spatial index */ | BTR_LATCH_FOR_DELETE \ | BTR_MODIFY_EXTERNAL))) -/**************************************************************//** -Report that an index page is corrupted. */ -void -btr_corruption_report( -/*==================*/ - const buf_block_t* block, /*!< in: corrupted block */ - const dict_index_t* index) /*!< in: index tree */ - ATTRIBUTE_COLD __attribute__((nonnull)); +/** Report that an index page is corrupted. +@param[in] buffer block +@param[in] index tree */ +ATTRIBUTE_COLD ATTRIBUTE_NORETURN __attribute__((nonnull)) +void btr_corruption_report(const buf_block_t* block,const dict_index_t* index); /** Assert that a B-tree page is not corrupted. @param block buffer block containing a B-tree page @param index the B-tree index */ -#define btr_assert_not_corrupted(block, index) \ - if ((ibool) !!page_is_comp(buf_block_get_frame(block)) \ - != dict_table_is_comp((index)->table)) { \ - btr_corruption_report(block, index); \ - ut_error; \ - } +#define btr_assert_not_corrupted(block, index) \ + if (!!page_is_comp(buf_block_get_frame(block)) \ + != index->table->not_redundant()) \ + btr_corruption_report(block, index) /**************************************************************//** Gets the root node of a tree and sx-latches it for segment access. @@ -225,6 +220,7 @@ btr_height_get( /** Gets a buffer page and declares its latching order level. @param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] mode latch mode @param[in] file file name @param[in] line line where called @@ -236,7 +232,7 @@ UNIV_INLINE buf_block_t* btr_block_get_func( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint mode, const char* file, unsigned line, @@ -245,13 +241,13 @@ btr_block_get_func( /** Gets a buffer page and declares its latching order level. @param page_id tablespace/page identifier -@param page_size page size +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param mode latch mode @param index index tree, may be NULL if not the insert buffer tree @param mtr mini-transaction handle @return the block descriptor */ -# define btr_block_get(page_id, page_size, mode, index, mtr) \ - btr_block_get_func(page_id, page_size, mode, \ +# define btr_block_get(page_id, zip_size, mode, index, mtr) \ + btr_block_get_func(page_id, zip_size, mode, \ __FILE__, __LINE__, (dict_index_t*)index, mtr) /**************************************************************//** Gets the index id field of a page. @@ -327,40 +323,33 @@ btr_node_ptr_get_child_page_no( @param[in] type type of the index @param[in,out] space tablespace where created @param[in] index_id index id -@param[in] index index, or NULL when applying TRUNCATE -log record during recovery -@param[in] btr_redo_create_info used for applying TRUNCATE log -@param[in] mtr mini-transaction handle -record during recovery -@return page number of the created root, FIL_NULL if did not succeed */ +@param[in] index index +@param[in,out] mtr mini-transaction +@return page number of the created root +@retval FIL_NULL if did not succeed */ ulint btr_create( ulint type, fil_space_t* space, index_id_t index_id, dict_index_t* index, - const btr_create_t* btr_redo_create_info, mtr_t* mtr); /** Free a persistent index tree if it exists. @param[in] page_id root page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] index_id PAGE_INDEX_ID contents @param[in,out] mtr mini-transaction */ void btr_free_if_exists( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, index_id_t index_id, mtr_t* mtr); -/** Free an index tree in a temporary tablespace or during TRUNCATE TABLE. -@param[in] page_id root page id -@param[in] page_size page size */ -void -btr_free( - const page_id_t page_id, - const page_size_t& page_size); +/** Free an index tree in a temporary tablespace. +@param[in] page_id root page id */ +void btr_free(const page_id_t page_id); /** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC. @param[in,out] index clustered index @@ -390,6 +379,12 @@ void btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset = false) MY_ATTRIBUTE((nonnull)); +/** Write instant ALTER TABLE metadata to a root page. +@param[in,out] root clustered index root page +@param[in] index clustered index with instant ALTER TABLE +@param[in,out] mtr mini-transaction */ +void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr); + /*************************************************************//** Makes tree one level higher by splitting the root, and inserts the tuple. It is assumed that mtr contains an x-latch on the tree. @@ -750,21 +745,23 @@ dberr_t btr_validate_index( /*===============*/ dict_index_t* index, /*!< in: index */ - const trx_t* trx, /*!< in: transaction or 0 */ - bool lockout)/*!< in: true if X-latch index is intended */ + const trx_t* trx) /*!< in: transaction or 0 */ MY_ATTRIBUTE((warn_unused_result)); -/*************************************************************//** -Removes a page from the level list of pages. */ -UNIV_INTERN +/** Remove a page from the level list of pages. +@param[in] space space where removed +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] page page to remove +@param[in] index index tree +@param[in,out] mtr mini-transaction */ void btr_level_list_remove_func( -/*=======================*/ - ulint space, /*!< in: space where removed */ - const page_size_t& page_size,/*!< in: page size */ - page_t* page, /*!< in/out: page to remove */ - dict_index_t* index, /*!< in: index tree */ - mtr_t* mtr); /*!< in/out: mini-transaction */ + ulint space, + ulint zip_size, + page_t* page, + dict_index_t* index, + mtr_t* mtr); + /*************************************************************//** Removes a page from the level list of pages. @param space in: space where removed @@ -799,5 +796,6 @@ btr_lift_page_up( /**************************************************************** Global variable controlling if scrubbing should be performed */ extern my_bool srv_immediate_scrub_data_uncompressed; +extern Atomic_counter<uint32_t> btr_validate_index_running; #endif diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic index 49567979c98..d3827b7dc6f 100644 --- a/storage/innobase/include/btr0btr.ic +++ b/storage/innobase/include/btr0btr.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2015, 2017, MariaDB Corporation. +Copyright (c) 2015, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -31,6 +31,7 @@ Created 6/2/1994 Heikki Tuuri /** Gets a buffer page and declares its latching order level. @param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] mode latch mode @param[in] file file name @param[in] line line where called @@ -42,7 +43,7 @@ UNIV_INLINE buf_block_t* btr_block_get_func( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint mode, const char* file, unsigned line, @@ -53,7 +54,7 @@ btr_block_get_func( dberr_t err=DB_SUCCESS; block = buf_page_get_gen( - page_id, page_size, mode, NULL, BUF_GET, file, line, mtr, &err); + page_id, zip_size, mode, NULL, BUF_GET, file, line, mtr, &err); if (err == DB_DECRYPTION_FAILED) { if (index && index->table) { diff --git a/storage/innobase/include/btr0bulk.h b/storage/innobase/include/btr0bulk.h index be4b55c1a11..46db1a73f70 100644 --- a/storage/innobase/include/btr0bulk.h +++ b/storage/innobase/include/btr0bulk.h @@ -289,8 +289,7 @@ public: ut_ad(!dict_index_is_spatial(index)); #ifdef UNIV_DEBUG if (m_flush_observer) - my_atomic_addlint(&m_index->table->space->redo_skipped_count, - 1); + m_index->table->space->redo_skipped_count++; #endif /* UNIV_DEBUG */ } @@ -299,8 +298,7 @@ public: { #ifdef UNIV_DEBUG if (m_flush_observer) - my_atomic_addlint(&m_index->table->space->redo_skipped_count, - ulint(-1)); + m_index->table->space->redo_skipped_count--; #endif /* UNIV_DEBUG */ } diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h index c6f7c846c22..12aaa73ae30 100644 --- a/storage/innobase/include/btr0cur.h +++ b/storage/innobase/include/btr0cur.h @@ -729,11 +729,12 @@ btr_free_externally_stored_field( ignored if rec == NULL */ bool rollback, /*!< in: performing rollback? */ mtr_t* local_mtr); /*!< in: mtr containing the latch */ + /** Copies the prefix of an externally stored field of a record. The clustered index record must be protected by a lock or a page latch. @param[out] buf the field, or a prefix of it @param[in] len length of buf, in bytes -@param[in] page_size BLOB page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] data 'internally' stored part of the field containing also the reference to the external part; must be protected by a lock or a page latch @@ -744,7 +745,7 @@ ulint btr_copy_externally_stored_field_prefix( byte* buf, ulint len, - const page_size_t& page_size, + ulint zip_size, const byte* data, ulint local_len); @@ -754,7 +755,7 @@ The clustered index record must be protected by a lock or a page latch. @param[in] data 'internally' stored part of the field containing also the reference to the external part; must be protected by a lock or a page latch -@param[in] page_size BLOB page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] local_len length of data @param[in,out] heap mem heap @return the whole field copied to heap */ @@ -762,7 +763,7 @@ byte* btr_copy_externally_stored_field( ulint* len, const byte* data, - const page_size_t& page_size, + ulint zip_size, ulint local_len, mem_heap_t* heap); @@ -770,7 +771,7 @@ btr_copy_externally_stored_field( @param[in] rec record in a clustered index; must be protected by a lock or a page latch @param[in] offset array returned by rec_get_offsets() -@param[in] page_size BLOB page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] no field number @param[out] len length of the field @param[in,out] heap mem heap @@ -779,7 +780,7 @@ byte* btr_rec_copy_externally_stored_field( const rec_t* rec, const rec_offs* offsets, - const page_size_t& page_size, + ulint zip_size, ulint no, ulint* len, mem_heap_t* heap); @@ -810,6 +811,7 @@ btr_rec_set_deleted_flag( /** Latches the leaf page or pages requested. @param[in] block leaf page where the search converged @param[in] page_id page id of the leaf +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] latch_mode BTR_SEARCH_LEAF, ... @param[in] cursor cursor @param[in] mtr mini-transaction @@ -818,7 +820,7 @@ btr_latch_leaves_t btr_cur_latch_leaves( buf_block_t* block, const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint latch_mode, btr_cur_t* cursor, mtr_t* mtr); @@ -1021,7 +1023,7 @@ inherited external field. */ #define BTR_EXTERN_INHERITED_FLAG 64U /** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */ -extern ulint btr_cur_n_non_sea; +extern Atomic_counter<ulint> btr_cur_n_non_sea; /** Old value of btr_cur_n_non_sea. Copied by srv_refresh_innodb_monitor_stats(). Referenced by srv_printf_innodb_monitor(). */ diff --git a/storage/innobase/include/btr0defragment.h b/storage/innobase/include/btr0defragment.h index 57f8c2f3811..22f29eae3a6 100644 --- a/storage/innobase/include/btr0defragment.h +++ b/storage/innobase/include/btr0defragment.h @@ -26,9 +26,9 @@ this program; if not, write to the Free Software Foundation, Inc., #define BTR_DEFRAGMENT_MAX_N_PAGES 32 /** stats in btr_defragment */ -extern ulint btr_defragment_compression_failures; -extern ulint btr_defragment_failures; -extern ulint btr_defragment_count; +extern Atomic_counter<ulint> btr_defragment_compression_failures; +extern Atomic_counter<ulint> btr_defragment_failures; +extern Atomic_counter<ulint> btr_defragment_count; /** Item in the work queue for btr_degrament_thread. */ struct btr_defragment_item_t diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h index 22e1ef11a68..83c374e2561 100644 --- a/storage/innobase/include/btr0types.h +++ b/storage/innobase/include/btr0types.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -27,7 +28,6 @@ Created 2/17/1996 Heikki Tuuri #define btr0types_h #include "page0types.h" -#include "page0size.h" #include "rem0types.h" /** Persistent cursor */ @@ -49,41 +49,11 @@ extern ulong btr_ahi_parts; /** The size of a reference to data stored on a different page. The reference is stored at the end of the prefix of the field in the index record. */ +#define FIELD_REF_SIZE 20U #define BTR_EXTERN_FIELD_REF_SIZE FIELD_REF_SIZE /** If the data don't exceed the size, the data are stored locally. */ #define BTR_EXTERN_LOCAL_STORED_MAX_SIZE \ (BTR_EXTERN_FIELD_REF_SIZE * 2) -/** The information is used for creating a new index tree when -applying TRUNCATE log record during recovery */ -struct btr_create_t { - - explicit btr_create_t(const byte* const ptr) - : - format_flags(), - n_fields(), - field_len(), - fields(ptr), - trx_id_pos(ULINT_UNDEFINED) - { - /* Do nothing */ - } - - /** Page format */ - ulint format_flags; - - /** Numbr of index fields */ - ulint n_fields; - - /** The length of the encoded meta-data */ - ulint field_len; - - /** Field meta-data, encoded. */ - const byte* const fields; - - /** Position of trx-id column. */ - ulint trx_id_pos; -}; - #endif diff --git a/storage/innobase/include/buf0buddy.h b/storage/innobase/include/buf0buddy.h index 5b1aefb4d69..5119a1c58c4 100644 --- a/storage/innobase/include/buf0buddy.h +++ b/storage/innobase/include/buf0buddy.h @@ -26,11 +26,6 @@ Created December 2006 by Marko Makela #ifndef buf0buddy_h #define buf0buddy_h -#ifdef UNIV_MATERIALIZE -# undef UNIV_INLINE -# define UNIV_INLINE -#endif - #include "buf0types.h" /**********************************************************************//** diff --git a/storage/innobase/include/buf0buddy.ic b/storage/innobase/include/buf0buddy.ic index dad9cb668dd..39ab46d80dd 100644 --- a/storage/innobase/include/buf0buddy.ic +++ b/storage/innobase/include/buf0buddy.ic @@ -23,11 +23,6 @@ Binary buddy allocator for compressed pages Created December 2006 by Marko Makela *******************************************************/ -#ifdef UNIV_MATERIALIZE -# undef UNIV_INLINE -# define UNIV_INLINE -#endif - #include "buf0buf.h" #include "buf0buddy.h" @@ -132,8 +127,3 @@ buf_buddy_free( buf_buddy_free_low(buf_pool, buf, buf_buddy_get_slot(size)); } - -#ifdef UNIV_MATERIALIZE -# undef UNIV_INLINE -# define UNIV_INLINE UNIV_INLINE_ORIGINAL -#endif diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 689427913cd..89d7c71b734 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -42,7 +42,6 @@ Created 11/5/1995 Heikki Tuuri #include "os0proc.h" #include "log0log.h" #include "srv0srv.h" -#include "my_atomic.h" #include <ostream> // Forward declaration @@ -424,16 +423,14 @@ be implemented at a higher level. In other words, all possible accesses to a given page through this function must be protected by the same set of mutexes or latches. @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size @return pointer to the block */ -buf_page_t* -buf_page_get_zip( - const page_id_t page_id, - const page_size_t& page_size); +buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size); /** This is the general function used to get access to a database page. It does page initialization and applies the buffered redo logs. @param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH @param[in] guess guessed block or NULL @param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, @@ -446,7 +443,7 @@ BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH buf_block_t* buf_page_get_gen( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint rw_latch, buf_block_t* guess, ulint mode, @@ -455,8 +452,9 @@ buf_page_get_gen( mtr_t* mtr, dberr_t* err); -/** This is the low level function used to get access to a database page. +/** Low level function used to get access to a database page. @param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH @param[in] guess guessed block or NULL @param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, @@ -469,7 +467,7 @@ BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH buf_block_t* buf_page_get_low( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint rw_latch, buf_block_t* guess, ulint mode, @@ -478,18 +476,18 @@ buf_page_get_low( mtr_t* mtr, dberr_t* err); -/** Initializes a page to the buffer buf_pool. The page is usually not read +/** Initialize a page in the buffer pool. The page is usually not read from a file even if it cannot be found in the buffer buf_pool. This is one of the functions which perform to a block a state transition NOT_USED => FILE_PAGE (the other is buf_page_get_gen). @param[in] page_id page id -@param[in] page_size page size -@param[in] mtr mini-transaction +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction @return pointer to the block, page bufferfixed */ buf_block_t* buf_page_create( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, mtr_t* mtr); /********************************************************************//** @@ -625,33 +623,6 @@ buf_block_buf_fix_inc_func( buf_block_t* block) /*!< in/out: block to bufferfix */ MY_ATTRIBUTE((nonnull)); -/** Increments the bufferfix count. -@param[in,out] bpage block to bufferfix -@return the count */ -UNIV_INLINE -ulint -buf_block_fix(buf_page_t* bpage); - -/** Increments the bufferfix count. -@param[in,out] block block to bufferfix -@return the count */ -UNIV_INLINE -ulint -buf_block_fix(buf_block_t* block); - -/** Decrements the bufferfix count. -@param[in,out] bpage block to bufferunfix -@return the remaining buffer-fix count */ -UNIV_INLINE -ulint -buf_block_unfix(buf_page_t* bpage); -/** Decrements the bufferfix count. -@param[in,out] block block to bufferunfix -@return the remaining buffer-fix count */ -UNIV_INLINE -ulint -buf_block_unfix(buf_block_t* block); - # ifdef UNIV_DEBUG /** Increments the bufferfix count. @param[in,out] b block to bufferfix @@ -711,19 +682,13 @@ buf_page_is_checksum_valid_none( /** Check if a page is corrupt. @param[in] check_lsn whether the LSN should be checked @param[in] read_buf database page -@param[in] page_size page size -@param[in] space tablespace +@param[in] fsp_flags tablespace flags @return whether the page is corrupted */ bool buf_page_is_corrupted( bool check_lsn, const byte* read_buf, - const page_size_t& page_size, -#ifndef UNIV_INNOCHECKSUM - const fil_space_t* space = NULL) -#else - const void* space = NULL) -#endif + ulint fsp_flags) MY_ATTRIBUTE((warn_unused_result)); inline void *aligned_malloc(size_t size, size_t align) @@ -747,6 +712,63 @@ inline void aligned_free(void *ptr) #endif } +/** Read the key version from the page. In full crc32 format, +key version is stored at {0-3th} bytes. In other format, it is +stored in 26th position. +@param[in] read_buf database page +@param[in] fsp_flags tablespace flags +@return key version of the page. */ +inline uint32_t buf_page_get_key_version(const byte* read_buf, ulint fsp_flags) +{ + return fil_space_t::full_crc32(fsp_flags) + ? mach_read_from_4(read_buf + FIL_PAGE_FCRC32_KEY_VERSION) + : mach_read_from_4(read_buf + + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); +} + +/** Read the compression info from the page. In full crc32 format, +compression info is at MSB of page type. In other format, it is +stored in page type. +@param[in] read_buf database page +@param[in] fsp_flags tablespace flags +@return true if page is compressed. */ +inline bool buf_page_is_compressed(const byte* read_buf, ulint fsp_flags) +{ + ulint page_type = mach_read_from_2(read_buf + FIL_PAGE_TYPE); + return fil_space_t::full_crc32(fsp_flags) + ? !!(page_type & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER) + : page_type == FIL_PAGE_PAGE_COMPRESSED; +} + +/** Get the compressed or uncompressed size of a full_crc32 page. +@param[in] buf page_compressed or uncompressed page +@param[out] comp whether the page could be compressed +@param[out] cr whether the page could be corrupted +@return the payload size in the file page */ +inline uint buf_page_full_crc32_size(const byte* buf, bool* comp, bool* cr) +{ + uint t = mach_read_from_2(buf + FIL_PAGE_TYPE); + uint page_size = uint(srv_page_size); + + if (!(t & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)) { + return page_size; + } + + t &= ~(1U << FIL_PAGE_COMPRESS_FCRC32_MARKER); + t <<= 8; + + if (t < page_size) { + page_size = t; + if (comp) { + *comp = true; + } + } else if (cr) { + *cr = true; + } + + return page_size; +} + #ifndef UNIV_INNOCHECKSUM /**********************************************************************//** Gets the space id, page offset, and byte offset within page of a @@ -808,10 +830,8 @@ buf_print(void); /** Dump a page to stderr. @param[in] read_buf database page -@param[in] page_size page size */ -UNIV_INTERN -void -buf_page_print(const byte* read_buf, const page_size_t& page_size) +@param[in] zip_size compressed page size, or 0 */ +void buf_page_print(const byte* read_buf, ulint zip_size = 0) ATTRIBUTE_COLD __attribute__((nonnull)); /********************************************************************//** Decompress a block. @@ -1170,6 +1190,7 @@ and the lock released later. @param[out] err DB_SUCCESS or DB_TABLESPACE_DELETED @param[in] mode BUF_READ_IBUF_PAGES_ONLY, ... @param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] unzip whether the uncompressed page is requested (for ROW_FORMAT=COMPRESSED) @return pointer to the block @@ -1179,7 +1200,7 @@ buf_page_init_for_read( dberr_t* err, ulint mode, const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, bool unzip); /** Complete a read or write request of a file page to or from the buffer pool. @@ -1390,6 +1411,15 @@ ulint buf_pool_size_align( ulint size); +/** Verify that post encryption checksum match with the calculated checksum. +This function should be called only if tablespace contains crypt data metadata. +@param[in] page page frame +@param[in] fsp_flags tablespace flags +@return true if page is encrypted and OK, false otherwise */ +bool buf_page_verify_crypt_checksum( + const byte* page, + ulint fsp_flags); + /** Calculate the checksum of a page from compressed table and update the page. @param[in,out] page page to update @@ -1410,7 +1440,7 @@ a page is written to disk. (may be src_frame or an encrypted/compressed copy of it) */ UNIV_INTERN byte* -buf_page_encrypt_before_write( +buf_page_encrypt( fil_space_t* space, buf_page_t* bpage, byte* src_frame); @@ -1420,10 +1450,9 @@ buf_page_encrypt_before_write( NOTE! The definition appears here only for other modules of this directory (buf) to see it. Do not use from outside! */ -struct buf_tmp_buffer_t { -private: - int32 reserved; /*!< true if this slot is reserved - */ +class buf_tmp_buffer_t { + /** whether this slot is reserved */ + std::atomic<bool> reserved; public: byte* crypt_buf; /*!< for encryption the data needs to be copied to a separate buffer before it's @@ -1439,16 +1468,14 @@ public: /** Release the slot */ void release() { - my_atomic_store32_explicit(&reserved, false, - MY_MEMORY_ORDER_RELAXED); + reserved.store(false, std::memory_order_relaxed); } /** Acquire the slot @return whether the slot was acquired */ bool acquire() { - return !my_atomic_fas32_explicit(&reserved, true, - MY_MEMORY_ORDER_RELAXED); + return !reserved.exchange(true, std::memory_order_relaxed); } }; @@ -1474,11 +1501,8 @@ public: buf_pool->page_hash or buf_pool->zip_hash */ - /** Page size. Protected by buf_pool mutex. */ - page_size_t size; - /** Count of how manyfold this block is currently bufferfixed. */ - int32 buf_fix_count; + Atomic_counter<uint32_t> buf_fix_count; /** type of pending I/O operation; also protected by buf_pool->mutex for writes only */ @@ -1623,6 +1647,27 @@ public: protected by buf_pool->zip_mutex or buf_block_t::mutex. */ # endif /* UNIV_DEBUG */ + + void fix() { buf_fix_count++; } + uint32_t unfix() + { + uint32_t count= buf_fix_count--; + ut_ad(count != 0); + return count - 1; + } + + /** @return the physical size, in bytes */ + ulint physical_size() const + { + return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : srv_page_size; + } + + /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes + @retval 0 if not compressed */ + ulint zip_size() const + { + return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : 0; + } }; /** The buffer control block structure */ @@ -1729,20 +1774,20 @@ struct buf_block_t{ /* @{ */ # if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG - ulint n_pointers; /*!< used in debugging: the number of + Atomic_counter<ulint> + n_pointers; /*!< used in debugging: the number of pointers in the adaptive hash index pointing to this frame; protected by atomic memory access or btr_search_own_all(). */ # define assert_block_ahi_empty(block) \ - ut_a(my_atomic_addlint(&(block)->n_pointers, 0) == 0) + ut_a((block)->n_pointers == 0) # define assert_block_ahi_empty_on_init(block) do { \ MEM_MAKE_DEFINED(&(block)->n_pointers, sizeof (block)->n_pointers); \ assert_block_ahi_empty(block); \ } while (0) # define assert_block_ahi_valid(block) \ - ut_a((block)->index \ - || my_atomic_loadlint(&(block)->n_pointers) == 0) + ut_a((block)->index || (block)->n_pointers == 0) # else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ # define assert_block_ahi_empty(block) /* nothing */ # define assert_block_ahi_empty_on_init(block) /* nothing */ @@ -1774,7 +1819,7 @@ struct buf_block_t{ # ifdef UNIV_DEBUG /** @name Debug fields */ /* @{ */ - rw_lock_t debug_latch; /*!< in the debug version, each thread + rw_lock_t* debug_latch; /*!< in the debug version, each thread which bufferfixes the block acquires an s-latch here; so we can use the debug utilities in sync0rw */ @@ -1786,6 +1831,16 @@ struct buf_block_t{ and accessed; we introduce this new mutex in InnoDB-5.1 to relieve contention on the buffer pool mutex */ + + void fix() { page.fix(); } + uint32_t unfix() { return page.unfix(); } + + /** @return the physical size, in bytes */ + ulint physical_size() const { return page.physical_size(); } + + /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes + @retval 0 if not compressed */ + ulint zip_size() const { return page.zip_size(); } }; /** Check if a buf_block_t object is in a valid state @@ -1877,13 +1932,13 @@ public: HazardPointer(buf_pool, mutex) {} /** Destructor */ - virtual ~FlushHp() {} + ~FlushHp() override {} /** Adjust the value of hp. This happens when some other thread working on the same list attempts to remove the hp from the list. @param bpage buffer block to be compared */ - void adjust(const buf_page_t* bpage); + void adjust(const buf_page_t* bpage) override; }; /** Class implementing buf_pool->LRU hazard pointer */ @@ -1898,13 +1953,13 @@ public: HazardPointer(buf_pool, mutex) {} /** Destructor */ - virtual ~LRUHp() {} + ~LRUHp() override {} /** Adjust the value of hp. This happens when some other thread working on the same list attempts to remove the hp from the list. @param bpage buffer block to be compared */ - void adjust(const buf_page_t* bpage); + void adjust(const buf_page_t* bpage) override; }; /** Special purpose iterators to be used when scanning the LRU list. @@ -1922,7 +1977,7 @@ public: LRUHp(buf_pool, mutex) {} /** Destructor */ - virtual ~LRUItr() {} + ~LRUItr() override {} /** Selects from where to start a scan. If we have scanned too deep into the LRU list it resets the value to the tail @@ -1990,17 +2045,6 @@ struct buf_buddy_stat_t { ib_uint64_t relocated_usec; }; -/** @brief The temporary memory array structure. - -NOTE! The definition appears here only for other modules of this -directory (buf) to see it. Do not use from outside! */ - -typedef struct { - ulint n_slots; /*!< Total number of slots */ - buf_tmp_buffer_t *slots; /*!< Pointer to the slots in the - array */ -} buf_tmp_array_t; - /** @brief The buffer pool structure. NOTE! The definition appears here only for other modules of this @@ -2060,7 +2104,8 @@ struct buf_pool_t{ indexed by block->frame */ ulint n_pend_reads; /*!< number of pending read operations */ - ulint n_pend_unzip; /*!< number of pending decompressions */ + Atomic_counter<ulint> + n_pend_unzip; /*!< number of pending decompressions */ time_t last_printout_time; /*!< when buf_print_io was last time @@ -2201,20 +2246,47 @@ struct buf_pool_t{ #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ UT_LIST_BASE_NODE_T(buf_buddy_free_t) zip_free[BUF_BUDDY_SIZES_MAX]; /*!< buddy free lists */ +#if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN +# error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN" +#endif + /* @} */ buf_page_t* watch; /*!< Sentinel records for buffer pool watches. Protected by buf_pool->mutex. */ - buf_tmp_array_t* tmp_arr; - /*!< Array for temporal memory - used in compression and encryption */ - -#if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN -# error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN" -#endif - /* @} */ + /** Temporary memory for page_compressed and encrypted I/O */ + struct io_buf_t { + /** number of elements in slots[] */ + const ulint n_slots; + /** array of slots */ + buf_tmp_buffer_t* const slots; + + io_buf_t() = delete; + + /** Constructor */ + explicit io_buf_t(ulint n_slots) : + n_slots(n_slots), + slots(static_cast<buf_tmp_buffer_t*>( + ut_malloc_nokey(n_slots + * sizeof *slots))) + { + memset((void*) slots, 0, n_slots * sizeof *slots); + } + + ~io_buf_t(); + + /** Reserve a buffer */ + buf_tmp_buffer_t* reserve() + { + for (buf_tmp_buffer_t* s = slots, *e = slots + n_slots; + s != e; s++) { + if (s->acquire()) return s; + } + return NULL; + } + } io_buf; }; /** Print the given buf_pool_t object. diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic index f331091a1d7..7d11e2b4cc0 100644 --- a/storage/innobase/include/buf0buf.ic +++ b/storage/innobase/include/buf0buf.ic @@ -955,49 +955,6 @@ buf_block_get_modify_clock( return(block->modify_clock); } -/** Increments the bufferfix count. -@param[in,out] bpage block to bufferfix -@return the count */ -UNIV_INLINE -ulint -buf_block_fix(buf_page_t* bpage) -{ - return uint32(my_atomic_add32_explicit( - &bpage->buf_fix_count, 1, - MY_MEMORY_ORDER_RELAXED)) + 1; -} - -/** Increments the bufferfix count. -@param[in,out] block block to bufferfix -@return the count */ -UNIV_INLINE -ulint -buf_block_fix(buf_block_t* block) -{ - return buf_block_fix(&block->page); -} - -/** Get the bufferfix count. -@param[in] bpage block to bufferfix -@return the count */ -UNIV_INLINE -ulint -buf_block_get_fix(buf_page_t* bpage) -{ - return my_atomic_load32_explicit(&bpage->buf_fix_count, - MY_MEMORY_ORDER_RELAXED); -} - -/** Get the bufferfix count. -@param[in] bpage block to bufferfix -@return the count */ -UNIV_INLINE -ulint -buf_block_get_fix(buf_block_t* block) -{ - return buf_block_get_fix(&block->page); -} - /*******************************************************************//** Increments the bufferfix count. */ UNIV_INLINE @@ -1016,36 +973,12 @@ buf_block_buf_fix_inc_func( threaded. */ if (!fsp_is_system_temporary(block->page.id.space())) { ibool ret; - ret = rw_lock_s_lock_nowait(&block->debug_latch, file, line); + ret = rw_lock_s_lock_nowait(block->debug_latch, file, line); ut_a(ret); } #endif /* UNIV_DEBUG */ - buf_block_fix(block); -} - -/** Decrements the bufferfix count. -@param[in,out] bpage block to bufferunfix -@return the remaining buffer-fix count */ -UNIV_INLINE -ulint -buf_block_unfix(buf_page_t* bpage) -{ - uint32 count = uint32(my_atomic_add32_explicit( - &bpage->buf_fix_count, - -1, MY_MEMORY_ORDER_RELAXED)); - ut_ad(count != 0); - return count - 1; -} - -/** Decrements the bufferfix count. -@param[in,out] block block to bufferunfix -@return the remaining buffer-fix count */ -UNIV_INLINE -ulint -buf_block_unfix(buf_block_t* block) -{ - return buf_block_unfix(&block->page); + block->fix(); } /*******************************************************************//** @@ -1056,14 +989,14 @@ buf_block_buf_fix_dec( /*==================*/ buf_block_t* block) /*!< in/out: block to bufferunfix */ { - buf_block_unfix(block); + block->unfix(); #ifdef UNIV_DEBUG /* No debug latch is acquired if block belongs to system temporary. Debug latch is not of much help if access to block is single threaded. */ if (!fsp_is_system_temporary(block->page.id.space())) { - rw_lock_s_unlock(&block->debug_latch); + rw_lock_s_unlock(block->debug_latch); } #endif /* UNIV_DEBUG */ } @@ -1306,14 +1239,14 @@ buf_page_release_zip( is single threaded. */ buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage); if (!fsp_is_system_temporary(block->page.id.space())) { - rw_lock_s_unlock(&block->debug_latch); + rw_lock_s_unlock(block->debug_latch); } } #endif /* UNIV_DEBUG */ /* Fall through */ case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_DIRTY: - buf_block_unfix(reinterpret_cast<buf_block_t*>(bpage)); + reinterpret_cast<buf_block_t*>(bpage)->unfix(); return; case BUF_BLOCK_POOL_WATCH: @@ -1342,7 +1275,7 @@ buf_page_release_latch( temporary. Debug latch is not of much help if access to block is single threaded. */ if (!fsp_is_system_temporary(block->page.id.space())) { - rw_lock_s_unlock(&block->debug_latch); + rw_lock_s_unlock(block->debug_latch); } #endif /* UNIV_DEBUG */ diff --git a/storage/innobase/include/buf0checksum.h b/storage/innobase/include/buf0checksum.h index ce39e290ac7..8dc25f91d59 100644 --- a/storage/innobase/include/buf0checksum.h +++ b/storage/innobase/include/buf0checksum.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2018, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,18 +29,6 @@ Created Aug 11, 2011 Vasil Dimov #include "buf0types.h" -#ifdef INNODB_BUG_ENDIAN_CRC32 -/** Calculate the CRC32 checksum of a page. The value is stored to the page -when it is written to a file and also checked for a match when reading from -the file. Note that we must be careful to calculate the same value on all -architectures. -@param[in] page buffer page (srv_page_size bytes) -@param[in] bug_endian whether to use big endian byteorder -when converting byte strings to integers, for bug-compatibility with -big-endian architecture running MySQL 5.6, MariaDB 10.0 or MariaDB 10.1 -@return CRC-32C */ -uint32_t buf_calc_page_crc32(const byte* page, bool bug_endian = false); -#else /** Calculate the CRC32 checksum of a page. The value is stored to the page when it is written to a file and also checked for a match when reading from the file. Note that we must be careful to calculate the same value on all @@ -48,7 +36,6 @@ architectures. @param[in] page buffer page (srv_page_size bytes) @return CRC-32C */ uint32_t buf_calc_page_crc32(const byte* page); -#endif /** Calculate a checksum which is stored to the page when it is written to a file. Note that we must be careful to calculate the same value on diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h index a0122d1c3f8..e022dd55215 100644 --- a/storage/innobase/include/buf0flu.h +++ b/storage/innobase/include/buf0flu.h @@ -73,17 +73,24 @@ buf_flush_relocate_on_flush_list( @param[in,out] bpage flushed page @param[in] dblwr whether the doublewrite buffer was used */ void buf_flush_write_complete(buf_page_t* bpage, bool dblwr); + +/** Assign the full crc32 checksum for non-compressed page. +@param[in,out] page page to be updated */ +void buf_flush_assign_full_crc32_checksum(byte* page); + /** Initialize a page for writing to the tablespace. -@param[in] block buffer block; NULL if bypassing the buffer pool -@param[in,out] page page frame -@param[in,out] page_zip_ compressed page, or NULL if uncompressed -@param[in] newest_lsn newest modification LSN to the page */ +@param[in] block buffer block; NULL if bypassing the buffer pool +@param[in,out] page page frame +@param[in,out] page_zip_ compressed page, or NULL if uncompressed +@param[in] newest_lsn newest modification LSN to the page +@param[in] use_full_checksum whether tablespace uses full checksum */ void buf_flush_init_for_writing( const buf_block_t* block, byte* page, void* page_zip_, - lsn_t newest_lsn); + lsn_t newest_lsn, + bool use_full_checksum); # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG /********************************************************************//** @@ -181,18 +188,6 @@ buf_flush_note_modification( lsn_t end_lsn, /*!< in: end lsn of the last mtr in the set of mtr's */ FlushObserver* observer); /*!< in: flush observer */ - -/********************************************************************//** -This function should be called when recovery has modified a buffer page. */ -UNIV_INLINE -void -buf_flush_recv_note_modification( -/*=============================*/ - buf_block_t* block, /*!< in: block which is modified */ - lsn_t start_lsn, /*!< in: start lsn of the first mtr in a - set of mtr's */ - lsn_t end_lsn); /*!< in: end lsn of the last mtr in the - set of mtr's */ /********************************************************************//** Returns TRUE if the file page block is immediately suitable for replacement, i.e., transition FILE_PAGE => NOT_USED allowed. diff --git a/storage/innobase/include/buf0flu.ic b/storage/innobase/include/buf0flu.ic index 8d06a53c547..02f3d8ced57 100644 --- a/storage/innobase/include/buf0flu.ic +++ b/storage/innobase/include/buf0flu.ic @@ -38,17 +38,6 @@ buf_flush_insert_into_flush_list( lsn_t lsn); /*!< in: oldest modification */ /********************************************************************//** -Inserts a modified block into the flush list in the right sorted position. -This function is used by recovery, because there the modifications do not -necessarily come in the order of lsn's. */ -void -buf_flush_insert_sorted_into_flush_list( -/*====================================*/ - buf_pool_t* buf_pool, /*!< buffer pool instance */ - buf_block_t* block, /*!< in/out: block which is modified */ - lsn_t lsn); /*!< in: oldest modification */ - -/********************************************************************//** This function should be called at a mini-transaction commit, if a page was modified in it. Puts the block to the list of modified blocks, if it is not already in it. */ @@ -63,24 +52,11 @@ buf_flush_note_modification( modified this block */ FlushObserver* observer) /*!< in: flush observer */ { -#ifdef UNIV_DEBUG - { - /* Allow write to proceed to shared temporary tablespace - in read-only mode. */ - ut_ad(!srv_read_only_mode - || fsp_is_system_temporary(block->page.id.space())); - ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); - ut_ad(block->page.buf_fix_count > 0); - - buf_pool_t* buf_pool = buf_pool_from_block(block); - - ut_ad(!buf_pool_mutex_own(buf_pool)); - ut_ad(!buf_flush_list_mutex_own(buf_pool)); - } -#endif /* UNIV_DEBUG */ - mutex_enter(&block->mutex); - + ut_ad(!srv_read_only_mode + || fsp_is_system_temporary(block->page.id.space())); + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.buf_fix_count > 0); ut_ad(block->page.newest_modification <= end_lsn); block->page.newest_modification = end_lsn; @@ -98,52 +74,7 @@ buf_flush_note_modification( ut_ad(block->page.oldest_modification <= start_lsn); } - buf_page_mutex_exit(block); + mutex_exit(&block->mutex); srv_stats.buf_pool_write_requests.inc(); } - -/********************************************************************//** -This function should be called when recovery has modified a buffer page. */ -UNIV_INLINE -void -buf_flush_recv_note_modification( -/*=============================*/ - buf_block_t* block, /*!< in: block which is modified */ - lsn_t start_lsn, /*!< in: start lsn of the first mtr in a - set of mtr's */ - lsn_t end_lsn) /*!< in: end lsn of the last mtr in the - set of mtr's */ -{ -#ifdef UNIV_DEBUG - { - ut_ad(!srv_read_only_mode); - ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); - ut_ad(block->page.buf_fix_count > 0); - - buf_pool_t* buf_pool = buf_pool_from_block(block); - - ut_ad(!buf_pool_mutex_own(buf_pool)); - ut_ad(!buf_flush_list_mutex_own(buf_pool)); - - ut_ad(start_lsn != 0); - ut_ad(block->page.newest_modification <= end_lsn); - } -#endif /* UNIV_DEBUG */ - - buf_page_mutex_enter(block); - - block->page.newest_modification = end_lsn; - - if (!block->page.oldest_modification) { - buf_pool_t* buf_pool = buf_pool_from_block(block); - - buf_flush_insert_sorted_into_flush_list( - buf_pool, block, start_lsn); - } else { - ut_ad(block->page.oldest_modification <= start_lsn); - } - - buf_page_mutex_exit(block); - -} diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h index e590d818334..ff0ba474bb3 100644 --- a/storage/innobase/include/buf0rea.h +++ b/storage/innobase/include/buf0rea.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2015, 2017, MariaDB Corporation. +Copyright (c) 2015, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -34,30 +34,23 @@ buffer buf_pool if it is not already there. Sets the io_fix flag and sets an exclusive lock on the buffer frame. The flag is cleared and the x-lock released by the i/o-handler thread. @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @retval DB_SUCCESS if the page was read and is not corrupted, @retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted, @retval DB_DECRYPTION_FAILED if page post encryption checksum matches but after decryption normal page checksum does not match. @retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */ -dberr_t -buf_read_page( - const page_id_t page_id, - const page_size_t& page_size); +dberr_t buf_read_page(const page_id_t page_id, ulint zip_size); -/********************************************************************//** -High-level function which reads a page asynchronously from a file to the +/** High-level function which reads a page asynchronously from a file to the buffer buf_pool if it is not already there. Sets the io_fix flag and sets an exclusive lock on the buffer frame. The flag is cleared and the x-lock released by the i/o-handler thread. @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] sync true if synchronous aio is desired */ void -buf_read_page_background( - const page_id_t page_id, - const page_size_t& page_size, - bool sync); +buf_read_page_background(const page_id_t page_id, ulint zip_size, bool sync); /** Applies a random read-ahead in buf_pool if there are at least a threshold value of accessed pages from the random read-ahead area. Does not read any @@ -70,16 +63,13 @@ performed by ibuf routines, a situation which could result in a deadlock if the OS does not support asynchronous i/o. @param[in] page_id page id of a page which the current thread wants to access -@param[in] page_size page size -@param[in] inside_ibuf TRUE if we are inside ibuf routine +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] ibuf whether we are inside ibuf routine @return number of page read requests issued; NOTE that if we read ibuf pages, it may happen that the page at the given page number does not get read even if we return a positive value! */ ulint -buf_read_ahead_random( - const page_id_t page_id, - const page_size_t& page_size, - ibool inside_ibuf); +buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf); /** Applies linear read-ahead if in the buf_pool the page is a border page of a linear read-ahead area and all the pages in the area have been accessed. @@ -104,14 +94,11 @@ NOTE 3: the calling thread must want access to the page given: this rule is set to prevent unintended read-aheads performed by ibuf routines, a situation which could result in a deadlock if the OS does not support asynchronous io. @param[in] page_id page id; see NOTE 3 above -@param[in] page_size page size -@param[in] inside_ibuf TRUE if we are inside ibuf routine +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] ibuf whether if we are inside ibuf routine @return number of page read requests issued */ ulint -buf_read_ahead_linear( - const page_id_t page_id, - const page_size_t& page_size, - ibool inside_ibuf); +buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf); /********************************************************************//** Issues read requests for pages which the ibuf module wants to read in, in diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h index bd5e26df47b..5532a524782 100644 --- a/storage/innobase/include/buf0types.h +++ b/storage/innobase/include/buf0types.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2015, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2019, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -82,8 +83,16 @@ enum srv_checksum_algorithm_t { innodb when reading */ SRV_CHECKSUM_ALGORITHM_NONE, /*!< Write none, allow crc32, innodb or none when reading */ - SRV_CHECKSUM_ALGORITHM_STRICT_NONE /*!< Write none, allow none + SRV_CHECKSUM_ALGORITHM_STRICT_NONE, /*!< Write none, allow none when reading */ + + /** For new files, always compute CRC-32C for the whole page. + For old files, allow crc32, innodb or none when reading. */ + SRV_CHECKSUM_ALGORITHM_FULL_CRC32, + + /** For new files, always compute CRC-32C for the whole page. + For old files, allow crc32 when reading. */ + SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32 }; inline @@ -202,6 +211,12 @@ private: const page_id_t page_id); }; +/** A field reference full of zero, for use in assertions and checks, +and dummy default values of instantly dropped columns. +Initially, BLOB field references are set to zero, in +dtuple_convert_big_rec(). */ +extern const byte field_ref_zero[UNIV_PAGE_SIZE_MAX]; + #ifndef UNIV_INNOCHECKSUM #include "ut0mutex.h" diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h index 11a7f2e516f..04ddf5b0a42 100644 --- a/storage/innobase/include/data0data.h +++ b/storage/innobase/include/data0data.h @@ -543,6 +543,33 @@ struct dtuple_t { inserted or updated. @param[in] index index possibly with instantly added columns */ void trim(const dict_index_t& index); + + /** + @param info_bits the info_bits of a data tuple + @return whether this is a hidden metadata record + for instant ADD COLUMN or ALTER TABLE */ + static bool is_alter_metadata(ulint info_bits) + { + return UNIV_UNLIKELY(info_bits == REC_INFO_METADATA_ALTER); + } + + /** + @param info_bits the info_bits of a data tuple + @return whether this is a hidden metadata record + for instant ADD COLUMN or ALTER TABLE */ + static bool is_metadata(ulint info_bits) + { + return UNIV_UNLIKELY((info_bits & ~REC_INFO_DELETED_FLAG) + == REC_INFO_METADATA_ADD); + } + + /** @return whether this is a hidden metadata record + for instant ALTER TABLE (not only ADD COLUMN) */ + bool is_alter_metadata() const { return is_alter_metadata(info_bits); } + + /** @return whether this is a hidden metadata record + for instant ADD COLUMN or ALTER TABLE */ + bool is_metadata() const { return is_metadata(info_bits); } }; inline ulint dtuple_get_n_fields(const dtuple_t* tuple) diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h index 740a1b83aca..0e496085113 100644 --- a/storage/innobase/include/data0type.h +++ b/storage/innobase/include/data0type.h @@ -262,35 +262,31 @@ dtype_get_at_most_n_mbchars( ulint data_len, /*!< in: length of str (in bytes) */ const char* str); /*!< in: the string whose prefix length is being determined */ -/*********************************************************************//** -Checks if a data main type is a string type. Also a BLOB is considered a -string type. -@return TRUE if string type */ -ibool -dtype_is_string_type( -/*=================*/ - ulint mtype); /*!< in: InnoDB main data type code: DATA_CHAR, ... */ -/*********************************************************************//** -Checks if a type is a binary string type. Note that for tables created with -< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For -those DATA_BLOB columns this function currently returns FALSE. -@return TRUE if binary string type */ -ibool -dtype_is_binary_string_type( -/*========================*/ - ulint mtype, /*!< in: main data type */ - ulint prtype);/*!< in: precise type */ -/*********************************************************************//** -Checks if a type is a non-binary string type. That is, dtype_is_string_type is -TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created -with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. -For those DATA_BLOB columns this function currently returns TRUE. -@return TRUE if non-binary string type */ -ibool -dtype_is_non_binary_string_type( -/*============================*/ - ulint mtype, /*!< in: main data type */ - ulint prtype);/*!< in: precise type */ +/** @return whether main type is a string type */ +inline bool dtype_is_string_type(ulint mtype) +{ + return mtype <= DATA_BLOB + || mtype == DATA_MYSQL || mtype == DATA_VARMYSQL; +} + +/** @return whether a type is a binary string type */ +inline bool dtype_is_binary_string_type(ulint mtype, ulint prtype) +{ + /* Note that for tables created before MySQL 4.0.14, + we do not know if a DATA_BLOB column is a BLOB or a TEXT column. + For those DATA_BLOB columns we return false. */ + + return mtype == DATA_FIXBINARY || mtype == DATA_BINARY + || (mtype == DATA_BLOB && (prtype & DATA_BINARY_TYPE)); +} + +/** @return whether a type is a non-binary string type */ +inline bool dtype_is_non_binary_string_type(ulint mtype, ulint prtype) +{ + return dtype_is_string_type(mtype) + && !dtype_is_binary_string_type(mtype, prtype); +} + /*********************************************************************//** Sets a data type structure. */ UNIV_INLINE @@ -338,14 +334,15 @@ dtype_get_mblen( multi-byte character */ ulint* mbmaxlen); /*!< out: maximum length of a multi-byte character */ -/*********************************************************************//** -Gets the MySQL charset-collation code for MySQL string types. -@return MySQL charset-collation code */ -UNIV_INLINE -ulint -dtype_get_charset_coll( -/*===================*/ - ulint prtype);/*!< in: precise data type */ +/** +Get the charset-collation code for string types. +@param prtype InnoDB precise type +@return charset-collation code */ +inline uint16_t dtype_get_charset_coll(ulint prtype) +{ + return static_cast<uint16_t>(prtype >> 16) & CHAR_COLL_MASK; +} + /** Form a precise type from the < 4.1.2 format precise type plus the charset-collation code. @param[in] old_prtype MySQL type code and the flags @@ -554,11 +551,55 @@ struct dtype_t{ { return (prtype & DATA_VERSIONED) == DATA_VERS_END; } + + /** Set the type of the BLOB in the hidden metadata record. */ + void metadata_blob_init() + { + prtype = DATA_NOT_NULL; + mtype = DATA_BLOB; + len = 0; + mbminlen = 0; + mbmaxlen = 0; + } }; /** The DB_TRX_ID,DB_ROLL_PTR values for "no history is available" */ extern const byte reset_trx_id[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]; +/** Info bit denoting the predefined minimum record: this bit is set +if and only if the record is the first user record on a non-leaf +B-tree page that is the leftmost page on its level +(PAGE_LEVEL is nonzero and FIL_PAGE_PREV is FIL_NULL). */ +#define REC_INFO_MIN_REC_FLAG 0x10UL +/** The delete-mark flag in info bits */ +#define REC_INFO_DELETED_FLAG 0x20UL + +/** Record status values for ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED */ +enum rec_comp_status_t { + /** User record (PAGE_LEVEL=0, heap>=PAGE_HEAP_NO_USER_LOW) */ + REC_STATUS_ORDINARY = 0, + /** Node pointer record (PAGE_LEVEL>=0, heap>=PAGE_HEAP_NO_USER_LOW) */ + REC_STATUS_NODE_PTR = 1, + /** The page infimum pseudo-record (heap=PAGE_HEAP_NO_INFIMUM) */ + REC_STATUS_INFIMUM = 2, + /** The page supremum pseudo-record (heap=PAGE_HEAP_NO_SUPREMUM) */ + REC_STATUS_SUPREMUM = 3, + /** Clustered index record that has been inserted or updated + after instant ADD COLUMN (more than dict_index_t::n_core_fields) */ + REC_STATUS_INSTANT = 4 +}; + +/** The dtuple_t::info_bits of the hidden metadata of instant ADD COLUMN. +@see rec_is_metadata() +@see rec_is_alter_metadata() */ +static const byte REC_INFO_METADATA_ADD + = REC_INFO_MIN_REC_FLAG | REC_STATUS_INSTANT; + +/** The dtuple_t::info_bits of the hidden metadata of instant ALTER TABLE. +@see rec_is_metadata() */ +static const byte REC_INFO_METADATA_ALTER + = REC_INFO_METADATA_ADD | REC_INFO_DELETED_FLAG; + #include "data0type.ic" #endif diff --git a/storage/innobase/include/data0type.ic b/storage/innobase/include/data0type.ic index f2c499716ce..037a71a9345 100644 --- a/storage/innobase/include/data0type.ic +++ b/storage/innobase/include/data0type.ic @@ -28,18 +28,6 @@ Created 1/16/1996 Heikki Tuuri #include "ha_prototypes.h" /*********************************************************************//** -Gets the MySQL charset-collation code for MySQL string types. -@return MySQL charset-collation code */ -UNIV_INLINE -ulint -dtype_get_charset_coll( -/*===================*/ - ulint prtype) /*!< in: precise data type */ -{ - return((prtype >> 16) & CHAR_COLL_MASK); -} - -/*********************************************************************//** Determines if a MySQL string type is a subset of UTF-8. This function may return false negatives, in case further character-set collation codes are introduced in MySQL later. diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h index f70a65890c9..6cfc63f4a9e 100644 --- a/storage/innobase/include/db0err.h +++ b/storage/innobase/include/db0err.h @@ -136,8 +136,6 @@ enum dberr_t { DB_FTS_TOO_MANY_WORDS_IN_PHRASE, /*< Too many words in a phrase */ - DB_TABLESPACE_TRUNCATED, /*!< tablespace was truncated */ - DB_DECRYPTION_FAILED, /* Tablespace encrypted and decrypt operation failed because of missing key management plugin, diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h index 4853d5ad73f..778471b77ae 100644 --- a/storage/innobase/include/dict0boot.h +++ b/storage/innobase/include/dict0boot.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -50,12 +51,8 @@ dict_hdr_get_new_id( (not assigned if NULL) */ index_id_t* index_id, /*!< out: index id (not assigned if NULL) */ - ulint* space_id, /*!< out: space id + ulint* space_id); /*!< out: space id (not assigned if NULL) */ - const dict_table_t* table, /*!< in: table */ - bool disable_redo); /*!< in: if true and table - object is NULL - then disable-redo */ /**********************************************************************//** Writes the current value of the row id counter to the dictionary header file page. */ @@ -124,13 +121,6 @@ dict_is_sys_table( /* The following is a secondary index on SYS_TABLES */ #define DICT_TABLE_IDS_ID 5 -#define DICT_HDR_FIRST_ID 10 /* the ids for tables etc. start - from this number, except for basic - system tables and their above defined - indexes; ibuf tables and indexes are - assigned as the id the number - DICT_IBUF_ID_MIN plus the space id */ - /* The offset of the dictionary header on the page */ #define DICT_HDR FSEG_PAGE_DATA diff --git a/storage/innobase/include/dict0boot.ic b/storage/innobase/include/dict0boot.ic index dacfcd58b53..7b0a2fd0b86 100644 --- a/storage/innobase/include/dict0boot.ic +++ b/storage/innobase/include/dict0boot.ic @@ -33,18 +33,18 @@ dict_sys_get_new_row_id(void) { row_id_t id; - mutex_enter(&dict_sys->mutex); + mutex_enter(&dict_sys.mutex); - id = dict_sys->row_id; + id = dict_sys.row_id; if (0 == (id % DICT_HDR_ROW_ID_WRITE_MARGIN)) { dict_hdr_flush_row_id(); } - dict_sys->row_id++; + dict_sys.row_id++; - mutex_exit(&dict_sys->mutex); + mutex_exit(&dict_sys.mutex); return(id); } diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h index 8ab987cd39a..92f55ce4a14 100644 --- a/storage/innobase/include/dict0crea.h +++ b/storage/innobase/include/dict0crea.h @@ -67,14 +67,6 @@ dict_create_table_step( /*===================*/ que_thr_t* thr); /*!< in: query thread */ -/** Assign a new table ID and put it into the table cache and the transaction. -@param[in,out] table Table that needs an ID -@param[in,out] trx Transaction */ -void -dict_table_assign_new_id( - dict_table_t* table, - trx_t* trx); - /***********************************************************//** Creates an index. This is a high-level function used in SQL execution graphs. @@ -104,29 +96,12 @@ dict_create_index_tree( dict_index_t* index, /*!< in/out: index */ const trx_t* trx); /*!< in: InnoDB transaction handle */ -/*******************************************************************//** -Recreate the index tree associated with a row in SYS_INDEXES table. -@return new root page number, or FIL_NULL on failure */ -ulint -dict_recreate_index_tree( -/*======================*/ - const dict_table_t* table, /*!< in: the table the index - belongs to */ - btr_pcur_t* pcur, /*!< in/out: persistent cursor pointing - to record in the clustered index of - SYS_INDEXES table. The cursor may be - repositioned in this call. */ - mtr_t* mtr); /*!< in: mtr having the latch - on the record page. The mtr may be - committed and restarted in this call. */ - /** Drop the index tree associated with a row in SYS_INDEXES table. @param[in,out] rec SYS_INDEXES record @param[in,out] pcur persistent cursor on rec @param[in,out] trx dictionary transaction -@param[in,out] mtr mini-transaction -@return whether freeing the B-tree was attempted */ -bool dict_drop_index_tree(rec_t* rec, btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr) +@param[in,out] mtr mini-transaction */ +void dict_drop_index_tree(rec_t* rec, btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr) MY_ATTRIBUTE((nonnull)); /***************************************************************//** diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h index 9f5485bb15c..35309fc1b54 100644 --- a/storage/innobase/include/dict0dict.h +++ b/storage/innobase/include/dict0dict.h @@ -32,12 +32,13 @@ Created 1/8/1996 Heikki Tuuri #include "dict0mem.h" #include "fsp0fsp.h" #include <deque> -#include "dict0pagecompress.h" extern bool innodb_table_stats_not_found; extern bool innodb_index_stats_not_found; -#include "sync0rw.h" +/** the first table or index ID for other than hard-coded system tables */ +constexpr uint8_t DICT_HDR_FIRST_ID= 10; + /********************************************************************//** Get the database name length in a table name. @return database name length */ @@ -130,19 +131,14 @@ dict_table_close( MY_ATTRIBUTE((nonnull)); /*********************************************************************//** Closes the only open handle to a table and drops a table while assuring -that dict_sys->mutex is held the whole time. This assures that the table +that dict_sys.mutex is held the whole time. This assures that the table is not evicted after the close when the count of open handles goes to zero. -Because dict_sys->mutex is held, we do not need to call -dict_table_prevent_eviction(). */ +Because dict_sys.mutex is held, we do not need to call prevent_eviction(). */ void dict_table_close_and_drop( /*======================*/ trx_t* trx, /*!< in: data dictionary transaction */ dict_table_t* table); /*!< in/out: table */ -/**********************************************************************//** -Inits the data dictionary module. */ -void -dict_init(void); /*********************************************************************//** Gets the minimum number of bytes per character. @@ -287,13 +283,6 @@ dict_col_name_is_reserved( /*======================*/ const char* name) /*!< in: column name */ MY_ATTRIBUTE((nonnull, warn_unused_result)); -/********************************************************************//** -Acquire the autoinc lock. */ -void -dict_table_autoinc_lock( -/*====================*/ - dict_table_t* table) /*!< in/out: table */ - MY_ATTRIBUTE((nonnull)); /** Unconditionally set the AUTO_INCREMENT counter. @param[in,out] table table or partition @param[in] value next available AUTO_INCREMENT value */ @@ -302,7 +291,7 @@ UNIV_INLINE void dict_table_autoinc_initialize(dict_table_t* table, ib_uint64_t value) { - ut_ad(dict_table_autoinc_own(table)); + ut_ad(mutex_own(&table->autoinc_mutex)); table->autoinc = value; } @@ -315,7 +304,7 @@ UNIV_INLINE ib_uint64_t dict_table_autoinc_read(const dict_table_t* table) { - ut_ad(dict_table_autoinc_own(table)); + ut_ad(mutex_own(&table->autoinc_mutex)); return(table->autoinc); } @@ -329,7 +318,7 @@ UNIV_INLINE bool dict_table_autoinc_update_if_greater(dict_table_t* table, ib_uint64_t value) { - ut_ad(dict_table_autoinc_own(table)); + ut_ad(mutex_own(&table->autoinc_mutex)); if (value > table->autoinc) { @@ -340,13 +329,6 @@ dict_table_autoinc_update_if_greater(dict_table_t* table, ib_uint64_t value) return(false); } -/********************************************************************//** -Release the autoinc lock. */ -void -dict_table_autoinc_unlock( -/*======================*/ - dict_table_t* table) /*!< in/out: table */ - MY_ATTRIBUTE((nonnull)); /**********************************************************************//** Adds system columns to a table object. */ void @@ -356,22 +338,6 @@ dict_table_add_system_columns( mem_heap_t* heap) /*!< in: temporary heap */ MY_ATTRIBUTE((nonnull)); /**********************************************************************//** -Removes a table object from the dictionary cache. */ -void -dict_table_remove_from_cache( -/*=========================*/ - dict_table_t* table) /*!< in, own: table */ - MY_ATTRIBUTE((nonnull)); -/**********************************************************************//** -Removes a table object from the dictionary cache. */ -void -dict_table_remove_from_cache_low( -/*=============================*/ - dict_table_t* table, /*!< in, own: table */ - ibool lru_evict) /*!< in: TRUE if table being evicted - to make room in the table LRU list */ - MY_ATTRIBUTE((nonnull)); -/**********************************************************************//** Renames a table object. @return TRUE if success */ dberr_t @@ -694,65 +660,14 @@ do { \ dict_table_skip_corrupt_index(index); \ } while (0) -/********************************************************************//** -Check whether the index is the clustered index. -@return nonzero for clustered index, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_clust( -/*================*/ - const dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((warn_unused_result)); - -/** Check if index is auto-generated clustered index. -@param[in] index index - -@return true if index is auto-generated clustered index. */ -UNIV_INLINE -bool -dict_index_is_auto_gen_clust( - const dict_index_t* index); - -/********************************************************************//** -Check whether the index is unique. -@return nonzero for unique index, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_unique( -/*=================*/ - const dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((warn_unused_result)); -/********************************************************************//** -Check whether the index is a Spatial Index. -@return nonzero for Spatial Index, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_spatial( -/*==================*/ - const dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((warn_unused_result)); - +#define dict_index_is_clust(index) (index)->is_clust() +#define dict_index_is_auto_gen_clust(index) (index)->is_gen_clust() +#define dict_index_is_unique(index) (index)->is_unique() +#define dict_index_is_spatial(index) (index)->is_spatial() +#define dict_index_is_ibuf(index) (index)->is_ibuf() +#define dict_index_is_sec_or_ibuf(index) !(index)->is_primary() #define dict_index_has_virtual(index) (index)->has_virtual() -/********************************************************************//** -Check whether the index is the insert buffer tree. -@return nonzero for insert buffer, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_ibuf( -/*===============*/ - const dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((warn_unused_result)); -/********************************************************************//** -Check whether the index is a secondary index or the insert buffer tree. -@return nonzero for insert buffer, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_sec_or_ibuf( -/*======================*/ - const dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((warn_unused_result)); - /** Get all the FTS indexes on a table. @param[in] table table @param[out] indexes all FTS indexes on this table @@ -908,15 +823,8 @@ dict_index_get_min_size( /*====================*/ const dict_index_t* index) /*!< in: index */ MY_ATTRIBUTE((nonnull, warn_unused_result)); -/********************************************************************//** -Check whether the table uses the compact page format. -@return TRUE if table uses the compact page format */ -UNIV_INLINE -bool -dict_table_is_comp( -/*===============*/ - const dict_table_t* table) /*!< in: table */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); + +#define dict_table_is_comp(table) (table)->not_redundant() /** Determine if a table uses atomic BLOBs (no locally stored prefix). @param[in] table InnoDB table @@ -928,6 +836,18 @@ dict_table_has_atomic_blobs(const dict_table_t* table) return(DICT_TF_HAS_ATOMIC_BLOBS(table->flags)); } +/** @return potential max length stored inline for externally stored fields */ +inline size_t dict_table_t::get_overflow_field_local_len() const +{ + if (dict_table_has_atomic_blobs(this)) { + /* ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED: do not + store any BLOB prefix locally */ + return BTR_EXTERN_FIELD_REF_SIZE; + } + /* up to MySQL 5.1: store a 768-byte prefix locally */ + return BTR_EXTERN_FIELD_REF_SIZE + DICT_ANTELOPE_MAX_INDEX_COL_LEN; +} + /** Set the various values in a dict_table_t::flags pointer. @param[in,out] flags, Pointer to a 4 byte Table Flags @param[in] format, File Format @@ -962,44 +882,34 @@ ulint dict_tf_to_fsp_flags(ulint table_flags) MY_ATTRIBUTE((const)); -/** Extract the page size from table flags. + +/** Extract the ROW_FORMAT=COMPRESSED page size from table flags. @param[in] flags flags -@return compressed page size, or 0 if not compressed */ -UNIV_INLINE -const page_size_t -dict_tf_get_page_size( - ulint flags) -MY_ATTRIBUTE((const)); +@return ROW_FORMAT=COMPRESSED page size +@retval 0 if not compressed */ +inline ulint dict_tf_get_zip_size(ulint flags) +{ + flags &= DICT_TF_MASK_ZIP_SSIZE; + return flags + ? (UNIV_ZIP_SIZE_MIN >> 1) + << (FSP_FLAGS_GET_ZIP_SSIZE(flags >> DICT_TF_POS_ZIP_SSIZE + << FSP_FLAGS_POS_ZIP_SSIZE)) + : 0; +} /** Determine the extent size (in pages) for the given table @param[in] table the table whose extent size is being calculated. @return extent size in pages (256, 128 or 64) */ -ulint -dict_table_extent_size( - const dict_table_t* table); +inline ulint dict_table_extent_size(const dict_table_t* table) +{ + if (ulint zip_size = table->space->zip_size()) { + return (1ULL << 20) / zip_size; + } -/** Get the table page size. */ -#define dict_table_page_size(table) page_size_t(table->space->flags) + return FSP_EXTENT_SIZE; +} -/*********************************************************************//** -Obtain exclusive locks on all index trees of the table. This is to prevent -accessing index trees while InnoDB is updating internal metadata for -operations such as truncate tables. */ -UNIV_INLINE -void -dict_table_x_lock_indexes( -/*======================*/ - dict_table_t* table) /*!< in: table */ - MY_ATTRIBUTE((nonnull)); -/*********************************************************************//** -Release the exclusive locks on all index tree. */ -UNIV_INLINE -void -dict_table_x_unlock_indexes( -/*========================*/ - dict_table_t* table) /*!< in: table */ - MY_ATTRIBUTE((nonnull)); /********************************************************************//** Checks if a column is in the ordering columns of the clustered index of a table. Column prefixes are treated like whole columns. @@ -1061,10 +971,6 @@ dict_make_room_in_cache( ulint max_tables, /*!< in: max tables allowed in cache */ ulint pct_check); /*!< in: max percent to check */ -/** Clears the virtual column's index list before index is being freed. -@param[in] index Index being freed */ -void dict_index_remove_from_v_col_list(dict_index_t* index); - /** Adds an index to the dictionary cache, with possible indexing newly added column. @param[in,out] index index; NOTE! The index memory @@ -1210,21 +1116,6 @@ dict_index_get_nth_col_or_prefix_pos( ulint* prefix_col_pos) /*!< out: col num if prefix */ __attribute__((warn_unused_result)); - -/********************************************************************//** -Returns TRUE if the index contains a column or a prefix of that column. -@param[in] index index -@param[in] n column number -@param[in] is_virtual whether it is a virtual col -@return TRUE if contains the column or its prefix */ -bool -dict_index_contains_col_or_prefix( -/*==============================*/ - const dict_index_t* index, /*!< in: index */ - ulint n, /*!< in: column number */ - bool is_virtual) - /*!< in: whether it is a virtual col */ - MY_ATTRIBUTE((warn_unused_result)); /********************************************************************//** Looks for a matching field in an index. The column has to be the same. The column in index must be complete, or must contain a prefix longer than the @@ -1249,16 +1140,6 @@ dict_table_get_nth_col_pos( ulint n, /*!< in: column number */ ulint* prefix_col_pos) /*!< out: col num if prefix */ MY_ATTRIBUTE((nonnull(1), warn_unused_result)); -/********************************************************************//** -Returns the position of a system column in an index. -@return position, ULINT_UNDEFINED if not contained */ -UNIV_INLINE -ulint -dict_index_get_sys_col_pos( -/*=======================*/ - const dict_index_t* index, /*!< in: index */ - ulint type) /*!< in: DATA_ROW_ID, ... */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); /*******************************************************************//** Adds a column to index. */ void @@ -1292,7 +1173,7 @@ dict_field_get_col( /**********************************************************************//** Returns an index object if it is found in the dictionary cache. -Assumes that dict_sys->mutex is already being held. +Assumes that dict_sys.mutex is already being held. @return index, NULL if not found */ dict_index_t* dict_index_get_if_in_cache_low( @@ -1356,21 +1237,6 @@ dict_index_build_node_ptr( ulint level) /*!< in: level of rec in tree: 0 means leaf level */ MY_ATTRIBUTE((nonnull, warn_unused_result)); -/**********************************************************************//** -Copies an initial segment of a physical record, long enough to specify an -index entry uniquely. -@return pointer to the prefix record */ -rec_t* -dict_index_copy_rec_order_prefix( -/*=============================*/ - const dict_index_t* index, /*!< in: index */ - const rec_t* rec, /*!< in: record for which to - copy prefix */ - ulint* n_fields,/*!< out: number of fields copied */ - byte** buf, /*!< in/out: memory buffer for the - copied prefix, or NULL */ - ulint* buf_size)/*!< in/out: buffer size */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); /** Convert a physical record into a search tuple. @param[in] rec index record (not necessarily in an index page) @param[in] index index @@ -1455,53 +1321,9 @@ dict_index_calc_min_rec_len( /*========================*/ const dict_index_t* index) /*!< in: index */ MY_ATTRIBUTE((nonnull, warn_unused_result)); -/** Reserve the dictionary system mutex. */ -void -dict_mutex_enter_for_mysql_func(const char *file, unsigned line); -#define dict_mutex_enter_for_mysql() \ - dict_mutex_enter_for_mysql_func(__FILE__, __LINE__) - -/********************************************************************//** -Releases the dictionary system mutex for MySQL. */ -void -dict_mutex_exit_for_mysql(void); -/*===========================*/ - -/** Create a dict_table_t's stats latch or delay for lazy creation. -This function is only called from either single threaded environment -or from a thread that has not shared the table object with other threads. -@param[in,out] table table whose stats latch to create -@param[in] enabled if false then the latch is disabled -and dict_table_stats_lock()/unlock() become noop on this table. */ -void -dict_table_stats_latch_create( - dict_table_t* table, - bool enabled); - -/** Destroy a dict_table_t's stats latch. -This function is only called from either single threaded environment -or from a thread that has not shared the table object with other threads. -@param[in,out] table table whose stats latch to destroy */ -void -dict_table_stats_latch_destroy( - dict_table_t* table); - -/** Lock the appropriate latch to protect a given table's statistics. -@param[in] table table whose stats to lock -@param[in] latch_mode RW_S_LATCH or RW_X_LATCH */ -void -dict_table_stats_lock( - dict_table_t* table, - ulint latch_mode); - -/** Unlock the latch that has been locked by dict_table_stats_lock(). -@param[in] table table whose stats to unlock -@param[in] latch_mode RW_S_LATCH or RW_X_LATCH */ -void -dict_table_stats_unlock( - dict_table_t* table, - ulint latch_mode); +#define dict_mutex_enter_for_mysql() mutex_enter(&dict_sys.mutex) +#define dict_mutex_exit_for_mysql() mutex_exit(&dict_sys.mutex) /********************************************************************//** Checks if the database name in two table names is the same. @@ -1547,23 +1369,6 @@ dict_table_is_fts_column( ulint col_no, /* in: col number to search for */ bool is_virtual)/*!< in: whether it is a virtual column */ MY_ATTRIBUTE((warn_unused_result)); -/**********************************************************************//** -Prevent table eviction by moving a table to the non-LRU list from the -LRU list if it is not already there. */ -UNIV_INLINE -void -dict_table_prevent_eviction( -/*========================*/ - dict_table_t* table) /*!< in: table to prevent eviction */ - MY_ATTRIBUTE((nonnull)); - -/**********************************************************************//** -Move a table to the non LRU end of the LRU list. */ -void -dict_table_move_from_lru_to_non_lru( -/*================================*/ - dict_table_t* table) /*!< in: table to move from LRU to non-LRU */ - MY_ATTRIBUTE((nonnull)); /** Looks for an index with the given id given a table instance. @param[in] table table instance @@ -1575,14 +1380,6 @@ dict_table_find_index_on_id( index_id_t id) MY_ATTRIBUTE((nonnull(1))); -/**********************************************************************//** -Move to the most recently used segment of the LRU list. */ -void -dict_move_to_mru( -/*=============*/ - dict_table_t* table) /*!< in: table to move to MRU */ - MY_ATTRIBUTE((nonnull)); - /** Maximum number of columns in a foreign key constraint. Please Note MySQL has a much lower limit on the number of columns allowed in a foreign key constraint */ @@ -1594,13 +1391,10 @@ extern FILE* dict_foreign_err_file; extern ib_mutex_t dict_foreign_err_mutex; /* mutex protecting the foreign key error messages */ -/** the dictionary system */ -extern dict_sys_t* dict_sys; -/** the data dictionary rw-latch protecting dict_sys */ -extern rw_lock_t dict_operation_lock; - -/* Dictionary system struct */ -struct dict_sys_t{ +/** InnoDB data dictionary cache */ +class dict_sys_t +{ +public: DictSysMutex mutex; /*!< mutex protecting the data dictionary; protects also the disk-based dictionary system tables; @@ -1608,6 +1402,15 @@ struct dict_sys_t{ and DROP TABLE, as well as reading the dictionary data for a table from system tables */ + /** @brief the data dictionary rw-latch protecting dict_sys + + Table create, drop, etc. reserve this in X-mode; implicit or + backround operations purge, rollback, foreign key checks reserve this + in S-mode; not all internal InnoDB operations are covered by MDL. + + This latch also prevents lock waits when accessing the InnoDB + data dictionary tables. @see trx_t::dict_operation_lock_mode */ + rw_lock_t latch; row_id_t row_id; /*!< the next row id to assign; NOTE that at a checkpoint this must be written to the dict system @@ -1616,8 +1419,8 @@ struct dict_sys_t{ the log records */ hash_table_t* table_hash; /*!< hash table of the tables, based on name */ - hash_table_t* table_id_hash; /*!< hash table of the tables, based - on id */ + /** hash table of persistent table IDs */ + hash_table_t* table_id_hash; dict_table_t* sys_tables; /*!< SYS_TABLES table */ dict_table_t* sys_columns; /*!< SYS_COLUMNS table */ dict_table_t* sys_indexes; /*!< SYS_INDEXES table */ @@ -1631,8 +1434,145 @@ struct dict_sys_t{ UT_LIST_BASE_NODE_T(dict_table_t) table_non_LRU; /*!< List of tables that can't be evicted from the cache */ +private: + bool m_initialised; + /** the sequence of temporary table IDs */ + std::atomic<table_id_t> temp_table_id; + /** hash table of temporary table IDs */ + hash_table_t* temp_id_hash; +public: + /** @return a new temporary table ID */ + table_id_t get_temporary_table_id() { + return temp_table_id.fetch_add(1, std::memory_order_relaxed); + } + + /** Look up a temporary table. + @param id temporary table ID + @return temporary table + @retval NULL if the table does not exist + (should only happen during the rollback of CREATE...SELECT) */ + dict_table_t* get_temporary_table(table_id_t id) + { + ut_ad(mutex_own(&mutex)); + dict_table_t* table; + ulint fold = ut_fold_ull(id); + HASH_SEARCH(id_hash, temp_id_hash, fold, dict_table_t*, table, + ut_ad(table->cached), table->id == id); + if (UNIV_LIKELY(table != NULL)) { + DBUG_ASSERT(table->is_temporary()); + DBUG_ASSERT(table->id >= DICT_HDR_FIRST_ID); + table->acquire(); + } + return table; + } + + /** Look up a persistent table. + @param id table ID + @return table + @retval NULL if not cached */ + dict_table_t* get_table(table_id_t id) + { + ut_ad(mutex_own(&mutex)); + dict_table_t* table; + ulint fold = ut_fold_ull(id); + HASH_SEARCH(id_hash, table_id_hash, fold, dict_table_t*, table, + ut_ad(table->cached), table->id == id); + DBUG_ASSERT(!table || !table->is_temporary()); + return table; + } + + /** + Constructor. Further initialisation happens in create(). + */ + + dict_sys_t() : m_initialised(false), temp_table_id(DICT_HDR_FIRST_ID) {} + + bool is_initialised() const { return m_initialised; } + + /** Initialise the data dictionary cache. */ + void create(); + + /** Close the data dictionary cache on shutdown. */ + void close(); + + /** Resize the hash tables based on the current buffer pool size. */ + void resize(); + + /** Add a table definition to the data dictionary cache */ + inline void add(dict_table_t* table); + /** Remove a table definition from the data dictionary cache. + @param[in,out] table cached table definition to be evicted + @param[in] lru whether this is part of least-recently-used evictiono + @param[in] keep whether to keep (not free) the object */ + void remove(dict_table_t* table, bool lru = false, bool keep = false); + +#ifdef UNIV_DEBUG + /** Find a table */ + template <bool in_lru> bool find(dict_table_t* table) + { + ut_ad(table); + ut_ad(table->can_be_evicted == in_lru); + ut_ad(mutex_own(&mutex)); + for (const dict_table_t* t = UT_LIST_GET_FIRST(in_lru + ? table_LRU : table_non_LRU); + t; t = UT_LIST_GET_NEXT(table_LRU, t)) + { + if (t == table) return true; + ut_ad(t->can_be_evicted == in_lru); + } + return false; + } + /** Find a table */ + bool find(dict_table_t* table) + { + return table->can_be_evicted ? find<true>(table) : find<false>(table); + } +#endif + + /** Move a table to the non-LRU list from the LRU list. */ + void prevent_eviction(dict_table_t* table) + { + ut_ad(find(table)); + if (table->can_be_evicted) + { + table->can_be_evicted = FALSE; + UT_LIST_REMOVE(table_LRU, table); + UT_LIST_ADD_LAST(table_non_LRU, table); + } + } + /** Acquire a reference to a cached table. */ + inline void acquire(dict_table_t* table); + +#ifdef UNIV_DEBUG + /** Assert that the data dictionary is locked */ + void assert_locked() + { + ut_ad(mutex_own(&mutex)); + ut_ad(rw_lock_own(&latch, RW_LOCK_X)); + } +#endif + /** Lock the data dictionary cache. */ + void lock(const char* file, unsigned line) + { + rw_lock_x_lock_func(&latch, 0, file, line); + mutex_enter_loc(&mutex, file, line); + } + + /** Unlock the data dictionary cache. */ + void unlock() + { + mutex_exit(&mutex); + rw_lock_x_unlock(&latch); + } }; +/** the data dictionary cache */ +extern dict_sys_t dict_sys; + +#define dict_table_prevent_eviction(table) dict_sys.prevent_eviction(table) +#define dict_sys_lock() dict_sys.lock(__FILE__, __LINE__) +#define dict_sys_unlock() dict_sys.unlock() + /** dummy index for ROW_FORMAT=REDUNDANT supremum and infimum records */ extern dict_index_t* dict_ind_redundant; @@ -1716,16 +1656,6 @@ dict_fs2utf8( size_t table_utf8_size)/*!< in: table_utf8 size */ MY_ATTRIBUTE((nonnull)); -/** Resize the hash tables besed on the current buffer pool size. */ -void -dict_resize(); - -/**********************************************************************//** -Closes the data dictionary module. */ -void -dict_close(void); -/*============*/ - /**********************************************************************//** Check whether the table is corrupted. @return nonzero for corrupted table, zero for valid tables */ diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic index 4174580c918..b6d15f28a69 100644 --- a/storage/innobase/include/dict0dict.ic +++ b/storage/innobase/include/dict0dict.ic @@ -25,6 +25,7 @@ Created 1/8/1996 Heikki Tuuri ***********************************************************************/ #include "fsp0sysspace.h" +#include "dict0pagecompress.h" /*********************************************************************//** Gets the minimum number of bytes per character. @@ -241,83 +242,6 @@ dict_table_get_next_index( #endif /* UNIV_DEBUG */ /********************************************************************//** -Check whether the index is the clustered index. -@return nonzero for clustered index, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_clust( -/*================*/ - const dict_index_t* index) /*!< in: index */ -{ - ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - return(index->type & DICT_CLUSTERED); -} - -/** Check if index is auto-generated clustered index. -@param[in] index index - -@return true if index is auto-generated clustered index. */ -UNIV_INLINE -bool -dict_index_is_auto_gen_clust( - const dict_index_t* index) -{ - return(index->type == DICT_CLUSTERED); -} - -/********************************************************************//** -Check whether the index is unique. -@return nonzero for unique index, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_unique( -/*=================*/ - const dict_index_t* index) /*!< in: index */ -{ - ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - return(index->type & DICT_UNIQUE); -} - -/********************************************************************//** -Check whether the index is a Spatial Index. -@return nonzero for Spatial Index, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_spatial( -/*==================*/ - const dict_index_t* index) /*!< in: index */ -{ - ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - return ulint(UNIV_EXPECT(index->type & DICT_SPATIAL, 0)); -} - -/********************************************************************//** -Check whether the index is the insert buffer tree. -@return nonzero for insert buffer, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_ibuf( -/*===============*/ - const dict_index_t* index) /*!< in: index */ -{ - ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - return(index->type & DICT_IBUF); -} - -/********************************************************************//** -Check whether the index is a secondary index or the insert buffer tree. -@return nonzero for insert buffer, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_sec_or_ibuf( -/*======================*/ - const dict_index_t* index) /*!< in: index */ -{ - ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - return((index->type & (DICT_CLUSTERED | DICT_IBUF)) != DICT_CLUSTERED); -} - -/********************************************************************//** Gets the number of user-defined non-virtual columns in a table in the dictionary cache. @return number of user-defined (e.g., not ROW_ID) non-virtual @@ -462,7 +386,8 @@ dict_table_get_nth_v_col( ut_ad(table); ut_ad(pos < table->n_v_def); ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); - ut_ad(!table->v_cols[pos].m_col.is_instant()); + ut_ad(!table->v_cols[pos].m_col.is_added()); + ut_ad(!table->v_cols[pos].m_col.is_dropped()); return &table->v_cols[pos]; } @@ -501,19 +426,6 @@ dict_table_get_sys_col_no( return unsigned(table->n_cols) + (sys - DATA_N_SYS_COLS); } -/********************************************************************//** -Check whether the table uses the compact page format. -@return TRUE if table uses the compact page format */ -UNIV_INLINE -bool -dict_table_is_comp( -/*===============*/ - const dict_table_t* table) /*!< in: table */ -{ - ut_ad(table); - return (table->flags & DICT_TF_COMPACT) != 0; -} - /************************************************************************ Check if the table has an FTS index. */ UNIV_INLINE @@ -720,20 +632,34 @@ dict_tf_to_fsp_flags(ulint table_flags) DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure", return(ULINT_UNDEFINED);); - /* Adjust bit zero. */ - fsp_flags = DICT_TF_HAS_ATOMIC_BLOBS(table_flags) ? 1 : 0; + /* No ROW_FORMAT=COMPRESSED for innodb_checksum_algorithm=full_crc32 */ + if ((srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32 + || srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_FULL_CRC32) + && !(table_flags & DICT_TF_MASK_ZIP_SSIZE)) { + + fsp_flags = 1U << FSP_FLAGS_FCRC32_POS_MARKER + | FSP_FLAGS_FCRC32_PAGE_SSIZE(); + + if (page_compression_level) { + fsp_flags |= innodb_compression_algorithm + << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO; + } + } else { + /* Adjust bit zero. */ + fsp_flags = DICT_TF_HAS_ATOMIC_BLOBS(table_flags) ? 1 : 0; - /* ZIP_SSIZE and ATOMIC_BLOBS are at the same position. */ - fsp_flags |= table_flags - & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS); + /* ZIP_SSIZE and ATOMIC_BLOBS are at the same position. */ + fsp_flags |= table_flags + & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS); - fsp_flags |= FSP_FLAGS_PAGE_SSIZE(); + fsp_flags |= FSP_FLAGS_PAGE_SSIZE(); - if (page_compression_level) { - fsp_flags |= FSP_FLAGS_MASK_PAGE_COMPRESSION; + if (page_compression_level) { + fsp_flags |= FSP_FLAGS_MASK_PAGE_COMPRESSION; + } } - ut_a(fsp_flags_is_valid(fsp_flags, false)); + ut_a(fil_space_t::is_valid_flags(fsp_flags, false)); if (DICT_TF_HAS_DATA_DIR(table_flags)) { fsp_flags |= 1U << FSP_FLAGS_MEM_DATA_DIR; @@ -779,50 +705,6 @@ dict_tf_to_sys_tables_type( return(type); } -/** Extract the page size info from table flags. -@param[in] flags flags -@return a structure containing the compressed and uncompressed -page sizes and a boolean indicating if the page is compressed. */ -UNIV_INLINE -const page_size_t -dict_tf_get_page_size( - ulint flags) -{ - const ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags); - - if (zip_ssize == 0) { - return(univ_page_size); - } - - const ulint zip_size = (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize; - - ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX); - - return(page_size_t(zip_size, srv_page_size, true)); -} - -/*********************************************************************//** -Obtain exclusive locks on all index trees of the table. This is to prevent -accessing index trees while InnoDB is updating internal metadata for -operations such as truncate tables. */ -UNIV_INLINE -void -dict_table_x_lock_indexes( -/*======================*/ - dict_table_t* table) /*!< in: table */ -{ - dict_index_t* index; - - ut_ad(mutex_own(&dict_sys->mutex)); - - /* Loop through each index of the table and lock them */ - for (index = dict_table_get_first_index(table); - index != NULL; - index = dict_table_get_next_index(index)) { - rw_lock_x_lock(dict_index_get_lock(index)); - } -} - /*********************************************************************//** Returns true if the particular FTS index in the table is still syncing in the background, false otherwise. @@ -844,24 +726,6 @@ dict_fts_index_syncing( } return(false); } -/*********************************************************************//** -Release the exclusive locks on all index tree. */ -UNIV_INLINE -void -dict_table_x_unlock_indexes( -/*========================*/ - dict_table_t* table) /*!< in: table */ -{ - dict_index_t* index; - - ut_ad(mutex_own(&dict_sys->mutex)); - - for (index = dict_table_get_first_index(table); - index != NULL; - index = dict_table_get_next_index(index)) { - rw_lock_x_unlock(dict_index_get_lock(index)); - } -} /********************************************************************//** Gets the number of fields in the internal representation of an index, @@ -979,30 +843,6 @@ dict_index_get_nth_field( } #endif /* UNIV_DEBUG */ -/********************************************************************//** -Returns the position of a system column in an index. -@return position, ULINT_UNDEFINED if not contained */ -UNIV_INLINE -ulint -dict_index_get_sys_col_pos( -/*=======================*/ - const dict_index_t* index, /*!< in: index */ - ulint type) /*!< in: DATA_ROW_ID, ... */ -{ - ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - ut_ad(!dict_index_is_ibuf(index)); - - if (dict_index_is_clust(index)) { - - return(dict_col_get_clust_pos( - dict_table_get_sys_col(index->table, type), - index)); - } - - return(dict_index_get_nth_col_pos( - index, dict_table_get_sys_col_no(index->table, type), NULL)); -} - /*********************************************************************//** Gets the field column. @return field->col, pointer to the table column */ @@ -1233,9 +1073,7 @@ dict_table_is_fts_column( index = (dict_index_t*) ib_vector_getp(indexes, i); - if (dict_index_contains_col_or_prefix( - index, col_no, is_virtual)) { - + if (index->contains_col_or_prefix(col_no, is_virtual)) { return(i); } } @@ -1300,21 +1138,6 @@ dict_max_v_field_len_store_undo( return(max_log_len); } -/**********************************************************************//** -Prevent table eviction by moving a table to the non-LRU list from the -LRU list if it is not already there. */ -UNIV_INLINE -void -dict_table_prevent_eviction( -/*========================*/ - dict_table_t* table) /*!< in: table to prevent eviction */ -{ - ut_ad(mutex_own(&dict_sys->mutex)); - if (table->can_be_evicted) { - dict_table_move_from_lru_to_non_lru(table); - } -} - /********************************************************************//** Check whether the table is corrupted. @return nonzero for corrupted table, zero for valid tables */ @@ -1358,8 +1181,8 @@ inline void dict_table_t::acquire() { - ut_ad(mutex_own(&dict_sys->mutex)); - my_atomic_add32_explicit(&n_ref_count, 1, MY_MEMORY_ORDER_RELAXED); + ut_ad(mutex_own(&dict_sys.mutex)); + n_ref_count++; } /** Release the table handle. @@ -1368,8 +1191,7 @@ inline bool dict_table_t::release() { - int32 n = my_atomic_add32_explicit( - &n_ref_count, -1, MY_MEMORY_ORDER_RELAXED); + auto n = n_ref_count--; ut_ad(n > 0); return n == 1; } diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h index aa3de6d0b17..afc017fd9d1 100644 --- a/storage/innobase/include/dict0load.h +++ b/storage/innobase/include/dict0load.h @@ -82,7 +82,7 @@ dict_get_first_table_name_in_db( /** Make sure the data_file_name is saved in dict_table_t if needed. Try to read it from the fil_system first, then from SYS_DATAFILES. @param[in] table Table object -@param[in] dict_mutex_own true if dict_sys->mutex is owned already */ +@param[in] dict_mutex_own true if dict_sys.mutex is owned already */ void dict_get_and_save_data_dir_path( dict_table_t* table, diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index 2504b2ef61d..259da23fcd9 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -44,9 +44,9 @@ Created 1/8/1996 Heikki Tuuri #include "fts0fts.h" #include "buf0buf.h" #include "gis0type.h" -#include "os0once.h" #include "fil0fil.h" #include "fil0crypt.h" +#include <sql_const.h> #include <set> #include <algorithm> #include <iterator> @@ -573,6 +573,10 @@ struct dict_col_t{ this column. Our current max limit is 3072 (REC_VERSION_56_MAX_INDEX_COL_LEN) bytes. */ +private: + /** Special value of ind for a dropped column */ + static const unsigned DROPPED = 1023; +public: /** Detach a virtual column from an index. @param index being-freed index */ @@ -588,7 +592,7 @@ struct dict_col_t{ } def_val; /** Retrieve the column name. - @param[in] table the table of this column */ + @param table the table of this column */ const char *name(const dict_table_t &table) const; /** @return whether this is a virtual column */ @@ -603,7 +607,8 @@ struct dict_col_t{ ut_ad(mtype == DATA_INT || mtype == DATA_FIXBINARY); return mtype == DATA_INT; } - /** @return whether this is system versioned */ + /** @return whether this user column (not row_start, row_end) + has System Versioning property */ bool is_versioned() const { return !(~prtype & DATA_VERSIONED); } /** @return whether this is the system version start */ bool vers_sys_start() const @@ -617,29 +622,119 @@ struct dict_col_t{ } /** @return whether this is an instantly-added column */ - bool is_instant() const + bool is_added() const { DBUG_ASSERT(def_val.len != UNIV_SQL_DEFAULT || !def_val.data); return def_val.len != UNIV_SQL_DEFAULT; } + /** Flag the column instantly dropped */ + void set_dropped() { ind = DROPPED; } + /** Flag the column instantly dropped. + @param not_null whether the column was NOT NULL + @param len2 whether the length exceeds 255 bytes + @param fixed_len the fixed length in bytes, or 0 */ + void set_dropped(bool not_null, bool len2, unsigned fixed) + { + DBUG_ASSERT(!len2 || !fixed); + prtype= not_null ? DATA_NOT_NULL | DATA_BINARY_TYPE : DATA_BINARY_TYPE; + if (fixed) + { + mtype= DATA_FIXBINARY; + len= fixed; + } + else + { + mtype= DATA_BINARY; + len= len2 ? 65535 : 255; + } + mbminlen= mbmaxlen= 0; + ind= DROPPED; + ord_part= 0; + max_prefix= 0; + } + /** @return whether the column was instantly dropped */ + bool is_dropped() const { return ind == DROPPED; } + /** @return whether the column was instantly dropped + @param index the clustered index */ + inline bool is_dropped(const dict_index_t &index) const; + /** Get the default value of an instantly-added column. @param[out] len value length (in bytes), or UNIV_SQL_NULL @return default value @retval NULL if the default value is SQL NULL (len=UNIV_SQL_NULL) */ const byte *instant_value(ulint *len) const { - DBUG_ASSERT(is_instant()); + DBUG_ASSERT(is_added()); *len= def_val.len; return static_cast<const byte*>(def_val.data); } /** Remove the 'instant ADD' status of the column */ - void remove_instant() + void clear_instant() { - DBUG_ASSERT(is_instant()); def_val.len= UNIV_SQL_DEFAULT; def_val.data= NULL; } + + /** @return whether two columns have compatible data type encoding */ + bool same_type(const dict_col_t &other) const + { + if (mtype != other.mtype) + { + /* For latin1_swedish_ci, DATA_CHAR and DATA_VARCHAR + will be used instead of DATA_MYSQL and DATA_VARMYSQL. + As long as mtype,prtype are being written to InnoDB + data dictionary tables, we cannot simplify this. */ + switch (mtype) { + default: + return false; + case DATA_VARCHAR: + if (other.mtype != DATA_VARMYSQL) + return false; + goto check_encoding; + case DATA_VARMYSQL: + if (other.mtype != DATA_VARCHAR) + return false; + goto check_encoding; + case DATA_CHAR: + if (other.mtype != DATA_MYSQL) + return false; + goto check_encoding; + case DATA_MYSQL: + if (other.mtype != DATA_CHAR) + return false; + goto check_encoding; + } + } + else if (dtype_is_string_type(mtype)) + { + check_encoding: + const uint16_t cset= dtype_get_charset_coll(prtype); + const uint16_t ocset= dtype_get_charset_coll(other.prtype); + return cset == ocset || dict_col_t::same_encoding(cset, ocset); + } + + return true; + } + + /** @return whether two collations codes have the same character encoding */ + static bool same_encoding(uint16_t a, uint16_t b); + + /** Determine if the columns have the same format + except for is_nullable() and is_versioned(). + @param other column to compare to + @return whether the columns have the same format */ + bool same_format(const dict_col_t &other) const + { + return same_type(other) && len >= other.len && + mbminlen == other.mbminlen && mbmaxlen == other.mbmaxlen && + !((prtype ^ other.prtype) & ~(DATA_NOT_NULL | DATA_VERSIONED | + CHAR_COLL_MASK << 16 | + DATA_LONG_TRUE_VARCHAR)); + } + + /** @return whether the column values are comparable by memcmp() */ + bool is_binary() const { return prtype & DATA_BINARY_TYPE; } }; /** Index information put in a list of virtual column structure. Index @@ -656,9 +751,6 @@ struct dict_v_idx_t { : index(index), nth_field(nth_field) {} }; -/** Index list to put in dict_v_col_t */ -typedef std::list<dict_v_idx_t, ut_allocator<dict_v_idx_t> > dict_v_idx_list; - /** Data structure for a virtual column in a table */ struct dict_v_col_t{ /** column structure */ @@ -668,15 +760,42 @@ struct dict_v_col_t{ dict_col_t** base_col; /** number of base column */ - ulint num_base; + unsigned num_base:10; /** column pos in table */ - ulint v_pos; + unsigned v_pos:10; - /** Virtual index list, and column position in the index, - the allocated memory is not from table->heap */ - dict_v_idx_list* v_indexes; + /** number of indexes */ + unsigned n_v_indexes:12; + /** Virtual index list, and column position in the index */ + std::forward_list<dict_v_idx_t, ut_allocator<dict_v_idx_t> > + v_indexes; + + /** Detach the column from an index. + @param index index to be detached from */ + void detach(const dict_index_t &index) + { + if (!n_v_indexes) return; + auto i= v_indexes.before_begin(); + ut_d(unsigned n= 0); + do { + auto prev = i++; + if (i == v_indexes.end()) + { + ut_ad(n == n_v_indexes); + return; + } + ut_ad(++n <= n_v_indexes); + if (i->index == &index) + { + v_indexes.erase_after(prev); + n_v_indexes--; + return; + } + } + while (i != v_indexes.end()); + } }; /** Data structure for newly added virtual column in a table */ @@ -704,7 +823,8 @@ struct dict_s_col_t { }; /** list to put stored column for create_table_info_t */ -typedef std::list<dict_s_col_t, ut_allocator<dict_s_col_t> > dict_s_col_list; +typedef std::forward_list<dict_s_col_t, ut_allocator<dict_s_col_t> > +dict_s_col_list; /** @brief DICT_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and is the maximum indexed column length (or indexed prefix length) in @@ -812,17 +932,15 @@ extern ulong zip_pad_max; an uncompressed page should be left as padding to avoid compression failures. This estimate is based on a self-adapting heuristic. */ struct zip_pad_info_t { - SysMutex* mutex; /*!< mutex protecting the info */ - ulint pad; /*!< number of bytes used as pad */ + SysMutex mutex; /*!< mutex protecting the info */ + Atomic_relaxed<ulint> + pad; /*!< number of bytes used as pad */ ulint success;/*!< successful compression ops during current round */ ulint failure;/*!< failed compression ops during current round */ ulint n_rounds;/*!< number of currently successful rounds */ - volatile os_once::state_t - mutex_created; - /*!< Creation state of mutex member */ }; /** Number of samples of data size kept when page compression fails for @@ -835,7 +953,7 @@ const char innobase_index_reserve_name[] = "GEN_CLUST_INDEX"; /** Data structure for an index. Most fields will be initialized to 0, NULL or FALSE in dict_mem_index_create(). */ -struct dict_index_t{ +struct dict_index_t { index_id_t id; /*!< id of the index */ mem_heap_t* heap; /*!< memory heap */ id_name_t name; /*!< index name */ @@ -897,13 +1015,13 @@ struct dict_index_t{ dictionary cache */ unsigned to_be_dropped:1; /*!< TRUE if the index is to be dropped; - protected by dict_operation_lock */ + protected by dict_sys.latch */ unsigned online_status:2; /*!< enum online_index_status. Transitions from ONLINE_INDEX_COMPLETE (to ONLINE_INDEX_CREATION) are protected - by dict_operation_lock and - dict_sys->mutex. Other changes are + by dict_sys.latch and + dict_sys.mutex. Other changes are protected by index->lock. */ unsigned uncommitted:1; /*!< a flag that is set for secondary indexes @@ -913,6 +1031,8 @@ struct dict_index_t{ #ifdef UNIV_DEBUG /** whether this is a dummy index object */ bool is_dummy; + /** whether btr_cur_instant_init() is in progress */ + bool in_instant_init; uint32_t magic_n;/*!< magic number */ /** Value of dict_index_t::magic_n */ # define DICT_INDEX_MAGIC_N 76789786 @@ -987,8 +1107,14 @@ struct dict_index_t{ /* in which slot the next sample should be saved. */ /* @} */ - /** R-tree split sequence number */ - volatile int32 rtr_ssn; +private: + /** R-tree split sequence number */ + Atomic_relaxed<node_seq_t> rtr_ssn; +public: + void set_ssn(node_seq_t ssn) { rtr_ssn= ssn; } + node_seq_t assign_ssn() { return rtr_ssn.fetch_add(1) + 1; } + node_seq_t ssn() const { return rtr_ssn; } + rtr_info_track_t* rtr_track;/*!< tracking all R-Tree search cursors */ trx_id_t trx_id; /*!< id of the transaction that created this @@ -1028,7 +1154,7 @@ struct dict_index_t{ page cannot be read or decrypted */ inline bool is_readable() const; - /** @return whether instant ADD COLUMN is in effect */ + /** @return whether instant ALTER TABLE is in effect */ inline bool is_instant() const; /** @return whether the index is the primary key index @@ -1038,12 +1164,38 @@ struct dict_index_t{ return DICT_CLUSTERED == (type & (DICT_CLUSTERED | DICT_IBUF)); } + /** @return whether this is a generated clustered index */ + bool is_gen_clust() const { return type == DICT_CLUSTERED; } + + /** @return whether this is a clustered index */ + bool is_clust() const { return type & DICT_CLUSTERED; } + + /** @return whether this is a unique index */ + bool is_unique() const { return type & DICT_UNIQUE; } + /** @return whether this is a spatial index */ bool is_spatial() const { return UNIV_UNLIKELY(type & DICT_SPATIAL); } + /** @return whether this is the change buffer */ + bool is_ibuf() const { return UNIV_UNLIKELY(type & DICT_IBUF); } + /** @return whether the index includes virtual columns */ bool has_virtual() const { return type & DICT_VIRTUAL; } + /** @return the position of DB_TRX_ID */ + unsigned db_trx_id() const { + DBUG_ASSERT(is_primary()); + DBUG_ASSERT(n_uniq); + DBUG_ASSERT(n_uniq <= MAX_REF_PARTS); + return n_uniq; + } + /** @return the position of DB_ROLL_PTR */ + unsigned db_roll_ptr() const { return db_trx_id() + 1; } + + /** @return the offset of the metadata BLOB field, + or the first user field after the PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR */ + unsigned first_user_field() const { return db_trx_id() + 2; } + /** @return whether the index is corrupted */ inline bool is_corrupted() const; @@ -1051,7 +1203,7 @@ struct dict_index_t{ @param whether to reset fields[].col */ void detach_columns(bool clear= false) { - if (!has_virtual()) + if (!has_virtual() || !cached) return; for (unsigned i= 0; i < n_fields; i++) { @@ -1060,7 +1212,7 @@ struct dict_index_t{ continue; col->detach(*this); if (clear) - fields[i].col= NULL; + fields[i].col= nullptr; } } @@ -1094,24 +1246,20 @@ struct dict_index_t{ return fields[n].col->instant_value(len); } - /** Adjust clustered index metadata for instant ADD COLUMN. - @param[in] clustered index definition after instant ADD COLUMN */ - void instant_add_field(const dict_index_t& instant); - - /** Remove the 'instant ADD' status of a clustered index. - Protected by index root page x-latch or table X-lock. */ - void remove_instant() - { - DBUG_ASSERT(is_primary()); - if (!is_instant()) { - return; - } - for (unsigned i = n_core_fields; i < n_fields; i++) { - fields[i].col->remove_instant(); - } - n_core_fields = n_fields; - n_core_null_bytes = UT_BITS_IN_BYTES(unsigned(n_nullable)); - } + /** Adjust index metadata for instant ADD/DROP/reorder COLUMN. + @param[in] clustered index definition after instant ALTER TABLE */ + inline void instant_add_field(const dict_index_t& instant); + /** Remove instant ADD COLUMN metadata. */ + inline void clear_instant_add(); + /** Remove instant ALTER TABLE metadata. */ + inline void clear_instant_alter(); + + /** Construct the metadata record for instant ALTER TABLE. + @param[in] row dummy or default values for existing columns + @param[in,out] heap memory heap for allocations + @return metadata record */ + inline dtuple_t* + instant_metadata(const dtuple_t& row, mem_heap_t* heap) const; /** Check if record in clustered index is historical row. @param[in] rec clustered row @@ -1127,6 +1275,16 @@ struct dict_index_t{ bool vers_history_row(const rec_t* rec, bool &history_row); + /** Reconstruct the clustered index fields. */ + inline void reconstruct_fields(); + + /** Check if the index contains a column or a prefix of that column. + @param[in] n column number + @param[in] is_virtual whether it is a virtual col + @return whether the index contains the column or its prefix */ + bool contains_col_or_prefix(ulint n, bool is_virtual) const + MY_ATTRIBUTE((warn_unused_result)); + #ifdef BTR_CUR_HASH_ADAPT /** @return a clone of this */ dict_index_t* clone() const; @@ -1206,20 +1364,8 @@ struct dict_index_t{ @param index being-freed index */ inline void dict_col_t::detach(const dict_index_t &index) { - ut_ad(is_virtual()); - - if (dict_v_idx_list *v_indexes= reinterpret_cast<const dict_v_col_t*>(this) - ->v_indexes) - { - for (dict_v_idx_list::iterator i= v_indexes->begin(); - i != v_indexes->end(); i++) - { - if (i->index == &index) { - v_indexes->erase(i); - return; - } - } - } + if (is_virtual()) + reinterpret_cast<dict_v_col_t*>(this)->detach(index); } /** The status of online index creation */ @@ -1534,6 +1680,64 @@ struct dict_vcol_templ_t { dict_vcol_templ_t() : vtempl(0), mysql_table_query_id(~0ULL) {} }; +/** Metadata on clustered index fields starting from first_user_field() */ +class field_map_element_t +{ + /** Number of bits for representing a column number */ + static constexpr uint16_t IND_BITS = 10; + + /** Set if the column of the field has been instantly dropped */ + static constexpr uint16_t DROPPED = 1U << (IND_BITS + 5); + + /** Set if the column was dropped and originally declared NOT NULL */ + static constexpr uint16_t NOT_NULL = 1U << (IND_BITS + 4); + + /** Column index (if !(data & DROPPED)): table->cols[data & IND], + or field length (if (data & DROPPED)): + (data & IND) = 0 if variable-length with max_len < 256 bytes; + (data & IND) = 1 if variable-length with max_len > 255 bytes; + (data & IND) = 1 + L otherwise, with L=fixed length of the column */ + static constexpr uint16_t IND = (1U << IND_BITS) - 1; + + /** Field metadata */ + uint16_t data; + + void clear_not_null() { data &= ~NOT_NULL; } +public: + bool is_dropped() const { return data & DROPPED; } + void set_dropped() { data |= DROPPED; } + bool is_not_null() const { return data & NOT_NULL; } + void set_not_null() { ut_ad(is_dropped()); data |= NOT_NULL; } + uint16_t ind() const { return data & IND; } + void set_ind(uint16_t i) + { + DBUG_ASSERT(i <= IND); + DBUG_ASSERT(!ind()); + data |= i; + } + field_map_element_t& operator= (uint16_t value) + { + data = value; + return *this; + } + operator uint16_t() { return data; } +}; + +static_assert(sizeof(field_map_element_t) == 2, + "Size mismatch for a persistent data item!"); + +/** Instantly dropped or reordered columns */ +struct dict_instant_t +{ + /** Number of dropped columns */ + unsigned n_dropped; + /** Dropped columns */ + dict_col_t* dropped; + /** Map of clustered index non-PK fields[i - first_user_field()] + to table columns */ + field_map_element_t* field_map; +}; + /** These are used when MySQL FRM and InnoDB data dictionary are in inconsistent state. */ typedef enum { @@ -1552,11 +1756,7 @@ struct dict_table_t { /** Get reference count. @return current value of n_ref_count */ - inline int32 get_ref_count() - { - return my_atomic_load32_explicit(&n_ref_count, - MY_MEMORY_ORDER_RELAXED); - } + inline uint32_t get_ref_count() const { return n_ref_count; } /** Acquire the table handle. */ inline void acquire(); @@ -1576,6 +1776,9 @@ struct dict_table_t { return flags2 & DICT_TF2_TEMPORARY; } + /** @return whether the table is not in ROW_FORMAT=REDUNDANT */ + bool not_redundant() const { return flags & DICT_TF_COMPACT; } + /** @return whether this table is readable @retval true normally @retval false if this is a single-table tablespace @@ -1594,35 +1797,92 @@ struct dict_table_t { return strstr(name, "/" TEMP_FILE_PREFIX) != NULL; } - /** @return whether instant ADD COLUMN is in effect */ + /** @return whether instant ALTER TABLE is in effect */ bool is_instant() const { return(UT_LIST_GET_FIRST(indexes)->is_instant()); } - /** @return whether the table supports instant ADD COLUMN */ + /** @return whether the table supports instant ALTER TABLE */ bool supports_instant() const { return(!(flags & DICT_TF_MASK_ZIP_SSIZE)); } - /** Adjust metadata for instant ADD COLUMN. - @param[in] table table definition after instant ADD COLUMN */ - void instant_add_column(const dict_table_t& table); + /** @return the number of instantly dropped columns */ + unsigned n_dropped() const { return instant ? instant->n_dropped : 0; } + + /** Look up an old column. + @param[in] cols the old columns of the table + @param[in] col_map map from old table columns to altered ones + @param[in] n_cols number of old columns + @param[in] i the number of the new column + @return old column + @retval NULL if column i was added to the table */ + static const dict_col_t* find(const dict_col_t* cols, + const ulint* col_map, ulint n_cols, + ulint i) + { + for (ulint o = n_cols; o--; ) { + if (col_map[o] == i) { + return &cols[o]; + } + } + return NULL; + } - /** Roll back instant_add_column(). - @param[in] old_n_cols original n_cols - @param[in] old_cols original cols - @param[in] old_col_names original col_names */ - void rollback_instant( + /** Serialise metadata of dropped or reordered columns. + @param[in,out] heap memory heap for allocation + @param[out] field data field with the metadata */ + inline void serialise_columns(mem_heap_t* heap, dfield_t* field) const; + + /** Reconstruct dropped or reordered columns. + @param[in] metadata data from serialise_columns() + @param[in] len length of the metadata, in bytes + @return whether parsing the metadata failed */ + bool deserialise_columns(const byte* metadata, ulint len); + + /** Set is_instant() before instant_column(). + @param[in] old previous table definition + @param[in] col_map map from old.cols[] + and old.v_cols[] to this + @param[out] first_alter_pos 0, or + 1 + first changed column position */ + inline void prepare_instant(const dict_table_t& old, + const ulint* col_map, + unsigned& first_alter_pos); + + /** Adjust table metadata for instant ADD/DROP/reorder COLUMN. + @param[in] table table on which prepare_instant() was invoked + @param[in] col_map mapping from cols[] and v_cols[] to table + @return whether the metadata record must be updated */ + inline bool instant_column(const dict_table_t& table, + const ulint* col_map); + + /** Roll back instant_column(). + @param[in] old_n_cols original n_cols + @param[in] old_cols original cols + @param[in] old_col_names original col_names + @param[in] old_instant original instant structure + @param[in] old_fields original fields + @param[in] old_n_fields original number of fields + @param[in] old_n_core_fields original number of core fields + @param[in] old_n_v_cols original n_v_cols + @param[in] old_v_cols original v_cols + @param[in] old_v_col_names original v_col_names + @param[in] col_map column map */ + inline void rollback_instant( unsigned old_n_cols, dict_col_t* old_cols, - const char* old_col_names); - - /** Trim the instantly added columns when an insert into SYS_COLUMNS - is rolled back during ALTER TABLE or recovery. - @param[in] n number of surviving non-system columns */ - void rollback_instant(unsigned n); + const char* old_col_names, + dict_instant_t* old_instant, + dict_field_t* old_fields, + unsigned old_n_fields, + unsigned old_n_core_fields, + unsigned old_n_v_cols, + dict_v_col_t* old_v_cols, + const char* old_v_col_names, + const ulint* col_map); /** Add the table definition to the data dictionary cache */ void add_to_cache(); @@ -1640,23 +1900,28 @@ struct dict_table_t { void inc_fk_checks() { #ifdef UNIV_DEBUG - lint fk_checks= (lint) + int32_t fk_checks= #endif - my_atomic_addlint(&n_foreign_key_checks_running, 1); + n_foreign_key_checks_running++; ut_ad(fk_checks >= 0); } void dec_fk_checks() { #ifdef UNIV_DEBUG - lint fk_checks= (lint) + int32_t fk_checks= #endif - my_atomic_addlint(&n_foreign_key_checks_running, ulint(-1)); + n_foreign_key_checks_running--; ut_ad(fk_checks > 0); } /** For overflow fields returns potential max length stored inline */ - size_t get_overflow_field_local_len() const; + inline size_t get_overflow_field_local_len() const; +private: + /** Initialize instant->field_map. + @param[in] table table definition to copy from */ + inline void init_instant(const dict_table_t& table); +public: /** Id of the table. */ table_id_t id; /** Hash chain node. */ @@ -1715,8 +1980,7 @@ struct dict_table_t { /** TRUE if the table is to be dropped, but not yet actually dropped (could in the background drop list). It is turned on at the beginning of row_drop_table_for_mysql() and turned off just before we start to - update system tables for the drop. It is protected by - dict_operation_lock. */ + update system tables for the drop. It is protected by dict_sys.latch. */ unsigned to_be_dropped:1; /** Number of non-virtual columns defined so far. */ @@ -1766,6 +2030,9 @@ struct dict_table_t { reason s_cols is a part of dict_table_t */ dict_s_col_list* s_cols; + /** Instantly dropped or reordered columns, or NULL if none */ + dict_instant_t* instant; + /** Column names packed in a character string "name1\0name2\0...nameN\0". Until the string contains n_cols, it will be allocated from a temporary heap. The final string will be allocated @@ -1815,7 +2082,7 @@ struct dict_table_t { /** Count of how many foreign key check operations are currently being performed on the table. We cannot drop the table while there are foreign key checks running on it. */ - ulint n_foreign_key_checks_running; + Atomic_counter<int32_t> n_foreign_key_checks_running; /** Transactions whose view low limit is greater than this number are not allowed to store to the MySQL query cache or retrieve from it. @@ -1837,9 +2104,6 @@ struct dict_table_t { /** Statistics for query optimization. @{ */ - /** Creation state of 'stats_latch'. */ - volatile os_once::state_t stats_latch_created; - /** This latch protects: dict_table_t::stat_initialized, dict_table_t::stat_n_rows (*), @@ -1851,7 +2115,7 @@ struct dict_table_t { dict_table_t::indexes*::stat_n_leaf_pages. (*) Those are not always protected for performance reasons. */ - rw_lock_t* stats_latch; + rw_lock_t stats_latch; /** TRUE if statistics have been calculated the first time after database startup or table creation. */ @@ -1933,7 +2197,7 @@ struct dict_table_t { /** The state of the background stats thread wrt this table. See BG_STAT_NONE, BG_STAT_IN_PROGRESS and BG_STAT_SHOULD_QUIT. - Writes are covered by dict_sys->mutex. Dirty reads are possible. */ + Writes are covered by dict_sys.mutex. Dirty reads are possible. */ #define BG_SCRUB_IN_PROGRESS ((byte)(1 << 2)) /*!< BG_SCRUB_IN_PROGRESS is set in @@ -1949,7 +2213,7 @@ struct dict_table_t { /** The state of the background stats thread wrt this table. See BG_STAT_NONE, BG_STAT_IN_PROGRESS and BG_STAT_SHOULD_QUIT. - Writes are covered by dict_sys->mutex. Dirty reads are possible. */ + Writes are covered by dict_sys.mutex. Dirty reads are possible. */ byte stats_bg_flag; bool stats_error_printed; @@ -1975,11 +2239,8 @@ struct dict_table_t { from a select. */ lock_t* autoinc_lock; - /** Creation state of autoinc_mutex member */ - volatile os_once::state_t autoinc_mutex_created; - /** Mutex protecting the autoincrement counter. */ - ib_mutex_t* autoinc_mutex; + ib_mutex_t autoinc_mutex; /** Autoinc counter value to give to the next inserted row. */ ib_uint64_t autoinc; @@ -2015,7 +2276,7 @@ private: /** Count of how many handles are opened to this table. Dropping of the table is NOT allowed until this count gets to zero. MySQL does NOT itself check the number of open handles at DROP. */ - int32 n_ref_count; + Atomic_counter<uint32_t> n_ref_count; public: /** List of locks on the table. Protected by lock_sys.mutex. */ @@ -2051,12 +2312,15 @@ inline bool dict_index_t::is_readable() const { return table->is_readable(); } inline bool dict_index_t::is_instant() const { ut_ad(n_core_fields > 0); - ut_ad(n_core_fields <= n_fields); + ut_ad(n_core_fields <= n_fields || table->n_dropped()); ut_ad(n_core_fields == n_fields || (type & ~(DICT_UNIQUE | DICT_CORRUPT)) == DICT_CLUSTERED); ut_ad(n_core_fields == n_fields || table->supports_instant()); ut_ad(n_core_fields == n_fields || !table->is_temporary()); - return(n_core_fields != n_fields); + ut_ad(!table->instant || !table->is_temporary()); + + return n_core_fields != n_fields + || (is_primary() && table->instant); } inline bool dict_index_t::is_corrupted() const @@ -2066,6 +2330,81 @@ inline bool dict_index_t::is_corrupted() const || (table && table->corrupted)); } +inline void dict_index_t::clear_instant_add() +{ + DBUG_ASSERT(is_primary()); + DBUG_ASSERT(is_instant()); + DBUG_ASSERT(!table->instant); + for (unsigned i = n_core_fields; i < n_fields; i++) { + fields[i].col->clear_instant(); + } + n_core_fields = n_fields; + n_core_null_bytes = UT_BITS_IN_BYTES(unsigned(n_nullable)); +} + +inline void dict_index_t::clear_instant_alter() +{ + DBUG_ASSERT(is_primary()); + DBUG_ASSERT(n_fields == n_def); + + if (!table->instant) { + if (is_instant()) { + clear_instant_add(); + } + return; + } + +#ifndef DBUG_OFF + for (unsigned i = first_user_field(); i--; ) { + DBUG_ASSERT(!fields[i].col->is_dropped()); + DBUG_ASSERT(!fields[i].col->is_nullable()); + } +#endif + const dict_col_t* ai_col = table->persistent_autoinc + ? fields[table->persistent_autoinc - 1].col + : NULL; + dict_field_t* const begin = &fields[first_user_field()]; + dict_field_t* end = &fields[n_fields]; + + for (dict_field_t* d = begin; d < end; ) { + /* Move fields for dropped columns to the end. */ + if (!d->col->is_dropped()) { + d++; + } else { + if (d->col->is_nullable()) { + n_nullable--; + } + + std::swap(*d, *--end); + } + } + + DBUG_ASSERT(&fields[n_fields - table->n_dropped()] == end); + n_core_fields = n_fields = n_def = end - fields; + n_core_null_bytes = UT_BITS_IN_BYTES(n_nullable); + std::sort(begin, end, [](const dict_field_t& a, const dict_field_t& b) + { return a.col->ind < b.col->ind; }); + table->instant = NULL; + if (ai_col) { + auto a = std::find_if(begin, end, + [ai_col](const dict_field_t& f) + { return f.col == ai_col; }); + table->persistent_autoinc = (a == end) ? 0 : 1 + (a - fields); + } +} + +/** @return whether the column was instantly dropped +@param[in] index the clustered index */ +inline bool dict_col_t::is_dropped(const dict_index_t& index) const +{ + DBUG_ASSERT(index.is_primary()); + DBUG_ASSERT(!is_dropped() == !index.table->instant); + DBUG_ASSERT(!is_dropped() || (this >= index.table->instant->dropped + && this < index.table->instant->dropped + + index.table->instant->n_dropped)); + return is_dropped(); +} + /*******************************************************************//** Initialise the table lock list. */ void @@ -2086,87 +2425,6 @@ struct dict_foreign_add_to_referenced_table { } }; -/** Destroy the autoinc latch of the given table. -This function is only called from either single threaded environment -or from a thread that has not shared the table object with other threads. -@param[in,out] table table whose stats latch to destroy */ -inline -void -dict_table_autoinc_destroy( - dict_table_t* table) -{ - if (table->autoinc_mutex_created == os_once::DONE - && table->autoinc_mutex != NULL) { - mutex_free(table->autoinc_mutex); - UT_DELETE(table->autoinc_mutex); - } -} - -/** Request for lazy creation of the autoinc latch of a given table. -This function is only called from either single threaded environment -or from a thread that has not shared the table object with other threads. -@param[in,out] table table whose autoinc latch is to be created. */ -inline -void -dict_table_autoinc_create_lazy( - dict_table_t* table) -{ - table->autoinc_mutex = NULL; - table->autoinc_mutex_created = os_once::NEVER_DONE; -} - -/** Request a lazy creation of dict_index_t::zip_pad::mutex. -This function is only called from either single threaded environment -or from a thread that has not shared the table object with other threads. -@param[in,out] index index whose zip_pad mutex is to be created */ -inline -void -dict_index_zip_pad_mutex_create_lazy( - dict_index_t* index) -{ - index->zip_pad.mutex = NULL; - index->zip_pad.mutex_created = os_once::NEVER_DONE; -} - -/** Destroy the zip_pad_mutex of the given index. -This function is only called from either single threaded environment -or from a thread that has not shared the table object with other threads. -@param[in,out] table table whose stats latch to destroy */ -inline -void -dict_index_zip_pad_mutex_destroy( - dict_index_t* index) -{ - if (index->zip_pad.mutex_created == os_once::DONE - && index->zip_pad.mutex != NULL) { - mutex_free(index->zip_pad.mutex); - UT_DELETE(index->zip_pad.mutex); - } -} - -/** Release the zip_pad_mutex of a given index. -@param[in,out] index index whose zip_pad_mutex is to be released */ -inline -void -dict_index_zip_pad_unlock( - dict_index_t* index) -{ - mutex_exit(index->zip_pad.mutex); -} - -#ifdef UNIV_DEBUG -/** Check if the current thread owns the autoinc_mutex of a given table. -@param[in] table the autoinc_mutex belongs to this table -@return true, if the current thread owns the autoinc_mutex, false otherwise.*/ -inline -bool -dict_table_autoinc_own( - const dict_table_t* table) -{ - return(mutex_own(table->autoinc_mutex)); -} -#endif /* UNIV_DEBUG */ - /** Check whether the col is used in spatial index or regular index. @param[in] col column to check @return spatial status */ diff --git a/storage/innobase/include/dict0priv.h b/storage/innobase/include/dict0priv.h index e56848d1954..3f2792054e0 100644 --- a/storage/innobase/include/dict0priv.h +++ b/storage/innobase/include/dict0priv.h @@ -45,18 +45,6 @@ dict_table_check_if_in_cache_low( /*=============================*/ const char* table_name); /*!< in: table name */ -/**********************************************************************//** -Returns a table object based on table id. -@return table, NULL if does not exist */ -UNIV_INLINE -dict_table_t* -dict_table_open_on_id_low( -/*=====================*/ - table_id_t table_id, /*!< in: table id */ - dict_err_ignore_t ignore_err, /*!< in: errors to ignore - when loading the table */ - ibool open_only_if_in_cache); - #include "dict0priv.ic" #endif /* dict0priv.h */ diff --git a/storage/innobase/include/dict0priv.ic b/storage/innobase/include/dict0priv.ic index 7b584c7e1cb..ff645378175 100644 --- a/storage/innobase/include/dict0priv.ic +++ b/storage/innobase/include/dict0priv.ic @@ -25,7 +25,6 @@ Created Wed 13 Oct 2010 16:10:14 EST Sunny Bains #include "dict0dict.h" #include "dict0load.h" -#include "dict0priv.h" /**********************************************************************//** Gets a table; loads it to the dictionary cache if necessary. A low-level @@ -40,7 +39,7 @@ dict_table_get_low( dict_table_t* table; ut_ad(table_name); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); table = dict_table_check_if_in_cache_low(table_name); @@ -64,40 +63,6 @@ dict_table_get_low( } /**********************************************************************//** -Returns a table object based on table id. -@return table, NULL if does not exist */ -UNIV_INLINE -dict_table_t* -dict_table_open_on_id_low( -/*======================*/ - table_id_t table_id, /*!< in: table id */ - dict_err_ignore_t ignore_err, /*!< in: errors to ignore - when loading the table */ - ibool open_only_if_in_cache) -{ - dict_table_t* table; - ulint fold; - - ut_ad(mutex_own(&dict_sys->mutex)); - - /* Look for the table name in the hash table */ - fold = ut_fold_ull(table_id); - - HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold, - dict_table_t*, table, ut_ad(table->cached), - table->id == table_id); - if (table == NULL && !open_only_if_in_cache) { - table = dict_load_table_on_id(table_id, ignore_err); - } - - ut_ad(!table || table->cached); - - /* TODO: should get the type information from MySQL */ - - return(table); -} - -/**********************************************************************//** Checks if a table is in the dictionary cache. @return table, NULL if not found */ UNIV_INLINE @@ -114,12 +79,12 @@ dict_table_check_if_in_cache_low( ("table: '%s'", table_name)); ut_ad(table_name); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); /* Look for the table name in the hash table */ table_fold = ut_fold_string(table_name); - HASH_SEARCH(name_hash, dict_sys->table_hash, table_fold, + HASH_SEARCH(name_hash, dict_sys.table_hash, table_fold, dict_table_t*, table, ut_ad(table->cached), !strcmp(table->name.m_name, table_name)); DBUG_RETURN(table); diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h index 00ac6eb4745..2e001cb56e9 100644 --- a/storage/innobase/include/dict0stats.h +++ b/storage/innobase/include/dict0stats.h @@ -187,6 +187,19 @@ dict_stats_rename_table( char* errstr, /*!< out: error string if != DB_SUCCESS is returned */ size_t errstr_sz); /*!< in: errstr size */ +/*********************************************************************//** +Renames an index in InnoDB persistent stats storage. +This function creates its own transaction and commits it. +@return DB_SUCCESS or error code. DB_STATS_DO_NOT_EXIST will be returned +if the persistent stats do not exist. */ +dberr_t +dict_stats_rename_index( +/*====================*/ + const dict_table_t* table, /*!< in: table whose index + is renamed */ + const char* old_index_name, /*!< in: old index name */ + const char* new_index_name) /*!< in: new index name */ + __attribute__((warn_unused_result)); /** Save an individual index's statistic into the persistent statistics storage. diff --git a/storage/innobase/include/dict0stats.ic b/storage/innobase/include/dict0stats.ic index 98024935e16..b30dede54f1 100644 --- a/storage/innobase/include/dict0stats.ic +++ b/storage/innobase/include/dict0stats.ic @@ -75,7 +75,7 @@ dict_stats_is_persistent_enabled(const dict_table_t* table) + dict_stats_update(DICT_STATS_RECALC_TRANSIENT) on a table that has just been PS-enabled. This is acceptable. Avoiding this would mean that we would have to - protect the ::stat_persistent with dict_table_stats_lock() like the + protect the ::stat_persistent with dict_table_t::stats_latch like the other ::stat_ members which would be too big performance penalty, especially when this function is called from dict_stats_update_if_needed(). */ @@ -148,7 +148,7 @@ dict_stats_init( /*============*/ dict_table_t* table) /*!< in/out: table */ { - ut_ad(!mutex_own(&dict_sys->mutex)); + ut_ad(!mutex_own(&dict_sys.mutex)); if (table->stat_initialized) { return; @@ -174,14 +174,14 @@ dict_stats_deinit( /*==============*/ dict_table_t* table) /*!< in/out: table */ { - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); ut_a(table->get_ref_count() == 0); - dict_table_stats_lock(table, RW_X_LATCH); + rw_lock_x_lock(&table->stats_latch); if (!table->stat_initialized) { - dict_table_stats_unlock(table, RW_X_LATCH); + rw_lock_x_unlock(&table->stats_latch); return; } @@ -222,5 +222,5 @@ dict_stats_deinit( } #endif /* HAVE_valgrind_or_MSAN */ - dict_table_stats_unlock(table, RW_X_LATCH); + rw_lock_x_unlock(&table->stats_latch); } diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h index 66b98629033..526139643d1 100644 --- a/storage/innobase/include/dict0stats_bg.h +++ b/storage/innobase/include/dict0stats_bg.h @@ -72,7 +72,7 @@ dict_stats_stop_bg( dict_table_t* table) /*!< in/out: table */ { ut_ad(!srv_read_only_mode); - ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(mutex_own(&dict_sys.mutex)); if (!(table->stats_bg_flag & BG_STAT_IN_PROGRESS)) { return(true); @@ -90,7 +90,7 @@ and restore the lock before it exits. The background stats thread is guaranteed not to start using the specified table after this function returns and before the caller unlocks the data dictionary because it sets the BG_STAT_IN_PROGRESS bit in table->stats_bg_flag -under dict_sys->mutex. */ +under dict_sys.mutex. */ void dict_stats_wait_bg_to_stop_using_table( /*===================================*/ diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h index 1e16e501a48..d0da45ab218 100644 --- a/storage/innobase/include/dict0types.h +++ b/storage/innobase/include/dict0types.h @@ -30,7 +30,6 @@ Created 1/8/1996 Heikki Tuuri #include <ut0mutex.h> #include <rem0types.h> -struct dict_sys_t; struct dict_col_t; struct dict_field_t; struct dict_index_t; diff --git a/storage/innobase/include/fil0crypt.h b/storage/innobase/include/fil0crypt.h index 870858b4ccd..fd0d3e12601 100644 --- a/storage/innobase/include/fil0crypt.h +++ b/storage/innobase/include/fil0crypt.h @@ -1,6 +1,6 @@ /***************************************************************************** Copyright (C) 2013, 2015, Google Inc. All Rights Reserved. -Copyright (c) 2015, 2018, MariaDB Corporation. +Copyright (c) 2015, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -26,11 +26,9 @@ Created 04/01/2015 Jan Lindström #ifndef fil0crypt_h #define fil0crypt_h -#ifndef UNIV_INNOCHECKSUM #include "os0event.h" #include "my_crypt.h" #include "fil0fil.h" -#endif /*! UNIV_INNOCHECKSUM */ /** * Magic pattern in start of crypt data on page 0 @@ -281,13 +279,11 @@ fil_space_merge_crypt_data( const fil_space_crypt_t* src); /** Initialize encryption parameters from a tablespace header page. -@param[in] page_size page size of the tablespace +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] page first page of the tablespace @return crypt data from page 0 @retval NULL if not present or not valid */ -UNIV_INTERN -fil_space_crypt_t* -fil_space_read_crypt_data(const page_size_t& page_size, const byte* page) +fil_space_crypt_t* fil_space_read_crypt_data(ulint zip_size, const byte* page) MY_ATTRIBUTE((nonnull, warn_unused_result)); /** @@ -313,14 +309,16 @@ fil_parse_write_crypt_data( MY_ATTRIBUTE((warn_unused_result)); /** Encrypt a buffer. -@param[in,out] crypt_data Crypt data -@param[in] space space_id -@param[in] offset Page offset -@param[in] lsn Log sequence number -@param[in] src_frame Page to encrypt -@param[in] page_size Page size -@param[in,out] dst_frame Output buffer +@param[in,out] crypt_data Crypt data +@param[in] space space_id +@param[in] offset Page offset +@param[in] lsn Log sequence number +@param[in] src_frame Page to encrypt +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] dst_frame Output buffer +@param[in] use_full_checksum full crc32 algo is used @return encrypted buffer or NULL */ +UNIV_INTERN byte* fil_encrypt_buf( fil_space_crypt_t* crypt_data, @@ -328,8 +326,9 @@ fil_encrypt_buf( ulint offset, lsn_t lsn, const byte* src_frame, - const page_size_t& page_size, - byte* dst_frame) + ulint zip_size, + byte* dst_frame, + bool use_full_checksum) MY_ATTRIBUTE((warn_unused_result)); /** @@ -351,20 +350,24 @@ fil_space_encrypt( byte* dst_frame) MY_ATTRIBUTE((warn_unused_result)); -/** -Decrypt a page. -@param[in,out] crypt_data crypt_data + +/** Decrypt a page. +@param]in] space_id space id +@param[in] crypt_data crypt_data @param[in] tmp_frame Temporary buffer -@param[in] page_size Page size +@param[in] physical_size page size +@param[in] fsp_flags Tablespace flags @param[in,out] src_frame Page to decrypt -@param[out] err DB_SUCCESS or error +@param[out] err DB_SUCCESS or DB_DECRYPTION_FAILED @return true if page decrypted, false if not.*/ UNIV_INTERN bool fil_space_decrypt( + ulint space_id, fil_space_crypt_t* crypt_data, byte* tmp_frame, - const page_size_t& page_size, + ulint physical_size, + ulint fsp_flags, byte* src_frame, dberr_t* err); @@ -383,17 +386,14 @@ fil_space_decrypt( byte* src_frame) MY_ATTRIBUTE((warn_unused_result)); -/****************************************************************** +/** Calculate post encryption checksum -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] dst_frame Block where checksum is calculated -@return page checksum or BUF_NO_CHECKSUM_MAGIC +@return page checksum not needed. */ -UNIV_INTERN uint32_t -fil_crypt_calculate_checksum( - const page_size_t& page_size, - const byte* dst_frame) +fil_crypt_calculate_checksum(ulint zip_size, const byte* dst_frame) MY_ATTRIBUTE((warn_unused_result)); /********************************************************************* @@ -491,10 +491,9 @@ calculated checksum as if it does page could be valid unencrypted, encrypted, or corrupted. @param[in,out] page page frame (checksum is temporarily modified) -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @return true if page is encrypted AND OK, false otherwise */ -bool -fil_space_verify_crypt_checksum(const byte* page, const page_size_t& page_size) +bool fil_space_verify_crypt_checksum(const byte* page, ulint zip_size) MY_ATTRIBUTE((warn_unused_result)); #endif /* fil0crypt_h */ diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 8682474824f..e78c9587325 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -27,12 +27,16 @@ Created 10/25/1995 Heikki Tuuri #ifndef fil0fil_h #define fil0fil_h +#include "fsp0types.h" + #ifndef UNIV_INNOCHECKSUM #include "log0recv.h" #include "dict0types.h" -#include "page0size.h" #include "ilist.h" +#ifdef UNIV_LINUX +# include <set> +#endif struct unflushed_spaces_tag_t; struct rotation_list_tag_t; @@ -41,8 +45,6 @@ struct rotation_list_tag_t; extern my_bool srv_use_doublewrite_buf; extern struct buf_dblwr_t* buf_dblwr; class page_id_t; -struct trx_t; -class truncate_t; /** Structure containing encryption specification */ struct fil_space_crypt_t; @@ -76,10 +78,17 @@ fil_type_is_data( struct fil_node_t; +#endif + /** Tablespace or log data space */ +#ifndef UNIV_INNOCHECKSUM struct fil_space_t : ilist_node<unflushed_spaces_tag_t>, ilist_node<rotation_list_tag_t> +#else +struct fil_space_t +#endif { +#ifndef UNIV_INNOCHECKSUM ulint id; /*!< space id */ hash_node_t hash; /*!< hash chain node */ char* name; /*!< Tablespace name */ @@ -93,26 +102,21 @@ struct fil_space_t : ilist_node<unflushed_spaces_tag_t>, /** Log sequence number of the latest MLOG_INDEX_LOAD record that was found while parsing the redo log */ lsn_t enable_lsn; + /** set when an .ibd file is about to be deleted, + or an undo tablespace is about to be truncated. + When this is set following new ops are not allowed: + * read IO request + * ibuf merge + * file flush + Note that we can still possibly have new write operations + because we don't check this flag when doing flush batches. */ bool stop_new_ops; - /*!< we set this true when we start - deleting a single-table tablespace. - When this is set following new ops - are not allowed: - * read IO request - * ibuf merge - * file flush - Note that we can still possibly have - new write operations because we don't - check this flag when doing flush - batches. */ /** whether undo tablespace truncation is in progress */ bool is_being_truncated; #ifdef UNIV_DEBUG - ulint redo_skipped_count; - /*!< reference count for operations who want - to skip redo log in the file space in order - to make modify_check() pass. - Uses my_atomic_loadlint() and friends. */ + /** reference count for operations who want to skip redo log in the + file space in order to make modify_check() pass. */ + Atomic_counter<ulint> redo_skipped_count; #endif fil_type_t purpose;/*!< purpose */ UT_LIST_BASE_NODE_T(fil_node_t) chain; @@ -130,10 +134,6 @@ struct fil_space_t : ilist_node<unflushed_spaces_tag_t>, /*!< recovered tablespace size in pages; 0 if no size change was read from the redo log, or if the size change was implemented */ - ulint flags; /*!< FSP_SPACE_FLAGS and FSP_FLAGS_MEM_ flags; - see fsp0types.h, - fsp_flags_is_valid(), - page_size_t(ulint) (constructor) */ ulint n_reserved_extents; /*!< number of reserved free extents for ongoing operations like B-tree page split */ @@ -141,20 +141,20 @@ struct fil_space_t : ilist_node<unflushed_spaces_tag_t>, the tablespace to disk; dropping of the tablespace is forbidden if this is positive */ /** Number of pending buffer pool operations accessing the tablespace - without holding a table lock or dict_operation_lock S-latch + without holding a table lock or dict_sys.latch S-latch that would prevent the table (and tablespace) from being dropped. An example is change buffer merge. The tablespace cannot be dropped while this is nonzero, or while fil_node_t::n_pending is nonzero. - Protected by fil_system.mutex and my_atomic_loadlint() and friends. */ - ulint n_pending_ops; + Protected by fil_system.mutex and std::atomic. */ + std::atomic<ulint> n_pending_ops; /** Number of pending block read or write operations (when a write is imminent or a read has recently completed). The tablespace object cannot be freed while this is nonzero, but it can be detached from fil_system. Note that fil_node_t::n_pending tracks actual pending I/O requests. - Protected by fil_system.mutex and my_atomic_loadlint() and friends. */ - ulint n_pending_ios; + Protected by fil_system.mutex and std::atomic. */ + std::atomic<ulint> n_pending_ios; rw_lock_t latch; /*!< latch protecting the file space storage allocation */ UT_LIST_NODE_T(fil_space_t) named_spaces; @@ -248,7 +248,10 @@ struct fil_space_t : ilist_node<unflushed_spaces_tag_t>, /** Note that the tablespace has been imported. Initially, purpose=FIL_TYPE_IMPORT so that no redo log is written while the space ID is being updated in each page. */ - void set_imported(); + inline void set_imported(); + + /** @return whether the storage device is rotational (HDD, not SSD) */ + inline bool is_rotational() const; /** Open each file. Only invoked on fil_system.temp_space. @return whether all files were opened */ @@ -257,38 +260,290 @@ struct fil_space_t : ilist_node<unflushed_spaces_tag_t>, void close(); /** Acquire a tablespace reference. */ - void acquire() { my_atomic_addlint(&n_pending_ops, 1); } + void acquire() { n_pending_ops++; } /** Release a tablespace reference. */ - void release() + void release() { ut_ad(referenced()); n_pending_ops--; } + /** @return whether references are being held */ + bool referenced() const { return n_pending_ops; } + + /** Acquire a tablespace reference for I/O. */ + void acquire_for_io() { n_pending_ios++; } + /** Release a tablespace reference for I/O. */ + void release_for_io() { ut_ad(pending_io()); n_pending_ios--; } + /** @return whether I/O is pending */ + bool pending_io() const { return n_pending_ios; } +#endif /* !UNIV_INNOCHECKSUM */ + /** FSP_SPACE_FLAGS and FSP_FLAGS_MEM_ flags; + check fsp0types.h to more info about flags. */ + ulint flags; + + /** Determine if full_crc32 is used for a data file + @param[in] flags tablespace flags (FSP_FLAGS) + @return whether the full_crc32 algorithm is active */ + static bool full_crc32(ulint flags) { + return flags & FSP_FLAGS_FCRC32_MASK_MARKER; + } + /** @return whether innodb_checksum_algorithm=full_crc32 is active */ + bool full_crc32() const { return full_crc32(flags); } + /** Determine the logical page size. + @param flags tablespace flags (FSP_FLAGS) + @return the logical page size + @retval 0 if the flags are invalid */ + static unsigned logical_size(ulint flags) { + + ulint page_ssize = 0; + + if (full_crc32(flags)) { + page_ssize = FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags); + } else { + page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); + } + + switch (page_ssize) { + case 3: return 4096; + case 4: return 8192; + case 5: + { ut_ad(full_crc32(flags)); return 16384; } + case 0: + { ut_ad(!full_crc32(flags)); return 16384; } + case 6: return 32768; + case 7: return 65536; + default: return 0; + } + } + /** Determine the ROW_FORMAT=COMPRESSED page size. + @param flags tablespace flags (FSP_FLAGS) + @return the ROW_FORMAT=COMPRESSED page size + @retval 0 if ROW_FORMAT=COMPRESSED is not used */ + static unsigned zip_size(ulint flags) { + + if (full_crc32(flags)) { + return 0; + } + + ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(flags); + return zip_ssize + ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize : 0; + } + /** Determine the physical page size. + @param flags tablespace flags (FSP_FLAGS) + @return the physical page size */ + static unsigned physical_size(ulint flags) { + + if (full_crc32(flags)) { + return logical_size(flags); + } + + ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(flags); + return zip_ssize + ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize + : unsigned(srv_page_size); + } + /** @return the ROW_FORMAT=COMPRESSED page size + @retval 0 if ROW_FORMAT=COMPRESSED is not used */ + unsigned zip_size() const { return zip_size(flags); } + /** @return the physical page size */ + unsigned physical_size() const { return physical_size(flags); } + /** Check whether the compression enabled in tablespace. + @param[in] flags tablespace flags */ + static bool is_compressed(ulint flags) { + + if (full_crc32(flags)) { + ulint algo = FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO( + flags); + DBUG_ASSERT(algo <= PAGE_ALGORITHM_LAST); + return algo > 0; + } + + return FSP_FLAGS_HAS_PAGE_COMPRESSION(flags); + } + /** @return whether the compression enabled for the tablespace. */ + bool is_compressed() const { return is_compressed(flags); } + + /** Get the compression algorithm for full crc32 format. + @param[in] flags tablespace flags + @return algorithm type of tablespace */ + static ulint get_compression_algo(ulint flags) { - ut_ad(referenced()); - my_atomic_addlint(&n_pending_ops, ulint(-1)); + return full_crc32(flags) + ? FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags) + : 0; } - /** @return whether references are being held */ - bool referenced() { return my_atomic_loadlint(&n_pending_ops); } - /** @return whether references are being held */ - bool referenced() const + /** @return the page_compressed algorithm + @retval 0 if not page_compressed */ + ulint get_compression_algo() const { + return fil_space_t::get_compression_algo(flags); + } + /** Determine if the page_compressed page contains an extra byte + for exact compressed stream length + @param[in] flags tablespace flags + @return whether the extra byte is needed */ + static bool full_crc32_page_compressed_len(ulint flags) { - return const_cast<fil_space_t*>(this)->referenced(); + DBUG_ASSERT(full_crc32(flags)); + switch (get_compression_algo(flags)) { + case PAGE_LZ4_ALGORITHM: + case PAGE_LZO_ALGORITHM: + case PAGE_SNAPPY_ALGORITHM: + return true; + } + return false; } - /** Acquire a tablespace reference for I/O. */ - void acquire_for_io() { my_atomic_addlint(&n_pending_ios, 1); } - /** Release a tablespace reference for I/O. */ - void release_for_io() + /** Whether the full checksum matches with non full checksum flags. + @param[in] flags flags present + @param[in] expected expected flags + @return true if it is equivalent */ + static bool is_flags_full_crc32_equal(ulint flags, ulint expected) { - ut_ad(pending_io()); - my_atomic_addlint(&n_pending_ios, ulint(-1)); + ut_ad(full_crc32(flags)); + ulint page_ssize = FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags); + + if (full_crc32(expected)) { + /* The data file may have been created with a + different innodb_compression_algorithm. But + we only support one innodb_page_size for all files. */ + return page_ssize + == FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(expected); + } + + ulint space_page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(expected); + + if (page_ssize == 5) { + if (space_page_ssize) { + return false; + } + } else if (space_page_ssize != page_ssize) { + return false; + } + + return true; } - /** @return whether I/O is pending */ - bool pending_io() { return my_atomic_loadlint(&n_pending_ios); } - /** @return whether I/O is pending */ - bool pending_io() const + /** Whether old tablespace flags match full_crc32 flags. + @param[in] flags flags present + @param[in] expected expected flags + @return true if it is equivalent */ + static bool is_flags_non_full_crc32_equal(ulint flags, ulint expected) + { + ut_ad(!full_crc32(flags)); + + if (!full_crc32(expected)) { + return false; + } + + ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); + ulint space_page_ssize = FSP_FLAGS_FCRC32_GET_PAGE_SSIZE( + expected); + + if (page_ssize) { + if (space_page_ssize != 5) { + return false; + } + } else if (space_page_ssize != page_ssize) { + return false; + } + + return true; + } + /** Whether both fsp flags are equivalent */ + static bool is_flags_equal(ulint flags, ulint expected) + { + if (!((flags ^ expected) & ~(1U << FSP_FLAGS_POS_RESERVED))) { + return true; + } + + return full_crc32(flags) + ? is_flags_full_crc32_equal(flags, expected) + : is_flags_non_full_crc32_equal(flags, expected); + } + /** Validate the tablespace flags for full crc32 format. + @param[in] flags the content of FSP_SPACE_FLAGS + @return whether the flags are correct in full crc32 format */ + static bool is_fcrc32_valid_flags(ulint flags) + { + ut_ad(flags & FSP_FLAGS_FCRC32_MASK_MARKER); + const ulint page_ssize = physical_size(flags); + if (page_ssize < 3 || page_ssize & 8) { + return false; + } + + flags >>= FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO; + + return flags <= PAGE_ALGORITHM_LAST; + } + /** Validate the tablespace flags. + @param[in] flags content of FSP_SPACE_FLAGS + @param[in] is_ibd whether this is an .ibd file + (not system tablespace) + @return whether the flags are correct. */ + static bool is_valid_flags(ulint flags, bool is_ibd) { - return const_cast<fil_space_t*>(this)->pending_io(); + DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", + return false;); + + if (full_crc32(flags)) { + return is_fcrc32_valid_flags(flags); + } + + if (flags == 0) { + return true; + } + + if (flags & ~FSP_FLAGS_MASK) { + return false; + } + + if ((flags & (FSP_FLAGS_MASK_POST_ANTELOPE + | FSP_FLAGS_MASK_ATOMIC_BLOBS)) + == FSP_FLAGS_MASK_ATOMIC_BLOBS) { + /* If the "atomic blobs" flag (indicating + ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED) flag + is set, then the "post Antelope" + (ROW_FORMAT!=REDUNDANT) flag must also be set. */ + return false; + } + + /* Bits 10..14 should be 0b0000d where d is the DATA_DIR flag + of MySQL 5.6 and MariaDB 10.0, which we ignore. + In the buggy FSP_SPACE_FLAGS written by MariaDB 10.1.0 to 10.1.20, + bits 10..14 would be nonzero 0bsssaa where sss is + nonzero PAGE_SSIZE (3, 4, 6, or 7) + and aa is ATOMIC_WRITES (not 0b11). */ + if (FSP_FLAGS_GET_RESERVED(flags) & ~1U) { + return false; + } + + const ulint ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); + if (ssize == 1 || ssize == 2 || ssize == 5 || ssize & 8) { + /* the page_size is not between 4k and 64k; + 16k should be encoded as 0, not 5 */ + return false; + } + + const ulint zssize = FSP_FLAGS_GET_ZIP_SSIZE(flags); + if (zssize == 0) { + /* not ROW_FORMAT=COMPRESSED */ + } else if (zssize > (ssize ? ssize : 5)) { + /* Invalid KEY_BLOCK_SIZE */ + return false; + } else if (~flags & (FSP_FLAGS_MASK_POST_ANTELOPE + | FSP_FLAGS_MASK_ATOMIC_BLOBS)) { + /* both these flags should be set for + ROW_FORMAT=COMPRESSED */ + return false; + } + + /* The flags do look valid. But, avoid misinterpreting + buggy MariaDB 10.1 format flags for + PAGE_COMPRESSED=1 PAGE_COMPRESSION_LEVEL={0,2,3} + as valid-looking PAGE_SSIZE if this is known to be + an .ibd file and we are using the default innodb_page_size=16k. */ + return(ssize == 0 || !is_ibd + || srv_page_size != UNIV_PAGE_SIZE_ORIG); } }; +#ifndef UNIV_INNOCHECKSUM /** Value of fil_space_t::magic_n */ #define FIL_SPACE_MAGIC_N 89472 @@ -302,6 +557,8 @@ struct fil_node_t { pfs_os_file_t handle; /** whether the file actually is a raw device or disk partition */ bool is_raw_disk; + /** whether the file is on non-rotational media (SSD) */ + bool on_ssd; /** size of the file in database pages (0 if not known yet); the possible last incomplete megabyte may be ignored if space->id == 0 */ @@ -344,6 +601,14 @@ struct fil_node_t { @return whether the page was found valid */ bool read_page0(bool first); + /** Determine some file metadata when creating or reading the file. + @param file the file that is being created, or OS_FILE_CLOSED */ + void find_metadata(os_file_t file = OS_FILE_CLOSED +#ifdef UNIV_LINUX + , struct stat* statbuf = NULL +#endif + ); + /** Close the file handle. */ void close(); }; @@ -351,6 +616,24 @@ struct fil_node_t { /** Value of fil_node_t::magic_n */ #define FIL_NODE_MAGIC_N 89389 +inline void fil_space_t::set_imported() +{ + ut_ad(purpose == FIL_TYPE_IMPORT); + purpose = FIL_TYPE_TABLESPACE; + UT_LIST_GET_FIRST(chain)->find_metadata(); +} + +inline bool fil_space_t::is_rotational() const +{ + for (const fil_node_t* node = UT_LIST_GET_FIRST(chain); + node != NULL; node = UT_LIST_GET_NEXT(chain, node)) { + if (!node->on_ssd) { + return true; + } + } + return false; +} + /** Common InnoDB file extensions */ enum ib_extention { NO_EXT = 0, @@ -389,19 +672,12 @@ typedef byte fil_faddr_t; /*!< 'type' definition in C: an address #define FIL_ADDR_BYTE 4U /* then comes 2-byte byte offset within page*/ #define FIL_ADDR_SIZE 6U /* address size is 6 bytes */ -#ifndef UNIV_INNOCHECKSUM - /** File space address */ struct fil_addr_t { ulint page; /*!< page number within a space */ ulint boffset; /*!< byte offset within the page */ }; -/** The null file address */ -extern const fil_addr_t fil_addr_null; - -#endif /* !UNIV_INNOCHECKSUM */ - /** The byte offsets on a file page for various variables @{ */ #define FIL_PAGE_SPACE_OR_CHKSUM 0 /*!< in < MySQL-4.0.14 space id the page belongs to (== 0) but in later @@ -442,19 +718,19 @@ extern const fil_addr_t fil_addr_null; MySQL/InnoDB 5.1.7 or later, the contents of this field is valid for all uncompressed pages. */ -#define FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION 26U /*!< for the first page - in a system tablespace data file - (ibdata*, not *.ibd): the file has - been flushed to disk at least up - to this lsn - for other pages: a 32-bit key version - used to encrypt the page + 32-bit checksum - or 64 bits of zero if no encryption - */ + +/** For the first page in a system tablespace data file(ibdata*, not *.ibd): +the file has been flushed to disk at least up to this lsn +For other pages: 32-bit key version used to encrypt the page + 32-bit checksum +or 64 bites of zero if no encryption */ +#define FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION 26U /** This overloads FIL_PAGE_FILE_FLUSH_LSN for RTREE Split Sequence Number */ #define FIL_RTREE_SPLIT_SEQ_NUM FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION +/** Start of the page_compressed content */ +#define FIL_PAGE_COMP_ALGO FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + /** starting from 4.1.x this contains the space id of the page */ #define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34U @@ -462,25 +738,45 @@ extern const fil_addr_t fil_addr_null; #define FIL_PAGE_DATA 38U /*!< start of the data on the page */ -/* Following are used when page compression is used */ -#define FIL_PAGE_COMPRESSED_SIZE 2 /*!< Number of bytes used to store - actual payload data size on - compressed pages. */ -#define FIL_PAGE_COMPRESSION_METHOD_SIZE 2 - /*!< Number of bytes used to store - actual compression method. */ +/** 32-bit key version used to encrypt the page in full_crc32 format. +For non-encrypted page, it contains 0. */ +#define FIL_PAGE_FCRC32_KEY_VERSION 0 + +/** page_compressed without innodb_checksum_algorithm=full_crc32 @{ */ +/** Number of bytes used to store actual payload data size on +page_compressed pages when not using full_crc32. */ +#define FIL_PAGE_COMP_SIZE 0 + +/** Number of bytes for FIL_PAGE_COMP_SIZE */ +#define FIL_PAGE_COMP_METADATA_LEN 2 + +/** Number of bytes used to store actual compression method +for encrypted tables when not using full_crc32. */ +#define FIL_PAGE_ENCRYPT_COMP_ALGO 2 + +/** Extra header size for encrypted page_compressed pages when +not using full_crc32 */ +#define FIL_PAGE_ENCRYPT_COMP_METADATA_LEN 4 /* @} */ + /** File page trailer @{ */ #define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used to store the page checksum, the last 4 bytes should be identical to the last 4 bytes of FIL_PAGE_LSN */ #define FIL_PAGE_DATA_END 8 /*!< size of the page trailer */ + +/** Store the last 4 bytes of FIL_PAGE_LSN */ +#define FIL_PAGE_FCRC32_END_LSN 8 + +/** Store crc32 checksum at the end of the page */ +#define FIL_PAGE_FCRC32_CHECKSUM 4 /* @} */ /** File page types (values of FIL_PAGE_TYPE) @{ */ -#define FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED 37401 /*!< Page is compressed and - then encrypted */ +/** page_compressed, encrypted=YES (not used for full_crc32) */ +#define FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED 37401 +/** page_compressed (not used for full_crc32) */ #define FIL_PAGE_PAGE_COMPRESSED 34354 /*!< page compressed page */ #define FIL_PAGE_INDEX 17855 /*!< B-tree node */ #define FIL_PAGE_RTREE 17854 /*!< R-tree node (SPATIAL INDEX) */ @@ -513,6 +809,12 @@ extern const fil_addr_t fil_addr_null; Note: FIL_PAGE_TYPE_INSTANT maps to the same as FIL_PAGE_INDEX. */ #define FIL_PAGE_TYPE_LAST FIL_PAGE_TYPE_UNKNOWN /*!< Last page type */ +/** Set in FIL_PAGE_TYPE if for full_crc32 pages in page_compressed format. +If the flag is set, then the following holds for the remaining bits +of FIL_PAGE_TYPE: +Bits 0..7 will contain the compressed page size in bytes. +Bits 8..14 are reserved and must be 0. */ +#define FIL_PAGE_COMPRESS_FCRC32_MARKER 15 /* @} */ /** @return whether the page type is B-tree or R-tree index */ @@ -597,6 +899,22 @@ struct fil_system_t { private: bool m_initialised; +#ifdef UNIV_LINUX + /** available block devices that reside on non-rotational storage */ + std::vector<dev_t> ssd; +public: + /** @return whether a file system device is on non-rotational storage */ + bool is_ssd(dev_t dev) const + { + /* Linux seems to allow up to 15 partitions per block device. + If the detected ssd carries "partition number 0" (it is the whole device), + compare the candidate file system number without the partition number. */ + for (const auto s : ssd) + if (dev == s || (dev & ~15U) == s) + return true; + return false; + } +#endif public: ib_mutex_t mutex; /*!< The mutex protecting the cache */ fil_space_t* sys_space; /*!< The innodb_system tablespace */ @@ -749,16 +1067,6 @@ fil_space_get_flags( /*================*/ ulint id); /*!< in: space id */ -/** Returns the page size of the space and whether it is compressed or not. -The tablespace must be cached in the memory cache. -@param[in] id space id -@param[out] found true if tablespace was found -@return page size */ -const page_size_t -fil_space_get_page_size( - ulint id, - bool* found); - /*******************************************************************//** Opens all log files and system tablespace data files. They stay open until the database server shutdown. This should be called at a server startup after the @@ -804,10 +1112,8 @@ for concurrency control. @param[in] id tablespace ID @param[in] silent whether to silently ignore missing tablespaces @return the tablespace -@retval NULL if missing or being deleted or truncated */ -UNIV_INTERN -fil_space_t* -fil_space_acquire_low(ulint id, bool silent) +@retval NULL if missing or being deleted */ +fil_space_t* fil_space_acquire_low(ulint id, bool silent) MY_ATTRIBUTE((warn_unused_result)); /** Acquire a tablespace when it could be dropped concurrently. @@ -1085,7 +1391,7 @@ fil_space_extend( @param[in] type IO context @param[in] sync true if synchronous aio is desired @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] byte_offset remainder of offset in bytes; in aio this must be divisible by the OS block size @param[in] len how many bytes to read or write; this must @@ -1097,14 +1403,14 @@ fil_space_extend( @param[in] message message for aio handler if non-sync aio used, else ignored @param[in] ignore_missing_space true=ignore missing space during read -@return DB_SUCCESS, DB_TABLESPACE_DELETED or DB_TABLESPACE_TRUNCATED +@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do i/o on a tablespace which does not exist */ dberr_t fil_io( const IORequest& type, bool sync, const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint byte_offset, ulint len, void* buf, diff --git a/storage/innobase/include/fil0fil.ic b/storage/innobase/include/fil0fil.ic index 31466f38546..24e4157d1f3 100644 --- a/storage/innobase/include/fil0fil.ic +++ b/storage/innobase/include/fil0fil.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2015, 2018, MariaDB Corporation. +Copyright (c) 2015, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -75,17 +75,25 @@ fil_get_page_type_name( } } -/****************************************************************//** -Validate page type. +#ifdef UNIV_DEBUG +/** Validate page type. +@param[in] space Tablespace object +@param[in] page page to validate @return true if valid, false if not */ UNIV_INLINE bool fil_page_type_validate( - const byte* page) /*!< in: page */ + fil_space_t* space, + const byte* page) { -#ifdef UNIV_DEBUG ulint page_type = mach_read_from_2(page + FIL_PAGE_TYPE); + if ((page_type & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER) + && space->full_crc32() + && space->is_compressed()) { + return true; + } + /* Validate page type */ if (!((page_type == FIL_PAGE_PAGE_COMPRESSED || page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED || @@ -106,25 +114,31 @@ fil_page_type_validate( page_type == FIL_PAGE_TYPE_ZBLOB2 || page_type == FIL_PAGE_TYPE_UNKNOWN))) { - ulint space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + ulint space_id = mach_read_from_4( + page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + ulint offset = mach_read_from_4(page + FIL_PAGE_OFFSET); - fil_system_enter(); - fil_space_t* rspace = fil_space_get_by_id(space); - fil_system_exit(); + + ulint key_version = mach_read_from_4( + page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + + if (space && space->full_crc32()) { + key_version = mach_read_from_4( + page + FIL_PAGE_FCRC32_KEY_VERSION); + } /* Dump out the page info */ - ib::fatal() << "Page " << space << ":" << offset - << " name " << (rspace ? rspace->name : "???") + ib::fatal() << "Page " << space_id << ":" << offset + << " name " << (space ? space->name : "???") << " page_type " << page_type - << " key_version " - << mach_read_from_4(page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) + << " key_version " << key_version << " lsn " << mach_read_from_8(page + FIL_PAGE_LSN) << " compressed_len " << mach_read_from_2(page + FIL_PAGE_DATA); return false; } -#endif /* UNIV_DEBUG */ return true; } +#endif /* UNIV_DEBUG */ #endif /* fil0fil_ic */ diff --git a/storage/innobase/include/fil0pagecompress.h b/storage/innobase/include/fil0pagecompress.h index 545e05da769..9baf3289380 100644 --- a/storage/innobase/include/fil0pagecompress.h +++ b/storage/innobase/include/fil0pagecompress.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (C) 2013, 2018 MariaDB Corporation. +Copyright (C) 2013, 2019 MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -33,21 +33,29 @@ Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com /** Compress a page_compressed page before writing to a data file. @param[in] buf page to be compressed @param[out] out_buf compressed page -@param[in] level compression level +@param[in] flags tablespace flags @param[in] block_size file system block size @param[in] encrypted whether the page will be subsequently encrypted @return actual length of compressed page @retval 0 if the page was not compressed */ -ulint fil_page_compress(const byte* buf, byte* out_buf, ulint level, - ulint block_size, bool encrypted) +ulint fil_page_compress( + const byte* buf, + byte* out_buf, + ulint flags, + ulint block_size, + bool encrypted) MY_ATTRIBUTE((nonnull, warn_unused_result)); /** Decompress a page that may be subject to page_compressed compression. @param[in,out] tmp_buf temporary buffer (of innodb_page_size) @param[in,out] buf compressed page buffer +@param[in] flags talespace flags @return size of the compressed data @retval 0 if decompression failed @retval srv_page_size if the page was not compressed */ -ulint fil_page_decompress(byte* tmp_buf, byte* buf) +ulint fil_page_decompress( + byte* tmp_buf, + byte* buf, + ulint flags) MY_ATTRIBUTE((nonnull, warn_unused_result)); #endif diff --git a/storage/innobase/include/fsp0file.h b/storage/innobase/include/fsp0file.h index 8c5b24fbadb..15485769429 100644 --- a/storage/innobase/include/fsp0file.h +++ b/storage/innobase/include/fsp0file.h @@ -504,13 +504,13 @@ public: /* No op - base constructor is called. */ } - ~RemoteDatafile() + ~RemoteDatafile() override { shutdown(); } /** Release the resources. */ - void shutdown(); + void shutdown() override; /** Get the link filepath. @return m_link_filepath */ @@ -532,7 +532,7 @@ public: in read-only mode so that it can be validated. @param[in] strict whether to issue error messages @return DB_SUCCESS or error code */ - dberr_t open_read_only(bool strict); + dberr_t open_read_only(bool strict) override; /** Opens a handle to the file linked to in an InnoDB Symbolic Link file in read-write mode so that it can be restored from doublewrite @@ -540,7 +540,7 @@ public: @param[in] read_only_mode If true, then readonly mode checks are enforced. @return DB_SUCCESS or error code */ - dberr_t open_read_write(bool read_only_mode) + dberr_t open_read_write(bool read_only_mode) override MY_ATTRIBUTE((warn_unused_result)); /****************************************************************** diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h index 2ba85803eb5..8e1acfe1805 100644 --- a/storage/innobase/include/fsp0fsp.h +++ b/storage/innobase/include/fsp0fsp.h @@ -28,15 +28,15 @@ Created 12/18/1995 Heikki Tuuri #define fsp0fsp_h #include "fsp0types.h" +#include "fut0lst.h" +#include "ut0byte.h" #ifndef UNIV_INNOCHECKSUM - -#include "fut0lst.h" #include "mtr0mtr.h" #include "page0types.h" #include "rem0types.h" -#include "ut0byte.h" - +#else +# include "mach0data.h" #endif /* !UNIV_INNOCHECKSUM */ /** @return the PAGE_SSIZE flags for the current innodb_page_size */ @@ -45,6 +45,12 @@ Created 12/18/1995 Heikki Tuuri 0U : (srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1) \ << FSP_FLAGS_POS_PAGE_SSIZE) +/** @return the PAGE_SSIZE flags for the current innodb_page_size in +full checksum format */ +#define FSP_FLAGS_FCRC32_PAGE_SSIZE() \ + ((srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1) \ + << FSP_FLAGS_FCRC32_POS_PAGE_SSIZE) + /* @defgroup Compatibility macros for MariaDB 10.1.0 through 10.1.20; see the table in fsp0types.h @{ */ /** Zero relative shift position of the PAGE_COMPRESSION field */ @@ -201,11 +207,6 @@ typedef byte fseg_inode_t; (16 + 3 * FLST_BASE_NODE_SIZE \ + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE) -#define FSP_SEG_INODES_PER_PAGE(page_size) \ - ((page_size.physical() - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE) - /* Number of segment inodes which fit on a - single page */ - #define FSEG_MAGIC_N_VALUE 97937874 #define FSEG_FILLFACTOR 8 /* If this value is x, then if @@ -290,33 +291,6 @@ the extent are free and which contain old tuple version to clean. */ #ifndef UNIV_INNOCHECKSUM /* @} */ -/** Calculate the number of pages to extend a datafile. -We extend single-table tablespaces first one extent at a time, -but 4 at a time for bigger tablespaces. It is not enough to extend always -by one extent, because we need to add at least one extent to FSP_FREE. -A single extent descriptor page will track many extents. And the extent -that uses its extent descriptor page is put onto the FSP_FREE_FRAG list. -Extents that do not use their extent descriptor page are added to FSP_FREE. -The physical page size is used to determine how many extents are tracked -on one extent descriptor page. See xdes_calc_descriptor_page(). -@param[in] page_size page_size of the datafile -@param[in] size current number of pages in the datafile -@return number of pages to extend the file. */ -ulint -fsp_get_pages_to_extend_ibd( - const page_size_t& page_size, - ulint size); - -/** Calculate the number of physical pages in an extent for this file. -@param[in] page_size page_size of the datafile -@return number of pages in an extent for this file. */ -UNIV_INLINE -ulint -fsp_get_extent_size_in_pages(const page_size_t& page_size) -{ - return (FSP_EXTENT_SIZE << srv_page_size_shift) / page_size.physical(); -} - /**********************************************************************//** Reads the space id from the first page of a tablespace. @return space id, ULINT UNDEFINED if error */ @@ -347,13 +321,15 @@ fsp_header_get_flags(const page_t* page) } /** Get the byte offset of encryption information in page 0. -@param[in] ps page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @return byte offset relative to FSP_HEADER_OFFSET */ inline MY_ATTRIBUTE((pure, warn_unused_result)) -ulint -fsp_header_get_encryption_offset(const page_size_t& ps) +ulint fsp_header_get_encryption_offset(ulint zip_size) { - return XDES_ARR_OFFSET + XDES_SIZE * ps.physical() / FSP_EXTENT_SIZE; + return zip_size + ? XDES_ARR_OFFSET + XDES_SIZE * zip_size / FSP_EXTENT_SIZE + : XDES_ARR_OFFSET + (XDES_SIZE << srv_page_size_shift) + / FSP_EXTENT_SIZE; } /** Check the encryption key from the first page of a tablespace. @@ -512,12 +488,14 @@ fsp_reserve_free_extents( @param[in,out] seg_header file segment header @param[in,out] space tablespace @param[in] offset page number +@param[in] log whether to write MLOG_INIT_FREE_PAGE record @param[in,out] mtr mini-transaction */ void fseg_free_page( fseg_header_t* seg_header, fil_space_t* space, ulint offset, + bool log, mtr_t* mtr); /** Determine whether a page is free. @param[in,out] space tablespace @@ -591,13 +569,12 @@ fil_block_check_type( /** Checks if a page address is an extent descriptor page address. @param[in] page_id page id -@param[in] page_size page size -@return TRUE if a descriptor page */ -UNIV_INLINE -ibool -fsp_descr_page( - const page_id_t page_id, - const page_size_t& page_size); +@param[in] physical_size page size +@return whether a descriptor page */ +inline bool fsp_descr_page(const page_id_t page_id, ulint physical_size) +{ + return (page_id.page_no() & (physical_size - 1)) == FSP_XDES_OFFSET; +} /** Initialize a file page whose prior contents should be ignored. @param[in,out] block buffer pool block */ @@ -644,7 +621,7 @@ fsp_flags_convert_from_101(ulint flags) { DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(ULINT_UNDEFINED);); - if (flags == 0) { + if (flags == 0 || fil_space_t::full_crc32(flags)) { return(flags); } @@ -739,7 +716,7 @@ fsp_flags_convert_from_101(ulint flags) flags = ((flags & 0x3f) | ssize << FSP_FLAGS_POS_PAGE_SSIZE | FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags) << FSP_FLAGS_POS_PAGE_COMPRESSION); - ut_ad(fsp_flags_is_valid(flags, false)); + ut_ad(fil_space_t::is_valid_flags(flags, false)); return(flags); } @@ -753,7 +730,7 @@ bool fsp_flags_match(ulint expected, ulint actual) { expected &= ~FSP_FLAGS_MEM_MASK; - ut_ad(fsp_flags_is_valid(expected, false)); + ut_ad(fil_space_t::is_valid_flags(expected, false)); if (actual == expected) { return(true); @@ -763,16 +740,6 @@ fsp_flags_match(ulint expected, ulint actual) return(actual == expected); } -/** Calculates the descriptor index within a descriptor page. -@param[in] page_size page size -@param[in] offset page offset -@return descriptor index */ -UNIV_INLINE -ulint -xdes_calc_descriptor_index( - const page_size_t& page_size, - ulint offset); - /**********************************************************************//** Gets a descriptor bit of a page. @return TRUE if free */ @@ -785,15 +752,42 @@ xdes_get_bit( ulint offset);/*!< in: page offset within extent: 0 ... FSP_EXTENT_SIZE - 1 */ -/** Calculates the page where the descriptor of a page resides. -@param[in] page_size page size +/** Determine the descriptor index within a descriptor page. +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] offset page offset +@return descriptor index */ +inline ulint xdes_calc_descriptor_index(ulint zip_size, ulint offset) +{ + return ut_2pow_remainder<ulint>(offset, + zip_size ? zip_size : srv_page_size) + / FSP_EXTENT_SIZE; +} + +/** Determine the descriptor page number for a page. +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] offset page offset @return descriptor page offset */ -UNIV_INLINE -ulint -xdes_calc_descriptor_page( - const page_size_t& page_size, - ulint offset); +inline ulint xdes_calc_descriptor_page(ulint zip_size, ulint offset) +{ + compile_time_assert(UNIV_PAGE_SIZE_MAX > XDES_ARR_OFFSET + + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX) + * XDES_SIZE_MAX); + compile_time_assert(UNIV_PAGE_SIZE_MIN > XDES_ARR_OFFSET + + (UNIV_PAGE_SIZE_MIN / FSP_EXTENT_SIZE_MIN) + * XDES_SIZE_MIN); + + ut_ad(srv_page_size > XDES_ARR_OFFSET + + (srv_page_size / FSP_EXTENT_SIZE) + * XDES_SIZE); + ut_ad(UNIV_ZIP_SIZE_MIN > XDES_ARR_OFFSET + + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE) + * XDES_SIZE); + ut_ad(!zip_size + || zip_size > XDES_ARR_OFFSET + + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE); + return ut_2pow_round<ulint>(offset, + zip_size ? zip_size : srv_page_size); +} #endif /* UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/fsp0fsp.ic b/storage/innobase/include/fsp0fsp.ic index 9f28aacaff5..31b9d8c5dbe 100644 --- a/storage/innobase/include/fsp0fsp.ic +++ b/storage/innobase/include/fsp0fsp.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, 2017, MariaDB Corporation. +Copyright (c) 2013, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -24,37 +24,6 @@ File space management Created 12/18/1995 Heikki Tuuri *******************************************************/ -#ifndef UNIV_INNOCHECKSUM - -/** Checks if a page address is an extent descriptor page address. -@param[in] page_id page id -@param[in] page_size page size -@return TRUE if a descriptor page */ -UNIV_INLINE -ibool -fsp_descr_page( - const page_id_t page_id, - const page_size_t& page_size) -{ - return((page_id.page_no() & (page_size.physical() - 1)) - == FSP_XDES_OFFSET); -} - -/** Calculates the descriptor index within a descriptor page. -@param[in] page_size page size -@param[in] offset page offset -@return descriptor index */ -UNIV_INLINE -ulint -xdes_calc_descriptor_index( - const page_size_t& page_size, - ulint offset) -{ - return(ut_2pow_remainder(offset, page_size.physical()) - / FSP_EXTENT_SIZE); -} -#endif /*!UNIV_INNOCHECKSUM */ - /**********************************************************************//** Gets a descriptor bit of a page. @return TRUE if free */ @@ -75,44 +44,5 @@ xdes_get_bit( ulint bit_index = index % 8; ulint byte_index = index / 8; - return(ut_bit_get_nth( - mach_read_ulint(descr + XDES_BITMAP + byte_index, - MLOG_1BYTE), - bit_index)); -} - -#ifndef UNIV_INNOCHECKSUM -/** Calculates the page where the descriptor of a page resides. -@param[in] page_size page size -@param[in] offset page offset -@return descriptor page offset */ -UNIV_INLINE -ulint -xdes_calc_descriptor_page( - const page_size_t& page_size, - ulint offset) -{ - compile_time_assert(UNIV_PAGE_SIZE_MAX > XDES_ARR_OFFSET - + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX) - * XDES_SIZE_MAX); - compile_time_assert(UNIV_PAGE_SIZE_MIN > XDES_ARR_OFFSET - + (UNIV_PAGE_SIZE_MIN / FSP_EXTENT_SIZE_MIN) - * XDES_SIZE_MIN); - - ut_ad(srv_page_size > XDES_ARR_OFFSET - + (srv_page_size / FSP_EXTENT_SIZE) - * XDES_SIZE); - ut_ad(UNIV_ZIP_SIZE_MIN > XDES_ARR_OFFSET - + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE) - * XDES_SIZE); - -#ifdef UNIV_DEBUG - if (page_size.is_compressed()) { - ut_a(page_size.physical() > XDES_ARR_OFFSET - + (page_size.physical() / FSP_EXTENT_SIZE) * XDES_SIZE); - } -#endif /* UNIV_DEBUG */ - - return(ut_2pow_round(offset, page_size.physical())); + return ut_bit_get_nth(descr[XDES_BITMAP + byte_index], bit_index); } -#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/fsp0pagecompress.h b/storage/innobase/include/fsp0pagecompress.h index fc0b907dfa7..27423858435 100644 --- a/storage/innobase/include/fsp0pagecompress.h +++ b/storage/innobase/include/fsp0pagecompress.h @@ -27,17 +27,6 @@ Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com #ifndef fsp0pagecompress_h #define fsp0pagecompress_h -/* Supported page compression methods */ - -#define PAGE_UNCOMPRESSED 0 -#define PAGE_ZLIB_ALGORITHM 1 -#define PAGE_LZ4_ALGORITHM 2 -#define PAGE_LZO_ALGORITHM 3 -#define PAGE_LZMA_ALGORITHM 4 -#define PAGE_BZIP2_ALGORITHM 5 -#define PAGE_SNAPPY_ALGORITHM 6 -#define PAGE_ALGORITHM_LAST PAGE_SNAPPY_ALGORITHM - /**********************************************************************//** Reads the page compression level from the first page of a tablespace. @return page compression level, or 0 if uncompressed */ diff --git a/storage/innobase/include/fsp0space.h b/storage/innobase/include/fsp0space.h index 5bd70e4f80d..632c65e14cc 100644 --- a/storage/innobase/include/fsp0space.h +++ b/storage/innobase/include/fsp0space.h @@ -127,7 +127,7 @@ public: @param[in] fsp_flags tablespace flags */ void set_flags(ulint fsp_flags) { - ut_ad(fsp_flags_is_valid(fsp_flags, false)); + ut_ad(fil_space_t::is_valid_flags(fsp_flags, false)); m_flags = fsp_flags; } diff --git a/storage/innobase/include/fsp0sysspace.h b/storage/innobase/include/fsp0sysspace.h index d3a79ec23a9..bcb8dd5e5e9 100644 --- a/storage/innobase/include/fsp0sysspace.h +++ b/storage/innobase/include/fsp0sysspace.h @@ -49,7 +49,7 @@ public: /* No op */ } - ~SysTablespace() + ~SysTablespace() override { shutdown(); } diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h index 5c77b62723a..69c5346a4f9 100644 --- a/storage/innobase/include/fsp0types.h +++ b/storage/innobase/include/fsp0types.h @@ -27,10 +27,6 @@ Created May 26, 2009 Vasil Dimov #ifndef fsp0types_h #define fsp0types_h -#include "univ.i" - -#ifndef UNIV_INNOCHECKSUM - /** The fil_space_t::id of the redo log. All persistent tablespaces have a smaller fil_space_t::id. */ #define SRV_LOG_SPACE_FIRST_ID 0xFFFFFFF0U @@ -39,6 +35,16 @@ have a smaller fil_space_t::id. */ #include "ut0byte.h" +/* Possible values of innodb_compression_algorithm */ +#define PAGE_UNCOMPRESSED 0 +#define PAGE_ZLIB_ALGORITHM 1 +#define PAGE_LZ4_ALGORITHM 2 +#define PAGE_LZO_ALGORITHM 3 +#define PAGE_LZMA_ALGORITHM 4 +#define PAGE_BZIP2_ALGORITHM 5 +#define PAGE_SNAPPY_ALGORITHM 6 +#define PAGE_ALGORITHM_LAST PAGE_SNAPPY_ALGORITHM + /** @name Flags for inserting records in order If records are inserted in order, there are the following flags to tell this (their type is made byte for the compiler @@ -50,7 +56,6 @@ fseg_alloc_free_page) */ #define FSP_NO_DIR ((byte)113) /*!< no order */ /* @} */ -#endif /* !UNIV_INNOCHECKSUM */ /** File space extent size in pages page size | file space extent size ----------+----------------------- @@ -73,7 +78,6 @@ page size | file space extent size offset */ #define FSEG_PAGE_DATA FIL_PAGE_DATA -#ifndef UNIV_INNOCHECKSUM /** @name File segment header The file segment header points to the inode describing the file segment. */ /* @{ */ @@ -88,6 +92,7 @@ typedef byte fseg_header_t; header, in bytes */ /* @} */ +#ifndef UNIV_INNOCHECKSUM #ifdef UNIV_DEBUG struct mtr_t; @@ -224,6 +229,15 @@ to ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. */ /** A mask of all the known/used bits in FSP_SPACE_FLAGS */ #define FSP_FLAGS_MASK (~(~0U << FSP_FLAGS_WIDTH)) +/** Number of flag bits used to indicate the tablespace page size */ +#define FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE 4 + +/** Marker to indicate whether tablespace is in full checksum format. */ +#define FSP_FLAGS_FCRC32_WIDTH_MARKER 1 + +/** Stores the compressed algo for full checksum format. */ +#define FSP_FLAGS_FCRC32_WIDTH_COMPRESSED_ALGO 3 + /* FSP_SPACE_FLAGS position and name in MySQL 5.6/MariaDB 10.0 or older and MariaDB 10.1.20 or older MariaDB 10.1 and in MariaDB 10.1.21 or newer. @@ -286,6 +300,19 @@ these are only used in MySQL 5.7 and used for compatibility. */ #define FSP_FLAGS_POS_PAGE_COMPRESSION (FSP_FLAGS_POS_RESERVED \ + FSP_FLAGS_WIDTH_RESERVED) +/** Zero relative shift position of the PAGE_SIZE field +in full crc32 format */ +#define FSP_FLAGS_FCRC32_POS_PAGE_SSIZE 0 + +/** Zero relative shift position of the MARKER field in full crc32 format. */ +#define FSP_FLAGS_FCRC32_POS_MARKER (FSP_FLAGS_FCRC32_POS_PAGE_SSIZE \ + + FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE) + +/** Zero relative shift position of the compressed algorithm stored +in full crc32 format. */ +#define FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO (FSP_FLAGS_FCRC32_POS_MARKER \ + + FSP_FLAGS_FCRC32_WIDTH_MARKER) + /** Bit mask of the POST_ANTELOPE field */ #define FSP_FLAGS_MASK_POST_ANTELOPE \ ((~(~0U << FSP_FLAGS_WIDTH_POST_ANTELOPE)) \ @@ -315,6 +342,21 @@ these are only used in MySQL 5.7 and used for compatibility. */ #define FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL \ (15U << FSP_FLAGS_MEM_COMPRESSION_LEVEL) +/** Bit mask of the PAGE_SIZE field in full crc32 format */ +#define FSP_FLAGS_FCRC32_MASK_PAGE_SSIZE \ + ((~(~0U << FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE)) \ + << FSP_FLAGS_FCRC32_POS_PAGE_SSIZE) + +/** Bit mask of the MARKER field in full crc32 format */ +#define FSP_FLAGS_FCRC32_MASK_MARKER \ + ((~(~0U << FSP_FLAGS_FCRC32_WIDTH_MARKER)) \ + << FSP_FLAGS_FCRC32_POS_MARKER) + +/** Bit mask of the COMPRESSED ALGO field in full crc32 format */ +#define FSP_FLAGS_FCRC32_MASK_COMPRESSED_ALGO \ + ((~(~0U << FSP_FLAGS_FCRC32_WIDTH_COMPRESSED_ALGO)) \ + << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO) + /** Return the value of the POST_ANTELOPE field */ #define FSP_FLAGS_GET_POST_ANTELOPE(flags) \ ((flags & FSP_FLAGS_MASK_POST_ANTELOPE) \ @@ -339,10 +381,14 @@ these are only used in MySQL 5.7 and used for compatibility. */ #define FSP_FLAGS_HAS_PAGE_COMPRESSION(flags) \ ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION) \ >> FSP_FLAGS_POS_PAGE_COMPRESSION) - -/** Return the contents of the UNUSED bits */ -#define FSP_FLAGS_GET_UNUSED(flags) \ - (flags >> FSP_FLAGS_POS_UNUSED) +/** @return the PAGE_SSIZE flags in full crc32 format */ +#define FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags) \ + ((flags & FSP_FLAGS_FCRC32_MASK_PAGE_SSIZE) \ + >> FSP_FLAGS_FCRC32_POS_PAGE_SSIZE) +/** @return the COMPRESSED_ALGO flags in full crc32 format */ +#define FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags) \ + ((flags & FSP_FLAGS_FCRC32_MASK_COMPRESSED_ALGO) \ + >> FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO) /** @return the value of the DATA_DIR field */ #define FSP_FLAGS_HAS_DATA_DIR(flags) \ @@ -354,67 +400,4 @@ these are only used in MySQL 5.7 and used for compatibility. */ /* @} */ -/** Validate the tablespace flags, which are stored in the -tablespace header at offset FSP_SPACE_FLAGS. -@param[in] flags the contents of FSP_SPACE_FLAGS -@param[in] is_ibd whether this is an .ibd file (not system tablespace) -@return whether the flags are correct (not in the buggy 10.1) format */ -MY_ATTRIBUTE((warn_unused_result, const)) -UNIV_INLINE -bool -fsp_flags_is_valid(ulint flags, bool is_ibd) -{ - DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", - return(false);); - if (flags == 0) { - return(true); - } - if (flags & ~FSP_FLAGS_MASK) { - return(false); - } - if ((flags & (FSP_FLAGS_MASK_POST_ANTELOPE | FSP_FLAGS_MASK_ATOMIC_BLOBS)) - == FSP_FLAGS_MASK_ATOMIC_BLOBS) { - /* If the "atomic blobs" flag (indicating - ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED) flag - is set, then the "post Antelope" (ROW_FORMAT!=REDUNDANT) flag - must also be set. */ - return(false); - } - /* Bits 10..14 should be 0b0000d where d is the DATA_DIR flag - of MySQL 5.6 and MariaDB 10.0, which we ignore. - In the buggy FSP_SPACE_FLAGS written by MariaDB 10.1.0 to 10.1.20, - bits 10..14 would be nonzero 0bsssaa where sss is - nonzero PAGE_SSIZE (3, 4, 6, or 7) - and aa is ATOMIC_WRITES (not 0b11). */ - if (FSP_FLAGS_GET_RESERVED(flags) & ~1U) { - return(false); - } - - const ulint ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); - if (ssize == 1 || ssize == 2 || ssize == 5 || ssize & 8) { - /* the page_size is not between 4k and 64k; - 16k should be encoded as 0, not 5 */ - return(false); - } - const ulint zssize = FSP_FLAGS_GET_ZIP_SSIZE(flags); - if (zssize == 0) { - /* not ROW_FORMAT=COMPRESSED */ - } else if (zssize > (ssize ? ssize : 5)) { - /* invalid KEY_BLOCK_SIZE */ - return(false); - } else if (~flags & (FSP_FLAGS_MASK_POST_ANTELOPE - | FSP_FLAGS_MASK_ATOMIC_BLOBS)) { - /* both these flags should be set for - ROW_FORMAT=COMPRESSED */ - return(false); - } - - /* The flags do look valid. But, avoid misinterpreting - buggy MariaDB 10.1 format flags for - PAGE_COMPRESSED=1 PAGE_COMPRESSION_LEVEL={0,2,3} - as valid-looking PAGE_SSIZE if this is known to be - an .ibd file and we are using the default innodb_page_size=16k. */ - return(ssize == 0 || !is_ibd || srv_page_size != UNIV_PAGE_SIZE_ORIG); -} - #endif /* fsp0types_h */ diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h index 354c36aba50..950d978e073 100644 --- a/storage/innobase/include/fts0fts.h +++ b/storage/innobase/include/fts0fts.h @@ -323,7 +323,7 @@ public: /** Whether the ADDED table record sync-ed after crash recovery; protected by bg_threads_mutex */ unsigned added_synced:1; - /** Whether the table holds dict_sys->mutex; + /** Whether the table holds dict_sys.mutex; protected by bg_threads_mutex */ unsigned dict_locked:1; @@ -384,9 +384,9 @@ extern bool fts_need_sync; #define fts_que_graph_free(graph) \ do { \ - mutex_enter(&dict_sys->mutex); \ + mutex_enter(&dict_sys.mutex); \ que_graph_free(graph); \ - mutex_exit(&dict_sys->mutex); \ + mutex_exit(&dict_sys.mutex); \ } while (0) /******************************************************************//** @@ -584,17 +584,15 @@ fts_get_doc_id_from_row( want to extract.*/ /** Extract the doc id from the record that belongs to index. -@param[in] table table -@param[in] rec record contains FTS_DOC_ID +@param[in] rec record containing FTS_DOC_ID @param[in] index index of rec -@param[in] heap heap memory +@param[in] offsets rec_get_offsets(rec,index) @return doc id that was extracted from rec */ doc_id_t fts_get_doc_id_from_rec( - dict_table_t* table, - const rec_t* rec, - const dict_index_t* index, - mem_heap_t* heap); + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets); /** Add new fts doc id to the update vector. @param[in] table the table that contains the FTS index. @@ -751,7 +749,7 @@ FTS auxiliary INDEX table and clear the cache at the end. dberr_t fts_sync_table(dict_table_t* table, bool wait = true); /****************************************************************//** -Free the query graph but check whether dict_sys->mutex is already +Free the query graph but check whether dict_sys.mutex is already held */ void fts_que_graph_free_check_lock( diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h index 3c6bc0e14d7..e0b0d27bebf 100644 --- a/storage/innobase/include/fts0priv.h +++ b/storage/innobase/include/fts0priv.h @@ -135,7 +135,7 @@ fts_eval_sql( /** Construct the name of an internal FTS table for the given table. @param[in] fts_table metadata on fulltext-indexed table @param[out] table_name a name up to MAX_FULL_NAME_LEN -@param[in] dict_locked whether dict_sys->mutex is being held */ +@param[in] dict_locked whether dict_sys.mutex is being held */ void fts_get_table_name(const fts_table_t* fts_table, char* table_name, bool dict_locked = false) MY_ATTRIBUTE((nonnull)); @@ -490,7 +490,7 @@ fts_get_table_id( MY_ATTRIBUTE((nonnull, warn_unused_result)); /** Construct the name of an internal FTS table for the given table. @param[in] fts_table metadata on fulltext-indexed table -@param[in] dict_locked whether dict_sys->mutex is being held +@param[in] dict_locked whether dict_sys.mutex is being held @return the prefix, must be freed with ut_free() */ char* fts_get_table_name_prefix(const fts_table_t* fts_table) MY_ATTRIBUTE((nonnull, malloc, warn_unused_result)); diff --git a/storage/innobase/include/fut0fut.h b/storage/innobase/include/fut0fut.h index 3c3f118bd68..a52fc256efa 100644 --- a/storage/innobase/include/fut0fut.h +++ b/storage/innobase/include/fut0fut.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -27,12 +28,11 @@ Created 12/13/1995 Heikki Tuuri #ifndef fut0fut_h #define fut0fut_h -#include "fil0fil.h" #include "mtr0mtr.h" /** Gets a pointer to a file address and latches the page. @param[in] space space id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] addr file address @param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_SX_LATCH @param[out] ptr_block file page @@ -43,13 +43,32 @@ UNIV_INLINE byte* fut_get_ptr( ulint space, - const page_size_t& page_size, + ulint zip_size, fil_addr_t addr, rw_lock_type_t rw_latch, mtr_t* mtr, buf_block_t** ptr_block = NULL) - MY_ATTRIBUTE((warn_unused_result)); +{ + buf_block_t* block; + byte* ptr = NULL; -#include "fut0fut.ic" + ut_ad(addr.boffset < srv_page_size); + ut_ad((rw_latch == RW_S_LATCH) + || (rw_latch == RW_X_LATCH) + || (rw_latch == RW_SX_LATCH)); + + block = buf_page_get(page_id_t(space, addr.page), zip_size, + rw_latch, mtr); + + ptr = buf_block_get_frame(block) + addr.boffset; + + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + if (ptr_block != NULL) { + *ptr_block = block; + } + + return(ptr); +} #endif /* fut0fut_h */ diff --git a/storage/innobase/include/fut0fut.ic b/storage/innobase/include/fut0fut.ic deleted file mode 100644 index b5c1e15e059..00000000000 --- a/storage/innobase/include/fut0fut.ic +++ /dev/null @@ -1,68 +0,0 @@ -/***************************************************************************** - -Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/******************************************************************//** -@file include/fut0fut.ic -File-based utilities - -Created 12/13/1995 Heikki Tuuri -***********************************************************************/ - -#include "sync0rw.h" -#include "buf0buf.h" - -/** Gets a pointer to a file address and latches the page. -@param[in] space space id -@param[in] page_size page size -@param[in] addr file address -@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_SX_LATCH -@param[in,out] mtr mini-transaction -@param[out] ptr_block file page -@return pointer to a byte in (*ptr_block)->frame; the *ptr_block is -bufferfixed and latched */ -UNIV_INLINE -byte* -fut_get_ptr( - ulint space, - const page_size_t& page_size, - fil_addr_t addr, - rw_lock_type_t rw_latch, - mtr_t* mtr, - buf_block_t** ptr_block) -{ - buf_block_t* block; - byte* ptr = NULL; - - ut_ad(addr.boffset < srv_page_size); - ut_ad((rw_latch == RW_S_LATCH) - || (rw_latch == RW_X_LATCH) - || (rw_latch == RW_SX_LATCH)); - - block = buf_page_get(page_id_t(space, addr.page), page_size, - rw_latch, mtr); - - ptr = buf_block_get_frame(block) + addr.boffset; - - buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); - - if (ptr_block != NULL) { - *ptr_block = block; - } - - return(ptr); -} diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h index 187b673d2fd..9fa928eda23 100644 --- a/storage/innobase/include/fut0lst.h +++ b/storage/innobase/include/fut0lst.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -26,11 +27,11 @@ Created 11/28/1995 Heikki Tuuri #ifndef fut0lst_h #define fut0lst_h -#ifndef UNIV_INNOCHECKSUM - -#include "fil0fil.h" -#include "mtr0mtr.h" - +#ifdef UNIV_INNOCHECKSUM +# include "fil0fil.h" +#else +#include "fut0fut.h" +#include "mtr0log.h" /* The C 'types' of base node and list node: these should be used to write self-documenting code. Of course, the sizeof macro cannot be @@ -39,14 +40,59 @@ applied to these types! */ typedef byte flst_base_node_t; typedef byte flst_node_t; -/* The physical size of a list base node in bytes */ -#define FLST_BASE_NODE_SIZE (4 + 2 * FIL_ADDR_SIZE) #endif /* !UNIV_INNOCHECKSUM */ +/* The physical size of a list base node in bytes */ +#define FLST_BASE_NODE_SIZE (4 + 2 * FIL_ADDR_SIZE) /* The physical size of a list node in bytes */ #define FLST_NODE_SIZE (2 * FIL_ADDR_SIZE) #ifndef UNIV_INNOCHECKSUM +/* We define the field offsets of a node for the list */ +#define FLST_PREV 0 /* 6-byte address of the previous list element; + the page part of address is FIL_NULL, if no + previous element */ +#define FLST_NEXT FIL_ADDR_SIZE /* 6-byte address of the next + list element; the page part of address + is FIL_NULL, if no next element */ + +/* We define the field offsets of a base node for the list */ +#define FLST_LEN 0 /* 32-bit list length field */ +#define FLST_FIRST 4 /* 6-byte address of the first element + of the list; undefined if empty list */ +#define FLST_LAST (4 + FIL_ADDR_SIZE) /* 6-byte address of the + last element of the list; undefined + if empty list */ + +/** Initialize a zero-initialized list base node. +@param[in,out] block file page +@param[in] ofs byte offset of the list base node +@param[in,out] mtr mini-transaction */ +inline void flst_init(buf_block_t* block, uint16_t ofs, mtr_t* mtr) +{ + ut_ad(0 == mach_read_from_2(FLST_LEN + ofs + block->frame)); + ut_ad(0 == mach_read_from_2(FLST_FIRST + FIL_ADDR_BYTE + ofs + + block->frame)); + ut_ad(0 == mach_read_from_2(FLST_LAST + FIL_ADDR_BYTE + ofs + + block->frame)); + compile_time_assert(FIL_NULL == 0xffU * 0x1010101U); + mlog_memset(block, FLST_FIRST + FIL_ADDR_PAGE + ofs, 4, 0xff, mtr); + mlog_memset(block, FLST_LAST + FIL_ADDR_PAGE + ofs, 4, 0xff, mtr); +} + +/** Write a null file address. +@param[in,out] faddr file address to be zeroed otu +@param[in,out] mtr mini-transaction */ +inline void flst_zero_addr(fil_faddr_t* faddr, mtr_t* mtr) +{ + if (mach_read_from_4(faddr + FIL_ADDR_PAGE) != FIL_NULL) { + mlog_memset(faddr + FIL_ADDR_PAGE, 4, 0xff, mtr); + } + if (mach_read_from_2(faddr + FIL_ADDR_BYTE)) { + mlog_write_ulint(faddr + FIL_ADDR_BYTE, 0, MLOG_2BYTES, mtr); + } +} + /********************************************************************//** Initializes a list base node. */ UNIV_INLINE @@ -83,7 +129,7 @@ flst_remove( @param[in] base base node @return length */ UNIV_INLINE -ulint +uint32_t flst_get_len( const flst_base_node_t* base); /********************************************************************//** diff --git a/storage/innobase/include/fut0lst.ic b/storage/innobase/include/fut0lst.ic index 00bb3fe8e9c..ec4181b2c93 100644 --- a/storage/innobase/include/fut0lst.ic +++ b/storage/innobase/include/fut0lst.ic @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -23,26 +24,8 @@ File-based list utilities Created 11/28/1995 Heikki Tuuri ***********************************************************************/ -#include "fut0fut.h" -#include "mtr0log.h" #include "buf0buf.h" -/* We define the field offsets of a node for the list */ -#define FLST_PREV 0 /* 6-byte address of the previous list element; - the page part of address is FIL_NULL, if no - previous element */ -#define FLST_NEXT FIL_ADDR_SIZE /* 6-byte address of the next - list element; the page part of address - is FIL_NULL, if no next element */ - -/* We define the field offsets of a base node for the list */ -#define FLST_LEN 0 /* 32-bit list length field */ -#define FLST_FIRST 4 /* 6-byte address of the first element - of the list; undefined if empty list */ -#define FLST_LAST (4 + FIL_ADDR_SIZE) /* 6-byte address of the - last element of the list; undefined - if empty list */ - /********************************************************************//** Writes a file address. */ UNIV_INLINE @@ -79,9 +62,8 @@ flst_read_addr( ut_ad(faddr && mtr); - addr.page = mtr_read_ulint(faddr + FIL_ADDR_PAGE, MLOG_4BYTES, mtr); - addr.boffset = mtr_read_ulint(faddr + FIL_ADDR_BYTE, MLOG_2BYTES, - mtr); + addr.page = mach_read_from_4(faddr + FIL_ADDR_PAGE); + addr.boffset = mach_read_from_2(faddr + FIL_ADDR_BYTE); ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA); ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA); return(addr); @@ -100,16 +82,18 @@ flst_init( MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)); - mlog_write_ulint(base + FLST_LEN, 0, MLOG_4BYTES, mtr); - flst_write_addr(base + FLST_FIRST, fil_addr_null, mtr); - flst_write_addr(base + FLST_LAST, fil_addr_null, mtr); + if (mach_read_from_4(base + FLST_LEN)) { + mlog_write_ulint(base + FLST_LEN, 0, MLOG_4BYTES, mtr); + } + flst_zero_addr(base + FLST_FIRST, mtr); + flst_zero_addr(base + FLST_LAST, mtr); } /** Get the length of a list. @param[in] base base node @return length */ UNIV_INLINE -ulint +uint32_t flst_get_len( const flst_base_node_t* base) { diff --git a/storage/innobase/include/gis0rtree.h b/storage/innobase/include/gis0rtree.h index e189b6a7f28..01fcc2943d2 100644 --- a/storage/innobase/include/gis0rtree.h +++ b/storage/innobase/include/gis0rtree.h @@ -191,23 +191,8 @@ rtr_non_leaf_insert_stack_push( double mbr_inc); /*!< in: MBR needs to be enlarged */ -/*****************************************************************//** -Allocates a new Split Sequence Number. -@return new SSN id */ -UNIV_INLINE -node_seq_t -rtr_get_new_ssn_id( -/*===============*/ - dict_index_t* index); /*!< in: the index struct */ - -/*****************************************************************//** -Get the current Split Sequence Number. -@return current SSN id */ -UNIV_INLINE -node_seq_t -rtr_get_current_ssn_id( -/*===================*/ - dict_index_t* index); /*!< in/out: the index struct */ +#define rtr_get_new_ssn_id(index) (index)->assign_ssn() +#define rtr_get_current_ssn_id(index) (index)->ssn() /********************************************************************//** Create a RTree search info structure */ diff --git a/storage/innobase/include/gis0rtree.ic b/storage/innobase/include/gis0rtree.ic index a22164931b2..2076b24b9b1 100644 --- a/storage/innobase/include/gis0rtree.ic +++ b/storage/innobase/include/gis0rtree.ic @@ -123,31 +123,6 @@ rtr_non_leaf_stack_push( #endif /* RTR_SEARCH_DIAGNOSTIC */ } -/*****************************************************************//** -Allocates a new Split Sequence Number. -@return new SSN id */ -UNIV_INLINE -node_seq_t -rtr_get_new_ssn_id( -/*===============*/ - dict_index_t* index) /*!< in/out: the index struct */ -{ - node_seq_t ssn= my_atomic_add32_explicit(&index->rtr_ssn, 1, - MY_MEMORY_ORDER_RELAXED); - return ssn + 1; -} -/*****************************************************************//** -Get the current Split Sequence Number. -@return current SSN id */ -UNIV_INLINE -node_seq_t -rtr_get_current_ssn_id( -/*===================*/ - dict_index_t* index) /*!< in: index struct */ -{ - return my_atomic_load32_explicit(&index->rtr_ssn, MY_MEMORY_ORDER_RELAXED); -} - /*********************************************************************//** Sets pointer to the data and length in a field. */ UNIV_INLINE diff --git a/storage/innobase/include/gis0type.h b/storage/innobase/include/gis0type.h index c5ea817c6bf..a1e0a878cb2 100644 --- a/storage/innobase/include/gis0type.h +++ b/storage/innobase/include/gis0type.h @@ -35,7 +35,7 @@ Created 2013/03/27 Jimmy Yang #include "gis0geo.h" #include <vector> -#include <list> +#include <forward_list> /* Node Sequence Number. Only updated when page splits */ typedef ib_uint32_t node_seq_t; @@ -133,15 +133,14 @@ typedef struct rtr_info{ /*!< current search mode */ } rtr_info_t; -typedef std::list<rtr_info_t*, ut_allocator<rtr_info_t*> > rtr_info_active; - -/* Tracking structure for all onoging search for an index */ -typedef struct rtr_info_track { - rtr_info_active* rtr_active; /*!< Active search info */ - ib_mutex_t rtr_active_mutex; +/* Tracking structure for all ongoing search for an index */ +struct rtr_info_track_t { + /** Active search info */ + std::forward_list<rtr_info_t*, ut_allocator<rtr_info_t*> > rtr_active; + ib_mutex_t rtr_active_mutex; /*!< mutex to protect rtr_active */ -} rtr_info_track_t; +}; /* This is to record the record movement between pages. Used for corresponding lock movement */ diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h index 51a34b91418..28e5d1d4f56 100644 --- a/storage/innobase/include/ha_prototypes.h +++ b/storage/innobase/include/ha_prototypes.h @@ -232,10 +232,11 @@ innobase_casedn_str( #ifdef WITH_WSREP UNIV_INTERN int -wsrep_innobase_kill_one_trx(void * const thd_ptr, - const trx_t * const bf_trx, - trx_t *victim_trx, - ibool signal); +wsrep_innobase_kill_one_trx( + THD* bf_thd, + trx_t *victim_trx, + bool signal); + ulint wsrep_innobase_mysql_sort(int mysql_type, uint charset_number, unsigned char* str, unsigned int str_length, unsigned int buf_length); @@ -512,18 +513,6 @@ normalize_table_name_c_low( const char* name, /*!< in: table name string */ ibool set_lower_case); /*!< in: TRUE if we want to set name to lower case */ -/*************************************************************//** -InnoDB index push-down condition check defined in ha_innodb.cc -@return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */ - -#include <my_compare.h> - -ICP_RESULT -innobase_index_cond( -/*================*/ - void* file) /*!< in/out: pointer to ha_innobase */ - MY_ATTRIBUTE((warn_unused_result)); - /** Update the system variable with the given value of the InnoDB buffer pool size. @param[in] buf_pool_size given value of buffer pool size.*/ diff --git a/storage/innobase/include/ib0mutex.h b/storage/innobase/include/ib0mutex.h index e496c65e46a..ce0e911dbb4 100644 --- a/storage/innobase/include/ib0mutex.h +++ b/storage/innobase/include/ib0mutex.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2019, MariaDB Corporation. +Copyright (c) 2017, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,13 +29,12 @@ Created 2013-03-26 Sunny Bains. #ifndef ib0mutex_h #define ib0mutex_h -#include "my_atomic.h" #include "my_cpu.h" #include "os0event.h" #include "sync0arr.h" /** OS mutex for tracking lock/unlock for debugging */ -template <template <typename> class Policy = NoPolicy> +template <template <typename> class Policy> struct OSTrackMutex { typedef Policy<OSTrackMutex> MutexPolicy; @@ -152,7 +151,7 @@ private: #include <sys/syscall.h> /** Mutex implementation that used the Linux futex. */ -template <template <typename> class Policy = NoPolicy> +template <template <typename> class Policy> struct TTASFutexMutex { typedef Policy<TTASFutexMutex> MutexPolicy; @@ -167,21 +166,24 @@ struct TTASFutexMutex { ~TTASFutexMutex() { - ut_a(m_lock_word == MUTEX_STATE_UNLOCKED); + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_UNLOCKED); } /** Called when the mutex is "created". Note: Not from the constructor but when the mutex is initialised. */ void init(latch_id_t, const char*, uint32_t) UNIV_NOTHROW { - ut_a(m_lock_word == MUTEX_STATE_UNLOCKED); + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_UNLOCKED); } /** Destroy the mutex. */ void destroy() UNIV_NOTHROW { /* The destructor can be called at shutdown. */ - ut_a(m_lock_word == MUTEX_STATE_UNLOCKED); + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_UNLOCKED); } /** Acquire the mutex. @@ -202,9 +204,8 @@ struct TTASFutexMutex { } for (n_waits= 0;; n_waits++) { - if (my_atomic_fas32_explicit(&m_lock_word, - MUTEX_STATE_WAITERS, - MY_MEMORY_ORDER_ACQUIRE) + if (m_lock_word.exchange(MUTEX_STATE_WAITERS, + std::memory_order_acquire) == MUTEX_STATE_UNLOCKED) { break; } @@ -220,9 +221,8 @@ struct TTASFutexMutex { /** Release the mutex. */ void exit() UNIV_NOTHROW { - if (my_atomic_fas32_explicit(&m_lock_word, - MUTEX_STATE_UNLOCKED, - MY_MEMORY_ORDER_RELEASE) + if (m_lock_word.exchange(MUTEX_STATE_UNLOCKED, + std::memory_order_release) == MUTEX_STATE_WAITERS) { syscall(SYS_futex, &m_lock_word, FUTEX_WAKE_PRIVATE, 1, 0, 0, 0); @@ -234,10 +234,11 @@ struct TTASFutexMutex { bool try_lock() UNIV_NOTHROW { int32 oldval = MUTEX_STATE_UNLOCKED; - return(my_atomic_cas32_strong_explicit(&m_lock_word, &oldval, - MUTEX_STATE_LOCKED, - MY_MEMORY_ORDER_ACQUIRE, - MY_MEMORY_ORDER_RELAXED)); + return m_lock_word.compare_exchange_strong( + oldval, + MUTEX_STATE_LOCKED, + std::memory_order_acquire, + std::memory_order_relaxed); } /** @return non-const version of the policy */ @@ -257,12 +258,12 @@ private: /** lock_word is the target of the atomic test-and-set instruction when atomic operations are enabled. */ - int32 m_lock_word; + std::atomic<int32> m_lock_word; }; #endif /* HAVE_IB_LINUX_FUTEX */ -template <template <typename> class Policy = NoPolicy> +template <template <typename> class Policy> struct TTASMutex { typedef Policy<TTASMutex> MutexPolicy; @@ -277,40 +278,45 @@ struct TTASMutex { ~TTASMutex() { - ut_ad(m_lock_word == MUTEX_STATE_UNLOCKED); + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_UNLOCKED); } /** Called when the mutex is "created". Note: Not from the constructor but when the mutex is initialised. */ void init(latch_id_t) UNIV_NOTHROW { - ut_ad(m_lock_word == MUTEX_STATE_UNLOCKED); + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_UNLOCKED); } /** Destroy the mutex. */ void destroy() UNIV_NOTHROW { /* The destructor can be called at shutdown. */ - ut_ad(m_lock_word == MUTEX_STATE_UNLOCKED); + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_UNLOCKED); } /** Try and lock the mutex. @return true on success */ bool try_lock() UNIV_NOTHROW { - int32 oldval = MUTEX_STATE_UNLOCKED; - return(my_atomic_cas32_strong_explicit(&m_lock_word, &oldval, - MUTEX_STATE_LOCKED, - MY_MEMORY_ORDER_ACQUIRE, - MY_MEMORY_ORDER_RELAXED)); + uint32_t oldval = MUTEX_STATE_UNLOCKED; + return m_lock_word.compare_exchange_strong( + oldval, + MUTEX_STATE_LOCKED, + std::memory_order_acquire, + std::memory_order_relaxed); } /** Release the mutex. */ void exit() UNIV_NOTHROW { - ut_ad(m_lock_word == MUTEX_STATE_LOCKED); - my_atomic_store32_explicit(&m_lock_word, MUTEX_STATE_UNLOCKED, - MY_MEMORY_ORDER_RELEASE); + ut_ad(m_lock_word.load(std::memory_order_relaxed) + == MUTEX_STATE_LOCKED); + m_lock_word.store(MUTEX_STATE_UNLOCKED, + std::memory_order_release); } /** Acquire the mutex. @@ -353,12 +359,11 @@ private: /** Policy data */ MutexPolicy m_policy; - /** lock_word is the target of the atomic test-and-set instruction - when atomic operations are enabled. */ - int32 m_lock_word; + /** mutex state */ + std::atomic<uint32_t> m_lock_word; }; -template <template <typename> class Policy = NoPolicy> +template <template <typename> class Policy> struct TTASEventMutex { typedef Policy<TTASEventMutex> MutexPolicy; @@ -376,7 +381,7 @@ struct TTASEventMutex { ~TTASEventMutex() UNIV_NOTHROW { - ut_ad(m_lock_word == MUTEX_STATE_UNLOCKED); + ut_ad(state() == MUTEX_STATE_UNLOCKED); } /** Called when the mutex is "created". Note: Not from the constructor @@ -385,7 +390,7 @@ struct TTASEventMutex { void init(latch_id_t id, const char*, uint32_t) UNIV_NOTHROW { ut_a(m_event == 0); - ut_a(m_lock_word == MUTEX_STATE_UNLOCKED); + ut_ad(state() == MUTEX_STATE_UNLOCKED); m_event = os_event_create(sync_latch_get_name(id)); } @@ -396,7 +401,7 @@ struct TTASEventMutex { void destroy() UNIV_NOTHROW { - ut_ad(m_lock_word == MUTEX_STATE_UNLOCKED); + ut_ad(state() == MUTEX_STATE_UNLOCKED); /* We have to free the event before InnoDB shuts down. */ os_event_destroy(m_event); @@ -408,20 +413,20 @@ struct TTASEventMutex { bool try_lock() UNIV_NOTHROW { - int32 oldval = MUTEX_STATE_UNLOCKED; - return(my_atomic_cas32_strong_explicit(&m_lock_word, &oldval, - MUTEX_STATE_LOCKED, - MY_MEMORY_ORDER_ACQUIRE, - MY_MEMORY_ORDER_RELAXED)); + uint32_t oldval = MUTEX_STATE_UNLOCKED; + return m_lock_word.compare_exchange_strong( + oldval, + MUTEX_STATE_LOCKED, + std::memory_order_acquire, + std::memory_order_relaxed); } /** Release the mutex. */ void exit() UNIV_NOTHROW { - if (my_atomic_fas32_explicit(&m_lock_word, - MUTEX_STATE_UNLOCKED, - MY_MEMORY_ORDER_RELEASE) + if (m_lock_word.exchange(MUTEX_STATE_UNLOCKED, + std::memory_order_release) == MUTEX_STATE_WAITERS) { os_event_set(m_event); sync_array_object_signalled(); @@ -459,11 +464,12 @@ struct TTASEventMutex { : SYNC_MUTEX, filename, line, &cell); - int32 oldval = MUTEX_STATE_LOCKED; - my_atomic_cas32_strong_explicit(&m_lock_word, &oldval, - MUTEX_STATE_WAITERS, - MY_MEMORY_ORDER_RELAXED, - MY_MEMORY_ORDER_RELAXED); + uint32_t oldval = MUTEX_STATE_LOCKED; + m_lock_word.compare_exchange_strong( + oldval, + MUTEX_STATE_WAITERS, + std::memory_order_relaxed, + std::memory_order_relaxed); if (oldval == MUTEX_STATE_UNLOCKED) { sync_array_free_cell(sync_arr, cell); @@ -482,9 +488,7 @@ struct TTASEventMutex { int32 state() const UNIV_NOTHROW { - return(my_atomic_load32_explicit(const_cast<int32*> - (&m_lock_word), - MY_MEMORY_ORDER_RELAXED)); + return m_lock_word.load(std::memory_order_relaxed); } /** The event that the mutex will wait in sync0arr.cc @@ -514,9 +518,8 @@ private: TTASEventMutex(const TTASEventMutex&); TTASEventMutex& operator=(const TTASEventMutex&); - /** lock_word is the target of the atomic test-and-set instruction - when atomic operations are enabled. */ - int32 m_lock_word; + /** mutex state */ + std::atomic<uint32_t> m_lock_word; /** Used by sync0arr.cc for the wait queue */ os_event_t m_event; @@ -530,7 +533,6 @@ with the Performance Schema instrumentation. */ template <typename MutexImpl> struct PolicyMutex { - typedef MutexImpl MutexType; typedef typename MutexImpl::MutexPolicy Policy; PolicyMutex() UNIV_NOTHROW : m_impl() @@ -561,7 +563,7 @@ struct PolicyMutex pfs_exit(); #endif /* UNIV_PFS_MUTEX */ - policy().release(m_impl); + ut_d(policy().context.release(m_impl)); m_impl.exit(); } @@ -587,11 +589,11 @@ struct PolicyMutex locker = pfs_begin_lock(&state, name, line); #endif /* UNIV_PFS_MUTEX */ - policy().enter(m_impl, name, line); + ut_d(policy().context.enter(m_impl, name, line)); m_impl.enter(n_spins, n_delay, name, line); - policy().locked(m_impl, name, line); + ut_d(policy().context.locked(m_impl, name, line)); #ifdef UNIV_PFS_MUTEX pfs_end(locker, 0); #endif /* UNIV_PFS_MUTEX */ @@ -620,9 +622,9 @@ struct PolicyMutex if (ret == 0) { - policy().enter(m_impl, name, line); + ut_d(policy().context.enter(m_impl, name, line)); - policy().locked(m_impl, name, line); + ut_d(policy().context.locked(m_impl, name, line)); } #ifdef UNIV_PFS_MUTEX @@ -636,7 +638,7 @@ struct PolicyMutex /** @return true if the thread owns the mutex. */ bool is_owned() const UNIV_NOTHROW { - return(policy().is_owned()); + return(policy().context.is_owned()); } #endif /* UNIV_DEBUG */ @@ -658,6 +660,7 @@ struct PolicyMutex m_impl.init(id, filename, line); policy().init(m_impl, id, filename, line); + ut_d(policy().context.init(id)); } /** Free resources (if any) */ @@ -668,6 +671,7 @@ struct PolicyMutex #endif /* UNIV_PFS_MUTEX */ m_impl.destroy(); policy().destroy(); + ut_d(policy().context.destroy()); } /** Required for os_event_t */ diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h index 00fe1d3b02a..02d38069d94 100644 --- a/storage/innobase/include/ibuf0ibuf.h +++ b/storage/innobase/include/ibuf0ibuf.h @@ -119,13 +119,6 @@ ibuf_mtr_commit( /*============*/ mtr_t* mtr) /*!< in/out: mini-transaction */ MY_ATTRIBUTE((nonnull)); -/*********************************************************************//** -Initializes an ibuf bitmap page. */ -void -ibuf_bitmap_page_init( -/*==================*/ - buf_block_t* block, /*!< in: bitmap page */ - mtr_t* mtr); /*!< in: mtr */ /************************************************************************//** Resets the free bits of the page in the ibuf bitmap. This is done in a separate mini-transaction, hence this operation does not restrict @@ -241,18 +234,19 @@ ibuf_inside( /** Checks if a page address is an ibuf bitmap page (level 3 page) address. @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @return TRUE if a bitmap page */ -UNIV_INLINE -ibool -ibuf_bitmap_page( - const page_id_t page_id, - const page_size_t& page_size); +inline bool ibuf_bitmap_page(const page_id_t page_id, ulint zip_size) +{ + ut_ad(ut_is_2pow(zip_size)); + ulint size = zip_size ? zip_size : srv_page_size; + return (page_id.page_no() & (size - 1)) == FSP_IBUF_BITMAP_OFFSET; +} /** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. Must not be called when recv_no_ibuf_operations==true. @param[in] page_id page id -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] x_latch FALSE if relaxed check (avoid latching the bitmap page) @param[in] file file name @@ -260,13 +254,13 @@ bitmap page) @param[in,out] mtr mtr which will contain an x-latch to the bitmap page if the page is not one of the fixed address ibuf pages, or NULL, in which case a new transaction is created. -@return TRUE if level 2 or level 3 page */ -ibool +@return true if level 2 or level 3 page */ +bool ibuf_page_low( const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, #ifdef UNIV_DEBUG - ibool x_latch, + bool x_latch, #endif /* UNIV_DEBUG */ const char* file, unsigned line, @@ -278,22 +272,22 @@ ibuf_page_low( /** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. Must not be called when recv_no_ibuf_operations==true. @param[in] page_id tablespace/page identifier -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in,out] mtr mini-transaction or NULL @return TRUE if level 2 or level 3 page */ -# define ibuf_page(page_id, page_size, mtr) \ - ibuf_page_low(page_id, page_size, TRUE, __FILE__, __LINE__, mtr) +# define ibuf_page(page_id, zip_size, mtr) \ + ibuf_page_low(page_id, zip_size, true, __FILE__, __LINE__, mtr) #else /* UVIV_DEBUG */ /** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. Must not be called when recv_no_ibuf_operations==true. @param[in] page_id tablespace/page identifier -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in,out] mtr mini-transaction or NULL @return TRUE if level 2 or level 3 page */ -# define ibuf_page(page_id, page_size, mtr) \ - ibuf_page_low(page_id, page_size, __FILE__, __LINE__, mtr) +# define ibuf_page(page_id, zip_size, mtr) \ + ibuf_page_low(page_id, zip_size, __FILE__, __LINE__, mtr) #endif /* UVIV_DEBUG */ /***********************************************************************//** @@ -304,23 +298,23 @@ void ibuf_free_excess_pages(void); /*========================*/ -/** Buffer an operation in the insert/delete buffer, instead of doing it -directly to the disk page, if this is possible. Does not do it if the index +/** Buffer an operation in the change buffer, instead of applying it +directly to the file page, if this is possible. Does not do it if the index is clustered or unique. @param[in] op operation type @param[in] entry index entry to insert @param[in,out] index index where to insert @param[in] page_id page id where to insert -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in,out] thr query thread -@return TRUE if success */ -ibool +@return true if success */ +bool ibuf_insert( ibuf_op_t op, const dtuple_t* entry, dict_index_t* index, const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, que_thr_t* thr); /** @@ -340,25 +334,22 @@ subsequently was dropped. @param[in,out] block if page has been read from disk, pointer to the page x-latched, else NULL @param[in] page_id page id of the index page -@param[in] update_ibuf_bitmap normally this is set to TRUE, but +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] update_ibuf_bitmap normally this is set, but if we have deleted or are deleting the tablespace, then we naturally do not want to update a non-existent bitmap page */ void ibuf_merge_or_delete_for_page( buf_block_t* block, const page_id_t page_id, - const page_size_t* page_size, - ibool update_ibuf_bitmap); + ulint zip_size, + bool update_ibuf_bitmap); + +/** Delete all change buffer entries for a tablespace, +in DISCARD TABLESPACE, IMPORT TABLESPACE, or crash recovery. +@param[in] space missing or to-be-discarded tablespace */ +void ibuf_delete_for_discarded_space(ulint space); -/*********************************************************************//** -Deletes all entries in the insert buffer for a given space id. This is used -in DISCARD TABLESPACE and IMPORT TABLESPACE. -NOTE: this does not update the page free bitmaps in the space. The space will -become CORRUPT when you call this function! */ -void -ibuf_delete_for_discarded_space( -/*============================*/ - ulint space); /*!< in: space id */ /** Contract the change buffer by reading pages to the buffer pool. @param[in] full If true, do a full contraction based on PCT_IO(100). If false, the size of contract batch is determined @@ -378,16 +369,8 @@ ibuf_merge_space( /*=============*/ ulint space); /*!< in: space id */ -/*********************************************************************//** -Parses a redo log record of an ibuf bitmap page init. -@return end of log record or NULL */ -byte* -ibuf_parse_bitmap_init( -/*===================*/ - byte* ptr, /*!< in: buffer */ - byte* end_ptr,/*!< in: buffer end */ - buf_block_t* block, /*!< in: block or NULL */ - mtr_t* mtr); /*!< in: mtr or NULL */ +/** Apply MLOG_IBUF_BITMAP_INIT when crash-upgrading */ +ATTRIBUTE_COLD void ibuf_bitmap_init_apply(buf_block_t* block); /******************************************************************//** Looks if the insert buffer is empty. diff --git a/storage/innobase/include/ibuf0ibuf.ic b/storage/innobase/include/ibuf0ibuf.ic index b3e04ee1661..db8c122c0f7 100644 --- a/storage/innobase/include/ibuf0ibuf.ic +++ b/storage/innobase/include/ibuf0ibuf.ic @@ -78,11 +78,12 @@ struct ibuf_t{ ulint height; /*!< tree height */ dict_index_t* index; /*!< insert buffer index */ - ulint n_merges; /*!< number of pages merged */ - ulint n_merged_ops[IBUF_OP_COUNT]; + /** number of pages merged */ + Atomic_counter<ulint> n_merges; + Atomic_counter<ulint> n_merged_ops[IBUF_OP_COUNT]; /*!< number of operations of each type merged to index pages */ - ulint n_discarded_ops[IBUF_OP_COUNT]; + Atomic_counter<ulint> n_discarded_ops[IBUF_OP_COUNT]; /*!< number of operations of each type discarded without merging due to the tablespace being deleted or the @@ -149,20 +150,6 @@ ibuf_inside( return(mtr->is_inside_ibuf()); } -/** Checks if a page address is an ibuf bitmap page (level 3 page) address. -@param[in] page_id page id -@param[in] page_size page size -@return TRUE if a bitmap page */ -UNIV_INLINE -ibool -ibuf_bitmap_page( - const page_id_t page_id, - const page_size_t& page_size) -{ - return((page_id.page_no() & (page_size.physical() - 1)) - == FSP_IBUF_BITMAP_OFFSET); -} - /** Translates the free space on a page to a value in the ibuf bitmap. @param[in] page_size page size in bytes @param[in] max_ins_size maximum insert size after reorganize for @@ -191,29 +178,6 @@ ibuf_index_page_calc_free_bits( return(n); } -/** Translates the ibuf free bits to the free space on a page in bytes. -@param[in] page_size page_size -@param[in] bits value for ibuf bitmap bits -@return maximum insert size after reorganize for the page */ -UNIV_INLINE -ulint -ibuf_index_page_calc_free_from_bits( - const page_size_t& page_size, - ulint bits) -{ - ut_ad(bits < 4); - ut_ad(!page_size.is_compressed() - || page_size.physical() > IBUF_PAGE_SIZE_PER_FREE_SPACE); - - if (bits == 3) { - return(4 * page_size.physical() - / IBUF_PAGE_SIZE_PER_FREE_SPACE); - } - - return(bits * (page_size.physical() - / IBUF_PAGE_SIZE_PER_FREE_SPACE)); -} - /*********************************************************************//** Translates the free space on a compressed page to a value in the ibuf bitmap. @return value for ibuf bitmap bits */ @@ -227,7 +191,7 @@ ibuf_index_page_calc_free_zip( const page_zip_des_t* page_zip; lint zip_max_ins; - ut_ad(block->page.size.is_compressed()); + ut_ad(block->page.zip.data); /* Consider the maximum insert size on the uncompressed page without reorganizing the page. We must not assume anything @@ -250,7 +214,7 @@ ibuf_index_page_calc_free_zip( max_ins_size = (ulint) zip_max_ins; } - return(ibuf_index_page_calc_free_bits(block->page.size.physical(), + return(ibuf_index_page_calc_free_bits(block->physical_size(), max_ins_size)); } @@ -263,14 +227,14 @@ ibuf_index_page_calc_free( /*======================*/ const buf_block_t* block) /*!< in: buffer block */ { - if (!block->page.size.is_compressed()) { + if (!block->page.zip.data) { ulint max_ins_size; max_ins_size = page_get_max_insert_size_after_reorganize( buf_block_get_frame(block), 1); return(ibuf_index_page_calc_free_bits( - block->page.size.physical(), max_ins_size)); + block->physical_size(), max_ins_size)); } else { return(ibuf_index_page_calc_free_zip(block)); } @@ -311,12 +275,12 @@ ibuf_update_free_bits_if_full( ut_ad(buf_block_get_page_zip(block) == NULL); before = ibuf_index_page_calc_free_bits( - block->page.size.physical(), max_ins_size); + srv_page_size, max_ins_size); if (max_ins_size >= increase) { compile_time_assert(ULINT32_UNDEFINED > UNIV_PAGE_SIZE_MAX); after = ibuf_index_page_calc_free_bits( - block->page.size.physical(), max_ins_size - increase); + srv_page_size, max_ins_size - increase); #ifdef UNIV_IBUF_DEBUG ut_a(after <= ibuf_index_page_calc_free(block)); #endif diff --git a/storage/innobase/include/log0crypt.h b/storage/innobase/include/log0crypt.h index 7a14b022e66..8d26ccb2ba3 100644 --- a/storage/innobase/include/log0crypt.h +++ b/storage/innobase/include/log0crypt.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (C) 2013, 2015, Google Inc. All Rights Reserved. -Copyright (C) 2014, 2017, MariaDB Corporation. All Rights Reserved. +Copyright (C) 2014, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -73,14 +73,23 @@ UNIV_INTERN bool log_crypt_read_checkpoint_buf(const byte* buf); +/** log_crypt() operation code */ +enum log_crypt_t { + /** encrypt a log block without rotating key */ + LOG_ENCRYPT, + /** decrypt a log block */ + LOG_DECRYPT, + /** attempt to rotate the key, and encrypt a log block */ + LOG_ENCRYPT_ROTATE_KEY +}; + /** Encrypt or decrypt log blocks. @param[in,out] buf log blocks to encrypt or decrypt @param[in] lsn log sequence number of the start of the buffer @param[in] size size of the buffer, in bytes -@param[in] decrypt whether to decrypt instead of encrypting */ -UNIV_INTERN -void -log_crypt(byte* buf, lsn_t lsn, ulint size, bool decrypt = false); +@param[in] op whether to decrypt, encrypt, or rotate key and encrypt +@return whether the operation succeeded (encrypt always does) */ +bool log_crypt(byte* buf, lsn_t lsn, ulint size, log_crypt_t op = LOG_ENCRYPT); /** Encrypt or decrypt a temporary file block. @param[in] src block to encrypt or decrypt diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index 133b1692d31..399319537c8 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -164,19 +164,16 @@ bool log_set_capacity(ulonglong file_size) MY_ATTRIBUTE((warn_unused_result)); -/******************************************************//** -This function is called, e.g., when a transaction wants to commit. It checks -that the log has been written to the log file up to the last log entry written -by the transaction. If there is a flush running, it waits and checks if the -flush flushed enough. If not, starts a new flush. */ -void -log_write_up_to( -/*============*/ - lsn_t lsn, /*!< in: log sequence number up to which - the log should be written, LSN_MAX if not specified */ - bool flush_to_disk); - /*!< in: true if we want the written log - also to be flushed to disk */ +/** Ensure that the log has been written to the log file up to a given +log entry (such as that of a transaction commit). Start a new write, or +wait and check if an already running write is covering the request. +@param[in] lsn log sequence number that should be +included in the redo log file write +@param[in] flush_to_disk whether the written log should also +be flushed to the file system +@param[in] rotate_key whether to rotate the encryption key */ +void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key = false); + /** write to the log file up to the last log entry. @param[in] sync whether we want the written log also to be flushed to disk. */ @@ -406,13 +403,14 @@ extern my_bool innodb_log_checksums; #define LOG_BLOCK_HDR_SIZE 12 /* size of the log block header in bytes */ -/* Offsets of a log block trailer from the end of the block */ +#define LOG_BLOCK_KEY 4 /* encryption key version + before LOG_BLOCK_CHECKSUM; + in log_t::FORMAT_ENC_10_4 only */ #define LOG_BLOCK_CHECKSUM 4 /* 4 byte checksum of the log block contents; in InnoDB versions < 3.23.52 this did not contain the checksum but the same value as - .._HDR_NO */ -#define LOG_BLOCK_TRL_SIZE 4 /* trailer size in bytes */ + LOG_BLOCK_HDR_NO */ /** Offsets inside the checkpoint pages (redo log format version 1) @{ */ /** Checkpoint number */ @@ -463,25 +461,6 @@ or the MySQL version that created the redo log file. */ IB_TO_STR(MYSQL_VERSION_MINOR) "." \ IB_TO_STR(MYSQL_VERSION_PATCH) -/** The original (not version-tagged) InnoDB redo log format */ -#define LOG_HEADER_FORMAT_3_23 0 -/** The MySQL 5.7.9/MariaDB 10.2.2 log format */ -#define LOG_HEADER_FORMAT_10_2 1 -/** The MariaDB 10.3.2 log format. -To prevent crash-downgrade to earlier 10.2 due to the inability to -roll back a retroactively introduced TRX_UNDO_RENAME_TABLE undo log record, -MariaDB 10.2.18 and later will use the 10.3 format, but LOG_HEADER_SUBFORMAT -1 instead of 0. MariaDB 10.3 will use subformat 0 (5.7-style TRUNCATE) or 2 -(MDEV-13564 backup-friendly TRUNCATE). */ -#define LOG_HEADER_FORMAT_10_3 103 -/** The redo log format identifier corresponding to the current format version. -Stored in LOG_HEADER_FORMAT. */ -#define LOG_HEADER_FORMAT_CURRENT LOG_HEADER_FORMAT_10_3 -/** Future MariaDB 10.4 log format */ -#define LOG_HEADER_FORMAT_10_4 104 -/** Encrypted MariaDB redo log */ -#define LOG_HEADER_FORMAT_ENCRYPTED (1U<<31) - /* @} */ #define LOG_CHECKPOINT_1 OS_FILE_LOG_BLOCK_SIZE @@ -506,6 +485,24 @@ typedef ib_mutex_t FlushOrderMutex; /** Redo log buffer */ struct log_t{ + /** The original (not version-tagged) InnoDB redo log format */ + static constexpr uint32_t FORMAT_3_23 = 0; + /** The MySQL 5.7.9/MariaDB 10.2.2 log format */ + static constexpr uint32_t FORMAT_10_2 = 1; + /** The MariaDB 10.3.2 log format. + To prevent crash-downgrade to earlier 10.2 due to the inability to + roll back a retroactively introduced TRX_UNDO_RENAME_TABLE undo log record, + MariaDB 10.2.18 and later will use the 10.3 format, but LOG_HEADER_SUBFORMAT + 1 instead of 0. MariaDB 10.3 will use subformat 0 (5.7-style TRUNCATE) or 2 + (MDEV-13564 backup-friendly TRUNCATE). */ + static constexpr uint32_t FORMAT_10_3 = 103; + /** The MariaDB 10.4.0 log format. */ + static constexpr uint32_t FORMAT_10_4 = 104; + /** Encrypted MariaDB redo log */ + static constexpr uint32_t FORMAT_ENCRYPTED = 1U << 31; + /** The MariaDB 10.4.0 log format (only with innodb_encrypt_log=ON) */ + static constexpr uint32_t FORMAT_ENC_10_4 = FORMAT_10_4 | FORMAT_ENCRYPTED; + MY_ALIGNED(CACHE_LINE_SIZE) lsn_t lsn; /*!< log sequence number */ ulong buf_free; /*!< first free offset within the log @@ -546,7 +543,7 @@ struct log_t{ struct files { /** number of files */ ulint n_files; - /** format of the redo log: e.g., LOG_HEADER_FORMAT_CURRENT */ + /** format of the redo log: e.g., FORMAT_10_4 */ uint32_t format; /** redo log subformat: 0 with separately logged TRUNCATE, 2 with fully redo-logged TRUNCATE (1 in MariaDB 10.2) */ @@ -564,7 +561,7 @@ struct log_t{ lsn_t scanned_lsn; /** @return whether the redo log is encrypted */ - bool is_encrypted() const { return format & LOG_HEADER_FORMAT_ENCRYPTED; } + bool is_encrypted() const { return format & FORMAT_ENCRYPTED; } /** @return capacity in bytes */ lsn_t capacity() const{ return (file_size - LOG_FILE_HDR_SIZE) * n_files; } /** Calculate the offset of a log sequence number. @@ -699,11 +696,34 @@ public: /** @return whether the redo log is encrypted */ bool is_encrypted() const { return(log.is_encrypted()); } - bool is_initialised() { return m_initialised; } + bool is_initialised() const { return m_initialised; } /** Complete an asynchronous checkpoint write. */ void complete_checkpoint(); + /** @return the log block header + trailer size */ + unsigned framing_size() const + { + return log.format == FORMAT_ENC_10_4 + ? LOG_BLOCK_HDR_SIZE + LOG_BLOCK_KEY + LOG_BLOCK_CHECKSUM + : LOG_BLOCK_HDR_SIZE + LOG_BLOCK_CHECKSUM; + } + /** @return the log block payload size */ + unsigned payload_size() const + { + return log.format == FORMAT_ENC_10_4 + ? OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE - LOG_BLOCK_CHECKSUM - + LOG_BLOCK_KEY + : OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE - LOG_BLOCK_CHECKSUM; + } + /** @return the log block trailer offset */ + unsigned trailer_offset() const + { + return log.format == FORMAT_ENC_10_4 + ? OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM - LOG_BLOCK_KEY + : OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM; + } + /** Initialise the redo log subsystem. */ void create(); diff --git a/storage/innobase/include/log0log.ic b/storage/innobase/include/log0log.ic index 722e658a24b..7dfa7c0db68 100644 --- a/storage/innobase/include/log0log.ic +++ b/storage/innobase/include/log0log.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2018, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -217,7 +217,7 @@ log_block_calc_checksum_format_0( sum = 1; sh = 0; - for (i = 0; i < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; i++) { + for (i = 0; i < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM; i++) { ulint b = (ulint) block[i]; sum &= 0x7FFFFFFFUL; sum += b; @@ -239,7 +239,7 @@ ulint log_block_calc_checksum_crc32( const byte* block) { - return(ut_crc32(block, OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE)); + return ut_crc32(block, OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM); } /** Calculates the checksum for a log block using the "no-op" algorithm. @@ -340,7 +340,7 @@ log_reserve_and_write_fast( #endif /* UNIV_LOG_LSN_DEBUG */ + log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE; - if (data_len >= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) { + if (data_len >= log_sys.trailer_offset()) { /* The string does not fit within the current log block or the log block would become full */ @@ -485,9 +485,9 @@ log_free_check(void) #ifdef UNIV_DEBUG static const latch_level_t latches[] = { - SYNC_DICT, /* dict_sys->mutex during + SYNC_DICT, /* dict_sys.mutex during commit_try_rebuild() */ - SYNC_DICT_OPERATION, /* dict_operation_lock X-latch during + SYNC_DICT_OPERATION, /* dict_sys.latch X-latch during commit_try_rebuild() */ SYNC_FTS_CACHE, /* fts_cache_t::lock */ SYNC_INDEX_TREE /* index->lock */ diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h index 866102e6f3d..21ddd2b0388 100644 --- a/storage/innobase/include/log0recv.h +++ b/storage/innobase/include/log0recv.h @@ -33,8 +33,7 @@ Created 9/20/1997 Heikki Tuuri #include "log0log.h" #include "mtr0types.h" -#include <list> -#include <vector> +#include <deque> /** Is recv_writer_thread active? */ extern bool recv_writer_thread_active; @@ -49,7 +48,7 @@ dberr_t recv_find_max_checkpoint(ulint* max_field) MY_ATTRIBUTE((nonnull, warn_unused_result)); -/** Reduces recv_sys->n_addrs for the corrupted page. +/** Reduces recv_sys.n_addrs for the corrupted page. This function should called when srv_force_recovery > 0. @param[in] page_id page id of the corrupted page */ void recv_recover_corrupt_page(page_id_t page_id); @@ -74,17 +73,6 @@ Initiates the rollback of active transactions. */ void recv_recovery_rollback_active(void); /*===============================*/ -/** Clean up after recv_sys_init() */ -void -recv_sys_close(); -/** Initialize the redo log recovery subsystem. */ -void -recv_sys_init(); -/********************************************************//** -Frees the recovery system. */ -void -recv_sys_debug_free(void); -/*=====================*/ /********************************************************//** Reset the state of the recovery system variables. */ @@ -110,7 +98,7 @@ enum store_t { /** Adds data from a new log block to the parsing buffer of recv_sys if -recv_sys->parse_start_lsn is non-zero. +recv_sys.parse_start_lsn is non-zero. @param[in] log_block log block to add @param[in] scanned_lsn lsn of how far we were able to find data in this log block @@ -140,10 +128,6 @@ corresponding to MLOG_INDEX_LOAD. */ extern void (*log_optimized_ddl_op)(ulint space_id); -/** Report backup-unfriendly TRUNCATE operation (with separate log file), -corresponding to MLOG_TRUNCATE. */ -extern void (*log_truncate)(); - /** Report an operation to create, delete, or rename a file during backup. @param[in] space_id tablespace identifier @param[in] flags tablespace flags (NULL if not create) @@ -184,7 +168,7 @@ struct recv_t{ struct recv_dblwr_t { /** Add a page frame to the doublewrite recovery buffer. */ void add(byte* page) { - pages.push_back(page); + pages.push_front(page); } /** Find a doublewrite copy of a page. @@ -194,7 +178,7 @@ struct recv_dblwr_t { @retval NULL if no page was found */ const byte* find_page(ulint space_id, ulint page_no); - typedef std::list<byte*, ut_allocator<byte*> > list; + typedef std::deque<byte*, ut_allocator<byte*> > list; /** Recovered doublewrite buffer page frames */ list pages; @@ -215,14 +199,11 @@ struct recv_sys_t{ buf_flush_t flush_type;/*!< type of the flush request. BUF_FLUSH_LRU: flush end of LRU, keeping free blocks. BUF_FLUSH_LIST: flush all of blocks. */ - ibool apply_log_recs; - /*!< this is TRUE when log rec application to - pages is allowed; this flag tells the - i/o-handler if it should do log record - application */ - ibool apply_batch_on; - /*!< this is TRUE when a log rec application - batch is running */ + /** whether recv_recover_page(), invoked from buf_page_io_complete(), + should apply log records*/ + bool apply_log_recs; + /** whether recv_apply_hashed_log_recs() is running */ + bool apply_batch_on; byte* buf; /*!< buffer for parsing log records */ size_t buf_size; /*!< size of buf */ ulint len; /*!< amount of data in buf */ @@ -276,6 +257,32 @@ struct recv_sys_t{ /** Lastly added LSN to the hash table of log records. */ lsn_t last_stored_lsn; + /** Initialize the redo log recovery subsystem. */ + void create(); + + /** Free most recovery data structures. */ + void debug_free(); + + /** Clean up after create() */ + void close(); + + bool is_initialised() const { return buf_size != 0; } + + /** Store a redo log record for applying. + @param type record type + @param space tablespace identifier + @param page_no page number + @param body record body + @param rec_end end of record + @param lsn start LSN of the mini-transaction + @param end_lsn end LSN of the mini-transaction */ + inline void add(mlog_id_t type, ulint space, ulint page_no, + byte* body, byte* rec_end, lsn_t lsn, + lsn_t end_lsn); + + /** Empty a fully processed set of stored redo log records. */ + inline void empty(); + /** Determine whether redo log recovery progress should be reported. @param[in] time the current time @return whether progress should be reported @@ -292,7 +299,7 @@ struct recv_sys_t{ }; /** The recovery system */ -extern recv_sys_t* recv_sys; +extern recv_sys_t recv_sys; /** TRUE when applying redo log records during crash recovery; FALSE otherwise. Note that this is FALSE while a background thread is diff --git a/storage/innobase/include/mach0data.h b/storage/innobase/include/mach0data.h index 8141c8a91e0..3d0e48253eb 100644 --- a/storage/innobase/include/mach0data.h +++ b/storage/innobase/include/mach0data.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,11 +29,10 @@ Created 11/28/1995 Heikki Tuuri #define mach0data_h #include "univ.i" +#include "mtr0types.h" #ifndef UNIV_INNOCHECKSUM -#include "mtr0types.h" - /* The data and all fields are always stored in a database file in the same format: ascii, big-endian, ... . All data in the files MUST be accessed using the functions in this @@ -368,17 +367,6 @@ mach_write_ulonglong( #endif /* !UNIV_INNOCHECKSUM */ -/** Read 1 to 4 bytes from a file page buffered in the buffer pool. -@param[in] ptr pointer where to read -@param[in] type MLOG_1BYTE, MLOG_2BYTES, or MLOG_4BYTES -@return value read */ -UNIV_INLINE -ulint -mach_read_ulint( - const byte* ptr, - mlog_id_t type) - MY_ATTRIBUTE((warn_unused_result)); - #include "mach0data.ic" #endif diff --git a/storage/innobase/include/mach0data.ic b/storage/innobase/include/mach0data.ic index 408044292a5..80bd925d70b 100644 --- a/storage/innobase/include/mach0data.ic +++ b/storage/innobase/include/mach0data.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -866,28 +866,3 @@ mach_write_ulonglong( } #endif /* !UNIV_INNOCHECKSUM */ - -/** Read 1 to 4 bytes from a file page buffered in the buffer pool. -@param[in] ptr pointer where to read -@param[in] type MLOG_1BYTE, MLOG_2BYTES, or MLOG_4BYTES -@return value read */ -UNIV_INLINE -ulint -mach_read_ulint( - const byte* ptr, - mlog_id_t type) -{ - switch (type) { - case MLOG_1BYTE: - return(mach_read_from_1(ptr)); - case MLOG_2BYTES: - return(mach_read_from_2(ptr)); - case MLOG_4BYTES: - return(mach_read_from_4(ptr)); - default: - break; - } - - ut_error; - return(0); -} diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h index dc76b40a3db..0c58f524015 100644 --- a/storage/innobase/include/mtr0log.h +++ b/storage/innobase/include/mtr0log.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -71,6 +72,23 @@ mlog_log_string( byte* ptr, /*!< in: pointer written to */ ulint len, /*!< in: string length */ mtr_t* mtr); /*!< in: mini-transaction handle */ + +/** Initialize a string of bytes. +@param[in,out] b buffer page +@param[in] ofs byte offset from block->frame +@param[in] len length of the data to write +@param[in] val the data byte to write +@param[in,out] mtr mini-transaction */ +void +mlog_memset(buf_block_t* b, ulint ofs, ulint len, byte val, mtr_t* mtr); + +/** Initialize a string of bytes. +@param[in,out] byte byte address +@param[in] len length of the data to write +@param[in] val the data byte to write +@param[in,out] mtr mini-transaction */ +void mlog_memset(byte* b, ulint len, byte val, mtr_t* mtr); + /********************************************************//** Writes initial part of a log record consisting of one-byte item type and four-byte space and page numbers. */ @@ -180,7 +198,7 @@ mlog_parse_initial_log_record( ulint* space, /*!< out: space id */ ulint* page_no);/*!< out: page number */ /********************************************************//** -Parses a log record written by mlog_write_ulint or mlog_write_ull. +Parses a log record written by mlog_write_ulint, mlog_write_ull, mlog_memset. @return parsed record end, NULL if not a complete record */ byte* mlog_parse_nbytes( diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index 074f55971b3..f364730b21f 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -54,10 +54,6 @@ savepoint. */ @return old mode */ #define mtr_set_log_mode(m, d) (m)->set_log_mode((d)) -/** Read 1 - 4 bytes from a file page buffered in the buffer pool. -@return value read */ -#define mtr_read_ulint(p, t, m) (m)->read_ulint((p), (t)) - /** Release an object in the memo stack. @return true if released */ #define mtr_memo_release(m, o, t) \ @@ -239,13 +235,6 @@ struct mtr_t { bool is_named_space(const fil_space_t* space) const; #endif /* UNIV_DEBUG */ - /** Read 1 - 4 bytes from a file page buffered in the buffer pool. - @param ptr pointer from where to read - @param type) MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES - @return value read */ - inline ulint read_ulint(const byte* ptr, mlog_id_t type) const - MY_ATTRIBUTE((warn_unused_result)); - /** Acquire a tablespace X-latch. @param[in] space_id tablespace ID @param[in] file file name from where called diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic index 4cc55ed13ec..0fe56f960b7 100644 --- a/storage/innobase/include/mtr0mtr.ic +++ b/storage/innobase/include/mtr0mtr.ic @@ -170,7 +170,7 @@ mtr_t::release_block_at_savepoint( ut_a(slot->object == block); - buf_block_unfix(reinterpret_cast<buf_block_t*>(block)); + reinterpret_cast<buf_block_t*>(block)->unfix(); buf_page_release_latch(block, slot->type); @@ -227,21 +227,3 @@ mtr_t::set_log_mode(mtr_log_t mode) ut_ad(0); return(old_mode); } - -/** -Reads 1 - 4 bytes from a file page buffered in the buffer pool. -@return value read */ - -ulint -mtr_t::read_ulint(const byte* ptr, mlog_id_t type) const -{ - ut_ad(is_active()); - - ut_ad(memo_contains_page_flagged( - ptr, - MTR_MEMO_PAGE_S_FIX - | MTR_MEMO_PAGE_X_FIX - | MTR_MEMO_PAGE_SX_FIX)); - - return(mach_read_ulint(ptr, type)); -} diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h index da6686d77c8..bf7484b2337 100644 --- a/storage/innobase/include/mtr0types.h +++ b/storage/innobase/include/mtr0types.h @@ -120,7 +120,7 @@ enum mlog_id_t { /** mark an index record as the predefined minimum record */ MLOG_REC_MIN_MARK = 26, - /** initialize an ibuf bitmap page */ + /** initialize an ibuf bitmap page (used in MariaDB 10.2 and 10.3) */ MLOG_IBUF_BITMAP_INIT = 27, #ifdef UNIV_LOG_LSN_DEBUG @@ -218,7 +218,8 @@ enum mlog_id_t { /** initialize a file page */ MLOG_INIT_FILE_PAGE2 = 59, - /** Table is being truncated. (Marked only for file-per-table) */ + /** Table is being truncated. (Was used in 10.2 and 10.3; + not supported for crash-upgrade to 10.4 or later.) */ MLOG_TRUNCATE = 60, /** notify that an index tree is being loaded without writing @@ -229,8 +230,14 @@ enum mlog_id_t { of a ROW_FORMAT=COMPRESSED table */ MLOG_ZIP_WRITE_TRX_ID = 62, + /** initialize a page with a string of identical bytes */ + MLOG_MEMSET = 63, + + /** Zero-fill a page that is not allocated. */ + MLOG_INIT_FREE_PAGE = 64, + /** biggest value (used in assertions) */ - MLOG_BIGGEST_TYPE = MLOG_ZIP_WRITE_TRX_ID, + MLOG_BIGGEST_TYPE = MLOG_INIT_FREE_PAGE, /** log record for writing/updating crypt data of a tablespace */ diff --git a/storage/innobase/include/os0api.h b/storage/innobase/include/os0api.h index 6f42d968c8e..3be7c0afaa4 100644 --- a/storage/innobase/include/os0api.h +++ b/storage/innobase/include/os0api.h @@ -1,6 +1,6 @@ /*********************************************************************** -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the @@ -54,22 +54,4 @@ buf_page_get_trim_length( ulint write_length) MY_ATTRIBUTE((warn_unused_result)); -/** -Get should we punch hole to tablespace. -@param[in] space Tablespace -@return true, if punch hole should be tried, false if not. */ -bool -fil_node_should_punch_hole( - const fil_node_t* node) - MY_ATTRIBUTE((warn_unused_result)); - -/** -Set punch hole to tablespace to given value. -@param[in] space Tablespace -@param[in] val value to be set. */ -void -fil_space_set_punch_hole( - fil_node_t* node, - bool val); - #endif /* OS_API_H */ diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index c896d9da6a2..a87ce5ec07b 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -36,7 +36,7 @@ Created 10/21/1995 Heikki Tuuri #ifndef os0file_h #define os0file_h -#include "page0size.h" +#include "fsp0types.h" #include "os0api.h" #ifndef _WIN32 @@ -360,17 +360,8 @@ public: /** Set the pointer to file node for IO @param[in] node File node */ - void set_fil_node(fil_node_t* node) - { - if (node && !fil_node_should_punch_hole(node)) { - clear_punch_hole(); - } - - m_fil_node = node; - } + inline void set_fil_node(fil_node_t* node); - /** Compare two requests - @reutrn true if the are equal */ bool operator==(const IORequest& rhs) const { return(m_type == rhs.m_type); @@ -414,17 +405,7 @@ public: : 0); } - bool should_punch_hole() const { - return (m_fil_node ? - fil_node_should_punch_hole(m_fil_node) - : false); - } - - void space_no_punch_hole() const { - if (m_fil_node) { - fil_space_set_punch_hole(m_fil_node, false); - } - } + inline bool should_punch_hole() const; /** Free storage space associated with a section of the file. @param[in] fh Open file handle @@ -1585,19 +1566,6 @@ os_file_change_size_win32( #endif /*_WIN32 */ -/** Check if the file system supports sparse files. - -Warning: On POSIX systems we try and punch a hole from offset 0 to -the system configured page size. This should only be called on an empty -file. - -@param[in] fh File handle for the file - if opened -@return true if the file system supports sparse files */ -bool -os_is_sparse_file_supported( - os_file_t fh) - MY_ATTRIBUTE((warn_unused_result)); - /** Free storage space associated with a section of the file. @param[in] fh Open file handle @param[in] off Starting offset (SEEK_SET) @@ -1637,16 +1605,6 @@ is_absolute_path( return(false); } -/***********************************************************************//** -Try to get number of bytes per sector from file system. -@return file block size */ -UNIV_INTERN -ulint -os_file_get_block_size( -/*===================*/ - os_file_t file, /*!< in: handle to a file */ - const char* name); /*!< in: file name */ - #include "os0file.ic" #endif /* os0file_h */ diff --git a/storage/innobase/include/os0once.h b/storage/innobase/include/os0once.h deleted file mode 100644 index a818b451830..00000000000 --- a/storage/innobase/include/os0once.h +++ /dev/null @@ -1,120 +0,0 @@ -/***************************************************************************** - -Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/os0once.h -A class that aids executing a given function exactly once in a multi-threaded -environment. - -Created Feb 20, 2014 Vasil Dimov -*******************************************************/ - -#ifndef os0once_h -#define os0once_h - -#include "univ.i" - -#include "ut0ut.h" -#include "my_cpu.h" - -/** Execute a given function exactly once in a multi-threaded environment -or wait for the function to be executed by another thread. - -Example usage: -First the user must create a control variable of type os_once::state_t and -assign it os_once::NEVER_DONE. -Then the user must pass this variable, together with a function to be -executed to os_once::do_or_wait_for_done(). - -Multiple threads can call os_once::do_or_wait_for_done() simultaneously with -the same (os_once::state_t) control variable. The provided function will be -called exactly once and when os_once::do_or_wait_for_done() returns then this -function has completed execution, by this or another thread. In other words -os_once::do_or_wait_for_done() will either execute the provided function or -will wait for its execution to complete if it is already called by another -thread or will do nothing if the function has already completed its execution -earlier. - -This mimics pthread_once(3), but unfortunatelly pthread_once(3) does not -support passing arguments to the init_routine() function. We should use -std::call_once() when we start compiling with C++11 enabled. */ -class os_once { -public: - /** Control variables' state type */ - typedef ib_uint32_t state_t; - - /** Not yet executed. */ - static const state_t NEVER_DONE = 0; - - /** Currently being executed by this or another thread. */ - static const state_t IN_PROGRESS = 1; - - /** Finished execution. */ - static const state_t DONE = 2; - - /** Call a given function or wait its execution to complete if it is - already called by another thread. - @param[in,out] state control variable - @param[in] do_func function to call - @param[in,out] do_func_arg an argument to pass to do_func(). */ - static - void - do_or_wait_for_done( - volatile state_t* state, - void (*do_func)(void*), - void* do_func_arg) - { - int32 oldval = NEVER_DONE; - - /* Avoid calling my_atomic_cas32() in the most common case. */ - if (*state == DONE) { - return; - } - - if (my_atomic_cas32((int32*) state, &oldval, IN_PROGRESS)) { - /* We are the first. Call the function. */ - - do_func(do_func_arg); - - my_atomic_store32((int32*) state, DONE); - } else { - /* The state is not NEVER_DONE, so either it is - IN_PROGRESS (somebody is calling the function right - now or DONE (it has already been called and completed). - Wait for it to become DONE. */ - for (;;) { - const state_t s = *state; - - switch (s) { - case DONE: - return; - case IN_PROGRESS: - break; - case NEVER_DONE: - /* fall through */ - default: - ut_error; - } - - MY_RELAX_CPU(); - } - } - } -}; - -#endif /* os0once_h */ diff --git a/storage/innobase/include/os0proc.h b/storage/innobase/include/os0proc.h index 9b0b3cbf628..d8952a56cc9 100644 --- a/storage/innobase/include/os0proc.h +++ b/storage/innobase/include/os0proc.h @@ -40,7 +40,7 @@ typedef unsigned long int os_process_id_t; /** The total amount of memory currently allocated from the operating system with os_mem_alloc_large(). */ -extern ulint os_total_large_mem_allocated; +extern Atomic_counter<ulint> os_total_large_mem_allocated; /** Converts the current process id to a number. @return process id as a number */ diff --git a/storage/innobase/include/os0thread.h b/storage/innobase/include/os0thread.h index d99bc841de9..67ee3097274 100644 --- a/storage/innobase/include/os0thread.h +++ b/storage/innobase/include/os0thread.h @@ -73,7 +73,7 @@ typedef unsigned int mysql_pfs_key_t; #endif /* HAVE_PSI_INTERFACE */ /** Number of threads active. */ -extern ulint os_thread_count; +extern Atomic_counter<ulint> os_thread_count; /***************************************************************//** Compares two thread ids for equality. diff --git a/storage/innobase/include/page0cur.ic b/storage/innobase/include/page0cur.ic index 4d7b5c3a42f..f0844ee1f73 100644 --- a/storage/innobase/include/page0cur.ic +++ b/storage/innobase/include/page0cur.ic @@ -24,12 +24,7 @@ The page cursor Created 10/4/1994 Heikki Tuuri *************************************************************************/ -#include "page0page.h" -#include "buf0types.h" - #ifdef UNIV_DEBUG -# include "rem0cmp.h" - /*********************************************************//** Gets pointer to the page frame where the cursor is positioned. @return page */ @@ -280,6 +275,7 @@ page_cur_tuple_insert( *offsets = rec_get_offsets(rec, index, *offsets, page_is_leaf(cursor->block->frame), ULINT_UNDEFINED, heap); + ut_ad(size == rec_offs_size(*offsets)); if (buf_block_get_page_zip(cursor->block)) { rec = page_cur_insert_rec_zip( diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h index 22f4bd5d8c4..0de7f50f8c2 100644 --- a/storage/innobase/include/page0page.h +++ b/storage/innobase/include/page0page.h @@ -27,29 +27,23 @@ Created 2/2/1994 Heikki Tuuri #define page0page_h #include "page0types.h" -#ifndef UNIV_INNOCHECKSUM +#include "fsp0fsp.h" #include "fil0fil.h" #include "buf0buf.h" -#include "data0data.h" -#include "dict0dict.h" -#include "rem0types.h" #include "rem0rec.h" -#endif /* !UNIV_INNOCHECKSUM*/ -#include "fsp0fsp.h" #ifndef UNIV_INNOCHECKSUM +#include "dict0dict.h" +#include "data0data.h" #include "mtr0mtr.h" -#ifdef UNIV_MATERIALIZE -#undef UNIV_INLINE -#define UNIV_INLINE -#endif - /* PAGE HEADER =========== Index page header starts at the first offset left free by the FIL-module */ typedef byte page_header_t; +#else +# include "mach0data.h" #endif /* !UNIV_INNOCHECKSUM */ #define PAGE_HEADER FSEG_PAGE_DATA /* index page header starts at this @@ -164,12 +158,12 @@ Otherwise written as 0. @see PAGE_ROOT_AUTO_INC */ not necessarily collation order; this record may have been deleted */ -/* Directions of cursor movement */ -#define PAGE_LEFT 1 -#define PAGE_RIGHT 2 -#define PAGE_SAME_REC 3 -#define PAGE_SAME_PAGE 4 -#define PAGE_NO_DIRECTION 5 +/* Directions of cursor movement (stored in PAGE_DIRECTION field) */ +constexpr uint16_t PAGE_LEFT= 1; +constexpr uint16_t PAGE_RIGHT= 2; +constexpr uint16_t PAGE_SAME_REC= 3; +constexpr uint16_t PAGE_SAME_PAGE= 4; +constexpr uint16_t PAGE_NO_DIRECTION= 5; #ifndef UNIV_INNOCHECKSUM @@ -1013,13 +1007,6 @@ page_get_direction(const page_t* page) inline uint16_t page_get_instant(const page_t* page); -/** Assign the PAGE_INSTANT field. -@param[in,out] page clustered index root page -@param[in] n original number of clustered index fields -@param[in,out] mtr mini-transaction */ -inline -void -page_set_instant(page_t* page, unsigned n, mtr_t* mtr); /**********************************************************//** Create an uncompressed B-tree index page. @@ -1041,16 +1028,10 @@ page_create_zip( buf_block_t* block, /*!< in/out: a buffer frame where the page is created */ dict_index_t* index, /*!< in: the index of the - page, or NULL when applying - TRUNCATE log - record during recovery */ + page */ ulint level, /*!< in: the B-tree level of the page */ trx_id_t max_trx_id, /*!< in: PAGE_MAX_TRX_ID */ - const redo_page_compress_t* page_comp_info, - /*!< in: used for applying - TRUNCATE log - record during recovery */ mtr_t* mtr); /*!< in/out: mini-transaction handle */ /**********************************************************//** @@ -1338,11 +1319,6 @@ const rec_t* page_find_rec_max_not_deleted( const page_t* page); -#ifdef UNIV_MATERIALIZE -#undef UNIV_INLINE -#define UNIV_INLINE UNIV_INLINE_ORIGINAL -#endif - #endif /* !UNIV_INNOCHECKSUM */ #include "page0page.ic" diff --git a/storage/innobase/include/page0page.ic b/storage/innobase/include/page0page.ic index d1bf382c1d5..c0a3c86c737 100644 --- a/storage/innobase/include/page0page.ic +++ b/storage/innobase/include/page0page.ic @@ -29,18 +29,10 @@ Created 2/2/1994 Heikki Tuuri #ifndef UNIV_INNOCHECKSUM #include "mach0data.h" -#ifdef UNIV_DEBUG -# include "log0recv.h" -#endif /* !UNIV_DEBUG */ #include "rem0cmp.h" #include "mtr0log.h" #include "page0zip.h" -#ifdef UNIV_MATERIALIZE -#undef UNIV_INLINE -#define UNIV_INLINE -#endif - /*************************************************************//** Returns the max trx id field value. */ UNIV_INLINE @@ -1103,29 +1095,6 @@ page_get_instant(const page_t* page) #endif /* UNIV_DEBUG */ return(i >> 3); } - -/** Assign the PAGE_INSTANT field. -@param[in,out] page clustered index root page -@param[in] n original number of clustered index fields -@param[in,out] mtr mini-transaction */ -inline -void -page_set_instant(page_t* page, unsigned n, mtr_t* mtr) -{ - ut_ad(fil_page_get_type(page) == FIL_PAGE_TYPE_INSTANT); - ut_ad(n > 0); - ut_ad(n < REC_MAX_N_FIELDS); - uint16_t i = page_header_get_field(page, PAGE_INSTANT); - ut_ad(i <= PAGE_NO_DIRECTION); - i |= n << 3; - mlog_write_ulint(PAGE_HEADER + PAGE_INSTANT + page, i, - MLOG_2BYTES, mtr); -} #endif /* !UNIV_INNOCHECKSUM */ -#ifdef UNIV_MATERIALIZE -#undef UNIV_INLINE -#define UNIV_INLINE UNIV_INLINE_ORIGINAL -#endif - #endif diff --git a/storage/innobase/include/page0size.h b/storage/innobase/include/page0size.h deleted file mode 100644 index ca1e704eda1..00000000000 --- a/storage/innobase/include/page0size.h +++ /dev/null @@ -1,197 +0,0 @@ -/***************************************************************************** - -Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/page0size.h -A class describing a page size. - -Created Nov 14, 2013 Vasil Dimov -*******************************************************/ - -#ifndef page0size_t -#define page0size_t - -#include "fsp0types.h" - -#define FIELD_REF_SIZE 20U - -/** A BLOB field reference full of zero, for use in assertions and -tests.Initially, BLOB field references are set to zero, in -dtuple_convert_big_rec(). */ -extern const byte field_ref_zero[UNIV_PAGE_SIZE_MAX]; - -#define PAGE_SIZE_T_SIZE_BITS 17 - -/** Page size descriptor. Contains the physical and logical page size, as well -as whether the page is compressed or not. */ -class page_size_t { -public: - /** Constructor from (physical, logical, is_compressed). - @param[in] physical physical (on-disk/zipped) page size - @param[in] logical logical (in-memory/unzipped) page size - @param[in] is_compressed whether the page is compressed */ - page_size_t(ulint physical, ulint logical, bool is_compressed) - { - if (physical == 0) { - physical = UNIV_PAGE_SIZE_ORIG; - } - if (logical == 0) { - logical = UNIV_PAGE_SIZE_ORIG; - } - - m_physical = static_cast<unsigned>(physical); - m_logical = static_cast<unsigned>(logical); - m_is_compressed = static_cast<unsigned>(is_compressed); - - ut_ad(physical <= (1 << PAGE_SIZE_T_SIZE_BITS)); - ut_ad(logical <= (1 << PAGE_SIZE_T_SIZE_BITS)); - - ut_ad(ut_is_2pow(physical)); - ut_ad(ut_is_2pow(logical)); - - ut_ad(logical <= UNIV_PAGE_SIZE_MAX); - ut_ad(logical >= physical); - ut_ad(!is_compressed || physical <= UNIV_ZIP_SIZE_MAX); - } - - /** Constructor from (fsp_flags). - @param[in] fsp_flags filespace flags */ - explicit page_size_t(ulint fsp_flags) - { - ulint ssize = FSP_FLAGS_GET_PAGE_SSIZE(fsp_flags); - - /* If the logical page size is zero in fsp_flags, then use the - legacy 16k page size. */ - ssize = (0 == ssize) ? UNIV_PAGE_SSIZE_ORIG : ssize; - - /* Convert from a 'log2 minus 9' to a page size in bytes. */ - const unsigned size = ((UNIV_ZIP_SIZE_MIN >> 1) << ssize); - - ut_ad(size <= UNIV_PAGE_SIZE_MAX); - ut_ad(size <= (1 << PAGE_SIZE_T_SIZE_BITS)); - - m_logical = size; - - ssize = FSP_FLAGS_GET_ZIP_SSIZE(fsp_flags); - - /* If the fsp_flags have zero in the zip_ssize field, then it means - that the tablespace does not have compressed pages and the physical - page size is the same as the logical page size. */ - if (ssize == 0) { - m_is_compressed = false; - m_physical = m_logical; - } else { - m_is_compressed = true; - - /* Convert from a 'log2 minus 9' to a page size - in bytes. */ - const unsigned phy - = ((UNIV_ZIP_SIZE_MIN >> 1) << ssize); - - ut_ad(phy <= UNIV_ZIP_SIZE_MAX); - ut_ad(phy <= (1 << PAGE_SIZE_T_SIZE_BITS)); - - m_physical = phy; - } - } - - /** Retrieve the physical page size (on-disk). - @return physical page size in bytes */ - inline ulint physical() const - { - ut_ad(m_physical > 0); - - return(m_physical); - } - - /** Retrieve the logical page size (in-memory). - @return logical page size in bytes */ - inline ulint logical() const - { - ut_ad(m_logical > 0); - return(m_logical); - } - - /** Check whether the page is compressed on disk. - @return true if compressed */ - inline bool is_compressed() const - { - return(m_is_compressed); - } - - /** Copy the values from a given page_size_t object. - @param[in] src page size object whose values to fetch */ - inline void copy_from(const page_size_t& src) - { - *this = src; - } - - /** Check if a given page_size_t object is equal to the current one. - @param[in] a page_size_t object to compare - @return true if equal */ - inline bool equals_to(const page_size_t& a) const - { - return(a.physical() == m_physical - && a.logical() == m_logical - && a.is_compressed() == m_is_compressed); - } - -private: - - /* For non compressed tablespaces, physical page size is equal to - the logical page size and the data is stored in buf_page_t::frame - (and is also always equal to univ_page_size (--innodb-page-size=)). - - For compressed tablespaces, physical page size is the compressed - page size as stored on disk and in buf_page_t::zip::data. The logical - page size is the uncompressed page size in memory - the size of - buf_page_t::frame (currently also always equal to univ_page_size - (--innodb-page-size=)). */ - - /** Physical page size. */ - unsigned m_physical:PAGE_SIZE_T_SIZE_BITS; - - /** Logical page size. */ - unsigned m_logical:PAGE_SIZE_T_SIZE_BITS; - - /** Flag designating whether the physical page is compressed, which is - true IFF the whole tablespace where the page belongs is compressed. */ - unsigned m_is_compressed:1; -}; - -/* Overloading the global output operator to conveniently print an object -of type the page_size_t. -@param[in,out] out the output stream -@param[in] obj an object of type page_size_t to be printed -@retval the output stream */ -inline -std::ostream& -operator<<( - std::ostream& out, - const page_size_t& obj) -{ - out << "[page size: physical=" << obj.physical() - << ", logical=" << obj.logical() - << ", compressed=" << obj.is_compressed() << "]"; - return(out); -} - -extern page_size_t univ_page_size; - -#endif /* page0size_t */ diff --git a/storage/innobase/include/page0types.h b/storage/innobase/include/page0types.h index 0fcaebd0e43..14ccc2eae36 100644 --- a/storage/innobase/include/page0types.h +++ b/storage/innobase/include/page0types.h @@ -84,18 +84,6 @@ enum page_cur_mode_t { PAGE_CUR_RTREE_GET_FATHER = 14 }; - -/** The information used for compressing a page when applying -TRUNCATE log record during recovery */ -struct redo_page_compress_t { - ulint type; /*!< index type */ - index_id_t index_id; /*!< index id */ - ulint n_fields; /*!< number of index fields */ - ulint field_len; /*!< the length of index field */ - const byte* fields; /*!< index field information */ - ulint trx_id_pos; /*!< position of trx-id column. */ -}; - /** Compressed page descriptor */ struct page_zip_des_t { diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h index bf6ad5c860f..ec205fd79bf 100644 --- a/storage/innobase/include/page0zip.h +++ b/storage/innobase/include/page0zip.h @@ -28,28 +28,11 @@ Created June 2005 by Marko Makela #ifndef page0zip_h #define page0zip_h -#ifdef UNIV_MATERIALIZE -# undef UNIV_INLINE -# define UNIV_INLINE -#endif - -#ifdef UNIV_INNOCHECKSUM -#include "buf0buf.h" -#include "ut0crc32.h" -#include "buf0checksum.h" -#include "mach0data.h" -#include "zlib.h" -#endif /* UNIV_INNOCHECKSUM */ +#include "buf0types.h" #ifndef UNIV_INNOCHECKSUM #include "mtr0types.h" #include "page0types.h" -#endif /* !UNIV_INNOCHECKSUM */ - -#include "buf0types.h" -#include "rem0types.h" - -#ifndef UNIV_INNOCHECKSUM #include "dict0types.h" #include "srv0srv.h" #include "trx0types.h" @@ -103,15 +86,10 @@ page_zip_set_size( @param[in] comp nonzero=compact format @param[in] n_fields number of fields in the record; ignored if tablespace is not compressed -@param[in] page_size page size -@return FALSE if the entire record can be stored locally on the page */ -UNIV_INLINE -ibool -page_zip_rec_needs_ext( - ulint rec_size, - ulint comp, - ulint n_fields, - const page_size_t& page_size) +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return false if the entire record can be stored locally on the page */ +inline bool page_zip_rec_needs_ext(ulint rec_size, ulint comp, ulint n_fields, + ulint zip_size) MY_ATTRIBUTE((warn_unused_result)); /**********************************************************************//** @@ -164,10 +142,6 @@ page_zip_compress( dict_index_t* index, /*!< in: index of the B-tree node */ ulint level, /*!< in: commpression level */ - const redo_page_compress_t* page_comp_info, - /*!< in: used for applying - TRUNCATE log - record during recovery */ mtr_t* mtr); /*!< in/out: mini-transaction, or NULL */ @@ -516,12 +490,7 @@ uint32_t page_zip_calc_checksum( const void* data, ulint size, - srv_checksum_algorithm_t algo -#ifdef INNODB_BUG_ENDIAN_CRC32 - /** for crc32, use the big-endian bug-compatible crc32 variant */ - , bool use_legacy_big_endian = false -#endif -); + srv_checksum_algorithm_t algo); /** Validate the checksum on a ROW_FORMAT=COMPRESSED page. @param data ROW_FORMAT=COMPRESSED page @@ -562,11 +531,6 @@ void page_zip_reset_stat_per_index(); /*===========================*/ -#ifdef UNIV_MATERIALIZE -# undef UNIV_INLINE -# define UNIV_INLINE UNIV_INLINE_ORIGINAL -#endif - #include "page0zip.ic" #endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/page0zip.ic b/storage/innobase/include/page0zip.ic index 4e4ccdb492f..337debd30e9 100644 --- a/storage/innobase/include/page0zip.ic +++ b/storage/innobase/include/page0zip.ic @@ -2,7 +2,7 @@ Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -25,11 +25,6 @@ Compressed page interface Created June 2005 by Marko Makela *******************************************************/ -#ifdef UNIV_MATERIALIZE -# undef UNIV_INLINE -# define UNIV_INLINE -#endif - #include "page0zip.h" #include "mtr0log.h" #include "page0page.h" @@ -154,22 +149,17 @@ page_zip_set_size( @param[in] comp nonzero=compact format @param[in] n_fields number of fields in the record; ignored if tablespace is not compressed -@param[in] page_size page size -@return FALSE if the entire record can be stored locally on the page */ -UNIV_INLINE -ibool -page_zip_rec_needs_ext( - ulint rec_size, - ulint comp, - ulint n_fields, - const page_size_t& page_size) +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return false if the entire record can be stored locally on the page */ +inline bool page_zip_rec_needs_ext(ulint rec_size, ulint comp, ulint n_fields, + ulint zip_size) { /* FIXME: row size check is this function seems to be the most correct. Put it in a separate function and use in more places of InnoDB */ ut_ad(rec_size > ulint(comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES)); - ut_ad(comp || !page_size.is_compressed()); + ut_ad(comp || !zip_size); #if UNIV_PAGE_SIZE_MAX > COMPRESSED_REC_MAX_DATA_SIZE if (comp ? rec_size >= COMPRESSED_REC_MAX_DATA_SIZE : @@ -178,7 +168,7 @@ page_zip_rec_needs_ext( } #endif - if (page_size.is_compressed()) { + if (zip_size) { ut_ad(comp); /* On a compressed page, there is a two-byte entry in the dense page directory for every record. But there @@ -187,7 +177,7 @@ page_zip_rec_needs_ext( the encoded heap number. Check also the available space on the uncompressed page. */ return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2 - 1) - >= page_zip_empty_size(n_fields, page_size.physical()) + >= page_zip_empty_size(n_fields, zip_size) || rec_size >= page_get_free_space_of_empty(TRUE) / 2); } @@ -417,7 +407,7 @@ page_zip_parse_compress_no_data( was successful. Crash in this case. */ if (page - && !page_zip_compress(page_zip, page, index, level, NULL, NULL)) { + && !page_zip_compress(page_zip, page, index, level, NULL)) { ut_error; } @@ -440,8 +430,3 @@ page_zip_reset_stat_per_index() mutex_exit(&page_zip_stat_per_index_mutex); } - -#ifdef UNIV_MATERIALIZE -# undef UNIV_INLINE -# define UNIV_INLINE UNIV_INLINE_ORIGINAL -#endif diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h index b46393e37d2..2df2b33f5c8 100644 --- a/storage/innobase/include/que0que.h +++ b/storage/innobase/include/que0que.h @@ -287,9 +287,9 @@ que_eval_sql( /*=========*/ pars_info_t* info, /*!< in: info struct, or NULL */ const char* sql, /*!< in: SQL string */ - ibool reserve_dict_mutex, - /*!< in: if TRUE, acquire/release - dict_sys->mutex around call to pars_sql. */ + bool reserve_dict_mutex, + /*!< in: whether to acquire/release + dict_sys.mutex around call to pars_sql. */ trx_t* trx); /*!< in: trx */ /**********************************************************************//** diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h index c0faf84cfbe..48575feda10 100644 --- a/storage/innobase/include/read0types.h +++ b/storage/innobase/include/read0types.h @@ -66,7 +66,14 @@ class ReadView Close view: READ_VIEW_STATE_OPEN -> READ_VIEW_STATE_CLOSED */ - int32_t m_state; + std::atomic<uint32_t> m_state; + + + /** m_state getter for ReadView owner thread */ + uint32_t state() const + { + return m_state.load(std::memory_order_relaxed); + } public: @@ -134,35 +141,36 @@ loop: Closes the view. View becomes not visible to purge thread. + + This method is intended to be called by ReadView owner thread, thus + m_state cannot change. */ void close() { - ut_ad(m_state == READ_VIEW_STATE_CLOSED || - m_state == READ_VIEW_STATE_OPEN); - if (m_state == READ_VIEW_STATE_OPEN) - my_atomic_store32_explicit(&m_state, READ_VIEW_STATE_CLOSED, - MY_MEMORY_ORDER_RELAXED); + ut_ad(state() == READ_VIEW_STATE_CLOSED || + state() == READ_VIEW_STATE_OPEN); + m_state.store(READ_VIEW_STATE_CLOSED, std::memory_order_relaxed); } /** m_state getter for trx_sys::clone_oldest_view() trx_sys::size(). */ - int32_t get_state() const + uint32_t get_state() const { - return my_atomic_load32_explicit(const_cast<int32*>(&m_state), - MY_MEMORY_ORDER_ACQUIRE); + return m_state.load(std::memory_order_acquire); } /** Returns true if view is open. - Only used by view owner thread, thus we can omit atomic operations. + This method is intended to be called by ReadView owner thread, thus + m_state cannot change. */ bool is_open() const { - ut_ad(m_state == READ_VIEW_STATE_OPEN || - m_state == READ_VIEW_STATE_CLOSED); - return m_state == READ_VIEW_STATE_OPEN; + ut_ad(state() == READ_VIEW_STATE_OPEN || + state() == READ_VIEW_STATE_CLOSED); + return state() == READ_VIEW_STATE_OPEN; } diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h index 2a522ae4837..23c25f76362 100644 --- a/storage/innobase/include/rem0rec.h +++ b/storage/innobase/include/rem0rec.h @@ -38,15 +38,6 @@ Created 5/30/1994 Heikki Tuuri #include <ostream> #include <sstream> -/* Info bit denoting the predefined minimum record: this bit is set -if and only if the record is the first user record on a non-leaf -B-tree page that is the leftmost page on its level -(PAGE_LEVEL is nonzero and FIL_PAGE_PREV is FIL_NULL). */ -#define REC_INFO_MIN_REC_FLAG 0x10UL -/* The deleted flag in info bits */ -#define REC_INFO_DELETED_FLAG 0x20UL /* when bit is set to 1, it means the - record has been delete marked */ - /* Number of extra bytes in an old-style record, in addition to the data and the offsets */ #define REC_N_OLD_EXTRA_BYTES 6 @@ -54,26 +45,6 @@ in addition to the data and the offsets */ in addition to the data and the offsets */ #define REC_N_NEW_EXTRA_BYTES 5 -/** Record status values for ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED */ -enum rec_comp_status_t { - /** User record (PAGE_LEVEL=0, heap>=PAGE_HEAP_NO_USER_LOW) */ - REC_STATUS_ORDINARY = 0, - /** Node pointer record (PAGE_LEVEL>=0, heap>=PAGE_HEAP_NO_USER_LOW) */ - REC_STATUS_NODE_PTR = 1, - /** The page infimum pseudo-record (heap=PAGE_HEAP_NO_INFIMUM) */ - REC_STATUS_INFIMUM = 2, - /** The page supremum pseudo-record (heap=PAGE_HEAP_NO_SUPREMUM) */ - REC_STATUS_SUPREMUM = 3, - /** Clustered index record that has been inserted or updated - after instant ADD COLUMN (more than dict_index_t::n_core_fields) */ - REC_STATUS_COLUMNS_ADDED = 4 -}; - -/** The dtuple_t::info_bits of the metadata pseudo-record. -@see rec_is_metadata() */ -static const byte REC_INFO_METADATA - = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED; - #define REC_NEW_STATUS 3 /* This is single byte bit-field */ #define REC_NEW_STATUS_MASK 0x7UL #define REC_NEW_STATUS_SHIFT 0 @@ -336,7 +307,7 @@ rec_comp_status_t rec_get_status(const rec_t* rec) { byte bits = rec[-REC_NEW_STATUS] & REC_NEW_STATUS_MASK; - ut_ad(bits <= REC_STATUS_COLUMNS_ADDED); + ut_ad(bits <= REC_STATUS_INSTANT); return static_cast<rec_comp_status_t>(bits); } @@ -347,12 +318,12 @@ inline void rec_set_status(rec_t* rec, byte bits) { - ut_ad(bits <= REC_STATUS_COLUMNS_ADDED); + ut_ad(bits <= REC_STATUS_INSTANT); rec[-REC_NEW_STATUS] = (rec[-REC_NEW_STATUS] & ~REC_NEW_STATUS_MASK) | bits; } -/** Get the length of added field count in a REC_STATUS_COLUMNS_ADDED record. +/** Get the length of added field count in a REC_STATUS_INSTANT record. @param[in] n_add_field number of added fields, minus one @return storage size of the field count, in bytes */ inline unsigned rec_get_n_add_field_len(ulint n_add_field) @@ -361,8 +332,26 @@ inline unsigned rec_get_n_add_field_len(ulint n_add_field) return n_add_field < 0x80 ? 1 : 2; } -/** Set the added field count in a REC_STATUS_COLUMNS_ADDED record. -@param[in,out] header variable header of a REC_STATUS_COLUMNS_ADDED record +/** Get the added field count in a REC_STATUS_INSTANT record. +@param[in,out] header variable header of a REC_STATUS_INSTANT record +@return number of added fields */ +inline unsigned rec_get_n_add_field(const byte*& header) +{ + unsigned n_fields_add = *--header; + if (n_fields_add < 0x80) { + ut_ad(rec_get_n_add_field_len(n_fields_add) == 1); + return n_fields_add; + } + + n_fields_add &= 0x7f; + n_fields_add |= unsigned(*--header) << 7; + ut_ad(n_fields_add < REC_MAX_N_FIELDS); + ut_ad(rec_get_n_add_field_len(n_fields_add) == 2); + return n_fields_add; +} + +/** Set the added field count in a REC_STATUS_INSTANT record. +@param[in,out] header variable header of a REC_STATUS_INSTANT record @param[in] n_add number of added fields, minus 1 @return record header before the number of added fields */ inline void rec_set_n_add_field(byte*& header, ulint n_add) @@ -799,20 +788,89 @@ inline ulint rec_offs_comp(const rec_offs *offsets) } /** Determine if the record is the metadata pseudo-record -in the clustered index. +in the clustered index for instant ADD COLUMN or ALTER TABLE. +@param[in] rec leaf page record +@param[in] comp 0 if ROW_FORMAT=REDUNDANT, else nonzero +@return whether the record is the metadata pseudo-record */ +inline bool rec_is_metadata(const rec_t* rec, ulint comp) +{ + bool is = !!(rec_get_info_bits(rec, comp) & REC_INFO_MIN_REC_FLAG); + ut_ad(!is || !comp || rec_get_status(rec) == REC_STATUS_INSTANT); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ADD COLUMN or ALTER TABLE. +@param[in] rec leaf page record +@param[in] index index of the record +@return whether the record is the metadata pseudo-record */ +inline bool rec_is_metadata(const rec_t* rec, const dict_index_t& index) +{ + bool is = rec_is_metadata(rec, dict_table_is_comp(index.table)); + ut_ad(!is || index.is_instant()); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ADD COLUMN (not other ALTER TABLE). +@param[in] rec leaf page record +@param[in] comp 0 if ROW_FORMAT=REDUNDANT, else nonzero +@return whether the record is the metadata pseudo-record */ +inline bool rec_is_add_metadata(const rec_t* rec, ulint comp) +{ + bool is = rec_get_info_bits(rec, comp) == REC_INFO_MIN_REC_FLAG; + ut_ad(!is || !comp || rec_get_status(rec) == REC_STATUS_INSTANT); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ADD COLUMN (not other ALTER TABLE). @param[in] rec leaf page record @param[in] index index of the record @return whether the record is the metadata pseudo-record */ -inline bool rec_is_metadata(const rec_t* rec, const dict_index_t* index) +inline bool rec_is_add_metadata(const rec_t* rec, const dict_index_t& index) +{ + bool is = rec_is_add_metadata(rec, dict_table_is_comp(index.table)); + ut_ad(!is || index.is_instant()); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ALTER TABLE (not plain ADD COLUMN). +@param[in] rec leaf page record +@param[in] comp 0 if ROW_FORMAT=REDUNDANT, else nonzero +@return whether the record is the ALTER TABLE metadata pseudo-record */ +inline bool rec_is_alter_metadata(const rec_t* rec, ulint comp) +{ + bool is = !(~rec_get_info_bits(rec, comp) + & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)); + ut_ad(!is || rec_is_metadata(rec, comp)); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ALTER TABLE (not plain ADD COLUMN). +@param[in] rec leaf page record +@param[in] index index of the record +@return whether the record is the ALTER TABLE metadata pseudo-record */ +inline bool rec_is_alter_metadata(const rec_t* rec, const dict_index_t& index) { - bool is = rec_get_info_bits(rec, dict_table_is_comp(index->table)) - & REC_INFO_MIN_REC_FLAG; - ut_ad(!is || index->is_instant()); - ut_ad(!is || !dict_table_is_comp(index->table) - || rec_get_status(rec) == REC_STATUS_COLUMNS_ADDED); + bool is = rec_is_alter_metadata(rec, dict_table_is_comp(index.table)); + ut_ad(!is || index.is_dummy || index.is_instant()); return is; } +/** Determine if a record is delete-marked (not a metadata pseudo-record). +@param[in] rec record +@param[in] comp nonzero if ROW_FORMAT!=REDUNDANT +@return whether the record is a delete-marked user record */ +inline bool rec_is_delete_marked(const rec_t* rec, ulint comp) +{ + return (rec_get_info_bits(rec, comp) + & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)) + == REC_INFO_DELETED_FLAG; +} + /** Get the nth field from an index. @param[in] rec index record @param[in] index index @@ -830,6 +888,7 @@ rec_get_nth_cfield( ulint* len) { ut_ad(rec_offs_validate(rec, index, offsets)); + if (!rec_offs_nth_default(offsets, n)) { return rec_get_nth_field(rec, offsets, n, len); } @@ -976,7 +1035,7 @@ rec_copy( @param[in] fields data fields @param[in] n_fields number of data fields @param[out] extra record header size -@param[in] status REC_STATUS_ORDINARY or REC_STATUS_COLUMNS_ADDED +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT @return total size, in bytes */ ulint rec_get_converted_size_temp( @@ -993,7 +1052,7 @@ rec_get_converted_size_temp( @param[in,out] offsets offsets to the fields; in: rec_offs_n_fields(offsets) @param[in] n_core number of core fields (index->n_core_fields) @param[in] def_val default values for non-core fields -@param[in] status REC_STATUS_ORDINARY or REC_STATUS_COLUMNS_ADDED */ +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT */ void rec_init_offsets_temp( const rec_t* rec, @@ -1020,8 +1079,7 @@ rec_init_offsets_temp( @param[in] index clustered or secondary index @param[in] fields data fields @param[in] n_fields number of data fields -@param[in] status REC_STATUS_ORDINARY or REC_STATUS_COLUMNS_ADDED -*/ +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT */ void rec_convert_dtuple_to_temp( rec_t* rec, @@ -1084,21 +1142,20 @@ rec_get_converted_size_comp_prefix( ulint n_fields,/*!< in: number of data fields */ ulint* extra) /*!< out: extra size */ MY_ATTRIBUTE((warn_unused_result, nonnull(1,2))); -/**********************************************************//** -Determines the size of a data tuple in ROW_FORMAT=COMPACT. + +/** Determine the size of a record in ROW_FORMAT=COMPACT. +@param[in] index record descriptor. dict_table_is_comp() + is assumed to hold, even if it doesn't +@param[in] tuple logical record +@param[out] extra extra size @return total size */ ulint rec_get_converted_size_comp( -/*========================*/ - const dict_index_t* index, /*!< in: record descriptor; - dict_table_is_comp() is - assumed to hold, even if - it does not */ - rec_comp_status_t status, /*!< in: status bits of the record */ - const dfield_t* fields, /*!< in: array of data fields */ - ulint n_fields,/*!< in: number of data fields */ - ulint* extra) /*!< out: extra size */ - MY_ATTRIBUTE((nonnull(1,3))); + const dict_index_t* index, + const dtuple_t* tuple, + ulint* extra) + MY_ATTRIBUTE((nonnull(1,2))); + /**********************************************************//** The following function returns the size of a data tuple when converted to a physical record. @@ -1273,7 +1330,7 @@ public: } /** Destructor */ - virtual ~rec_printer() {} + ~rec_printer() override {} private: /** Copy constructor */ diff --git a/storage/innobase/include/rem0rec.ic b/storage/innobase/include/rem0rec.ic index 48898b1f916..6cecd9f1f08 100644 --- a/storage/innobase/include/rem0rec.ic +++ b/storage/innobase/include/rem0rec.ic @@ -67,7 +67,7 @@ most significant bytes and bits are written below less significant. 001=REC_STATUS_NODE_PTR 010=REC_STATUS_INFIMUM 011=REC_STATUS_SUPREMUM - 100=REC_STATUS_COLUMNS_ADDED + 100=REC_STATUS_INSTANT 1xx=reserved 5 bits heap number 4 8 bits heap number @@ -451,7 +451,7 @@ rec_get_n_fields( } switch (rec_get_status(rec)) { - case REC_STATUS_COLUMNS_ADDED: + case REC_STATUS_INSTANT: case REC_STATUS_ORDINARY: return(dict_index_get_n_fields(index)); case REC_STATUS_NODE_PTR: @@ -547,19 +547,6 @@ rec_set_n_owned_new( } } -#ifdef UNIV_DEBUG -/** Check if the info bits are valid. -@param[in] bits info bits to check -@return true if valid */ -inline -bool -rec_info_bits_valid( - ulint bits) -{ - return(0 == (bits & ~(REC_INFO_DELETED_FLAG | REC_INFO_MIN_REC_FLAG))); -} -#endif /* UNIV_DEBUG */ - /******************************************************//** The following function is used to retrieve the info bits of a record. @return info bits */ @@ -573,7 +560,6 @@ rec_get_info_bits( const ulint val = rec_get_bit_field_1( rec, comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS, REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); - ut_ad(rec_info_bits_valid(val)); return(val); } @@ -586,7 +572,6 @@ rec_set_info_bits_old( rec_t* rec, /*!< in: old-style physical record */ ulint bits) /*!< in: info bits */ { - ut_ad(rec_info_bits_valid(bits)); rec_set_bit_field_1(rec, bits, REC_OLD_INFO_BITS, REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); } @@ -599,7 +584,6 @@ rec_set_info_bits_new( rec_t* rec, /*!< in/out: new-style physical record */ ulint bits) /*!< in: info bits */ { - ut_ad(rec_info_bits_valid(bits)); rec_set_bit_field_1(rec, bits, REC_NEW_INFO_BITS, REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); } @@ -1388,24 +1372,20 @@ rec_get_converted_size( } else if (index->table->id == DICT_INDEXES_ID) { /* The column SYS_INDEXES.MERGE_THRESHOLD was instantly added in MariaDB 10.2.2 (MySQL 5.7). */ + ut_ad(!index->table->is_temporary()); ut_ad(index->n_fields == DICT_NUM_FIELDS__SYS_INDEXES); ut_ad(dtuple->n_fields == DICT_NUM_FIELDS__SYS_INDEXES || dtuple->n_fields == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD); } else { ut_ad(dtuple->n_fields >= index->n_core_fields); - ut_ad(dtuple->n_fields <= index->n_fields); + ut_ad(dtuple->n_fields <= index->n_fields + || dtuple->is_alter_metadata()); } #endif if (dict_table_is_comp(index->table)) { - return(rec_get_converted_size_comp( - index, - static_cast<rec_comp_status_t>( - dtuple->info_bits - & REC_NEW_STATUS_MASK), - dtuple->fields, - dtuple->n_fields, NULL)); + return rec_get_converted_size_comp(index, dtuple, NULL); } data_size = dtuple_get_data_size(dtuple, 0); diff --git a/storage/innobase/include/row0ext.h b/storage/innobase/include/row0ext.h index 11a6bfa4667..251f3125667 100644 --- a/storage/innobase/include/row0ext.h +++ b/storage/innobase/include/row0ext.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,7 +30,7 @@ Created September 2006 Marko Makela #include "data0types.h" #include "mem0mem.h" #include "dict0types.h" -#include "page0size.h" +#include "fsp0types.h" #include "row0types.h" /********************************************************************//** @@ -43,7 +44,7 @@ row_ext_create( in the InnoDB table object, as reported by dict_col_get_no(); NOT relative to the records in the clustered index */ - ulint flags, /*!< in: table->flags */ + const dict_table_t& table, /*!< in: table */ const dtuple_t* tuple, /*!< in: data tuple containing the field references of the externally stored columns; must be indexed by col_no; @@ -91,9 +92,7 @@ struct row_ext_t{ REC_ANTELOPE_MAX_INDEX_COL_LEN or REC_VERSION_56_MAX_INDEX_COL_LEN depending on row format */ - page_size_t page_size; - /*!< page size of the externally stored - columns */ + ulint zip_size;/*!< ROW_FORMAT=COMPRESSED page size, or 0 */ ulint len[1]; /*!< prefix lengths; 0 if not cached */ }; diff --git a/storage/innobase/include/row0ftsort.h b/storage/innobase/include/row0ftsort.h index 3a65e1c58da..0189bb7a4ff 100644 --- a/storage/innobase/include/row0ftsort.h +++ b/storage/innobase/include/row0ftsort.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2015, 2018, MariaDB Corporation. +Copyright (c) 2015, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -60,8 +60,8 @@ struct fts_psort_t; struct fts_psort_common_t { row_merge_dup_t* dup; /*!< descriptor of FTS index */ dict_table_t* new_table; /*!< source table */ - /* Old table page size */ - page_size_t old_page_size; + /** Old table page size */ + ulint old_zip_size; trx_t* trx; /*!< transaction */ fts_psort_t* all_info; /*!< all parallel sort info */ os_event_t sort_event; /*!< sort event */ @@ -199,19 +199,19 @@ row_merge_create_fts_sort_index( @param[in] new_table table where indexes are created @param[in] opt_doc_id_size whether to use 4 bytes instead of 8 bytes integer to store Doc ID during sort -@param[in] old_page_size page size of the old table during alter +@param[in] old_zip_size page size of the old table during alter @param[out] psort parallel sort info to be instantiated @param[out] merge parallel merge info to be instantiated -@return TRUE if all successful */ -ibool +@return true if all successful */ +bool row_fts_psort_info_init( - trx_t* trx, - row_merge_dup_t* dup, - const dict_table_t* new_table, - ibool opt_doc_id_size, - const page_size_t old_page_size, - fts_psort_t** psort, - fts_psort_t** merge) + trx_t* trx, + row_merge_dup_t*dup, + dict_table_t* new_table, + bool opt_doc_id_size, + ulint old_zip_size, + fts_psort_t** psort, + fts_psort_t** merge) MY_ATTRIBUTE((nonnull)); /********************************************************************//** diff --git a/storage/innobase/include/row0log.h b/storage/innobase/include/row0log.h index fac1b950e2e..63fd877691c 100644 --- a/storage/innobase/include/row0log.h +++ b/storage/innobase/include/row0log.h @@ -36,7 +36,7 @@ Created 2011-05-26 Marko Makela class ut_stage_alter_t; -extern ulint onlineddl_rowlog_rows; +extern Atomic_counter<ulint> onlineddl_rowlog_rows; extern ulint onlineddl_rowlog_pct_used; extern ulint onlineddl_pct_progress; diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h index 8c9b5325c5f..e5798f1f673 100644 --- a/storage/innobase/include/row0mysql.h +++ b/storage/innobase/include/row0mysql.h @@ -43,6 +43,7 @@ Created 9/17/2000 Heikki Tuuri extern ibool row_rollback_on_timeout; struct row_prebuilt_t; +class ha_innobase; /*******************************************************************//** Frees the blob heap in prebuilt when no longer needed. */ @@ -417,7 +418,7 @@ will remain locked. @param[in] create_failed true=create table failed because e.g. foreign key column @param[in] nonatomic Whether it is permitted to release - and reacquire dict_operation_lock + and reacquire dict_sys.latch @return error code */ dberr_t row_drop_table_for_mysql( @@ -777,10 +778,14 @@ struct row_prebuilt_t { store it here so that we can return it to MySQL */ /*----------------------*/ - void* idx_cond; /*!< In ICP, pointer to a ha_innobase, - passed to innobase_index_cond(). - NULL if index condition pushdown is - not used. */ + + /** Argument of handler_rowid_filter_check(), + or NULL if no PRIMARY KEY filter is pushed */ + ha_innobase* pk_filter; + + /** Argument to handler_index_cond_check(), + or NULL if no index condition pushdown (ICP) is used. */ + ha_innobase* idx_cond; ulint idx_cond_n_cols;/*!< Number of fields in idx_cond_cols. 0 if and only if idx_cond == NULL. */ /*----------------------*/ diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h index 5268d684529..b4dab3c2f1b 100644 --- a/storage/innobase/include/row0row.h +++ b/storage/innobase/include/row0row.h @@ -74,6 +74,7 @@ row_get_rec_roll_ptr( #define ROW_BUILD_FOR_PURGE 1 /*!< build row for purge. */ #define ROW_BUILD_FOR_UNDO 2 /*!< build row for undo. */ #define ROW_BUILD_FOR_INSERT 3 /*!< build row for insert. */ + /*****************************************************************//** When an insert or purge to a table is performed, this function builds the entry to be inserted into or purged from an index on the table. @@ -223,6 +224,24 @@ row_rec_to_index_entry( mem_heap_t* heap) /*!< in: memory heap from which the memory needed is allocated */ MY_ATTRIBUTE((warn_unused_result)); + +/** Convert a metadata record to a data tuple. +@param[in] rec metadata record +@param[in] index clustered index after instant ALTER TABLE +@param[in] offsets rec_get_offsets(rec) +@param[in,out] heap memory heap for allocations +@param[in] info_bits the info_bits after an update +@param[in] pad whether to pad to index->n_fields */ +dtuple_t* +row_metadata_to_tuple( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets, + mem_heap_t* heap, + ulint info_bits, + bool pad) + MY_ATTRIBUTE((nonnull,warn_unused_result)); + /*******************************************************************//** Builds from a secondary index record a row reference with which we can search the clustered index record. diff --git a/storage/innobase/include/row0row.ic b/storage/innobase/include/row0row.ic index 18e6959e6f3..e89adb581f4 100644 --- a/storage/innobase/include/row0row.ic +++ b/storage/innobase/include/row0row.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -39,16 +39,12 @@ row_get_trx_id_offset( const dict_index_t* index, /*!< in: clustered index */ const rec_offs* offsets)/*!< in: record offsets */ { - ulint pos; ulint offset; ulint len; - ut_ad(dict_index_is_clust(index)); ut_ad(rec_offs_validate(NULL, index, offsets)); - pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); - - offset = rec_get_nth_field_offs(offsets, pos, &len); + offset = rec_get_nth_field_offs(offsets, index->db_trx_id(), &len); ut_ad(len == DATA_TRX_ID_LEN); diff --git a/storage/innobase/include/row0trunc.h b/storage/innobase/include/row0trunc.h deleted file mode 100644 index c5f89f7cfdb..00000000000 --- a/storage/innobase/include/row0trunc.h +++ /dev/null @@ -1,416 +0,0 @@ -/***************************************************************************** - -Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2018, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/row0trunc.h -TRUNCATE implementation - -Created 2013-04-25 Krunal Bauskar -*******************************************************/ - -#ifndef row0trunc_h -#define row0trunc_h - -#include "row0mysql.h" -#include "dict0boot.h" -#include "fil0fil.h" -#include "srv0start.h" - -#include <vector> - -/** The information of TRUNCATE log record. -This class handles the recovery stage of TRUNCATE table. */ -class truncate_t { - -public: - /** - Constructor - - @param old_table_id old table id assigned to table before truncate - @param new_table_id new table id that will be assigned to table - after truncate - @param dir_path directory path */ - truncate_t( - table_id_t old_table_id, - table_id_t new_table_id, - const char* dir_path); - - /** - Constructor - - @param log_file_name parse the log file during recovery to populate - information related to table to truncate */ - truncate_t(const char* log_file_name); - - /** - Consturctor - - @param space_id space in which table reisde - @param name table name - @param tablespace_flags tablespace flags use for recreating tablespace - @param log_flags page format flag - @param recv_lsn lsn of redo log record. */ - truncate_t( - ulint space_id, - const char* name, - ulint tablespace_flags, - ulint log_flags, - lsn_t recv_lsn); - - /** Destructor */ - ~truncate_t(); - - /** The index information of MLOG_FILE_TRUNCATE redo record */ - struct index_t { - - /* Default copy constructor and destructor should be OK. */ - - index_t(); - - /** - Set the truncate log values for a compressed table. - @return DB_CORRUPTION or error code */ - dberr_t set(const dict_index_t* index); - - typedef std::vector<byte, ut_allocator<byte> > fields_t; - - /** Index id */ - index_id_t m_id; - - /** Index type */ - ulint m_type; - - /** Root Page Number */ - ulint m_root_page_no; - - /** New Root Page Number. - Note: This field is not persisted to TRUNCATE log but used - during truncate table fix-up for updating SYS_XXXX tables. */ - ulint m_new_root_page_no; - - /** Number of index fields */ - ulint m_n_fields; - - /** DATA_TRX_ID column position. */ - ulint m_trx_id_pos; - - /** Compressed table field meta data, encode by - page_zip_fields_encode. Empty for non-compressed tables. - Should be NUL terminated. */ - fields_t m_fields; - }; - - /** - @return the directory path, can be NULL */ - const char* get_dir_path() const - { - return(m_dir_path); - } - - /** - Register index information - - @param index index information logged as part of truncate log. */ - void add(index_t& index) - { - m_indexes.push_back(index); - } - - /** - Add table to truncate post recovery. - - @param ptr table information need to complete truncate of table. */ - static void add(truncate_t* ptr) - { - s_tables.push_back(ptr); - } - - /** - Clear registered index vector */ - void clear() - { - m_indexes.clear(); - } - - /** - @return old table id of the table to truncate */ - table_id_t old_table_id() const - { - return(m_old_table_id); - } - - /** - @return new table id of the table to truncate */ - table_id_t new_table_id() const - { - return(m_new_table_id); - } - - /** - Update root page number in SYS_XXXX tables. - - @param trx transaction object - @param table_id table id for which information needs to - be updated. - @param reserve_dict_mutex if TRUE, acquire/release - dict_sys->mutex around call to pars_sql. - @param mark_index_corrupted if true, then mark index corrupted - @return DB_SUCCESS or error code */ - dberr_t update_root_page_no( - trx_t* trx, - table_id_t table_id, - ibool reserve_dict_mutex, - bool mark_index_corrupted) const; - - /** Create an index for a table. - @param[in] table_name table name, for which to create - the index - @param[in,out] space tablespace - @param[in] index_type type of index to truncate - @param[in] index_id id of index to truncate - @param[in] btr_redo_create_info control info for ::btr_create() - @param[in,out] mtr mini-transaction covering the - create index - @return root page no or FIL_NULL on failure */ - inline ulint create_index( - const char* table_name, - fil_space_t* space, - ulint index_type, - index_id_t index_id, - const btr_create_t& btr_redo_create_info, - mtr_t* mtr) const; - - /** Create the indexes for a table - @param[in] table_name table name, for which to create the - indexes - @param[in,out] space tablespace - @param[in] format_flags page format flags - @return DB_SUCCESS or error code. */ - inline dberr_t create_indexes( - const char* table_name, - fil_space_t* space, - ulint format_flags); - - /** Check if index has been modified since TRUNCATE log snapshot - was recorded. - @param[in] space tablespace - @param[in] root_page_no index root page number - @return true if modified else false */ - inline bool is_index_modified_since_logged( - const fil_space_t* space, - ulint root_page_no) const; - - /** Drop indexes for a table. - @param[in,out] space tablespace - @return DB_SUCCESS or error code. */ - void drop_indexes(fil_space_t* space) const; - - /** - Parses log record during recovery - @param start_ptr buffer containing log body to parse - @param end_ptr buffer end - - @return DB_SUCCESS or error code */ - dberr_t parse( - byte* start_ptr, - const byte* end_ptr); - - /** Parse MLOG_TRUNCATE log record from REDO log file during recovery. - @param[in,out] start_ptr buffer containing log body to parse - @param[in] end_ptr buffer end - @param[in] space_id tablespace identifier - @return parsed upto or NULL. */ - static byte* parse_redo_entry( - byte* start_ptr, - const byte* end_ptr, - ulint space_id); - - /** - Write a log record for truncating a single-table tablespace. - - @param start_ptr buffer to write log record - @param end_ptr buffer end - @param space_id space id - @param tablename the table name in the usual - databasename/tablename format of InnoDB - @param flags tablespace flags - @param format_flags page format - @param lsn lsn while logging */ - dberr_t write( - byte* start_ptr, - byte* end_ptr, - ulint space_id, - const char* tablename, - ulint flags, - ulint format_flags, - lsn_t lsn) const; - - /** - @return number of indexes parsed from the truncate log record */ - size_t indexes() const; - - /** - Truncate a single-table tablespace. The tablespace must be cached - in the memory cache. - - Note: This is defined in fil0fil.cc because it needs to access some - types that are local to that file. - - @param space_id space id - @param dir_path directory path - @param tablename the table name in the usual - databasename/tablename format of InnoDB - @param flags tablespace flags - @param default_size if true, truncate to default size if tablespace - is being newly re-initialized. - @return DB_SUCCESS or error */ - static dberr_t truncate( - ulint space_id, - const char* dir_path, - const char* tablename, - ulint flags, - bool default_size); - - /** - Fix the table truncate by applying information parsed from TRUNCATE log. - Fix-up includes re-creating table (drop and re-create indexes) - @return error code or DB_SUCCESS */ - static dberr_t fixup_tables_in_system_tablespace(); - - /** - Fix the table truncate by applying information parsed from TRUNCATE log. - Fix-up includes re-creating tablespace. - @return error code or DB_SUCCESS */ - static dberr_t fixup_tables_in_non_system_tablespace(); - - /** - Check whether a tablespace was truncated during recovery - @param space_id tablespace id to check - @return true if the tablespace was truncated */ - static bool is_tablespace_truncated(ulint space_id); - - /** Was tablespace truncated (on crash before checkpoint). - If the MLOG_TRUNCATE redo-record is still available then tablespace - was truncated and checkpoint is yet to happen. - @param[in] space_id tablespace id to check. - @return true if tablespace was truncated. */ - static bool was_tablespace_truncated(ulint space_id); - - /** Get the lsn associated with space. - @param[in] space_id tablespace id to check. - @return associated lsn. */ - static lsn_t get_truncated_tablespace_init_lsn(ulint space_id); - -private: - typedef std::vector<index_t, ut_allocator<index_t> > indexes_t; - - /** Space ID of tablespace */ - ulint m_space_id; - - /** ID of table that is being truncated. */ - table_id_t m_old_table_id; - - /** New ID that will be assigned to table on truncation. */ - table_id_t m_new_table_id; - - /** Data dir path of tablespace */ - char* m_dir_path; - - /** Table name */ - char* m_tablename; - - /** Tablespace Flags */ - ulint m_tablespace_flags; - - /** Format flags (log flags; stored in page-no field of header) */ - ulint m_format_flags; - - /** Index meta-data */ - indexes_t m_indexes; - - /** LSN of TRUNCATE log record. */ - lsn_t m_log_lsn; - - /** Log file name. */ - char* m_log_file_name; - - /** Encryption information of the table */ - fil_encryption_t m_encryption; - uint32_t m_key_id; - - /** Vector of tables to truncate. */ - typedef std::vector<truncate_t*, ut_allocator<truncate_t*> > - tables_t; - - /** Information about tables to truncate post recovery */ - static tables_t s_tables; - - /** Information about truncated table - This is case when truncate is complete but checkpoint hasn't. */ - typedef std::map<ulint, lsn_t> truncated_tables_t; - static truncated_tables_t s_truncated_tables; - -public: - /** If true then fix-up of table is active and so while creating - index instead of grabbing information from dict_index_t, grab it - from parsed truncate log record. */ - static bool s_fix_up_active; -}; - -/** -Parse truncate log file. */ -class TruncateLogParser { - -public: - - /** - Scan and Parse truncate log files. - - @param dir_path look for log directory in following path - @return DB_SUCCESS or error code. */ - static dberr_t scan_and_parse( - const char* dir_path); - -private: - typedef std::vector<char*, ut_allocator<char*> > - trunc_log_files_t; - -private: - /** - Scan to find out truncate log file from the given directory path. - - @param dir_path look for log directory in following path. - @param log_files cache to hold truncate log file name found. - @return DB_SUCCESS or error code. */ - static dberr_t scan( - const char* dir_path, - trunc_log_files_t& log_files); - - /** - Parse the log file and populate table to truncate information. - (Add this table to truncate information to central vector that is then - used by truncate fix-up routine to fix-up truncate action of the table.) - - @param log_file_name log file to parse - @return DB_SUCCESS or error code. */ - static dberr_t parse( - const char* log_file_name); -}; - -#endif /* row0trunc_h */ diff --git a/storage/innobase/include/row0types.h b/storage/innobase/include/row0types.h index 5f1e46c6a4d..048b161b884 100644 --- a/storage/innobase/include/row0types.h +++ b/storage/innobase/include/row0types.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2018, MariaDB Corporation. +Copyright (c) 2018, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -24,8 +24,8 @@ Row operation global types Created 12/27/1996 Heikki Tuuri *******************************************************/ -#ifndef row0types_h -#define row0types_h +#pragma once +#include "buf0types.h" struct plan_t; @@ -146,5 +146,3 @@ public: return first_use; } }; - -#endif diff --git a/storage/innobase/include/row0undo.h b/storage/innobase/include/row0undo.h index a18d154c132..4357a908ca3 100644 --- a/storage/innobase/include/row0undo.h +++ b/storage/innobase/include/row0undo.h @@ -82,17 +82,20 @@ that index record. */ enum undo_exec { UNDO_NODE_FETCH_NEXT = 1, /*!< we should fetch the next undo log record */ - UNDO_NODE_INSERT, /*!< undo a fresh insert of a - row to a table */ - UNDO_NODE_MODIFY /*!< undo a modify operation - (DELETE or UPDATE) on a row - of a table */ + /** rollback an insert into persistent table */ + UNDO_INSERT_PERSISTENT, + /** rollback an update (or delete) in a persistent table */ + UNDO_UPDATE_PERSISTENT, + /** rollback an insert into temporary table */ + UNDO_INSERT_TEMPORARY, + /** rollback an update (or delete) in a temporary table */ + UNDO_UPDATE_TEMPORARY, }; /** Undo node structure */ struct undo_node_t{ que_common_t common; /*!< node type: QUE_NODE_UNDO */ - enum undo_exec state; /*!< node execution state */ + undo_exec state; /*!< rollback execution state */ trx_t* trx; /*!< trx for which undo is done */ roll_ptr_t roll_ptr;/*!< roll pointer to undo log record */ trx_undo_rec_t* undo_rec;/*!< undo log record */ diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h index b34acfd8dc1..677af76c561 100644 --- a/storage/innobase/include/row0upd.h +++ b/storage/innobase/include/row0upd.h @@ -101,19 +101,6 @@ upd_get_field_by_field_no( bool is_virtual) /*!< in: if it is a virtual column */ MY_ATTRIBUTE((warn_unused_result)); /*********************************************************************//** -Writes into the redo log the values of trx id and roll ptr and enough info -to determine their positions within a clustered index record. -@return new pointer to mlog */ -byte* -row_upd_write_sys_vals_to_log( -/*==========================*/ - dict_index_t* index, /*!< in: clustered index */ - trx_id_t trx_id, /*!< in: transaction id */ - roll_ptr_t roll_ptr,/*!< in: roll ptr of the undo log record */ - byte* log_ptr,/*!< pointer to a buffer of size > 20 opened - in mlog */ - mtr_t* mtr); /*!< in: mtr */ -/*********************************************************************//** Updates the trx id and roll ptr field in a clustered index record when a row is updated or marked deleted. */ UNIV_INLINE @@ -128,18 +115,6 @@ row_upd_rec_sys_fields( const trx_t* trx, /*!< in: transaction */ roll_ptr_t roll_ptr);/*!< in: DB_ROLL_PTR to the undo log */ /*********************************************************************//** -Sets the trx id or roll ptr field of a clustered index entry. */ -void -row_upd_index_entry_sys_field( -/*==========================*/ - dtuple_t* entry, /*!< in/out: index entry, where the memory - buffers for sys fields are already allocated: - the function just copies the new values to - them */ - dict_index_t* index, /*!< in: clustered index */ - ulint type, /*!< in: DATA_TRX_ID or DATA_ROLL_PTR */ - ib_uint64_t val); /*!< in: value to write */ -/*********************************************************************//** Creates an update node for a query graph. @return own: update node */ upd_node_t* @@ -482,6 +457,14 @@ struct upd_t{ return false; } + /** @return whether this is for a hidden metadata record + for instant ALTER TABLE */ + bool is_metadata() const { return dtuple_t::is_metadata(info_bits); } + /** @return whether this is for a hidden metadata record + for instant ALTER TABLE (not only ADD COLUMN) */ + bool is_alter_metadata() const + { return dtuple_t::is_alter_metadata(info_bits); } + #ifdef UNIV_DEBUG bool validate() const { @@ -495,7 +478,6 @@ struct upd_t{ return(true); } #endif // UNIV_DEBUG - }; /** Kinds of update operation */ diff --git a/storage/innobase/include/row0upd.ic b/storage/innobase/include/row0upd.ic index e1368a14e63..fffb7650da3 100644 --- a/storage/innobase/include/row0upd.ic +++ b/storage/innobase/include/row0upd.ic @@ -167,13 +167,13 @@ row_upd_rec_sys_fields( const trx_t* trx, /*!< in: transaction */ roll_ptr_t roll_ptr)/*!< in: DB_ROLL_PTR to the undo log */ { - ut_ad(dict_index_is_clust(index)); + ut_ad(index->is_primary()); ut_ad(rec_offs_validate(rec, index, offsets)); - if (page_zip) { - ulint pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + if (UNIV_LIKELY_NULL(page_zip)) { page_zip_write_trx_id_and_roll_ptr(page_zip, rec, offsets, - pos, trx->id, roll_ptr); + index->db_trx_id(), + trx->id, roll_ptr); } else { ulint offset = index->trx_id_offset; diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h index 84e8ece2d77..10730366401 100644 --- a/storage/innobase/include/srv0mon.h +++ b/storage/innobase/include/srv0mon.h @@ -37,6 +37,8 @@ Created 12/15/2009 Jimmy Yang #endif /* __STDC_LIMIT_MACROS */ #include <stdint.h> +#include "my_atomic.h" +#include "my_atomic_wrapper.h" /** Possible status values for "mon_status" in "struct monitor_value" */ enum monitor_running_status { @@ -177,7 +179,6 @@ enum monitor_id_t { MONITOR_OVLD_INDEX_PAGES_WRITTEN, MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN, MONITOR_OVLD_PAGES_READ, - MONITOR_OVLD_PAGES0_READ, MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS, MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS_AVOIDED, MONITOR_OVLD_BYTE_READ, @@ -300,7 +301,6 @@ enum monitor_id_t { MONITOR_TRX_COMMIT_UNDO, MONITOR_TRX_ROLLBACK, MONITOR_TRX_ROLLBACK_SAVEPOINT, - MONITOR_TRX_ROLLBACK_ACTIVE, MONITOR_TRX_ACTIVE, MONITOR_RSEG_HISTORY_LEN, MONITOR_NUM_UNDO_SLOT_USED, diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 6c575733710..e1d37613dc9 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -47,7 +47,6 @@ Created 10/10/1995 Heikki Tuuri #include "que0types.h" #include "trx0types.h" #include "srv0conc.h" -#include "buf0checksum.h" #include "fil0fil.h" #include "mysql/psi/mysql_stage.h" @@ -144,7 +143,8 @@ struct srv_stats_t ulint_ctr_1_t n_lock_wait_count; /** Number of threads currently waiting on database locks */ - simple_atomic_counter<> n_lock_wait_current_count; + MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<ulint> + n_lock_wait_current_count; /** Number of rows read. */ ulint_ctr_64_t n_rows_read; @@ -176,9 +176,6 @@ struct srv_stats_t /** Number of times prefix optimization avoided triggering cluster lookup */ ulint_ctr_64_t n_sec_rec_cluster_reads_avoided; - /** Number of times page 0 is read from tablespace */ - ulint_ctr_64_t page0_read; - /** Number of encryption_get_latest_key_version calls */ ulint_ctr_64_t n_key_requests; @@ -457,7 +454,7 @@ extern uint srv_fast_shutdown; /*!< If this is 1, do not do a /** Signal to shut down InnoDB (NULL if shutdown was signaled, or if running in innodb_read_only mode, srv_read_only_mode) */ -extern st_my_thread_var *srv_running; +extern std::atomic<st_my_thread_var *> srv_running; extern ibool srv_innodb_status; @@ -565,7 +562,6 @@ extern uint srv_sys_space_size_debug; extern bool srv_log_files_created; #endif /* UNIV_DEBUG */ -#define SRV_SEMAPHORE_WAIT_EXTENSION 7200 extern ulint srv_dml_needed_delay; #define SRV_MAX_N_IO_THREADS 130 @@ -929,23 +925,6 @@ srv_purge_wakeup(); /** Shut down the purge threads. */ void srv_purge_shutdown(); -/** Check if tablespace is being truncated. -(Ignore system-tablespace as we don't re-create the tablespace -and so some of the action that are suppressed by this function -for independent tablespace are not applicable to system-tablespace). -@param space_id space_id to check for truncate action -@return true if being truncated, false if not being - truncated or tablespace is system-tablespace. */ -bool -srv_is_tablespace_truncated(ulint space_id); - -/** Check if tablespace was truncated. -@param[in] space space object to check for truncate action -@return true if tablespace was truncated and we still have an active -MLOG_TRUNCATE REDO log record. */ -bool -srv_was_tablespace_truncated(const fil_space_t* space); - #ifdef UNIV_DEBUG /** Disables master thread. It's used by: SET GLOBAL innodb_master_thread_disabled_debug = 1 (0). @@ -1000,7 +979,6 @@ struct export_var_t{ ulint innodb_page_size; /*!< srv_page_size */ ulint innodb_pages_created; /*!< buf_pool->stat.n_pages_created */ ulint innodb_pages_read; /*!< buf_pool->stat.n_pages_read*/ - ulint innodb_page0_read; /*!< srv_stats.page0_read */ ulint innodb_pages_written; /*!< buf_pool->stat.n_pages_written */ ulint innodb_row_lock_waits; /*!< srv_n_lock_wait_count */ ulint innodb_row_lock_current_waits; /*!< srv_n_lock_wait_current_count */ diff --git a/storage/innobase/include/sync0arr.ic b/storage/innobase/include/sync0arr.ic index 9163d5b6614..962226b4934 100644 --- a/storage/innobase/include/sync0arr.ic +++ b/storage/innobase/include/sync0arr.ic @@ -44,8 +44,7 @@ sync_array_get() return(sync_wait_array[0]); } - return(sync_wait_array[default_indexer_t<>::get_rnd_index() - % sync_array_size]); + return(sync_wait_array[get_rnd_value() % sync_array_size]); } /******************************************************************//** diff --git a/storage/innobase/include/sync0policy.h b/storage/innobase/include/sync0policy.h index 4e48f1e2720..94f49ff628c 100644 --- a/storage/innobase/include/sync0policy.h +++ b/storage/innobase/include/sync0policy.h @@ -30,247 +30,176 @@ Created 2012-08-21 Sunny Bains. #include "ut0rnd.h" #include "os0thread.h" #include "srv0mon.h" +#include "sync0debug.h" #ifdef UNIV_DEBUG -# define MUTEX_MAGIC_N 979585UL - -template <typename Mutex> -class MutexDebug { -public: - - /** For passing context to SyncDebug */ - struct Context : public latch_t { - - /** Constructor */ - Context() - : - m_mutex(), - m_filename(), - m_line(), - m_thread_id(ULINT_UNDEFINED) - { - /* No op */ - } - - /** Create the context for SyncDebug - @param[in] id ID of the latch to track */ - Context(latch_id_t id) - : - latch_t(id) - { - ut_ad(id != LATCH_ID_NONE); - } - - /** Set to locked state - @param[in] mutex The mutex to acquire - @param[in] filename File name from where to acquire - @param[in] line Line number in filename */ - void locked( - const Mutex* mutex, - const char* filename, - unsigned line) - UNIV_NOTHROW - { - m_mutex = mutex; - - my_atomic_storelint(&m_thread_id, - ulint(os_thread_get_curr_id())); - - m_filename = filename; - - m_line = line; - } - - /** Reset to unlock state */ - void release() - UNIV_NOTHROW - { - m_mutex = NULL; - - my_atomic_storelint(&m_thread_id, ULINT_UNDEFINED); - - m_filename = NULL; - - m_line = 0; - } - - /** Print information about the latch - @return the string representation */ - virtual std::string to_string() const - UNIV_NOTHROW - { - std::ostringstream msg; - - msg << m_mutex->policy().to_string(); - - if (m_thread_id != ULINT_UNDEFINED) { - - msg << " addr: " << m_mutex - << " acquired: " << locked_from().c_str(); - - } else { - msg << "Not locked"; - } - - return(msg.str()); - } - - /** @return the name of the file and line number in the file - from where the mutex was acquired "filename:line" */ - virtual std::string locked_from() const - { - std::ostringstream msg; - - msg << sync_basename(m_filename) << ":" << m_line; - - return(std::string(msg.str())); - } - - /** Mutex to check for lock order violation */ - const Mutex* m_mutex; - - /** Filename from where enter was called */ - const char* m_filename; - - /** Line mumber in filename */ - unsigned m_line; - - /** Thread ID of the thread that own(ed) the mutex */ - ulint m_thread_id; - }; - - /** Constructor. */ - MutexDebug() - : - m_magic_n(), - m_context() - UNIV_NOTHROW - { - /* No op */ - } - - /* Destructor */ - virtual ~MutexDebug() { } - - /** Mutex is being destroyed. */ - void destroy() UNIV_NOTHROW - { - ut_ad((ulint)my_atomic_loadlint(&m_context.m_thread_id) == ULINT_UNDEFINED); - - m_magic_n = 0; - - m_context.m_thread_id = 0; - } - - /** Called when the mutex is "created". Note: Not from the constructor - but when the mutex is initialised. - @param[in] id Mutex ID */ - void init(latch_id_t id) UNIV_NOTHROW; - - /** Called when an attempt is made to lock the mutex - @param[in] mutex Mutex instance to be locked - @param[in] filename Filename from where it was called - @param[in] line Line number from where it was called */ - void enter( - const Mutex* mutex, - const char* filename, - unsigned line) - UNIV_NOTHROW; - - /** Called when the mutex is locked - @param[in] mutex Mutex instance that was locked - @param[in] filename Filename from where it was called - @param[in] line Line number from where it was called */ - void locked( - const Mutex* mutex, - const char* filename, - unsigned line) - UNIV_NOTHROW; - - /** Called when the mutex is released - @param[in] mutx Mutex that was released */ - void release(const Mutex* mutex) - UNIV_NOTHROW; - - /** @return true if thread owns the mutex */ - bool is_owned() const UNIV_NOTHROW - { - return(os_thread_eq( - (os_thread_id_t)my_atomic_loadlint(&m_context.m_thread_id), - os_thread_get_curr_id())); - } - - /** @return the name of the file from the mutex was acquired */ - const char* get_enter_filename() const - UNIV_NOTHROW - { - return(m_context.m_filename); - } - - /** @return the name of the file from the mutex was acquired */ - unsigned get_enter_line() const - UNIV_NOTHROW - { - return(m_context.m_line); - } - - /** @return id of the thread that was trying to acquire the mutex */ - os_thread_id_t get_thread_id() const - UNIV_NOTHROW - { - return((os_thread_id_t)my_atomic_loadlint(&m_context.m_thread_id)); - } +template <typename Mutex> class MutexDebug: public latch_t +{ + /** Mutex to check for lock order violation */ + const Mutex *m_mutex; + /** Filename from where enter was called */ + const char *m_filename; + /** Line mumber in filename */ + unsigned m_line; + /** Thread ID of the thread that owns the mutex */ + os_thread_id_t m_thread_id; + /** Mutex protecting the above members */ + mutable OSMutex m_debug_mutex; + + + void set(const Mutex *mutex, const char *filename, unsigned line, + os_thread_id_t thread_id) + { + m_debug_mutex.enter(); + m_mutex= mutex; + m_filename= filename; + m_line= line; + m_thread_id= thread_id; + m_debug_mutex.exit(); + } + + + const MutexDebug get() const + { + MutexDebug ret; + m_debug_mutex.enter(); + ret.m_mutex= m_mutex; + ret.m_filename= m_filename; + ret.m_line= m_line; + ret.m_thread_id= m_thread_id; + m_debug_mutex.exit(); + return ret; + } + + + /** + Called either when mutex is locked or destroyed. Thus members are protected + from concurrent modification. + */ + void assert_clean_context() + { + ut_ad(!m_mutex); + ut_ad(!m_filename); + ut_ad(!m_line); + ut_ad(m_thread_id == os_thread_id_t(ULINT_UNDEFINED)); + } - /** Magic number to check for memory corruption. */ - ulint m_magic_n; - /** Latch state of the mutex owner */ - Context m_context; +public: + /** + Called when the mutex is "created". Note: Not from the constructor + but when the mutex is initialised. + @param[in] id Mutex ID + */ + void init(latch_id_t id) + { + ut_ad(id != LATCH_ID_NONE); + m_id= id; + m_debug_mutex.init(); + set(0, 0, 0, os_thread_id_t(ULINT_UNDEFINED)); + } + + + /** Mutex is being destroyed. */ + void destroy() + { + assert_clean_context(); + m_debug_mutex.destroy(); + } + + + /** + Called when an attempt is made to lock the mutex + @param[in] mutex Mutex instance to be locked + @param[in] filename Filename from where it was called + @param[in] line Line number from where it was called + */ + void enter(const Mutex &mutex, const char *filename, unsigned line) + { + MutexDebug context; + ut_ad(!is_owned()); + context.init(m_id); + context.set(&mutex, filename, line, os_thread_get_curr_id()); + /* Check for latch order violation. */ + sync_check_lock_validate(&context); + context.set(0, 0, 0, os_thread_id_t(ULINT_UNDEFINED)); + context.destroy(); + } + + + /** + Called when the mutex is locked + @param[in] mutex Mutex instance that was locked + @param[in] filename Filename from where it was called + @param[in] line Line number from where it was called + */ + void locked(const Mutex &mutex, const char *filename, unsigned line) + { + assert_clean_context(); + set(&mutex, filename, line, os_thread_get_curr_id()); + sync_check_lock_granted(this); + } + + + /** + Called when the mutex is released + @param[in] mutex Mutex that was released + */ + void release(const Mutex &mutex) + { + ut_ad(is_owned()); + set(0, 0, 0, os_thread_id_t(ULINT_UNDEFINED)); + sync_check_unlock(this); + } + + + /** @return true if thread owns the mutex */ + bool is_owned() const + { + return os_thread_eq(get_thread_id(), os_thread_get_curr_id()); + } + + + /** @return the name of the file from the mutex was acquired */ + const char* get_enter_filename() const { return get().m_filename; } + + + /** @return the name of the file from the mutex was acquired */ + unsigned get_enter_line() const { return get().m_line; } + + + /** @return id of the thread that was trying to acquire the mutex */ + os_thread_id_t get_thread_id() const { return get().m_thread_id; } + + + /** + Print information about the latch + @return the string representation + */ + virtual std::string to_string() const + { + std::ostringstream msg; + const MutexDebug ctx= get(); + + msg << m_mutex->policy().to_string(); + if (ctx.m_mutex) + msg << " addr: " << ctx.m_mutex << " acquired: " + << sync_basename(ctx.get_enter_filename()) << ":" + << ctx.get_enter_line(); + else + msg << "Not locked"; + + return(msg.str()); + } }; #endif /* UNIV_DEBUG */ -/* Do nothing */ -template <typename Mutex> -struct NoPolicy { - /** Default constructor. */ - NoPolicy() { } - - void init(const Mutex&, latch_id_t, const char*, uint32_t) - UNIV_NOTHROW { } - void destroy() UNIV_NOTHROW { } - void enter(const Mutex&, const char*, unsigned) UNIV_NOTHROW { } - void add(uint32_t, uint32_t) UNIV_NOTHROW { } - void locked(const Mutex&, const char*, ulint) UNIV_NOTHROW { } - void release(const Mutex&) UNIV_NOTHROW { } - std::string to_string() const { return(""); }; - latch_id_t get_id() const; -}; - /** Collect the metrics per mutex instance, no aggregation. */ template <typename Mutex> struct GenericPolicy -#ifdef UNIV_DEBUG -: public MutexDebug<Mutex> -#endif /* UNIV_DEBUG */ { public: - typedef Mutex MutexType; - - /** Constructor. */ - GenericPolicy() - UNIV_NOTHROW - : -#ifdef UNIV_DEBUG - MutexDebug<MutexType>(), -#endif /* UNIV_DEBUG */ - m_count(), - m_id() - { } - - /** Destructor */ - ~GenericPolicy() { } - /** Called when the mutex is "created". Note: Not from the constructor but when the mutex is initialised. @param[in] id Mutex ID @@ -292,8 +221,6 @@ public: meta.get_counter()->single_register(&m_count); sync_file_created_register(this, filename, uint16_t(line)); - - ut_d(MutexDebug<MutexType>::init(m_id)); } /** Called when the mutex is destroyed. */ @@ -305,8 +232,6 @@ public: meta.get_counter()->single_deregister(&m_count); sync_file_created_deregister(this); - - ut_d(MutexDebug<MutexType>::destroy()); } /** Called after a successful mutex acquire. @@ -332,40 +257,6 @@ public: ++m_count.m_calls; } - /** Called when an attempt is made to lock the mutex - @param[in] mutex Mutex instance to be locked - @param[in] filename Filename from where it was called - @param[in] line Line number from where it was called */ - void enter( - const MutexType& mutex, - const char* filename, - unsigned line) - UNIV_NOTHROW - { - ut_d(MutexDebug<MutexType>::enter(&mutex, filename, line)); - } - - /** Called when the mutex is locked - @param[in] mutex Mutex instance that is locked - @param[in] filename Filename from where it was called - @param[in] line Line number from where it was called */ - void locked( - const MutexType& mutex, - const char* filename, - unsigned line) - UNIV_NOTHROW - { - ut_d(MutexDebug<MutexType>::locked(&mutex, filename, line)); - } - - /** Called when the mutex is released - @param[in] mutex Mutex instance that is released */ - void release(const MutexType& mutex) - UNIV_NOTHROW - { - ut_d(MutexDebug<MutexType>::release(&mutex)); - } - /** Print the information about the latch @return the string representation */ std::string print() const @@ -378,14 +269,18 @@ public: return(m_id); } - /** @return the string representation */ - std::string to_string() const; -private: - typedef latch_meta_t::CounterType Counter; + /** @return the string representation */ + std::string to_string() const + { return sync_mutex_to_string(get_id(), sync_file_created_get(this)); } - /** The user visible counters, registered with the meta-data. */ - Counter::Count m_count; +#ifdef UNIV_DEBUG + MutexDebug<Mutex> context; +#endif + +private: + /** The user visible counters, registered with the meta-data. */ + latch_meta_t::CounterType::Count m_count; /** Latch meta data ID */ latch_id_t m_id; @@ -395,29 +290,8 @@ private: too many of them to count individually. */ template <typename Mutex> class BlockMutexPolicy -#ifdef UNIV_DEBUG -: public MutexDebug<Mutex> -#endif /* UNIV_DEBUG */ { public: - typedef Mutex MutexType; - typedef typename latch_meta_t::CounterType::Count Count; - - /** Default constructor. */ - BlockMutexPolicy() - : -#ifdef UNIV_DEBUG - MutexDebug<MutexType>(), -#endif /* UNIV_DEBUG */ - m_count(), - m_id() - { - /* Do nothing */ - } - - /** Destructor */ - ~BlockMutexPolicy() { } - /** Called when the mutex is "created". Note: Not from the constructor but when the mutex is initialised. @param[in] id Mutex ID */ @@ -436,8 +310,6 @@ public: ut_ad(meta.get_id() == id); m_count = meta.get_counter()->sum_register(); - - ut_d(MutexDebug<MutexType>::init(m_id)); } /** Called when the mutex is destroyed. */ @@ -445,7 +317,6 @@ public: UNIV_NOTHROW { m_count = NULL; - ut_d(MutexDebug<MutexType>::destroy()); } /** Called after a successful mutex acquire. @@ -469,40 +340,6 @@ public: ++m_count->m_calls; } - /** Called when the mutex is locked - @param[in] mutex Mutex instance that is locked - @param[in] filename Filename from where it was called - @param[in] line Line number from where it was called */ - void locked( - const MutexType& mutex, - const char* filename, - unsigned line) - UNIV_NOTHROW - { - ut_d(MutexDebug<MutexType>::locked(&mutex, filename, line)); - } - - /** Called when the mutex is released - @param[in] mutex Mutex instance that is released */ - void release(const MutexType& mutex) - UNIV_NOTHROW - { - ut_d(MutexDebug<MutexType>::release(&mutex)); - } - - /** Called when an attempt is made to lock the mutex - @param[in] mutex Mutex instance to be locked - @param[in] filename Filename from where it was called - @param[in] line Line number from where it was called */ - void enter( - const MutexType& mutex, - const char* filename, - unsigned line) - UNIV_NOTHROW - { - ut_d(MutexDebug<MutexType>::enter(&mutex, filename, line)); - } - /** Print the information about the latch @return the string representation */ std::string print() const @@ -514,19 +351,26 @@ public: return(m_id); } - /** @return the string representation */ - std::string to_string() const; -private: - typedef latch_meta_t::CounterType Counter; + /** + I don't think it makes sense to keep track of the file name + and line number for each block mutex. Too much of overhead. Use the + latch id to figure out the location from the source. + + @return the string representation + */ + std::string to_string() const + { return(sync_mutex_to_string(get_id(), "buf0buf.cc:0")); } + +#ifdef UNIV_DEBUG + MutexDebug<Mutex> context; +#endif - /** The user visible counters, registered with the meta-data. */ - Counter::Count* m_count; +private: + /** The user visible counters, registered with the meta-data. */ + latch_meta_t::CounterType::Count *m_count; /** Latch meta data ID */ latch_id_t m_id; }; - -#include "sync0policy.ic" - #endif /* sync0policy_h */ diff --git a/storage/innobase/include/sync0policy.ic b/storage/innobase/include/sync0policy.ic deleted file mode 100644 index e7aeb2e16bb..00000000000 --- a/storage/innobase/include/sync0policy.ic +++ /dev/null @@ -1,101 +0,0 @@ -/***************************************************************************** - -Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/******************************************************************//** -@file include/sync0policy.ic -Policy for mutexes. - -Created 2012-08-21 Sunny Bains. -***********************************************************************/ - -#include "sync0debug.h" - -template <typename Mutex> -std::string GenericPolicy<Mutex>::to_string() const -{ - return(sync_mutex_to_string(get_id(), sync_file_created_get(this))); -} - -template <typename Mutex> -std::string BlockMutexPolicy<Mutex>::to_string() const -{ - /* I don't think it makes sense to keep track of the file name - and line number for each block mutex. Too much of overhead. Use the - latch id to figure out the location from the source. */ - return(sync_mutex_to_string(get_id(), "buf0buf.cc:0")); -} - -#ifdef UNIV_DEBUG - -template <typename Mutex> -void MutexDebug<Mutex>::init(latch_id_t id) - UNIV_NOTHROW -{ - m_context.m_id = id; - - m_context.release(); - - m_magic_n = MUTEX_MAGIC_N; -} - -template <typename Mutex> -void MutexDebug<Mutex>::enter( - const Mutex* mutex, - const char* name, - unsigned line) - UNIV_NOTHROW -{ - ut_ad(!is_owned()); - - Context context(m_context.get_id()); - - context.locked(mutex, name, line); - - /* Check for latch order violation. */ - - sync_check_lock_validate(&context); -} - -template <typename Mutex> -void MutexDebug<Mutex>::locked( - const Mutex* mutex, - const char* name, - unsigned line) - UNIV_NOTHROW -{ - ut_ad(!is_owned()); - ut_ad(m_context.m_thread_id == ULINT_UNDEFINED); - - m_context.locked(mutex, name, line); - - sync_check_lock_granted(&m_context); -} - -template <typename Mutex> -void MutexDebug<Mutex>::release(const Mutex*) - UNIV_NOTHROW -{ - ut_ad(is_owned()); - - m_context.release(); - - sync_check_unlock(&m_context); -} - -#endif /* UNIV_DEBUG */ diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h index 5de22c74fa1..48528eb4d30 100644 --- a/storage/innobase/include/sync0rw.h +++ b/storage/innobase/include/sync0rw.h @@ -2,7 +2,7 @@ Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. -Copyright (c) 2017, 2019, MariaDB Corporation. +Copyright (c) 2017, 2020, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -568,11 +568,11 @@ struct rw_lock_t : public latch_t #endif /* UNIV_DEBUG */ { - /** Holds the state of the lock. */ - int32_t lock_word; + /** Holds the state of the lock. */ + Atomic_relaxed<int32_t> lock_word; - /** 1: there are waiters */ - int32_t waiters; + /** 0=no waiters, 1=waiters for X or SX lock exist */ + Atomic_relaxed<uint32_t> waiters; /** number of granted SX locks. */ volatile ulint sx_recursive; @@ -625,8 +625,7 @@ struct rw_lock_t #endif /* UNIV_PFS_RWLOCK */ #ifdef UNIV_DEBUG - virtual std::string to_string() const; - virtual std::string locked_from() const; + std::string to_string() const override; /** In the debug version: pointer to the debug info list of the lock */ UT_LIST_BASE_NODE_T(rw_lock_debug_t) debug_list; @@ -634,7 +633,6 @@ struct rw_lock_t /** Level in the global latching order. */ latch_level_t level; #endif /* UNIV_DEBUG */ - }; #ifdef UNIV_DEBUG /** The structure for storing debug info of an rw-lock. All access to this diff --git a/storage/innobase/include/sync0rw.ic b/storage/innobase/include/sync0rw.ic index 15f8ff3fe62..603e902d01c 100644 --- a/storage/innobase/include/sync0rw.ic +++ b/storage/innobase/include/sync0rw.ic @@ -2,7 +2,7 @@ Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. -Copyright (c) 2017, 2018, MariaDB Corporation. +Copyright (c) 2017, 2020, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -77,8 +77,7 @@ rw_lock_get_writer( /*===============*/ const rw_lock_t* lock) /*!< in: rw-lock */ { - int32_t lock_word = my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), - MY_MEMORY_ORDER_RELAXED); + int32_t lock_word = lock->lock_word; ut_ad(lock_word <= X_LOCK_DECR); if (lock_word > X_LOCK_HALF_DECR) { @@ -110,8 +109,7 @@ rw_lock_get_reader_count( /*=====================*/ const rw_lock_t* lock) /*!< in: rw-lock */ { - int32_t lock_word = my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), - MY_MEMORY_ORDER_RELAXED); + int32_t lock_word = lock->lock_word; ut_ad(lock_word <= X_LOCK_DECR); if (lock_word > X_LOCK_HALF_DECR) { @@ -147,8 +145,7 @@ rw_lock_get_x_lock_count( /*=====================*/ const rw_lock_t* lock) /*!< in: rw-lock */ { - int32_t lock_copy = my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), - MY_MEMORY_ORDER_RELAXED); + int32_t lock_copy = lock->lock_word; ut_ad(lock_copy <= X_LOCK_DECR); if (lock_copy == 0 || lock_copy == -X_LOCK_HALF_DECR) { @@ -181,8 +178,7 @@ rw_lock_get_sx_lock_count( const rw_lock_t* lock) /*!< in: rw-lock */ { #ifdef UNIV_DEBUG - int32_t lock_copy = my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), - MY_MEMORY_ORDER_RELAXED); + int32_t lock_copy = lock->lock_word; ut_ad(lock_copy <= X_LOCK_DECR); @@ -213,14 +209,15 @@ rw_lock_lock_word_decr( int32_t amount, /*!< in: amount to decrement */ int32_t threshold) /*!< in: threshold of judgement */ { - int32_t lock_copy = my_atomic_load32_explicit(&lock->lock_word, - MY_MEMORY_ORDER_RELAXED); + int32_t lock_copy = lock->lock_word; + while (lock_copy > threshold) { - if (my_atomic_cas32_strong_explicit(&lock->lock_word, - &lock_copy, - lock_copy - amount, - MY_MEMORY_ORDER_ACQUIRE, - MY_MEMORY_ORDER_RELAXED)) { + if (lock->lock_word.compare_exchange_strong( + lock_copy, + lock_copy - amount, + std::memory_order_acquire, + std::memory_order_relaxed)) { + return(true); } } @@ -304,9 +301,9 @@ rw_lock_x_lock_func_nowait( { int32_t oldval = X_LOCK_DECR; - if (my_atomic_cas32_strong_explicit(&lock->lock_word, &oldval, 0, - MY_MEMORY_ORDER_ACQUIRE, - MY_MEMORY_ORDER_RELAXED)) { + if (lock->lock_word.compare_exchange_strong(oldval, 0, + std::memory_order_acquire, + std::memory_order_relaxed)) { lock->writer_thread = os_thread_get_curr_id(); } else if (os_thread_eq(lock->writer_thread, os_thread_get_curr_id())) { @@ -316,12 +313,12 @@ rw_lock_x_lock_func_nowait( observe consistent values. */ if (oldval == 0 || oldval == -X_LOCK_HALF_DECR) { /* There are 1 x-locks */ - my_atomic_add32_explicit(&lock->lock_word, -X_LOCK_DECR, - MY_MEMORY_ORDER_RELAXED); + lock->lock_word.fetch_sub(X_LOCK_DECR, + std::memory_order_relaxed); } else if (oldval <= -X_LOCK_DECR) { /* There are 2 or more x-locks */ - my_atomic_add32_explicit(&lock->lock_word, -1, - MY_MEMORY_ORDER_RELAXED); + lock->lock_word.fetch_sub(1, + std::memory_order_relaxed); /* Watch for too many recursive locks */ ut_ad(oldval < 1); } else { @@ -355,27 +352,21 @@ rw_lock_s_unlock_func( #endif /* UNIV_DEBUG */ rw_lock_t* lock) /*!< in/out: rw-lock */ { -#ifdef UNIV_DEBUG - int32_t dbg_lock_word = my_atomic_load32_explicit(&lock->lock_word, - MY_MEMORY_ORDER_RELAXED); - ut_ad(dbg_lock_word > -X_LOCK_DECR); - ut_ad(dbg_lock_word != 0); - ut_ad(dbg_lock_word < X_LOCK_DECR); -#endif - ut_d(rw_lock_remove_debug_info(lock, pass, RW_LOCK_S)); /* Increment lock_word to indicate 1 less reader */ - int32_t lock_word = my_atomic_add32_explicit(&lock->lock_word, 1, - MY_MEMORY_ORDER_RELEASE) + 1; - if (lock_word == 0 || lock_word == -X_LOCK_HALF_DECR) { + int32_t lock_word = lock->lock_word.fetch_add( + 1, std::memory_order_release); + if (lock_word == -1 || lock_word == -X_LOCK_HALF_DECR - 1) { /* wait_ex waiter exists. It may not be asleep, but we signal anyway. We do not wake other waiters, because they can't exist without wait_ex waiter and wait_ex waiter goes first.*/ os_event_set(lock->wait_ex_event); sync_array_object_signalled(); - + } else { + ut_ad(lock_word > -X_LOCK_DECR); + ut_ad(lock_word < X_LOCK_DECR); } ut_ad(rw_lock_validate(lock)); @@ -393,11 +384,7 @@ rw_lock_x_unlock_func( #endif /* UNIV_DEBUG */ rw_lock_t* lock) /*!< in/out: rw-lock */ { - int32_t lock_word = my_atomic_load32_explicit(&lock->lock_word, - MY_MEMORY_ORDER_RELAXED); - - ut_ad(lock_word == 0 || lock_word == -X_LOCK_HALF_DECR - || lock_word <= -X_LOCK_DECR); + int32_t lock_word = lock->lock_word; if (lock_word == 0) { /* Last caller in a possible recursive chain. */ @@ -411,31 +398,27 @@ rw_lock_x_unlock_func( ACQ_REL due to... RELEASE: we release rw-lock ACQUIRE: we want waiters to be loaded after lock_word is stored */ - my_atomic_add32_explicit(&lock->lock_word, X_LOCK_DECR, - MY_MEMORY_ORDER_ACQ_REL); + lock->lock_word.fetch_add(X_LOCK_DECR, + std::memory_order_acq_rel); /* This no longer has an X-lock but it may still have an SX-lock. So it is now free for S-locks by other threads. We need to signal read/write waiters. We do not need to signal wait_ex waiters, since they cannot exist when there is a writer. */ - if (my_atomic_load32_explicit(&lock->waiters, - MY_MEMORY_ORDER_RELAXED)) { - my_atomic_store32_explicit(&lock->waiters, 0, - MY_MEMORY_ORDER_RELAXED); + if (lock->waiters) { + lock->waiters = 0; os_event_set(lock->event); sync_array_object_signalled(); } } else if (lock_word == -X_LOCK_DECR || lock_word == -(X_LOCK_DECR + X_LOCK_HALF_DECR)) { /* There are 2 x-locks */ - my_atomic_add32_explicit(&lock->lock_word, X_LOCK_DECR, - MY_MEMORY_ORDER_RELAXED); + lock->lock_word.fetch_add(X_LOCK_DECR); } else { /* There are more than 2 x-locks. */ ut_ad(lock_word < -X_LOCK_DECR); - my_atomic_add32_explicit(&lock->lock_word, 1, - MY_MEMORY_ORDER_RELAXED); + lock->lock_word.fetch_add(1); } ut_ad(rw_lock_validate(lock)); @@ -461,8 +444,7 @@ rw_lock_sx_unlock_func( ut_d(rw_lock_remove_debug_info(lock, pass, RW_LOCK_SX)); if (lock->sx_recursive == 0) { - int32_t lock_word = my_atomic_load32_explicit(&lock->lock_word, - MY_MEMORY_ORDER_RELAXED); + int32_t lock_word = lock->lock_word; /* Last caller in a possible recursive chain. */ if (lock_word > 0) { lock->writer_thread = 0; @@ -472,17 +454,15 @@ rw_lock_sx_unlock_func( ACQ_REL due to... RELEASE: we release rw-lock ACQUIRE: we want waiters to be loaded after lock_word is stored */ - my_atomic_add32_explicit(&lock->lock_word, X_LOCK_HALF_DECR, - MY_MEMORY_ORDER_ACQ_REL); + lock->lock_word.fetch_add(X_LOCK_HALF_DECR, + std::memory_order_acq_rel); /* Lock is now free. May have to signal read/write waiters. We do not need to signal wait_ex waiters, since they cannot exist when there is an sx-lock holder. */ - if (my_atomic_load32_explicit(&lock->waiters, - MY_MEMORY_ORDER_RELAXED)) { - my_atomic_store32_explicit(&lock->waiters, 0, - MY_MEMORY_ORDER_RELAXED); + if (lock->waiters) { + lock->waiters = 0; os_event_set(lock->event); sync_array_object_signalled(); } @@ -490,8 +470,7 @@ rw_lock_sx_unlock_func( /* still has x-lock */ ut_ad(lock_word == -X_LOCK_HALF_DECR || lock_word <= -(X_LOCK_DECR + X_LOCK_HALF_DECR)); - my_atomic_add32_explicit(&lock->lock_word, X_LOCK_HALF_DECR, - MY_MEMORY_ORDER_RELAXED); + lock->lock_word.fetch_add(X_LOCK_HALF_DECR); } } diff --git a/storage/innobase/include/sync0types.h b/storage/innobase/include/sync0types.h index 8fcb1abb0e2..4d2a7c8ff28 100644 --- a/storage/innobase/include/sync0types.h +++ b/storage/innobase/include/sync0types.h @@ -28,7 +28,6 @@ Created 9/5/1995 Heikki Tuuri #define sync0types_h #include <vector> -#include <my_atomic.h> #include "ut0new.h" @@ -998,9 +997,6 @@ struct latch_t { @return the string representation */ virtual std::string to_string() const = 0; - /** @return "filename:line" from where the latch was last locked */ - virtual std::string locked_from() const = 0; - /** @return the latch level */ latch_level_t get_level() const UNIV_NOTHROW @@ -1041,7 +1037,7 @@ struct sync_checker : public sync_check_functor_t /** Check the latching constraints @param[in] level The level held by the thread @return whether a latch violation was detected */ - bool operator()(const latch_level_t level) const + bool operator()(const latch_level_t level) const override { if (some_allowed) { switch (level) { @@ -1085,7 +1081,7 @@ struct sync_allowed_latches : public sync_check_functor_t { @param[in] latch The latch level to check @return true if there is a latch violation */ - bool operator()(const latch_level_t level) const + bool operator()(const latch_level_t level) const override { return(std::find(begin, end, level) == end); } @@ -1116,51 +1112,6 @@ enum rw_lock_flag_t { #endif /* UNIV_INNOCHECKSUM */ -static inline ulint my_atomic_addlint(ulint *A, ulint B) -{ -#ifdef _WIN64 - return ulint(my_atomic_add64((volatile int64*)A, B)); -#else - return ulint(my_atomic_addlong(A, B)); -#endif -} - -static inline ulint my_atomic_loadlint(const ulint *A) -{ -#ifdef _WIN64 - return ulint(my_atomic_load64((volatile int64*)A)); -#else - return ulint(my_atomic_loadlong(A)); -#endif -} - -static inline lint my_atomic_addlint(volatile lint *A, lint B) -{ -#ifdef _WIN64 - return my_atomic_add64((volatile int64*)A, B); -#else - return my_atomic_addlong(A, B); -#endif -} - -static inline lint my_atomic_loadlint(const lint *A) -{ -#ifdef _WIN64 - return lint(my_atomic_load64((volatile int64*)A)); -#else - return my_atomic_loadlong(A); -#endif -} - -static inline void my_atomic_storelint(ulint *A, ulint B) -{ -#ifdef _WIN64 - my_atomic_store64((volatile int64*)A, B); -#else - my_atomic_storelong(A, B); -#endif -} - /** Simple non-atomic counter aligned to CACHE_LINE_SIZE @tparam Type the integer type of the counter */ template <typename Type> @@ -1183,28 +1134,4 @@ private: /** The counter */ Type m_counter; }; - -/** Simple atomic counter aligned to CACHE_LINE_SIZE -@tparam Type lint or ulint */ -template <typename Type = ulint> -struct MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) simple_atomic_counter -{ - /** Increment the counter */ - Type inc() { return add(1); } - /** Decrement the counter */ - Type dec() { return add(Type(~0)); } - - /** Add to the counter - @param[in] i amount to be added - @return the value of the counter before adding */ - Type add(Type i) { return my_atomic_addlint(&m_counter, i); } - - /** @return the value of the counter (non-atomic access)! */ - operator Type() const { return m_counter; } - -private: - /** The counter */ - Type m_counter; -}; - #endif /* sync0types_h */ diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h index 4bc5aded341..7c3343ce7d2 100644 --- a/storage/innobase/include/trx0purge.h +++ b/storage/innobase/include/trx0purge.h @@ -140,202 +140,6 @@ private: TrxUndoRsegs::const_iterator m_iter; }; -/* Namespace to hold all the related functions and variables need for truncate -of undo tablespace. */ -namespace undo { - - typedef std::vector<ulint> undo_spaces_t; - typedef std::vector<trx_rseg_t*> rseg_for_trunc_t; - - /** Mark completion of undo truncate action by writing magic number to - the log file and then removing it from the disk. - If we are going to remove it from disk then why write magic number ? - This is to safeguard from unlink (file-system) anomalies that will keep - the link to the file even after unlink action is successfull and - ref-count = 0. - @param[in] space_id id of the undo tablespace to truncate.*/ - void done(ulint space_id); - - /** Check if TRUNCATE_DDL_LOG file exist. - @param[in] space_id id of the undo tablespace. - @return true if exist else false. */ - bool is_log_present(ulint space_id); - - /** Track UNDO tablespace mark for truncate. */ - class Truncate { - public: - void create() - { - m_undo_for_trunc = ULINT_UNDEFINED; - m_scan_start = 1; - m_purge_rseg_truncate_frequency = - ulint(srv_purge_rseg_truncate_frequency); - } - - /** Clear the cached rollback segment. Normally done - when purge is about to shutdown. */ - void clear() - { - reset(); - rseg_for_trunc_t temp; - m_rseg_for_trunc.swap(temp); - } - - /** Is tablespace selected for truncate. - @return true if undo tablespace is marked for truncate */ - bool is_marked() const - { - return(!(m_undo_for_trunc == ULINT_UNDEFINED)); - } - - /** Mark the tablespace for truncate. - @param[in] undo_id tablespace for truncate. */ - void mark(ulint undo_id) - { - m_undo_for_trunc = undo_id; - - m_scan_start = (undo_id + 1) - % (srv_undo_tablespaces_active + 1); - if (m_scan_start == 0) { - /* Note: UNDO tablespace ids starts from 1. */ - m_scan_start = 1; - } - - /* We found an UNDO-tablespace to truncate so set the - local purge rseg truncate frequency to 1. This will help - accelerate the purge action and in turn truncate. */ - m_purge_rseg_truncate_frequency = 1; - } - - /** Get the tablespace marked for truncate. - @return tablespace id marked for truncate. */ - ulint get_marked_space_id() const - { - return(m_undo_for_trunc); - } - - /** Add rseg to truncate vector. - @param[in,out] rseg rseg for truncate */ - void add_rseg_to_trunc(trx_rseg_t* rseg) - { - m_rseg_for_trunc.push_back(rseg); - } - - /** Get number of rsegs registered for truncate. - @return return number of rseg that belongs to tablespace mark - for truncate. */ - ulint rsegs_size() const - { - return(m_rseg_for_trunc.size()); - } - - /** Get ith registered rseg. - @param[in] id index of rseg to get. - @return reference to registered rseg. */ - trx_rseg_t* get_ith_rseg(ulint id) - { - ut_ad(id < m_rseg_for_trunc.size()); - return(m_rseg_for_trunc.at(id)); - } - - /** Reset for next rseg truncate. */ - void reset() - { - m_undo_for_trunc = ULINT_UNDEFINED; - m_rseg_for_trunc.clear(); - - /* Sync with global value as we are done with - truncate now. */ - m_purge_rseg_truncate_frequency = static_cast<ulint>( - srv_purge_rseg_truncate_frequency); - } - - /** Get the tablespace id to start scanning from. - @return id of UNDO tablespace to start scanning from. */ - ulint get_scan_start() const - { - return(m_scan_start); - } - - /** Check if the tablespace needs fix-up (based on presence of - DDL truncate log) - @param space_id space id of the undo tablespace to check - @return true if fix up is needed else false */ - bool needs_fix_up(ulint space_id) const - { - return(is_log_present(space_id)); - } - - /** Add undo tablespace to truncate vector. - @param[in] space_id space id of tablespace to - truncate */ - static void add_space_to_trunc_list(ulint space_id) - { - s_spaces_to_truncate.push_back(space_id); - } - - /** Clear the truncate vector. */ - static void clear_trunc_list() - { - s_spaces_to_truncate.clear(); - } - - /** Is tablespace marked for truncate. - @param[in] space_id space id to check - @return true if marked for truncate, else false. */ - static bool is_tablespace_truncated(ulint space_id) - { - return(std::find(s_spaces_to_truncate.begin(), - s_spaces_to_truncate.end(), space_id) - != s_spaces_to_truncate.end()); - } - - /** Was a tablespace truncated at startup - @param[in] space_id space id to check - @return whether space_id was truncated at startup */ - static bool was_tablespace_truncated(ulint space_id) - { - return(std::find(s_fix_up_spaces.begin(), - s_fix_up_spaces.end(), - space_id) - != s_fix_up_spaces.end()); - } - - /** Get local rseg purge truncate frequency - @return rseg purge truncate frequency. */ - ulint get_rseg_truncate_frequency() const - { - return(m_purge_rseg_truncate_frequency); - } - - private: - /** UNDO tablespace is mark for truncate. */ - ulint m_undo_for_trunc; - - /** rseg that resides in UNDO tablespace is marked for - truncate. */ - rseg_for_trunc_t m_rseg_for_trunc; - - /** Start scanning for UNDO tablespace from this space_id. - This is to avoid bias selection of one tablespace always. */ - ulint m_scan_start; - - /** Rollback segment(s) purge frequency. This is local - value maintained along with global value. It is set to global - value on start but when tablespace is marked for truncate it - is updated to 1 and then minimum value among 2 is used by - purge action. */ - ulint m_purge_rseg_truncate_frequency; - - /** List of UNDO tablespace(s) to truncate. */ - static undo_spaces_t s_spaces_to_truncate; - public: - /** Undo tablespaces that were truncated at startup */ - static undo_spaces_t s_fix_up_spaces; - }; /* class Truncate */ - -}; /* namespace undo */ - /** The control structure used in the purge operation */ class purge_sys_t { @@ -348,22 +152,19 @@ public: MY_ALIGNED(CACHE_LINE_SIZE) rw_lock_t latch; private: - /** whether purge is enabled; protected by latch and my_atomic */ - int32_t m_enabled; + /** whether purge is enabled; protected by latch and std::atomic */ + std::atomic<bool> m_enabled; /** number of pending stop() calls without resume() */ - int32_t m_paused; + Atomic_counter<int32_t> m_paused; public: que_t* query; /*!< The query graph which will do the parallelized purge operation */ MY_ALIGNED(CACHE_LINE_SIZE) ReadView view; /*!< The purge will not remove undo logs which are >= this view (purge view) */ - /** Total number of tasks submitted by srv_purge_coordinator_thread. - Not accessed by other threads. */ - ulint n_submitted; - /** Number of completed tasks. Accessed by srv_purge_coordinator - and srv_worker_thread by my_atomic. */ - ulint n_completed; + /** Number of not completed tasks. Accessed by srv_purge_coordinator + and srv_worker_thread by std::atomic. */ + std::atomic<ulint> n_tasks; /** Iterator to the undo log records of committed transactions */ struct iterator @@ -417,9 +218,14 @@ public: by the pq_mutex */ PQMutex pq_mutex; /*!< Mutex protecting purge_queue */ - undo::Truncate undo_trunc; /*!< Track UNDO tablespace marked - for truncate. */ - + /** Undo tablespace file truncation (only accessed by the + srv_purge_coordinator_thread) */ + struct { + /** The undo tablespace that is currently being truncated */ + fil_space_t* current; + /** The undo tablespace that was last truncated */ + fil_space_t* last; + } truncate; /** Constructor. @@ -428,7 +234,7 @@ public: uninitialised. Real initialisation happens in create(). */ - purge_sys_t() : event(NULL), m_enabled(false) {} + purge_sys_t() : event(NULL), m_enabled(false), n_tasks(0) {} /** Create the instance */ @@ -438,39 +244,24 @@ public: void close(); /** @return whether purge is enabled */ - bool enabled() - { - return my_atomic_load32_explicit(&m_enabled, MY_MEMORY_ORDER_RELAXED); - } - /** @return whether purge is enabled */ - bool enabled_latched() - { - ut_ad(rw_lock_own_flagged(&latch, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); - return bool(m_enabled); - } + bool enabled() { return m_enabled.load(std::memory_order_relaxed); } /** @return whether the purge coordinator is paused */ bool paused() - { return my_atomic_load32_explicit(&m_paused, MY_MEMORY_ORDER_RELAXED); } - /** @return whether the purge coordinator is paused */ - bool paused_latched() - { - ut_ad(rw_lock_own_flagged(&latch, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); - return m_paused != 0; - } + { return m_paused != 0; } /** Enable purge at startup. Not protected by latch; the main thread will wait for purge_sys.enabled() in srv_start() */ void coordinator_startup() { ut_ad(!enabled()); - my_atomic_store32_explicit(&m_enabled, true, MY_MEMORY_ORDER_RELAXED); + m_enabled.store(true, std::memory_order_relaxed); } /** Disable purge at shutdown */ void coordinator_shutdown() { ut_ad(enabled()); - my_atomic_store32_explicit(&m_enabled, false, MY_MEMORY_ORDER_RELAXED); + m_enabled.store(false, std::memory_order_relaxed); } /** @return whether the purge coordinator thread is active */ diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h index a23b57ccc3e..d9ea6c19d11 100644 --- a/storage/innobase/include/trx0roll.h +++ b/storage/innobase/include/trx0roll.h @@ -42,16 +42,6 @@ trx_savept_take( /*============*/ trx_t* trx); /*!< in: transaction */ -/** Get the last undo log record of a transaction (for rollback). -@param[in,out] trx transaction -@param[out] roll_ptr DB_ROLL_PTR to the undo record -@param[in,out] heap memory heap for allocation -@return undo log record copied to heap -@retval NULL if none left or the roll_limit (savepoint) was reached */ -trx_undo_rec_t* -trx_roll_pop_top_rec_of_trx(trx_t* trx, roll_ptr_t* roll_ptr, mem_heap_t* heap) - MY_ATTRIBUTE((nonnull, warn_unused_result)); - /** Report progress when rolling back a row of a recovered transaction. */ void trx_roll_report_progress(); /*******************************************************************//** diff --git a/storage/innobase/include/trx0rseg.ic b/storage/innobase/include/trx0rseg.ic index 687a1d5b8d8..0cff8fa1f5c 100644 --- a/storage/innobase/include/trx0rseg.ic +++ b/storage/innobase/include/trx0rseg.ic @@ -41,7 +41,7 @@ trx_rsegf_get(fil_space_t* space, ulint page_no, mtr_t* mtr) || !srv_was_started); buf_block_t* block = buf_page_get(page_id_t(space->id, page_no), - univ_page_size, RW_X_LATCH, mtr); + 0, RW_X_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_RSEG_HEADER); @@ -67,8 +67,7 @@ trx_rsegf_get_new( || !srv_was_started); ut_ad(space <= TRX_SYS_MAX_UNDO_SPACES || space == SRV_TMP_SPACE_ID); - block = buf_page_get( - page_id_t(space, page_no), univ_page_size, RW_X_LATCH, mtr); + block = buf_page_get(page_id_t(space, page_no), 0, RW_X_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW); diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index 913e2d25172..73f05eb5d48 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -68,17 +68,12 @@ trx_sys_rseg_find_free(const buf_block_t* sys_header); @param[in] rw whether to lock the page for writing @return the TRX_SYS page @retval NULL if the page cannot be read */ -inline -buf_block_t* -trx_sysf_get(mtr_t* mtr, bool rw = true) +inline buf_block_t *trx_sysf_get(mtr_t* mtr, bool rw= true) { - buf_block_t* block = buf_page_get( - page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), - univ_page_size, rw ? RW_X_LATCH : RW_S_LATCH, mtr); - if (block) { - buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); - } - return block; + buf_block_t* block = buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), + 0, rw ? RW_X_LATCH : RW_S_LATCH, mtr); + ut_d(if (block) buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);) + return block; } #ifdef UNIV_DEBUG @@ -200,14 +195,13 @@ trx_sysf_rseg_get_space(const buf_block_t* sys_header, ulint rseg_id) @param[in] sys_header TRX_SYS page @param[in] rseg_id rollback segment identifier @return undo page number */ -inline -uint32_t -trx_sysf_rseg_get_page_no(const buf_block_t* sys_header, ulint rseg_id) +inline uint32_t +trx_sysf_rseg_get_page_no(const buf_block_t *sys_header, ulint rseg_id) { - ut_ad(rseg_id < TRX_SYS_N_RSEGS); - return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO - + rseg_id * TRX_SYS_RSEG_SLOT_SIZE - + sys_header->frame); + ut_ad(rseg_id < TRX_SYS_N_RSEGS); + return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO + + rseg_id * TRX_SYS_RSEG_SLOT_SIZE + + sys_header->frame); } /** Maximum length of MySQL binlog file name, in bytes. @@ -344,9 +338,9 @@ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */ /*-------------------------------------------------------------*/ /** Contents of TRX_SYS_DOUBLEWRITE_MAGIC */ -#define TRX_SYS_DOUBLEWRITE_MAGIC_N 536853855 +constexpr uint32_t TRX_SYS_DOUBLEWRITE_MAGIC_N= 536853855; /** Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED */ -#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N 1783657386 +constexpr uint32_t TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N= 1783657386; /** Size of the doublewrite block in pages */ #define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE @@ -369,7 +363,7 @@ struct rw_trx_hash_element_t trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */ - trx_id_t no; + Atomic_counter<trx_id_t> no; trx_t *trx; ib_mutex_t mutex; }; @@ -716,11 +710,7 @@ public: because it may change even before this method returns. */ - uint32_t size() - { - return uint32_t(my_atomic_load32_explicit(&hash.count, - MY_MEMORY_ORDER_RELAXED)); - } + uint32_t size() { return uint32_t(lf_hash_size(&hash)); } /** @@ -802,7 +792,7 @@ class trx_sys_t The smallest number not yet assigned as a transaction id or transaction number. Accessed and updated with atomic operations. */ - MY_ALIGNED(CACHE_LINE_SIZE) trx_id_t m_max_trx_id; + MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<trx_id_t> m_max_trx_id; /** @@ -813,17 +803,17 @@ class trx_sys_t @sa assign_new_trx_no() @sa snapshot_ids() */ - MY_ALIGNED(CACHE_LINE_SIZE) trx_id_t m_rw_trx_hash_version; + MY_ALIGNED(CACHE_LINE_SIZE) std::atomic<trx_id_t> m_rw_trx_hash_version; + bool m_initialised; + +public: /** TRX_RSEG_HISTORY list length (number of committed transactions to purge) */ - MY_ALIGNED(CACHE_LINE_SIZE) int32 rseg_history_len; - - bool m_initialised; + MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<uint32_t> rseg_history_len; -public: /** Mutex protecting trx_list. */ MY_ALIGNED(CACHE_LINE_SIZE) mutable TrxSysMutex mutex; @@ -899,9 +889,7 @@ public: trx_id_t get_max_trx_id() { - return static_cast<trx_id_t> - (my_atomic_load64_explicit(reinterpret_cast<int64*>(&m_max_trx_id), - MY_MEMORY_ORDER_RELAXED)); + return m_max_trx_id; } @@ -943,9 +931,7 @@ public: void assign_new_trx_no(trx_t *trx) { trx->no= get_new_trx_id_no_refresh(); - my_atomic_store64_explicit(reinterpret_cast<int64*> - (&trx->rw_trx_hash_element->no), - trx->no, MY_MEMORY_ORDER_RELAXED); + trx->rw_trx_hash_element->no= trx->no; refresh_rw_trx_hash_version(); } @@ -996,7 +982,8 @@ public: /** Initialiser for m_max_trx_id and m_rw_trx_hash_version. */ void init_max_trx_id(trx_id_t value) { - m_max_trx_id= m_rw_trx_hash_version= value; + m_max_trx_id= value; + m_rw_trx_hash_version.store(value, std::memory_order_relaxed); } @@ -1118,22 +1105,6 @@ public: return count; } - /** @return number of committed transactions waiting for purge */ - ulint history_size() const - { - return uint32(my_atomic_load32(&const_cast<trx_sys_t*>(this) - ->rseg_history_len)); - } - /** Add to the TRX_RSEG_HISTORY length (on database startup). */ - void history_add(int32 len) - { - my_atomic_add32(&rseg_history_len, len); - } - /** Register a committed transaction. */ - void history_insert() { history_add(1); } - /** Note that a committed transaction was purged. */ - void history_remove() { history_add(-1); } - private: static my_bool get_min_trx_id_callback(rw_trx_hash_element_t *element, trx_id_t *id) @@ -1164,8 +1135,7 @@ private: { if (element->id < arg->m_id) { - trx_id_t no= static_cast<trx_id_t>(my_atomic_load64_explicit( - reinterpret_cast<int64*>(&element->no), MY_MEMORY_ORDER_RELAXED)); + trx_id_t no= element->no; arg->m_ids->push_back(element->id); if (no < arg->m_no) arg->m_no= no; @@ -1177,18 +1147,14 @@ private: /** Getter for m_rw_trx_hash_version, must issue ACQUIRE memory barrier. */ trx_id_t get_rw_trx_hash_version() { - return static_cast<trx_id_t> - (my_atomic_load64_explicit(reinterpret_cast<int64*> - (&m_rw_trx_hash_version), - MY_MEMORY_ORDER_ACQUIRE)); + return m_rw_trx_hash_version.load(std::memory_order_acquire); } /** Increments m_rw_trx_hash_version, must issue RELEASE memory barrier. */ void refresh_rw_trx_hash_version() { - my_atomic_add64_explicit(reinterpret_cast<int64*>(&m_rw_trx_hash_version), - 1, MY_MEMORY_ORDER_RELEASE); + m_rw_trx_hash_version.fetch_add(1, std::memory_order_release); } @@ -1207,8 +1173,7 @@ private: trx_id_t get_new_trx_id_no_refresh() { - return static_cast<trx_id_t>(my_atomic_add64_explicit( - reinterpret_cast<int64*>(&m_max_trx_id), 1, MY_MEMORY_ORDER_RELAXED)); + return m_max_trx_id++; } }; diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index feb27e56115..70df62d0d03 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -181,17 +181,6 @@ trx_start_for_ddl_low( trx_start_for_ddl_low((t), (o)) #endif /* UNIV_DEBUG */ -/****************************************************************//** -Commits a transaction. */ -void -trx_commit( -/*=======*/ - trx_t* trx); /*!< in/out: transaction */ - -/** Commit a transaction and a mini-transaction. -@param[in,out] trx transaction -@param[in,out] mtr mini-transaction (NULL if no modifications) */ -void trx_commit_low(trx_t* trx, mtr_t* mtr); /**********************************************************************//** Does the transaction commit for MySQL. @return DB_SUCCESS or error number */ @@ -447,31 +436,6 @@ Check transaction state */ ut_error; \ } while (0) -/** Check if transaction is free so that it can be re-initialized. -@param t transaction handle */ -#define assert_trx_is_free(t) do { \ - ut_ad(trx_state_eq((t), TRX_STATE_NOT_STARTED)); \ - ut_ad(!(t)->id); \ - ut_ad(!(t)->has_logged()); \ - ut_ad(!(t)->is_referenced()); \ - ut_ad(!(t)->is_wsrep()); \ - ut_ad(!(t)->read_view.is_open()); \ - ut_ad((t)->lock.wait_thr == NULL); \ - ut_ad(UT_LIST_GET_LEN((t)->lock.trx_locks) == 0); \ - ut_ad((t)->lock.table_locks.empty()); \ - ut_ad(!(t)->autoinc_locks \ - || ib_vector_is_empty((t)->autoinc_locks)); \ - ut_ad((t)->dict_operation == TRX_DICT_OP_NONE); \ -} while(0) - -/** Check if transaction is in-active so that it can be freed and put back to -transaction pool. -@param t transaction handle */ -#define assert_trx_is_inactive(t) do { \ - assert_trx_is_free((t)); \ - ut_ad((t)->dict_operation_lock_mode == 0); \ -} while(0) - #ifdef UNIV_DEBUG /*******************************************************************//** Assert that an autocommit non-locking select cannot be in the @@ -559,6 +523,11 @@ struct trx_lock_t { lock_sys.mutex. Otherwise, this may only be modified by the thread that is serving the running transaction. */ +#ifdef WITH_WSREP + bool was_chosen_as_wsrep_victim; + /*!< high priority wsrep thread has + marked this trx to abort */ +#endif /* WITH_WSREP */ /** Pre-allocated record locks */ struct { @@ -585,6 +554,9 @@ struct trx_lock_t { lock_list table_locks; /*!< All table locks requested by this transaction, including AUTOINC locks */ + /** List of pending trx_t::evict_table() */ + UT_LIST_BASE_NODE_T(dict_table_t) evicted_tables; + bool cancel; /*!< true if the transaction is being rolled back either via deadlock detection or due to lock timeout. The @@ -675,7 +647,7 @@ with exactly one user transaction. There are some exceptions to this: * For DDL operations, a subtransaction is allocated that modifies the data dictionary tables. Lock waits and deadlocks are prevented by -acquiring the dict_operation_lock before starting the subtransaction +acquiring the dict_sys.latch before starting the subtransaction and releasing it after committing the subtransaction. * The purge system uses a special transaction that is not associated @@ -751,7 +723,7 @@ private: that it is no longer "active". */ - int32_t n_ref; + Atomic_counter<int32_t> n_ref; public: @@ -890,10 +862,10 @@ public: defer flush of the logs to disk until after we release the mutex. */ - bool must_flush_log_later;/*!< this flag is set to TRUE in - trx_commit() if flush_log_later was - TRUE, and there were modifications by - the transaction; in that case we must + bool must_flush_log_later;/*!< set in commit() + if flush_log_later was + set and redo log was written; + in that case we will flush the log in trx_commit_complete_for_mysql() */ ulint duplicates; /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */ @@ -913,8 +885,8 @@ public: ib_uint32_t dict_operation_lock_mode; /*!< 0, RW_S_LATCH, or RW_X_LATCH: the latch mode trx currently holds - on dict_operation_lock. Protected - by dict_operation_lock. */ + on dict_sys.latch. Protected + by dict_sys.latch. */ /** wall-clock time of the latest transition to TRX_STATE_ACTIVE; used for diagnostic purposes only */ @@ -1120,19 +1092,32 @@ public: /** Release any explicit locks of a committing transaction. */ inline void release_locks(); + /** Evict a table definition due to the rollback of ALTER TABLE. + @param[in] table_id table identifier */ + void evict_table(table_id_t table_id); + +private: + /** Mark a transaction committed in the main memory data structures. */ + inline void commit_in_memory(const mtr_t *mtr); +public: + /** Commit the transaction. */ + void commit(); + + /** Commit the transaction in a mini-transaction. + @param mtr mini-transaction (if there are any persistent modifications) */ + void commit_low(mtr_t *mtr= nullptr); - bool is_referenced() - { - return my_atomic_load32_explicit(&n_ref, MY_MEMORY_ORDER_RELAXED) > 0; - } + + + bool is_referenced() const { return n_ref > 0; } void reference() { #ifdef UNIV_DEBUG - int32_t old_n_ref= + auto old_n_ref= #endif - my_atomic_add32_explicit(&n_ref, 1, MY_MEMORY_ORDER_RELAXED); + n_ref++; ut_ad(old_n_ref >= 0); } @@ -1140,13 +1125,33 @@ public: void release_reference() { #ifdef UNIV_DEBUG - int32_t old_n_ref= + auto old_n_ref= #endif - my_atomic_add32_explicit(&n_ref, -1, MY_MEMORY_ORDER_RELAXED); + n_ref--; ut_ad(old_n_ref > 0); } + void assert_freed() const + { + ut_ad(state == TRX_STATE_NOT_STARTED); + ut_ad(!id); + ut_ad(!has_logged()); + ut_ad(!is_referenced()); + ut_ad(!is_wsrep()); +#ifdef WITH_WSREP + ut_ad(!lock.was_chosen_as_wsrep_victim); +#endif + ut_ad(!read_view.is_open()); + ut_ad(!lock.wait_thr); + ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0); + ut_ad(lock.table_locks.empty()); + ut_ad(!autoinc_locks || ib_vector_is_empty(autoinc_locks)); + ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0); + ut_ad(dict_operation == TRX_DICT_OP_NONE); + } + + private: /** Assign a rollback segment for modifying temporary tables. @return the assigned rollback segment */ diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h index 7be4314ecbc..ce92e5de5e1 100644 --- a/storage/innobase/include/trx0undo.h +++ b/storage/innobase/include/trx0undo.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2018, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -79,27 +79,22 @@ trx_undo_trx_id_is_insert( /*======================*/ const byte* trx_id) /*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */ MY_ATTRIBUTE((warn_unused_result)); -/*****************************************************************//** -Writes a roll ptr to an index page. In case that the size changes in -some future version, this function should be used instead of -mach_write_... */ -UNIV_INLINE -void -trx_write_roll_ptr( -/*===============*/ - byte* ptr, /*!< in: pointer to memory where - written */ - roll_ptr_t roll_ptr); /*!< in: roll ptr */ -/*****************************************************************//** -Reads a roll ptr from an index page. In case that the roll ptr size -changes in some future version, this function should be used instead of -mach_read_... +/** Write DB_ROLL_PTR. +@param[out] ptr buffer +@param[in] roll_ptr DB_ROLL_PTR value */ +inline void trx_write_roll_ptr(byte* ptr, roll_ptr_t roll_ptr) +{ + compile_time_assert(DATA_ROLL_PTR_LEN == 7); + mach_write_to_7(ptr, roll_ptr); +} +/** Read DB_ROLL_PTR. +@param[in] ptr buffer @return roll ptr */ -UNIV_INLINE -roll_ptr_t -trx_read_roll_ptr( -/*==============*/ - const byte* ptr); /*!< in: pointer to memory from where to read */ +inline roll_ptr_t trx_read_roll_ptr(const byte* ptr) +{ + compile_time_assert(DATA_ROLL_PTR_LEN == 7); + return mach_read_from_7(ptr); +} /** Gets an undo log page and x-latches it. @param[in] page_id page id @@ -185,9 +180,7 @@ trx_undo_free_last_page(trx_undo_t* undo, mtr_t* mtr) @param[in,out] undo undo log @param[in] limit all undo logs after this limit will be discarded @param[in] is_temp whether this is temporary undo log */ -void -trx_undo_truncate_end(trx_undo_t* undo, undo_no_t limit, bool is_temp) - MY_ATTRIBUTE((nonnull)); +void trx_undo_truncate_end(trx_undo_t& undo, undo_no_t limit, bool is_temp); /** Truncate the head of an undo log. NOTE that only whole pages are freed; the header page is not @@ -315,16 +308,17 @@ trx_undo_mem_create_at_db_start(trx_rseg_t* rseg, ulint id, ulint page_no, and delete markings: in short, modifys (the name 'UPDATE' is a historical relic) */ -/* States of an undo log segment */ -#define TRX_UNDO_ACTIVE 1 /* contains an undo log of an active - transaction */ -#define TRX_UNDO_CACHED 2 /* cached for quick reuse */ -#define TRX_UNDO_TO_FREE 3 /* insert undo segment can be freed */ -#define TRX_UNDO_TO_PURGE 4 /* update undo segment will not be - reused: it can be freed in purge when - all undo data in it is removed */ -#define TRX_UNDO_PREPARED 5 /* contains an undo log of an - prepared transaction */ +/* TRX_UNDO_STATE values of an undo log segment */ +/** contains an undo log of an active transaction */ +constexpr uint16_t TRX_UNDO_ACTIVE = 1; +/** cached for quick reuse */ +constexpr uint16_t TRX_UNDO_CACHED = 2; +/** old_insert undo segment that can be freed */ +constexpr uint16_t TRX_UNDO_TO_FREE = 3; +/** can be freed in purge when all undo data in it is removed */ +constexpr uint16_t TRX_UNDO_TO_PURGE = 4; +/** contains an undo log of a prepared transaction */ +constexpr uint16_t TRX_UNDO_PREPARED = 5; #ifndef UNIV_INNOCHECKSUM diff --git a/storage/innobase/include/trx0undo.ic b/storage/innobase/include/trx0undo.ic index 19697c6054c..6d1ec16869e 100644 --- a/storage/innobase/include/trx0undo.ic +++ b/storage/innobase/include/trx0undo.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2018, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -103,37 +103,6 @@ trx_undo_trx_id_is_insert( return bool(trx_id[DATA_TRX_ID_LEN] >> 7); } -/*****************************************************************//** -Writes a roll ptr to an index page. In case that the size changes in -some future version, this function should be used instead of -mach_write_... */ -UNIV_INLINE -void -trx_write_roll_ptr( -/*===============*/ - byte* ptr, /*!< in: pointer to memory where - written */ - roll_ptr_t roll_ptr) /*!< in: roll ptr */ -{ - compile_time_assert(DATA_ROLL_PTR_LEN == 7); - mach_write_to_7(ptr, roll_ptr); -} - -/*****************************************************************//** -Reads a roll ptr from an index page. In case that the roll ptr size -changes in some future version, this function should be used instead of -mach_read_... -@return roll ptr */ -UNIV_INLINE -roll_ptr_t -trx_read_roll_ptr( -/*==============*/ - const byte* ptr) /*!< in: pointer to memory from where to read */ -{ - compile_time_assert(DATA_ROLL_PTR_LEN == 7); - return(mach_read_from_7(ptr)); -} - /** Gets an undo log page and x-latches it. @param[in] page_id page id @param[in,out] mtr mini-transaction @@ -142,8 +111,7 @@ UNIV_INLINE page_t* trx_undo_page_get(const page_id_t page_id, mtr_t* mtr) { - buf_block_t* block = buf_page_get(page_id, univ_page_size, - RW_X_LATCH, mtr); + buf_block_t* block = buf_page_get(page_id, 0, RW_X_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); @@ -158,8 +126,7 @@ UNIV_INLINE page_t* trx_undo_page_get_s_latched(const page_id_t page_id, mtr_t* mtr) { - buf_block_t* block = buf_page_get(page_id, univ_page_size, - RW_S_LATCH, mtr); + buf_block_t* block = buf_page_get(page_id, 0, RW_S_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i index 001690a47a1..99e493acfb4 100644 --- a/storage/innobase/include/univ.i +++ b/storage/innobase/include/univ.i @@ -77,6 +77,7 @@ used throughout InnoDB but do not include too much themselves. They support cross-platform development and expose comonly used SQL names. */ #include <my_global.h> +#include "my_counter.h" /* JAN: TODO: missing 5.7 header */ #ifdef HAVE_MY_THREAD_H diff --git a/storage/innobase/include/ut0counter.h b/storage/innobase/include/ut0counter.h index a04a674751c..646a5f367c2 100644 --- a/storage/innobase/include/ut0counter.h +++ b/storage/innobase/include/ut0counter.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2012, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2018, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -30,7 +30,6 @@ Created 2012/04/12 by Sunny Bains #include "os0thread.h" #include "my_rdtsc.h" -#include "my_atomic.h" /** CPU cache line size */ #ifdef CPU_LEVEL1_DCACHE_LINESIZE @@ -42,120 +41,85 @@ Created 2012/04/12 by Sunny Bains /** Default number of slots to use in ib_counter_t */ #define IB_N_SLOTS 64 -/** Get the offset into the counter array. */ -template <typename Type, int N> -struct generic_indexer_t { - /** @return offset within m_counter */ - static size_t offset(size_t index) UNIV_NOTHROW - { - return(((index % N) + 1) * (CACHE_LINE_SIZE / sizeof(Type))); - } -}; +/** Use the result of my_timer_cycles(), which mainly uses RDTSC for cycles +as a random value. See the comments for my_timer_cycles() */ +/** @return result from RDTSC or similar functions. */ +static inline size_t +get_rnd_value() +{ + size_t c = static_cast<size_t>(my_timer_cycles()); + + if (c != 0) { + return c; + } -/** Use the result of my_timer_cycles(), which mainly uses RDTSC for cycles, -to index into the counter array. See the comments for my_timer_cycles() */ -template <typename Type=ulint, int N=1> -struct counter_indexer_t : public generic_indexer_t<Type, N> { - /** @return result from RDTSC or similar functions. */ - static size_t get_rnd_index() UNIV_NOTHROW - { - size_t c = static_cast<size_t>(my_timer_cycles()); - - if (c != 0) { - return(c); - } else { - /* We may go here if my_timer_cycles() returns 0, - so we have to have the plan B for the counter. */ + /* We may go here if my_timer_cycles() returns 0, + so we have to have the plan B for the counter. */ #if !defined(_WIN32) - return(size_t(os_thread_get_curr_id())); + return (size_t)os_thread_get_curr_id(); #else - LARGE_INTEGER cnt; - QueryPerformanceCounter(&cnt); + LARGE_INTEGER cnt; + QueryPerformanceCounter(&cnt); - return(static_cast<size_t>(cnt.QuadPart)); + return static_cast<size_t>(cnt.QuadPart); #endif /* !_WIN32 */ - } - } +} - /** @return a random offset to the array */ - static size_t get_rnd_offset() UNIV_NOTHROW - { - return(generic_indexer_t<Type, N>::offset(get_rnd_index())); - } -}; - -#define default_indexer_t counter_indexer_t - -/** Class for using fuzzy counters. The counter is relaxed atomic +/** Class for using fuzzy counters. The counter is multi-instance relaxed atomic so the results are not guaranteed to be 100% accurate but close enough. Creates an array of counters and separates each element by the CACHE_LINE_SIZE bytes */ -template < - typename Type, - int N = IB_N_SLOTS, - template<typename, int> class Indexer = default_indexer_t> -struct MY_ALIGNED(CACHE_LINE_SIZE) ib_counter_t -{ +template <typename Type, int N = IB_N_SLOTS> +struct ib_counter_t { /** Increment the counter by 1. */ - void inc() UNIV_NOTHROW { add(1); } + void inc() { add(1); } /** Increment the counter by 1. @param[in] index a reasonably thread-unique identifier */ - void inc(size_t index) UNIV_NOTHROW { add(index, 1); } + void inc(size_t index) { add(index, 1); } /** Add to the counter. @param[in] n amount to be added */ - void add(Type n) UNIV_NOTHROW { add(m_policy.get_rnd_offset(), n); } + void add(Type n) { add(get_rnd_value(), n); } /** Add to the counter. @param[in] index a reasonably thread-unique identifier @param[in] n amount to be added */ - void add(size_t index, Type n) UNIV_NOTHROW { - size_t i = m_policy.offset(index); - - ut_ad(i < UT_ARR_SIZE(m_counter)); - - if (sizeof(Type) == 8) { - my_atomic_add64_explicit( - reinterpret_cast<int64*>(&m_counter[i]), - static_cast<int64>(n), MY_MEMORY_ORDER_RELAXED); - } else if (sizeof(Type) == 4) { - my_atomic_add32_explicit( - reinterpret_cast<int32*>(&m_counter[i]), - static_cast<int32>(n), MY_MEMORY_ORDER_RELAXED); - } - compile_time_assert(sizeof(Type) == 8 || sizeof(Type) == 4); + void add(size_t index, Type n) { + index = index % N; + + ut_ad(index < UT_ARR_SIZE(m_counter)); + + m_counter[index].value.fetch_add(n, std::memory_order_relaxed); } - /* @return total value - not 100% accurate, since it is relaxed atomic. */ - operator Type() const UNIV_NOTHROW { + /* @return total value - not 100% accurate, since it is relaxed atomic*/ + operator Type() const { Type total = 0; - for (size_t i = 0; i < N; ++i) { - if (sizeof(Type) == 8) { - total += static_cast< - Type>(my_atomic_load64_explicit( - reinterpret_cast<int64*>(const_cast<Type*>( - &m_counter[m_policy.offset(i)])), - MY_MEMORY_ORDER_RELAXED)); - } else if (sizeof(Type) == 4) { - total += static_cast< - Type>(my_atomic_load32_explicit( - reinterpret_cast<int32*>(const_cast<Type*>( - &m_counter[m_policy.offset(i)])), - MY_MEMORY_ORDER_RELAXED)); - } + for (const auto &counter : m_counter) { + total += counter.value.load(std::memory_order_relaxed); } return(total); } private: - /** Indexer into the array */ - Indexer<Type, N>m_policy; - - /** Slot 0 is unused. */ - Type m_counter[(N + 1) * (CACHE_LINE_SIZE / sizeof(Type))]; + /** Atomic which occupies whole CPU cache line. + Note: We rely on the default constructor of std::atomic and + do not explicitly initialize the contents. This works for us, + because ib_counter_t is only intended for usage with global + memory that is allocated from the .bss and thus guaranteed to + be zero-initialized by the run-time environment. + @see srv_stats + @see rw_lock_stats */ + struct ib_counter_element_t { + MY_ALIGNED(CACHE_LINE_SIZE) std::atomic<Type> value; + }; + static_assert(sizeof(ib_counter_element_t) == CACHE_LINE_SIZE, ""); + + /** Array of counter elements */ + MY_ALIGNED(CACHE_LINE_SIZE) ib_counter_element_t m_counter[N]; }; #endif /* ut0counter_h */ diff --git a/storage/innobase/include/ut0crc32.h b/storage/innobase/include/ut0crc32.h index 68af6882155..f2c1b7e82b6 100644 --- a/storage/innobase/include/ut0crc32.h +++ b/storage/innobase/include/ut0crc32.h @@ -47,12 +47,6 @@ typedef uint32_t (*ut_crc32_func_t)(const byte* ptr, ulint len); /** Pointer to CRC32 calculation function. */ extern ut_crc32_func_t ut_crc32; -#ifdef INNODB_BUG_ENDIAN_CRC32 -/** Pointer to CRC32 calculation function, which uses big-endian byte order -when converting byte strings to integers internally. */ -extern uint32_t ut_crc32_legacy_big_endian(const byte* buf, ulint len); -#endif /* INNODB_BUG_ENDIAN_CRC32 */ - /** Text description of CRC32 implementation */ extern const char* ut_crc32_implementation; diff --git a/storage/innobase/include/ut0mutex.h b/storage/innobase/include/ut0mutex.h index 1f99ee17a24..d7d48cd1f28 100644 --- a/storage/innobase/include/ut0mutex.h +++ b/storage/innobase/include/ut0mutex.h @@ -38,8 +38,6 @@ Created 2012-03-24 Sunny Bains. @param[in] T The resulting typedef alias */ #define UT_MUTEX_TYPE(M, P, T) typedef PolicyMutex<M<P> > T; -typedef OSMutex EventMutex; - # ifdef HAVE_IB_LINUX_FUTEX UT_MUTEX_TYPE(TTASFutexMutex, GenericPolicy, FutexMutex); UT_MUTEX_TYPE(TTASFutexMutex, BlockMutexPolicy, BlockFutexMutex); diff --git a/storage/innobase/include/ut0rnd.h b/storage/innobase/include/ut0rnd.h index 9af8687bfd0..5b1ae5bc0da 100644 --- a/storage/innobase/include/ut0rnd.h +++ b/storage/innobase/include/ut0rnd.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2019, MariaDB Corporation. +Copyright (c) 2019, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -32,7 +32,7 @@ Created 1/20/1994 Heikki Tuuri #ifndef UNIV_INNOCHECKSUM /** Seed value of ut_rnd_gen() */ -extern int32 ut_rnd_current; +extern std::atomic<uint32_t> ut_rnd_current; /** @return a pseudo-random 32-bit number */ inline uint32_t ut_rnd_gen() @@ -45,8 +45,7 @@ inline uint32_t ut_rnd_gen() x^19+x^18+x^14+x^13+x^11+x^10+x^9+x^8+x^6+1 */ const uint32_t crc32c= 0x1edc6f41; - uint32_t rnd= my_atomic_load32_explicit(&ut_rnd_current, - MY_MEMORY_ORDER_RELAXED); + uint32_t rnd= ut_rnd_current.load(std::memory_order_relaxed); if (UNIV_UNLIKELY(rnd == 0)) { @@ -61,7 +60,7 @@ inline uint32_t ut_rnd_gen() rnd^= crc32c; } - my_atomic_store32_explicit(&ut_rnd_current, rnd, MY_MEMORY_ORDER_RELAXED); + ut_rnd_current.store(rnd, std::memory_order_relaxed); return rnd; } diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h index a6a70c99ecf..430b99d7667 100644 --- a/storage/innobase/include/ut0ut.h +++ b/storage/innobase/include/ut0ut.h @@ -46,7 +46,6 @@ Created 1/20/1994 Heikki Tuuri #include <stdarg.h> #include <string> -#include <my_atomic.h> /** Index name prefix in fast index creation, as a string constant */ #define TEMP_INDEX_PREFIX_STR "\377" @@ -146,12 +145,6 @@ ut_2_power_up( ulint n) /*!< in: number != 0 */ MY_ATTRIBUTE((const)); -/** Determine how many bytes (groups of 8 bits) are needed to -store the given number of bits. -@param b in: bits -@return number of bytes (octets) needed to represent b */ -#define UT_BITS_IN_BYTES(b) (((b) + 7) / 8) - /**********************************************************//** Returns the number of milliseconds since some epoch. The value may wrap around. It should only be used for heuristic @@ -162,6 +155,12 @@ ut_time_ms(void); /*============*/ #endif /* !UNIV_INNOCHECKSUM */ +/** Determine how many bytes (groups of 8 bits) are needed to +store the given number of bits. +@param b in: bits +@return number of bytes (octets) needed to represent b */ +#define UT_BITS_IN_BYTES(b) (((b) + 7) / 8) + /** Determines if a number is zero or a power of two. @param[in] n number @return nonzero if n is zero or a power of two; zero otherwise */ |