diff options
author | Daniel Black <danielgb@au.ibm.com> | 2018-02-25 15:25:54 +1100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-02-25 15:25:54 +1100 |
commit | 0805a9565f09a36104a420d9e229e44b1eea8879 (patch) | |
tree | 5c5dbf942479c2b7349f1b4505768ddf5a2f8b8d /storage/innobase/include | |
parent | 3188131b15c26509e4df3c4b15972d07a20be8bd (diff) | |
parent | 8936b175106a3fdfc560e9d33aa58a6372084c5f (diff) | |
download | mariadb-git-0805a9565f09a36104a420d9e229e44b1eea8879.tar.gz |
Merge branch '10.3' into 10.2-MDEV-10814-dont-dump-query-cache
Diffstat (limited to 'storage/innobase/include')
87 files changed, 2437 insertions, 2972 deletions
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index cff8bc7cbc9..336ee68ee59 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -37,6 +37,12 @@ Created 6/2/1994 Heikki Tuuri #include "btr0types.h" #include "gis0type.h" +#define BTR_MAX_NODE_LEVEL 50 /*!< Maximum B-tree page level + (not really a hard limit). + Used in debug assertions + in btr_page_set_level and + btr_page_get_level */ + /** Maximum record size which can be stored on a page, without using the special big record storage structure */ #define BTR_PAGE_MAX_REC_SIZE (UNIV_PAGE_SIZE / 2 - 200) @@ -285,14 +291,22 @@ btr_page_get_index_id( MY_ATTRIBUTE((warn_unused_result)); /********************************************************//** Gets the node level field in an index page. +@param[in] page index page @return level, leaf level == 0 */ UNIV_INLINE ulint -btr_page_get_level_low( -/*===================*/ - const page_t* page) /*!< in: index page */ - MY_ATTRIBUTE((warn_unused_result)); -#define btr_page_get_level(page, mtr) btr_page_get_level_low(page) +btr_page_get_level(const page_t* page) +{ + ulint level; + + ut_ad(page); + + level = mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL); + + ut_ad(level <= BTR_MAX_NODE_LEVEL); + + return(level); +} MY_ATTRIBUTE((warn_unused_result)) /********************************************************//** Gets the next index page number. @return next page number */ diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic index bd4f2a40267..d24458beace 100644 --- a/storage/innobase/include/btr0btr.ic +++ b/storage/innobase/include/btr0btr.ic @@ -29,12 +29,6 @@ Created 6/2/1994 Heikki Tuuri #include "mtr0log.h" #include "page0zip.h" -#define BTR_MAX_NODE_LEVEL 50 /*!< Maximum B-tree page level - (not really a hard limit). - Used in debug assertions - in btr_page_set_level and - btr_page_get_level_low */ - /** Gets a buffer page and declares its latching order level. @param[in] page_id page id @param[in] mode latch mode @@ -144,26 +138,6 @@ btr_page_get_index_id( } /********************************************************//** -Gets the node level field in an index page. -@return level, leaf level == 0 */ -UNIV_INLINE -ulint -btr_page_get_level_low( -/*===================*/ - const page_t* page) /*!< in: index page */ -{ - ulint level; - - ut_ad(page); - - level = mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL); - - ut_ad(level <= BTR_MAX_NODE_LEVEL); - - return(level); -} - -/********************************************************//** Sets the node level field in an index page. */ UNIV_INLINE void diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h index 76f13325e2a..8d8fe0bc236 100644 --- a/storage/innobase/include/btr0cur.h +++ b/storage/innobase/include/btr0cur.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -177,8 +177,7 @@ Note that if mode is PAGE_CUR_LE, which is used in inserts, then cursor->up_match and cursor->low_match both will have sensible values. If mode is PAGE_CUR_GE, then up_match will a have a sensible value. */ dberr_t -btr_cur_search_to_nth_level( -/*========================*/ +btr_cur_search_to_nth_level_func( dict_index_t* index, /*!< in: index */ ulint level, /*!< in: the tree level of search */ const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in @@ -197,23 +196,29 @@ btr_cur_search_to_nth_level( cursor->left_block is used to store a pointer to the left neighbor page, in the cases BTR_SEARCH_PREV and BTR_MODIFY_PREV; - NOTE that if has_search_latch - is != 0, we maybe do not have a latch set - on the cursor page, we assume - the caller uses his search latch - to protect the record! */ + NOTE that if ahi_latch, we might not have a + cursor page latch, we assume that ahi_latch + protects the record! */ btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is s- or x-latched, but see also above! */ - ulint has_search_latch, - /*!< in: latch mode the caller - currently has on search system: - RW_S_LATCH, or 0 */ +#ifdef BTR_CUR_HASH_ADAPT + rw_lock_t* ahi_latch, + /*!< in: currently held btr_search_latch + (in RW_S_LATCH mode), or NULL */ +#endif /* BTR_CUR_HASH_ADAPT */ const char* file, /*!< in: file name */ unsigned line, /*!< in: line where called */ mtr_t* mtr, /*!< in/out: mini-transaction */ ib_uint64_t autoinc = 0); /*!< in: PAGE_ROOT_AUTO_INC to be written (0 if none) */ +#ifdef BTR_CUR_HASH_ADAPT +# define btr_cur_search_to_nth_level(i,l,t,m,lm,c,a,fi,li,mtr) \ + btr_cur_search_to_nth_level_func(i,l,t,m,lm,c,a,fi,li,mtr) +#else /* BTR_CUR_HASH_ADAPT */ +# define btr_cur_search_to_nth_level(i,l,t,m,lm,c,a,fi,li,mtr) \ + btr_cur_search_to_nth_level_func(i,l,t,m,lm,c,fi,li,mtr) +#endif /* BTR_CUR_HASH_ADAPT */ /*****************************************************************//** Opens a cursor at either end of an index. diff --git a/storage/innobase/include/btr0cur.ic b/storage/innobase/include/btr0cur.ic index 56868cca336..4ab3819ad75 100644 --- a/storage/innobase/include/btr0cur.ic +++ b/storage/innobase/include/btr0cur.ic @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -137,10 +138,9 @@ btr_cur_compress_recommendation( LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page) * 2U, return(FALSE)); - if ((page_get_data_size(page) - < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index)) - || ((btr_page_get_next(page, mtr) == FIL_NULL) - && (btr_page_get_prev(page, mtr) == FIL_NULL))) { + if (page_get_data_size(page) + < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index) + || !page_has_siblings(page)) { /* The page fillfactor has dropped below a predefined minimum value OR the level in the B-tree contains just @@ -173,11 +173,9 @@ btr_cur_can_delete_without_compress( page = btr_cur_get_page(cursor); - if ((page_get_data_size(page) - rec_size - < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index)) - || ((btr_page_get_next(page, mtr) == FIL_NULL) - && (btr_page_get_prev(page, mtr) == FIL_NULL)) - || (page_get_n_recs(page) < 2)) { + if (page_get_data_size(page) - rec_size + < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index) + || !page_has_siblings(page) || page_get_n_recs(page) < 2) { /* The page fillfactor will drop below a predefined minimum value, OR the level in the B-tree contains just diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h index fab934ca0ee..1d8690a3c90 100644 --- a/storage/innobase/include/btr0pcur.h +++ b/storage/innobase/include/btr0pcur.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -136,20 +136,25 @@ btr_pcur_open_with_no_init_func( may end up on the previous page of the record! */ ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ...; - NOTE that if has_search_latch != 0 then - we maybe do not acquire a latch on the cursor - page, but assume that the caller uses his - btr search latch to protect the record! */ + NOTE that if ahi_latch then we might not + acquire a cursor page latch, but assume + that the ahi_latch protects the record! */ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ - ulint has_search_latch, - /*!< in: latch mode the caller - currently has on search system: - RW_S_LATCH, or 0 */ +#ifdef BTR_CUR_HASH_ADAPT + rw_lock_t* ahi_latch, + /*!< in: adaptive hash index latch held + by the caller, or NULL if none */ +#endif /* BTR_CUR_HASH_ADAPT */ const char* file, /*!< in: file name */ unsigned line, /*!< in: line where called */ mtr_t* mtr); /*!< in: mtr */ -#define btr_pcur_open_with_no_init(ix,t,md,l,cur,has,m) \ - btr_pcur_open_with_no_init_func(ix,t,md,l,cur,has,__FILE__,__LINE__,m) +#ifdef BTR_CUR_HASH_ADAPT +# define btr_pcur_open_with_no_init(ix,t,md,l,cur,ahi,m) \ + btr_pcur_open_with_no_init_func(ix,t,md,l,cur,ahi,__FILE__,__LINE__,m) +#else /* BTR_CUR_HASH_ADAPT */ +# define btr_pcur_open_with_no_init(ix,t,md,l,cur,ahi,m) \ + btr_pcur_open_with_no_init_func(ix,t,md,l,cur,__FILE__,__LINE__,m) +#endif /* BTR_CUR_HASH_ADAPT */ /*****************************************************************//** Opens a persistent cursor at either end of an index. */ diff --git a/storage/innobase/include/btr0pcur.ic b/storage/innobase/include/btr0pcur.ic index 4490942a2bb..e12564fe547 100644 --- a/storage/innobase/include/btr0pcur.ic +++ b/storage/innobase/include/btr0pcur.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2015, 2017, MariaDB Corporation. +Copyright (c) 2015, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -219,12 +219,8 @@ btr_pcur_is_before_first_in_tree( ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); ut_ad(cursor->latch_mode != BTR_NO_LATCHES); - if (btr_page_get_prev(btr_pcur_get_page(cursor), mtr) != FIL_NULL) { - - return(FALSE); - } - - return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor))); + return !page_has_prev(btr_pcur_get_page(cursor)) + && page_cur_is_before_first(btr_pcur_get_page_cur(cursor)); } /*********************************************************//** @@ -240,12 +236,8 @@ btr_pcur_is_after_last_in_tree( ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); ut_ad(cursor->latch_mode != BTR_NO_LATCHES); - if (btr_page_get_next(btr_pcur_get_page(cursor), mtr) != FIL_NULL) { - - return(FALSE); - } - - return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor))); + return !page_has_next(btr_pcur_get_page(cursor)) + && page_cur_is_after_last(btr_pcur_get_page_cur(cursor)); } /*********************************************************//** @@ -454,9 +446,12 @@ btr_pcur_open_low( ut_ad(!dict_index_is_spatial(index)); - err = btr_cur_search_to_nth_level( - index, level, tuple, mode, latch_mode, - btr_cursor, 0, file, line, mtr, autoinc); + err = btr_cur_search_to_nth_level_func( + index, level, tuple, mode, latch_mode, btr_cursor, +#ifdef BTR_CUR_HASH_ADAPT + NULL, +#endif /* BTR_CUR_HASH_ADAPT */ + file, line, mtr, autoinc); if (err != DB_SUCCESS) { ib::warn() << " Error code: " << err @@ -491,15 +486,15 @@ btr_pcur_open_with_no_init_func( may end up on the previous page of the record! */ ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ...; - NOTE that if has_search_latch != 0 then - we maybe do not acquire a latch on the cursor - page, but assume that the caller uses his - btr search latch to protect the record! */ + NOTE that if ahi_latch then we might not + acquire a cursor page latch, but assume + that the ahi_latch protects the record! */ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ - ulint has_search_latch, - /*!< in: latch mode the caller - currently has on search system: - RW_S_LATCH, or 0 */ +#ifdef BTR_CUR_HASH_ADAPT + rw_lock_t* ahi_latch, + /*!< in: adaptive hash index latch held + by the caller, or NULL if none */ +#endif /* BTR_CUR_HASH_ADAPT */ const char* file, /*!< in: file name */ unsigned line, /*!< in: line where called */ mtr_t* mtr) /*!< in: mtr */ @@ -514,9 +509,12 @@ btr_pcur_open_with_no_init_func( btr_cursor = btr_pcur_get_btr_cur(cursor); - err = btr_cur_search_to_nth_level( + err = btr_cur_search_to_nth_level_func( index, 0, tuple, mode, latch_mode, btr_cursor, - has_search_latch, file, line, mtr); +#ifdef BTR_CUR_HASH_ADAPT + ahi_latch, +#endif /* BTR_CUR_HASH_ADAPT */ + file, line, mtr); cursor->pos_state = BTR_PCUR_IS_POSITIONED; diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h index bd1a72fc3ac..f32429800f8 100644 --- a/storage/innobase/include/btr0sea.h +++ b/storage/innobase/include/btr0sea.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -91,12 +91,11 @@ both have sensible values. we assume the caller uses his search latch to protect the record! @param[out] cursor tree cursor -@param[in] has_search_latch - latch mode the caller currently has on - search system: RW_S/X_LATCH or 0 +@param[in] ahi_latch the adaptive hash index latch being held, + or NULL @param[in] mtr mini transaction -@return TRUE if succeeded */ -ibool +@return whether the search succeeded */ +bool btr_search_guess_on_hash( dict_index_t* index, btr_search_t* info, @@ -104,7 +103,7 @@ btr_search_guess_on_hash( ulint mode, ulint latch_mode, btr_cur_t* cursor, - ulint has_search_latch, + rw_lock_t* ahi_latch, mtr_t* mtr); /** Move or delete hash entries for moved records, usually in a page split. @@ -140,17 +139,19 @@ btr_search_drop_page_hash_when_freed( /** Updates the page hash index when a single record is inserted on a page. @param[in] cursor cursor which was positioned to the place to insert using btr_cur_search_, and the new record has been - inserted next to the cursor. */ + inserted next to the cursor. +@param[in] ahi_latch the adaptive hash index latch */ void -btr_search_update_hash_node_on_insert(btr_cur_t* cursor); +btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch); /** Updates the page hash index when a single record is inserted on a page. -@param[in] cursor cursor which was positioned to the +@param[in,out] cursor cursor which was positioned to the place to insert using btr_cur_search_..., and the new record has been inserted next - to the cursor */ + to the cursor +@param[in] ahi_latch the adaptive hash index latch */ void -btr_search_update_hash_on_insert(btr_cur_t* cursor); +btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch); /** Updates the page hash index when a single record is deleted from a page. @param[in] cursor cursor which was positioned on the record to delete @@ -163,18 +164,6 @@ btr_search_update_hash_on_delete(btr_cur_t* cursor); bool btr_search_validate(); -/** X-Lock the search latch (corresponding to given index) -@param[in] index index handler */ -UNIV_INLINE -void -btr_search_x_lock(const dict_index_t* index); - -/** X-Unlock the search latch (corresponding to given index) -@param[in] index index handler */ -UNIV_INLINE -void -btr_search_x_unlock(const dict_index_t* index); - /** Lock all search latches in exclusive mode. */ UNIV_INLINE void @@ -185,18 +174,6 @@ UNIV_INLINE void btr_search_x_unlock_all(); -/** S-Lock the search latch (corresponding to given index) -@param[in] index index handler */ -UNIV_INLINE -void -btr_search_s_lock(const dict_index_t* index); - -/** S-Unlock the search latch (corresponding to given index) -@param[in] index index handler */ -UNIV_INLINE -void -btr_search_s_unlock(const dict_index_t* index); - /** Lock all search latches in shared mode. */ UNIV_INLINE void @@ -243,15 +220,11 @@ btr_get_search_table(const dict_index_t* index); #else /* BTR_CUR_HASH_ADAPT */ # define btr_search_sys_create(size) # define btr_search_drop_page_hash_index(block) -# define btr_search_s_lock(index) -# define btr_search_s_unlock(index) # define btr_search_s_lock_all(index) # define btr_search_s_unlock_all(index) -# define btr_search_x_lock(index) -# define btr_search_x_unlock(index) # define btr_search_info_update(index, cursor) # define btr_search_move_or_delete_hash_entries(new_block, block) -# define btr_search_update_hash_on_insert(cursor) +# define btr_search_update_hash_on_insert(cursor, ahi_latch) # define btr_search_update_hash_on_delete(cursor) # define btr_search_sys_resize(hash_size) #endif /* BTR_CUR_HASH_ADAPT */ @@ -312,7 +285,7 @@ struct btr_search_t{ ulint n_bytes; /*!< recommended prefix: number of bytes in an incomplete field @see BTR_PAGE_MAX_REC_SIZE */ - ibool left_side; /*!< TRUE or FALSE, depending on whether + bool left_side; /*!< true or false, depending on whether the leftmost record of several records with the same prefix should be indexed in the hash index */ diff --git a/storage/innobase/include/btr0sea.ic b/storage/innobase/include/btr0sea.ic index b5a7536a2b4..e0052a98639 100644 --- a/storage/innobase/include/btr0sea.ic +++ b/storage/innobase/include/btr0sea.ic @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -45,13 +46,11 @@ btr_search_info_create(mem_heap_t* heap) } #ifdef BTR_CUR_HASH_ADAPT -/*********************************************************************//** -Updates the search info. */ +/** Updates the search info. +@param[in,out] info search info +@param[in,out] cursor cursor which was just positioned */ void -btr_search_info_update_slow( -/*========================*/ - btr_search_t* info, /*!< in/out: search info */ - btr_cur_t* cursor);/*!< in: cursor which was just positioned */ +btr_search_info_update_slow(btr_search_t* info, btr_cur_t* cursor); /*********************************************************************//** Updates the search info. */ @@ -62,8 +61,8 @@ btr_search_info_update( dict_index_t* index, /*!< in: index of the cursor */ btr_cur_t* cursor) /*!< in: cursor which was just positioned */ { - ut_ad(!rw_lock_own(btr_get_search_latch(index), RW_LOCK_S)); - ut_ad(!rw_lock_own(btr_get_search_latch(index), RW_LOCK_X)); + ut_ad(!btr_search_own_any(RW_LOCK_S)); + ut_ad(!btr_search_own_any(RW_LOCK_X)); if (dict_index_is_spatial(index) || !btr_search_enabled) { return; @@ -87,24 +86,6 @@ btr_search_info_update( btr_search_info_update_slow(info, cursor); } -/** X-Lock the search latch (corresponding to given index) -@param[in] index index handler */ -UNIV_INLINE -void -btr_search_x_lock(const dict_index_t* index) -{ - rw_lock_x_lock(btr_get_search_latch(index)); -} - -/** X-Unlock the search latch (corresponding to given index) -@param[in] index index handler */ -UNIV_INLINE -void -btr_search_x_unlock(const dict_index_t* index) -{ - rw_lock_x_unlock(btr_get_search_latch(index)); -} - /** Lock all search latches in exclusive mode. */ UNIV_INLINE void @@ -125,24 +106,6 @@ btr_search_x_unlock_all() } } -/** S-Lock the search latch (corresponding to given index) -@param[in] index index handler */ -UNIV_INLINE -void -btr_search_s_lock(const dict_index_t* index) -{ - rw_lock_s_lock(btr_get_search_latch(index)); -} - -/** S-Unlock the search latch (corresponding to given index) -@param[in] index index handler */ -UNIV_INLINE -void -btr_search_s_unlock(const dict_index_t* index) -{ - rw_lock_s_unlock(btr_get_search_latch(index)); -} - /** Lock all search latches in shared mode. */ UNIV_INLINE void diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 4a54c30629b..0cef7862332 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -1738,7 +1738,7 @@ struct buf_block_t{ used in debugging */ ibool in_withdraw_list; #endif /* UNIV_DEBUG */ - unsigned lock_hash_val:32;/*!< hashed value of the page address + uint32_t lock_hash_val; /*!< hashed value of the page address in the record lock hash table; protected by buf_block_t::lock (or buf_block_t::mutex, buf_pool->mutex @@ -1827,7 +1827,7 @@ struct buf_block_t{ } while (0) # define assert_block_ahi_valid(block) \ ut_a((block)->index \ - || my_atomic_addlint(&(block)->n_pointers, 0) == 0) + || my_atomic_loadlint(&(block)->n_pointers) == 0) # else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ # define assert_block_ahi_empty(block) /* nothing */ # define assert_block_ahi_empty_on_init(block) /* nothing */ @@ -2351,8 +2351,12 @@ Use these instead of accessing buf_pool->mutex directly. */ /** Get appropriate page_hash_lock. */ -# define buf_page_hash_lock_get(buf_pool, page_id) \ - hash_get_lock((buf_pool)->page_hash, (page_id).fold()) +UNIV_INLINE +rw_lock_t* +buf_page_hash_lock_get(const buf_pool_t* buf_pool, const page_id_t& page_id) +{ + return hash_get_lock(buf_pool->page_hash, page_id.fold()); +} /** If not appropriate page_hash_lock, relock until appropriate. */ # define buf_page_hash_lock_s_confirm(hash_lock, buf_pool, page_id)\ diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h index 44cd5b5f772..81a1fb757c7 100644 --- a/storage/innobase/include/buf0flu.h +++ b/storage/innobase/include/buf0flu.h @@ -369,6 +369,12 @@ public: || m_interrupted); } + /** @return whether to flush only some pages of the tablespace */ + bool is_partial_flush() const { return m_stage != NULL; } + + /** @return whether the operation was interrupted */ + bool is_interrupted() const { return m_interrupted; } + /** Interrupt observer not to wait. */ void interrupted() { @@ -381,7 +387,6 @@ public: /** Flush dirty pages. */ void flush(); - /** Notify observer of flushing a page @param[in] buf_pool buffer pool instance @param[in] bpage buffer page to flush */ @@ -397,10 +402,10 @@ public: buf_page_t* bpage); private: /** Table space id */ - ulint m_space_id; + const ulint m_space_id; /** Trx instance */ - trx_t* m_trx; + trx_t* const m_trx; /** Performance schema accounting object, used by ALTER TABLE. If not NULL, then stage->begin_phase_flush() will be called initially, diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h index 54c001ce478..f6a7695a2b5 100644 --- a/storage/innobase/include/buf0lru.h +++ b/storage/innobase/include/buf0lru.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -52,12 +52,16 @@ These are low-level functions /** Empty the flush list for all pages belonging to a tablespace. @param[in] id tablespace identifier -@param[in] trx transaction, for checking for user interrupt; - or NULL if nothing is to be written -@param[in] drop_ahi whether to drop the adaptive hash index */ -UNIV_INTERN +@param[in,out] observer flush observer, + or NULL if nothing is to be written */ void -buf_LRU_flush_or_remove_pages(ulint id, const trx_t* trx, bool drop_ahi=false); +buf_LRU_flush_or_remove_pages( + ulint id, + FlushObserver* observer +#ifdef BTR_CUR_HASH_ADAPT + , bool drop_ahi = false /*!< whether to drop the adaptive hash index */ +#endif /* BTR_CUR_HASH_ADAPT */ + ); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /********************************************************************//** diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h index a0b3059ad40..d3361ad8b3b 100644 --- a/storage/innobase/include/data0data.h +++ b/storage/innobase/include/data0data.h @@ -591,6 +591,22 @@ struct dfield_t{ @param[in,out] heap memory heap in which the clone will be created @return the cloned object */ dfield_t* clone(mem_heap_t* heap) const; + + /** @return system field indicates history row */ + bool vers_history_row() const + { + ut_ad(type.vers_sys_end()); + if (type.mtype == DATA_FIXBINARY) { + ut_ad(len == sizeof timestamp_max_bytes); + return 0 != memcmp(data, timestamp_max_bytes, len); + } else { + ut_ad(type.mtype == DATA_INT); + ut_ad(len == sizeof trx_id_max_bytes); + return 0 != memcmp(data, trx_id_max_bytes, len); + } + ut_ad(0); + return false; + } }; /** Structure for an SQL data tuple of fields (logical record) */ diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h index bd2a15fe881..7e1c362cf8d 100644 --- a/storage/innobase/include/data0type.h +++ b/storage/innobase/include/data0type.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -189,6 +189,12 @@ be less than 256 */ for shorter VARCHARs MySQL uses only 1 byte */ #define DATA_VIRTUAL 8192U /* Virtual column */ +/** System Versioning */ +#define DATA_VERS_START 16384U /* start system field */ +#define DATA_VERS_END 32768U /* end system field */ +/** system-versioned user data column */ +#define DATA_VERSIONED (DATA_VERS_START|DATA_VERS_END) + /** Check whether locking is disabled (never). */ #define dict_table_is_locking_disabled(table) false @@ -203,16 +209,7 @@ store the charset-collation number; one byte is left unused, though */ #define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE 6 /* Maximum multi-byte character length in bytes, plus 1 */ -#define DATA_MBMAX 5 - -/* Pack mbminlen, mbmaxlen to mbminmaxlen. */ -#define DATA_MBMINMAXLEN(mbminlen, mbmaxlen) \ - unsigned((mbmaxlen) * DATA_MBMAX + (mbminlen)) -/* Get mbminlen from mbminmaxlen. */ -#define DATA_MBMINLEN(mbminmaxlen) \ - unsigned(UNIV_EXPECT((mbminmaxlen) % DATA_MBMAX, 1)) -/* Get mbmaxlen from mbminmaxlen. */ -#define DATA_MBMAXLEN(mbminmaxlen) unsigned((mbminmaxlen) / DATA_MBMAX) +#define DATA_MBMAX 8 /* For checking if mtype is GEOMETRY datatype */ #define DATA_GEOMETRY_MTYPE(mtype) ((mtype) == DATA_GEOMETRY) @@ -255,8 +252,10 @@ ulint dtype_get_at_most_n_mbchars( /*========================*/ ulint prtype, /*!< in: precise type */ - ulint mbminmaxlen, /*!< in: minimum and maximum length of - a multi-byte character */ + ulint mbminlen, /*!< in: minimum length of + a multi-byte character, in bytes */ + ulint mbmaxlen, /*!< in: maximum length of + a multi-byte character, in bytes */ ulint prefix_len, /*!< in: length of the requested prefix, in characters, multiplied by dtype_get_mbmaxlen(dtype) */ @@ -399,19 +398,6 @@ ulint dtype_get_mbmaxlen( /*===============*/ const dtype_t* type); /*!< in: type */ -/*********************************************************************//** -Sets the minimum and maximum length of a character, in bytes. */ -UNIV_INLINE -void -dtype_set_mbminmaxlen( -/*==================*/ - dtype_t* type, /*!< in/out: type */ - ulint mbminlen, /*!< in: minimum length of a char, - in bytes, or 0 if this is not - a character type */ - ulint mbmaxlen); /*!< in: maximum length of a char, - in bytes, or 0 if this is not - a character type */ /***********************************************************************//** Returns the size of a fixed size data type, 0 if not a fixed size type. @return fixed size, or 0 */ @@ -422,7 +408,9 @@ dtype_get_fixed_size_low( ulint mtype, /*!< in: main type */ ulint prtype, /*!< in: precise type */ ulint len, /*!< in: length */ - ulint mbminmaxlen, /*!< in: minimum and maximum length of a + ulint mbminlen, /*!< in: minimum length of a + multibyte character, in bytes */ + ulint mbmaxlen, /*!< in: maximum length of a multibyte character, in bytes */ ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */ @@ -436,8 +424,8 @@ dtype_get_min_size_low( ulint mtype, /*!< in: main type */ ulint prtype, /*!< in: precise type */ ulint len, /*!< in: length */ - ulint mbminmaxlen); /*!< in: minimum and maximum length of a - multibyte character */ + ulint mbminlen, /*!< in: minimum length of a character */ + ulint mbmaxlen); /*!< in: maximum length of a character */ /***********************************************************************//** Returns the maximum size of a data type. Note: types in system tables may be incomplete and return incorrect information. @@ -549,13 +537,30 @@ struct dtype_t{ string data (in addition to the string, MySQL uses 1 or 2 bytes to store the string length) */ - unsigned mbminmaxlen:5; /*!< minimum and maximum length of a - character, in bytes; - DATA_MBMINMAXLEN(mbminlen,mbmaxlen); - mbminlen=DATA_MBMINLEN(mbminmaxlen); - mbmaxlen=DATA_MBMINLEN(mbminmaxlen) */ + unsigned mbminlen:3; /*!< minimum length of a character, + in bytes */ + unsigned mbmaxlen:3; /*!< maximum length of a character, + in bytes */ + + /** @return whether this is system field */ + bool vers_sys_field() const { return prtype & DATA_VERSIONED; } + /** @return whether this is system versioned user field */ + bool is_versioned() const { return !(~prtype & DATA_VERSIONED); } + /** @return whether this is the system field start */ + bool vers_sys_start() const + { + return (prtype & DATA_VERSIONED) == DATA_VERS_START; + } + /** @return whether this is the system field end */ + bool vers_sys_end() const + { + return (prtype & DATA_VERSIONED) == DATA_VERS_END; + } }; +/** The DB_TRX_ID,DB_ROLL_PTR values for "no history is available" */ +extern const byte reset_trx_id[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]; + #include "data0type.ic" #endif diff --git a/storage/innobase/include/data0type.ic b/storage/innobase/include/data0type.ic index a68f4829561..c0b32953cff 100644 --- a/storage/innobase/include/data0type.ic +++ b/storage/innobase/include/data0type.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -101,27 +101,6 @@ dtype_get_mblen( } /*********************************************************************//** -Sets the minimum and maximum length of a character, in bytes. */ -UNIV_INLINE -void -dtype_set_mbminmaxlen( -/*==================*/ - dtype_t* type, /*!< in/out: type */ - ulint mbminlen, /*!< in: minimum length of a char, - in bytes, or 0 if this is not - a character type */ - ulint mbmaxlen) /*!< in: maximum length of a char, - in bytes, or 0 if this is not - a character type */ -{ - ut_ad(mbminlen < DATA_MBMAX); - ut_ad(mbmaxlen < DATA_MBMAX); - ut_ad(mbminlen <= mbmaxlen); - - type->mbminmaxlen = DATA_MBMINMAXLEN(mbminlen, mbmaxlen); -} - -/*********************************************************************//** Compute the mbminlen and mbmaxlen members of a data type structure. */ UNIV_INLINE void @@ -133,7 +112,8 @@ dtype_set_mblen( ulint mbmaxlen; dtype_get_mblen(type->mtype, type->prtype, &mbminlen, &mbmaxlen); - dtype_set_mbminmaxlen(type, mbminlen, mbmaxlen); + type->mbminlen = mbminlen; + type->mbmaxlen = mbmaxlen; ut_ad(dtype_validate(type)); } @@ -225,8 +205,7 @@ dtype_get_mbminlen( /*===============*/ const dtype_t* type) /*!< in: type */ { - ut_ad(type); - return(DATA_MBMINLEN(type->mbminmaxlen)); + return type->mbminlen; } /*********************************************************************//** Gets the maximum length of a character, in bytes. @@ -238,8 +217,7 @@ dtype_get_mbmaxlen( /*===============*/ const dtype_t* type) /*!< in: type */ { - ut_ad(type); - return(DATA_MBMAXLEN(type->mbminmaxlen)); + return type->mbmaxlen; } /**********************************************************************//** @@ -387,79 +365,79 @@ dtype_sql_name( #define APPEND_UNSIGNED() \ do { \ if (prtype & DATA_UNSIGNED) { \ - ut_snprintf(name + strlen(name), \ + snprintf(name + strlen(name), \ name_sz - strlen(name), \ " UNSIGNED"); \ } \ } while (0) - ut_snprintf(name, name_sz, "UNKNOWN"); + snprintf(name, name_sz, "UNKNOWN"); switch (mtype) { case DATA_INT: switch (len) { case 1: - ut_snprintf(name, name_sz, "TINYINT"); + snprintf(name, name_sz, "TINYINT"); break; case 2: - ut_snprintf(name, name_sz, "SMALLINT"); + snprintf(name, name_sz, "SMALLINT"); break; case 3: - ut_snprintf(name, name_sz, "MEDIUMINT"); + snprintf(name, name_sz, "MEDIUMINT"); break; case 4: - ut_snprintf(name, name_sz, "INT"); + snprintf(name, name_sz, "INT"); break; case 8: - ut_snprintf(name, name_sz, "BIGINT"); + snprintf(name, name_sz, "BIGINT"); break; } APPEND_UNSIGNED(); break; case DATA_FLOAT: - ut_snprintf(name, name_sz, "FLOAT"); + snprintf(name, name_sz, "FLOAT"); APPEND_UNSIGNED(); break; case DATA_DOUBLE: - ut_snprintf(name, name_sz, "DOUBLE"); + snprintf(name, name_sz, "DOUBLE"); APPEND_UNSIGNED(); break; case DATA_FIXBINARY: - ut_snprintf(name, name_sz, "BINARY(%u)", len); + snprintf(name, name_sz, "BINARY(%u)", len); break; case DATA_CHAR: case DATA_MYSQL: - ut_snprintf(name, name_sz, "CHAR(%u)", len); + snprintf(name, name_sz, "CHAR(%u)", len); break; case DATA_VARCHAR: case DATA_VARMYSQL: - ut_snprintf(name, name_sz, "VARCHAR(%u)", len); + snprintf(name, name_sz, "VARCHAR(%u)", len); break; case DATA_BINARY: - ut_snprintf(name, name_sz, "VARBINARY(%u)", len); + snprintf(name, name_sz, "VARBINARY(%u)", len); break; case DATA_GEOMETRY: - ut_snprintf(name, name_sz, "GEOMETRY"); + snprintf(name, name_sz, "GEOMETRY"); break; case DATA_BLOB: switch (len) { case 9: - ut_snprintf(name, name_sz, "TINYBLOB"); + snprintf(name, name_sz, "TINYBLOB"); break; case 10: - ut_snprintf(name, name_sz, "BLOB"); + snprintf(name, name_sz, "BLOB"); break; case 11: - ut_snprintf(name, name_sz, "MEDIUMBLOB"); + snprintf(name, name_sz, "MEDIUMBLOB"); break; case 12: - ut_snprintf(name, name_sz, "LONGBLOB"); + snprintf(name, name_sz, "LONGBLOB"); break; } } if (prtype & DATA_NOT_NULL) { - ut_snprintf(name + strlen(name), + snprintf(name + strlen(name), name_sz - strlen(name), " NOT NULL"); } @@ -477,8 +455,10 @@ dtype_get_fixed_size_low( ulint mtype, /*!< in: main type */ ulint prtype, /*!< in: precise type */ ulint len, /*!< in: length */ - ulint mbminmaxlen, /*!< in: minimum and maximum length of - a multibyte character, in bytes */ + ulint mbminlen, /*!< in: minimum length of a + multibyte character, in bytes */ + ulint mbmaxlen, /*!< in: maximum length of a + multibyte character, in bytes */ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ { switch (mtype) { @@ -518,11 +498,10 @@ dtype_get_fixed_size_low( dtype_get_charset_coll(prtype), &i_mbminlen, &i_mbmaxlen); - ut_ad(DATA_MBMINMAXLEN(i_mbminlen, i_mbmaxlen) - == mbminmaxlen); + ut_ad(i_mbminlen == mbminlen); + ut_ad(i_mbmaxlen == mbmaxlen); #endif /* UNIV_DEBUG */ - if (DATA_MBMINLEN(mbminmaxlen) - == DATA_MBMAXLEN(mbminmaxlen)) { + if (mbminlen == mbmaxlen) { return(len); } } @@ -552,8 +531,8 @@ dtype_get_min_size_low( ulint mtype, /*!< in: main type */ ulint prtype, /*!< in: precise type */ ulint len, /*!< in: length */ - ulint mbminmaxlen) /*!< in: minimum and maximum length of a - multi-byte character */ + ulint mbminlen, /*!< in: minimum length of a character */ + ulint mbmaxlen) /*!< in: maximum length of a character */ { switch (mtype) { case DATA_SYS: @@ -583,9 +562,6 @@ dtype_get_min_size_low( if (prtype & DATA_BINARY_TYPE) { return(len); } else { - ulint mbminlen = DATA_MBMINLEN(mbminmaxlen); - ulint mbmaxlen = DATA_MBMAXLEN(mbminmaxlen); - if (mbminlen == mbmaxlen) { return(len); } @@ -656,5 +632,5 @@ dtype_get_sql_null_size( ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ { return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len, - type->mbminmaxlen, comp)); + type->mbminlen, type->mbmaxlen, comp)); } diff --git a/storage/innobase/include/dict0defrag_bg.h b/storage/innobase/include/dict0defrag_bg.h index eb2a6e6824f..8d77a461dc9 100644 --- a/storage/innobase/include/dict0defrag_bg.h +++ b/storage/innobase/include/dict0defrag_bg.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2016, MariaDB Corporation. All rights Reserved. +Copyright (c) 2016, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -88,6 +88,5 @@ Save defragmentation stats for a given index. dberr_t dict_stats_save_defrag_stats( /*============================*/ - dict_index_t* index) /*!< in: index */ - MY_ATTRIBUTE((warn_unused_result)); + dict_index_t* index); /*!< in: index */ #endif /* dict0defrag_bg_h */ diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h index f14487f09d0..4356ee113de 100644 --- a/storage/innobase/include/dict0dict.h +++ b/storage/innobase/include/dict0dict.h @@ -2,7 +2,7 @@ Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2013, 2017, MariaDB Corporation. +Copyright (c) 2013, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -182,18 +182,6 @@ dict_col_get_mbmaxlen( const dict_col_t* col) /*!< in: column */ MY_ATTRIBUTE((nonnull, warn_unused_result)); /*********************************************************************//** -Sets the minimum and maximum number of bytes per character. */ -UNIV_INLINE -void -dict_col_set_mbminmaxlen( -/*=====================*/ - dict_col_t* col, /*!< in/out: column */ - ulint mbminlen, /*!< in: minimum multi-byte - character size, in bytes */ - ulint mbmaxlen) /*!< in: minimum multi-byte - character size, in bytes */ - MY_ATTRIBUTE((nonnull)); -/*********************************************************************//** Gets the column data type. */ UNIV_INLINE void @@ -414,7 +402,7 @@ dict_table_rename_in_cache( /*!< in: in ALTER TABLE we want to preserve the original table name in constraints which reference it */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); + MY_ATTRIBUTE((nonnull)); /** Removes an index from the dictionary cache. @param[in,out] table table whose index to remove @@ -929,7 +917,7 @@ dict_index_get_min_size( Check whether the table uses the compact page format. @return TRUE if table uses the compact page format */ UNIV_INLINE -ibool +bool dict_table_is_comp( /*===============*/ const dict_table_t* table) /*!< in: table */ @@ -1275,7 +1263,7 @@ Returns TRUE if the index contains a column or a prefix of that column. @param[in] n column number @param[in] is_virtual whether it is a virtual col @return TRUE if contains the column or its prefix */ -ibool +bool dict_index_contains_col_or_prefix( /*==============================*/ const dict_index_t* index, /*!< in: index */ diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic index 26918251d8b..e1c2c71bc0a 100644 --- a/storage/innobase/include/dict0dict.ic +++ b/storage/innobase/include/dict0dict.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, 2017, MariaDB Corporation. +Copyright (c) 2013, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -41,7 +41,7 @@ dict_col_get_mbminlen( /*==================*/ const dict_col_t* col) /*!< in: column */ { - return(DATA_MBMINLEN(col->mbminmaxlen)); + return col->mbminlen; } /*********************************************************************//** Gets the maximum number of bytes per character. @@ -52,25 +52,7 @@ dict_col_get_mbmaxlen( /*==================*/ const dict_col_t* col) /*!< in: column */ { - return(DATA_MBMAXLEN(col->mbminmaxlen)); -} -/*********************************************************************//** -Sets the minimum and maximum number of bytes per character. */ -UNIV_INLINE -void -dict_col_set_mbminmaxlen( -/*=====================*/ - dict_col_t* col, /*!< in/out: column */ - ulint mbminlen, /*!< in: minimum multi-byte - character size, in bytes */ - ulint mbmaxlen) /*!< in: minimum multi-byte - character size, in bytes */ -{ - ut_ad(mbminlen < DATA_MBMAX); - ut_ad(mbmaxlen < DATA_MBMAX); - ut_ad(mbminlen <= mbmaxlen); - - col->mbminmaxlen = DATA_MBMINMAXLEN(mbminlen, mbmaxlen); + return col->mbmaxlen; } /*********************************************************************//** Gets the column data type. */ @@ -87,7 +69,8 @@ dict_col_copy_type( type->mtype = col->mtype; type->prtype = col->prtype; type->len = col->len; - type->mbminmaxlen = col->mbminmaxlen; + type->mbminlen = col->mbminlen; + type->mbmaxlen = col->mbmaxlen; } #ifdef UNIV_DEBUG @@ -107,7 +90,8 @@ dict_col_type_assert_equal( ut_ad(col->mtype == type->mtype); ut_ad(col->prtype == type->prtype); //ut_ad(col->len == type->len); - ut_ad(col->mbminmaxlen == type->mbminmaxlen); + ut_ad(col->mbminlen == type->mbminlen); + ut_ad(col->mbmaxlen == type->mbmaxlen); return(TRUE); } @@ -123,7 +107,7 @@ dict_col_get_min_size( const dict_col_t* col) /*!< in: column */ { return(dtype_get_min_size_low(col->mtype, col->prtype, col->len, - col->mbminmaxlen)); + col->mbminlen, col->mbmaxlen)); } /***********************************************************************//** Returns the maximum size of the column. @@ -147,7 +131,7 @@ dict_col_get_fixed_size( ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ { return(dtype_get_fixed_size_low(col->mtype, col->prtype, col->len, - col->mbminmaxlen, comp)); + col->mbminlen, col->mbmaxlen, comp)); } /***********************************************************************//** Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column. @@ -286,7 +270,7 @@ dict_index_is_clust( const dict_index_t* index) /*!< in: index */ { ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - return(index->is_clust()); + return(index->type & DICT_CLUSTERED); } /** Check if index is auto-generated clustered index. @@ -570,7 +554,7 @@ dict_table_get_sys_col_no( Check whether the table uses the compact page format. @return TRUE if table uses the compact page format */ UNIV_INLINE -ibool +bool dict_table_is_comp( /*===============*/ const dict_table_t* table) /*!< in: table */ @@ -581,7 +565,7 @@ dict_table_is_comp( #error "DICT_TF_COMPACT must be 1" #endif - return(table->flags & DICT_TF_COMPACT); + return (table->flags & DICT_TF_COMPACT) != 0; } /************************************************************************ @@ -657,7 +641,7 @@ dict_tf_is_valid( bit. For ROW_FORMAT=REDUNDANT, only the DATA_DIR flag (which we cleared above) can be set. If any other flags are set, the flags are invalid. */ - return(flags == 0); + return(flags == 0 || flags == DICT_TF_MASK_NO_ROLLBACK); } return(dict_tf_is_valid_not_redundant(flags)); diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h index 22fd27c2484..9ba42007568 100644 --- a/storage/innobase/include/dict0load.h +++ b/storage/innobase/include/dict0load.h @@ -193,8 +193,7 @@ dict_getnext_system( mtr_t* mtr); /*!< in: the mini-transaction */ /********************************************************************//** This function processes one SYS_TABLES record and populate the dict_table_t -struct for the table. Extracted out of dict_print() to be used by -both monitor table output and information schema innodb_sys_tables output. +struct for the table. @return error message, or NULL on success */ const char* dict_process_sys_tables_rec_and_mtr_commit( diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index 8dcd6bf2606..0bab513d051 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -2,7 +2,7 @@ Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2013, 2017, MariaDB Corporation. +Copyright (c) 2013, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -298,7 +298,7 @@ result in recursive cascading calls. This defines the maximum number of such cascading deletes/updates allowed. When exceeded, the delete from parent table will fail, and user has to drop excessive foreign constraint before proceeds. */ -#define FK_MAX_CASCADE_DEL 255 +#define FK_MAX_CASCADE_DEL 15 /**********************************************************************//** Creates a table memory object. @@ -617,11 +617,10 @@ struct dict_col_t{ the string, MySQL uses 1 or 2 bytes to store the string length) */ - unsigned mbminmaxlen:5; /*!< minimum and maximum length of a - character, in bytes; - DATA_MBMINMAXLEN(mbminlen,mbmaxlen); - mbminlen=DATA_MBMINLEN(mbminmaxlen); - mbmaxlen=DATA_MBMINLEN(mbminmaxlen) */ + unsigned mbminlen:3; /*!< minimum length of a + character, in bytes */ + unsigned mbmaxlen:3; /*!< maximum length of a + character, in bytes */ /*----------------------*/ /* End of definitions copied from dtype_t */ /* @} */ @@ -652,6 +651,22 @@ struct dict_col_t{ bool is_virtual() const { return prtype & DATA_VIRTUAL; } /** @return whether NULL is an allowed value for this column */ bool is_nullable() const { return !(prtype & DATA_NOT_NULL); } + + /** @return whether this is system field */ + bool vers_sys_field() const { return prtype & DATA_VERSIONED; } + /** @return whether this is system versioned */ + bool is_versioned() const { return !(~prtype & DATA_VERSIONED); } + /** @return whether this is the system version start */ + bool vers_sys_start() const + { + return (prtype & DATA_VERSIONED) == DATA_VERS_START; + } + /** @return whether this is the system version end */ + bool vers_sys_end() const + { + return (prtype & DATA_VERSIONED) == DATA_VERS_END; + } + /** @return whether this is an instantly-added column */ bool is_instant() const { @@ -1064,8 +1079,12 @@ struct dict_index_t{ /** @return whether instant ADD COLUMN is in effect */ inline bool is_instant() const; - /** @return whether the index is the clustered index */ - bool is_clust() const { return type & DICT_CLUSTERED; } + /** @return whether the index is the primary key index + (not the clustered index of the change buffer) */ + bool is_primary() const + { + return DICT_CLUSTERED == (type & (DICT_CLUSTERED | DICT_IBUF)); + } /** Determine how many fields of a given prefix can be set NULL. @param[in] n_prefix number of fields in the prefix @@ -1091,7 +1110,7 @@ struct dict_index_t{ @param[out] len value length (in bytes), or UNIV_SQL_NULL @return default value @retval NULL if the default value is SQL NULL (len=UNIV_SQL_NULL) */ - const byte* instant_field_value(uint n, ulint* len) const + const byte* instant_field_value(ulint n, ulint* len) const { DBUG_ASSERT(is_instant() || id == DICT_INDEXES_ID); DBUG_ASSERT(n + (id == DICT_INDEXES_ID) >= n_core_fields); @@ -1107,7 +1126,7 @@ struct dict_index_t{ Protected by index root page x-latch or table X-lock. */ void remove_instant() { - DBUG_ASSERT(is_clust()); + DBUG_ASSERT(is_primary()); if (!is_instant()) { return; } @@ -1117,6 +1136,20 @@ struct dict_index_t{ n_core_fields = n_fields; n_core_null_bytes = UT_BITS_IN_BYTES(n_nullable); } + + /** Check if record in clustered index is historical row. + @param[in] rec clustered row + @param[in] offsets offsets + @return true if row is historical */ + bool + vers_history_row(const rec_t* rec, const ulint* offsets); + + /** Check if record in secondary index is historical row. + @param[in] rec record in a secondary index + @param[out] history_row true if row is historical + @return true on error */ + bool + vers_history_row(const rec_t* rec, bool &history_row); }; /** The status of online index creation */ @@ -1512,6 +1545,29 @@ struct dict_table_t { /** Add the table definition to the data dictionary cache */ void add_to_cache(); + bool versioned() const { return vers_start || vers_end; } + bool versioned_by_id() const + { + return vers_start && cols[vers_start].mtype == DATA_INT; + } + + void inc_fk_checks() + { +#ifdef UNIV_DEBUG + lint fk_checks= +#endif + my_atomic_addlint(&n_foreign_key_checks_running, 1); + ut_ad(fk_checks >= 0); + } + void dec_fk_checks() + { +#ifdef UNIV_DEBUG + lint fk_checks= +#endif + my_atomic_addlint(&n_foreign_key_checks_running, -1); + ut_ad(fk_checks > 0); + } + /** Id of the table. */ table_id_t id; @@ -1556,6 +1612,13 @@ struct dict_table_t { Use DICT_TF2_FLAG_IS_SET() to parse this flag. */ unsigned flags2:DICT_TF2_BITS; + /** TRUE if the table is an intermediate table during copy alter + operation or a partition/subpartition which is required for copying + data and skip the undo log for insertion of row in the table. + This variable will be set and unset during extra(), or during the + process of altering partitions */ + unsigned skip_alter_undo:1; + /*!< whether this is in a single-table tablespace and the .ibd file is missing or page decryption failed and page is corrupted */ unsigned file_unreadable:1; @@ -1625,7 +1688,10 @@ struct dict_table_t { /** Virtual column names */ const char* v_col_names; - + unsigned vers_start:10; + /*!< System Versioning: row start col index */ + unsigned vers_end:10; + /*!< System Versioning: row end col index */ bool is_system_db; /*!< True if the table belongs to a system database (mysql, information_schema or @@ -1762,7 +1828,7 @@ struct dict_table_t { /** How many rows are modified since last stats recalc. When a row is inserted, updated, or deleted, we add 1 to this number; we calculate new estimates for the table and the indexes if the table has changed - too much, see row_update_statistics_if_needed(). The counter is reset + too much, see dict_stats_update_if_needed(). The counter is reset to zero at statistics calculation. This counter is not protected by any latch, because this is only used for heuristics. */ ib_uint64_t stat_modified_counter; @@ -1842,7 +1908,7 @@ struct dict_table_t { ulong n_waiting_or_granted_auto_inc_locks; /** The transaction that currently holds the the AUTOINC lock on this - table. Protected by lock_sys->mutex. */ + table. Protected by lock_sys.mutex. */ const trx_t* autoinc_trx; /* @} */ @@ -1857,7 +1923,7 @@ struct dict_table_t { /** Count of the number of record locks on this table. We use this to determine whether we can evict the table from the dictionary cache. - It is protected by lock_sys->mutex. */ + It is protected by lock_sys.mutex. */ ulint n_rec_locks; #ifndef DBUG_ASSERT_EXISTS @@ -1869,7 +1935,7 @@ private: ulint n_ref_count; public: - /** List of locks on the table. Protected by lock_sys->mutex. */ + /** List of locks on the table. Protected by lock_sys.mutex. */ table_lock_list_t locks; /** Timestamp of the last modification of this table. */ @@ -2032,6 +2098,19 @@ dict_col_get_spatial_status( return(spatial_status); } +/** Clear defragmentation summary. */ +inline void dict_stats_empty_defrag_summary(dict_index_t* index) +{ + index->stat_defrag_n_pages_freed = 0; +} + +/** Clear defragmentation related index stats. */ +inline void dict_stats_empty_defrag_stats(dict_index_t* index) +{ + index->stat_defrag_modified_counter = 0; + index->stat_defrag_n_page_split = 0; +} + #include "dict0mem.ic" #endif /* dict0mem_h */ diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h index 8846aeda7fd..5dd53c46d1b 100644 --- a/storage/innobase/include/dict0stats.h +++ b/storage/innobase/include/dict0stats.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2009, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -177,6 +177,7 @@ dict_stats_rename_table( char* errstr, /*!< out: error string if != DB_SUCCESS is returned */ size_t errstr_sz); /*!< in: errstr size */ +#ifdef MYSQL_RENAME_INDEX /*********************************************************************//** Renames an index in InnoDB persistent stats storage. This function creates its own transaction and commits it. @@ -190,52 +191,7 @@ dict_stats_rename_index( const char* old_index_name, /*!< in: old index name */ const char* new_index_name) /*!< in: new index name */ __attribute__((warn_unused_result)); -/*********************************************************************//** -Save defragmentation result. -@return DB_SUCCESS or error code */ -UNIV_INTERN -dberr_t -dict_stats_save_defrag_summary( - dict_index_t* index); /*!< in: index */ - -/*********************************************************************//** -Save defragmentation stats for a given index. -@return DB_SUCCESS or error code */ -UNIV_INTERN -dberr_t -dict_stats_save_defrag_stats( - dict_index_t* index); /*!< in: index */ - -/**********************************************************************//** -Clear defragmentation summary. */ -UNIV_INTERN -void -dict_stats_empty_defrag_summary( -/*==================*/ - dict_index_t* index); /*!< in: index to clear defragmentation stats */ - -/**********************************************************************//** -Clear defragmentation related index stats. */ -UNIV_INTERN -void -dict_stats_empty_defrag_stats( -/*==================*/ - dict_index_t* index); /*!< in: index to clear defragmentation stats */ - - -/*********************************************************************//** -Renames an index in InnoDB persistent stats storage. -This function creates its own transaction and commits it. -@return DB_SUCCESS or error code. DB_STATS_DO_NOT_EXIST will be returned -if the persistent stats do not exist. */ -dberr_t -dict_stats_rename_index( -/*====================*/ - const dict_table_t* table, /*!< in: table whose index - is renamed */ - const char* old_index_name, /*!< in: old index name */ - const char* new_index_name) /*!< in: new index name */ - MY_ATTRIBUTE((warn_unused_result)); +#endif /* MYSQL_RENAME_INDEX */ /** Save an individual index's statistic into the persistent statistics storage. @@ -252,7 +208,7 @@ rolled back only in the case of error, but not freed. dberr_t dict_stats_save_index_stat( dict_index_t* index, - lint last_update, + ib_time_t last_update, const char* stat_name, ib_uint64_t stat_value, ib_uint64_t* sample_size, diff --git a/storage/innobase/include/dict0stats.ic b/storage/innobase/include/dict0stats.ic index 1efe5780b58..0d187ed90c7 100644 --- a/storage/innobase/include/dict0stats.ic +++ b/storage/innobase/include/dict0stats.ic @@ -79,7 +79,7 @@ dict_stats_is_persistent_enabled(const dict_table_t* table) protect the ::stat_persistent with dict_table_stats_lock() like the other ::stat_ members which would be too big performance penalty, especially when this function is called from - row_update_statistics_if_needed(). */ + dict_stats_update_if_needed(). */ /* we rely on this read to be atomic */ ib_uint32_t stat_persistent = table->stat_persistent; diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h index 27b4cc0e694..6984351cc06 100644 --- a/storage/innobase/include/dict0types.h +++ b/storage/innobase/include/dict0types.h @@ -52,6 +52,13 @@ DICT_IBUF_ID_MIN plus the space id */ typedef ib_id_t table_id_t; typedef ib_id_t index_id_t; +/** Maximum transaction identifier */ +#define TRX_ID_MAX IB_ID_MAX + +/** The bit pattern corresponding to TRX_ID_MAX */ +extern const byte trx_id_max_bytes[8]; +extern const byte timestamp_max_bytes[7]; + /** Error to ignore when we load table dictionary into memory. However, the table and index will be marked as "corrupted", and caller will be responsible to deal with corrupted table or index. diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 695c490ea94..724deb4e6bc 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, 2017, MariaDB Corporation. +Copyright (c) 2013, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -35,16 +35,11 @@ Created 10/25/1995 Heikki Tuuri #include "page0size.h" #include "ibuf0types.h" -#include <list> -#include <vector> - // Forward declaration struct trx_t; class page_id_t; class truncate_t; -typedef std::list<char*, ut_allocator<char*> > space_name_list_t; - /** Structure containing encryption specification */ struct fil_space_crypt_t; @@ -885,6 +880,15 @@ fil_create_directory_for_tablename( /*===============================*/ const char* name); /*!< in: name in the standard 'databasename/tablename' format */ +/** Write redo log for renaming a file. +@param[in] space_id tablespace id +@param[in] old_name tablespace file name +@param[in] new_name tablespace file name after renaming */ +void +fil_name_write_rename( + ulint space_id, + const char* old_name, + const char* new_name); /********************************************************//** Recreates table indexes by applying TRUNCATE log record during recovery. @@ -942,10 +946,14 @@ fil_table_accessible(const dict_table_t* table) /** Delete a tablespace and associated .ibd file. @param[in] id tablespace identifier -@param[in] drop_ahi whether to drop the adaptive hash index @return DB_SUCCESS or error */ dberr_t -fil_delete_tablespace(ulint id, bool drop_ahi = false); +fil_delete_tablespace( + ulint id +#ifdef BTR_CUR_HASH_ADAPT + , bool drop_ahi = false /*!< whether to drop the adaptive hash index */ +#endif /* BTR_CUR_HASH_ADAPT */ + ); /** Truncate the tablespace to needed size. @param[in] space_id id of tablespace to truncate @@ -1160,27 +1168,24 @@ fil_file_readdir_next_file( os_file_dir_t dir, /*!< in: directory stream */ os_file_stat_t* info); /*!< in/out: buffer where the info is returned */ -/*******************************************************************//** -Returns true if a matching tablespace exists in the InnoDB tablespace memory -cache. Note that if we have not done a crash recovery at the database startup, -there may be many tablespaces which are not yet in the memory cache. +/** Determine if a matching tablespace exists in the InnoDB tablespace +memory cache. Note that if we have not done a crash recovery at the database +startup, there may be many tablespaces which are not yet in the memory cache. +@param[in] id Tablespace ID +@param[in] name Tablespace name used in fil_space_create(). +@param[in] print_error_if_does_not_exist + Print detailed error information to the +error log if a matching tablespace is not found from memory. +@param[in] heap Heap memory +@param[in] table_flags table flags @return true if a matching tablespace exists in the memory cache */ bool fil_space_for_table_exists_in_mem( -/*==============================*/ - ulint id, /*!< in: space id */ - const char* name, /*!< in: table name in the standard - 'databasename/tablename' format */ + ulint id, + const char* name, bool print_error_if_does_not_exist, - /*!< in: print detailed error - information to the .err log if a - matching tablespace is not found from - memory */ - bool adjust_space, /*!< in: whether to adjust space id - when find table space mismatch */ - mem_heap_t* heap, /*!< in: heap memory */ - table_id_t table_id, /*!< in: table id */ - ulint table_flags); /*!< in: table flags */ + mem_heap_t* heap, + ulint table_flags); /** Try to extend a tablespace if it is smaller than the specified size. @param[in,out] space tablespace @@ -1503,18 +1508,6 @@ ulint fil_space_get_id_by_name( const char* tablespace); -/** -Iterate over all the spaces in the space list and fetch the -tablespace names. It will return a copy of the name that must be -freed by the caller using: delete[]. -@return DB_SUCCESS if all OK. */ -dberr_t -fil_get_space_names( -/*================*/ - space_name_list_t& space_name_list) - /*!< in/out: Vector for collecting the names. */ - MY_ATTRIBUTE((warn_unused_result)); - /** Generate redo log for swapping two .ibd files @param[in] old_table old table @param[in] new_table new table diff --git a/storage/innobase/include/fsp0file.h b/storage/innobase/include/fsp0file.h index 1f057be0877..68e9f687fcd 100644 --- a/storage/innobase/include/fsp0file.h +++ b/storage/innobase/include/fsp0file.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -363,7 +364,7 @@ private: @param[in] read_only_mode if true, then readonly mode checks are enforced. @return DB_SUCCESS or DB_IO_ERROR if page cannot be read */ - dberr_t read_first_page(bool read_first_page) + dberr_t read_first_page(bool read_only_mode) MY_ATTRIBUTE((warn_unused_result)); /** Free the first page from memory when it is no longer needed. */ diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h index 813e34b43d3..362bdcb7fe6 100644 --- a/storage/innobase/include/fts0fts.h +++ b/storage/innobase/include/fts0fts.h @@ -1015,5 +1015,27 @@ fts_check_corrupt( dict_table_t* base_table, trx_t* trx); +/** Fetch the document from tuple, tokenize the text data and +insert the text data into fts auxiliary table and +its cache. Moreover this tuple fields doesn't contain any information +about externally stored field. This tuple contains data directly +converted from mysql. +@param[in] ftt FTS transaction table +@param[in] doc_id doc id +@param[in] tuple tuple from where data can be retrieved + and tuple should be arranged in table + schema order. */ +void +fts_add_doc_from_tuple( + fts_trx_table_t*ftt, + doc_id_t doc_id, + const dtuple_t* tuple); + +/** Create an FTS trx. +@param[in,out] trx InnoDB Transaction +@return FTS transaction. */ +fts_trx_t* +fts_trx_create( + trx_t* trx); #endif /*!< fts0fts.h */ diff --git a/storage/innobase/include/ha0ha.h b/storage/innobase/include/ha0ha.h index db53b6c6580..f5be654f490 100644 --- a/storage/innobase/include/ha0ha.h +++ b/storage/innobase/include/ha0ha.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -198,13 +199,6 @@ ha_validate( ulint start_index, /*!< in: start index */ ulint end_index); /*!< in: end index */ #endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ -/*************************************************************//** -Prints info of a hash table. */ -void -ha_print_info( -/*==========*/ - FILE* file, /*!< in: file where to print */ - hash_table_t* table); /*!< in: hash table */ /** The hash table external chain node */ struct ha_node_t { @@ -217,7 +211,7 @@ struct ha_node_t { }; #endif /* BTR_CUR_HASH_ADAPT */ -#ifdef UNIV_DEBUG +#if defined UNIV_DEBUG && defined BTR_CUR_HASH_ADAPT /********************************************************************//** Assert that the synchronization object in a hash operation involving possible change in the hash table is held. diff --git a/storage/innobase/include/ib0mutex.h b/storage/innobase/include/ib0mutex.h index 76f02cc1521..20703a4a933 100644 --- a/storage/innobase/include/ib0mutex.h +++ b/storage/innobase/include/ib0mutex.h @@ -225,7 +225,7 @@ struct TTASFutexMutex { return; } - ut_delay(ut_rnd_interval(0, max_delay)); + ut_delay(max_delay); } for (n_waits= 0;; n_waits++) { @@ -362,7 +362,7 @@ struct TTASMutex { uint32_t n_spins = 0; while (!try_lock()) { - ut_delay(ut_rnd_interval(0, max_delay)); + ut_delay(max_delay); if (++n_spins == max_spins) { os_thread_yield(); max_spins+= step; @@ -516,7 +516,7 @@ struct TTASEventMutex { sync_array_wait_event(sync_arr, cell); } } else { - ut_delay(ut_rnd_interval(0, max_delay)); + ut_delay(max_delay); } } diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h index 60b07f2fe72..462d0cd4051 100644 --- a/storage/innobase/include/lock0lock.h +++ b/storage/innobase/include/lock0lock.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -65,23 +65,6 @@ ulint lock_get_size(void); /*===============*/ /*********************************************************************//** -Creates the lock system at database start. */ -void -lock_sys_create( -/*============*/ - ulint n_cells); /*!< in: number of slots in lock hash table */ -/** Resize the lock hash table. -@param[in] n_cells number of slots in lock hash table */ -void -lock_sys_resize( - ulint n_cells); - -/*********************************************************************//** -Closes the lock system at database shutdown. */ -void -lock_sys_close(void); -/*================*/ -/*********************************************************************//** Gets the heap_no of the smallest user record on a page. @return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */ UNIV_INLINE @@ -296,7 +279,7 @@ lock_rec_insert_check_and_lock( dict_index_t* index, /*!< in: index */ que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr, /*!< in/out: mini-transaction */ - ibool* inherit)/*!< out: set to TRUE if the new + bool* inherit)/*!< out: set to true if the new inserted record maybe should inherit LOCK_GAP type locks from the successor record */ @@ -511,18 +494,6 @@ void lock_trx_release_locks( /*===================*/ trx_t* trx); /*!< in/out: transaction */ -/*********************************************************************//** -Removes locks on a table to be dropped or truncated. -If remove_also_table_sx_locks is TRUE then table-level S and X locks are -also removed in addition to other table-level and record-level locks. -No lock, that is going to be removed, is allowed to be a wait lock. */ -void -lock_remove_all_on_table( -/*=====================*/ - dict_table_t* table, /*!< in: table to be dropped - or truncated */ - ibool remove_also_table_sx_locks);/*!< in: also removes - table S and X locks */ /*********************************************************************//** Calculates the fold value of a page file address: used in inserting or @@ -566,33 +537,9 @@ lock_rec_find_set_bit( bit set */ /*********************************************************************//** -Gets the source table of an ALTER TABLE transaction. The table must be -covered by an IX or IS table lock. -@return the source table of transaction, if it is covered by an IX or -IS table lock; dest if there is no source table, and NULL if the -transaction is locking more than two tables or an inconsistency is -found */ -dict_table_t* -lock_get_src_table( -/*===============*/ - trx_t* trx, /*!< in: transaction */ - dict_table_t* dest, /*!< in: destination of ALTER TABLE */ - lock_mode* mode); /*!< out: lock mode of the source table */ -/*********************************************************************//** -Determine if the given table is exclusively "owned" by the given -transaction, i.e., transaction holds LOCK_IX and possibly LOCK_AUTO_INC -on the table. -@return TRUE if table is only locked by trx, with LOCK_IX, and -possibly LOCK_AUTO_INC */ -ibool -lock_is_table_exclusive( -/*====================*/ - const dict_table_t* table, /*!< in: table */ - const trx_t* trx); /*!< in: transaction */ -/*********************************************************************//** Checks if a lock request lock1 has to wait for request lock2. -@return TRUE if lock1 has to wait for lock2 to be removed */ -ibool +@return whether lock1 has to wait for lock2 to be removed */ +bool lock_has_to_wait( /*=============*/ const lock_t* lock1, /*!< in: waiting lock */ @@ -609,7 +556,7 @@ lock_report_trx_id_insanity( const rec_t* rec, /*!< in: user record */ dict_index_t* index, /*!< in: index */ const ulint* offsets, /*!< in: rec_get_offsets(rec, index) */ - trx_id_t max_trx_id); /*!< in: trx_sys_get_max_trx_id() */ + trx_id_t max_trx_id); /*!< in: trx_sys.get_max_trx_id() */ /*********************************************************************//** Prints info of locks for all transactions. @return FALSE if not able to obtain lock mutex and exits without @@ -641,7 +588,7 @@ lock_print_info_all_transactions( Return approximate number or record locks (bits set in the bitmap) for this transaction. Since delete-marked records may be removed, the record count will not be precise. -The caller must be holding lock_sys->mutex. */ +The caller must be holding lock_sys.mutex. */ ulint lock_number_of_rows_locked( /*=======================*/ @@ -650,7 +597,7 @@ lock_number_of_rows_locked( /*********************************************************************//** Return the number of table locks for a transaction. -The caller must be holding lock_sys->mutex. */ +The caller must be holding lock_sys.mutex. */ ulint lock_number_of_tables_locked( /*=========================*/ @@ -827,7 +774,6 @@ Set the lock system timeout event. */ void lock_set_timeout_event(); /*====================*/ -#ifdef UNIV_DEBUG /*********************************************************************//** Checks that a transaction id is sensible, i.e., not in the future. @return true if ok */ @@ -837,8 +783,8 @@ lock_check_trx_id_sanity( trx_id_t trx_id, /*!< in: trx id */ const rec_t* rec, /*!< in: user record */ dict_index_t* index, /*!< in: index */ - const ulint* offsets) /*!< in: rec_get_offsets(rec, index) */ - MY_ATTRIBUTE((warn_unused_result)); + const ulint* offsets); /*!< in: rec_get_offsets(rec, index) */ +#ifdef UNIV_DEBUG /*******************************************************************//** Check if the transaction holds any locks on the sys tables or its records. @@ -934,11 +880,12 @@ struct lock_op_t{ typedef ib_mutex_t LockMutex; /** The lock system struct */ -struct lock_sys_t{ - char pad1[CACHE_LINE_SIZE]; /*!< padding to prevent other - memory update hotspots from - residing on the same memory - cache line */ +class lock_sys_t +{ + bool m_initialised; + +public: + MY_ALIGNED(CACHE_LINE_SIZE) LockMutex mutex; /*!< Mutex protecting the locks */ hash_table_t* rec_hash; /*!< hash table of the record @@ -948,13 +895,13 @@ struct lock_sys_t{ hash_table_t* prdt_page_hash; /*!< hash table of the page lock */ - char pad2[CACHE_LINE_SIZE]; /*!< Padding */ + MY_ALIGNED(CACHE_LINE_SIZE) LockMutex wait_mutex; /*!< Mutex protecting the next two fields */ srv_slot_t* waiting_threads; /*!< Array of user threads suspended while waiting for locks within InnoDB, protected - by the lock_sys->wait_mutex; + by the lock_sys.wait_mutex; os_event_set() and os_event_reset() on waiting_threads[]->event @@ -963,12 +910,7 @@ struct lock_sys_t{ srv_slot_t* last_slot; /*!< highest slot ever used in the waiting_threads array, protected by - lock_sys->wait_mutex */ - ibool rollback_complete; - /*!< TRUE if rollback of all - recovered transactions is - complete. Protected by - lock_sys->mutex */ + lock_sys.wait_mutex */ ulint n_lock_max_wait_time; /*!< Max wait time */ @@ -980,6 +922,38 @@ struct lock_sys_t{ bool timeout_thread_active; /*!< True if the timeout thread is running */ + + + /** + Constructor. + + Some members may require late initialisation, thus we just mark object as + uninitialised. Real initialisation happens in create(). + */ + lock_sys_t(): m_initialised(false) {} + + + bool is_initialised() { return m_initialised; } + + + /** + Creates the lock system at database start. + + @param[in] n_cells number of slots in lock hash table + */ + void create(ulint n_cells); + + + /** + Resize the lock hash table. + + @param[in] n_cells number of slots in lock hash table + */ + void resize(ulint n_cells); + + + /** Closes the lock system at database shutdown. */ + void close(); }; /*************************************************************//** @@ -1024,36 +998,36 @@ lock_rec_trx_wait( ulint type); /** The lock system */ -extern lock_sys_t* lock_sys; +extern lock_sys_t lock_sys; -/** Test if lock_sys->mutex can be acquired without waiting. */ +/** Test if lock_sys.mutex can be acquired without waiting. */ #define lock_mutex_enter_nowait() \ - (lock_sys->mutex.trylock(__FILE__, __LINE__)) + (lock_sys.mutex.trylock(__FILE__, __LINE__)) -/** Test if lock_sys->mutex is owned. */ -#define lock_mutex_own() (lock_sys->mutex.is_owned()) +/** Test if lock_sys.mutex is owned. */ +#define lock_mutex_own() (lock_sys.mutex.is_owned()) -/** Acquire the lock_sys->mutex. */ +/** Acquire the lock_sys.mutex. */ #define lock_mutex_enter() do { \ - mutex_enter(&lock_sys->mutex); \ + mutex_enter(&lock_sys.mutex); \ } while (0) -/** Release the lock_sys->mutex. */ +/** Release the lock_sys.mutex. */ #define lock_mutex_exit() do { \ - lock_sys->mutex.exit(); \ + lock_sys.mutex.exit(); \ } while (0) -/** Test if lock_sys->wait_mutex is owned. */ -#define lock_wait_mutex_own() (lock_sys->wait_mutex.is_owned()) +/** Test if lock_sys.wait_mutex is owned. */ +#define lock_wait_mutex_own() (lock_sys.wait_mutex.is_owned()) -/** Acquire the lock_sys->wait_mutex. */ +/** Acquire the lock_sys.wait_mutex. */ #define lock_wait_mutex_enter() do { \ - mutex_enter(&lock_sys->wait_mutex); \ + mutex_enter(&lock_sys.wait_mutex); \ } while (0) -/** Release the lock_sys->wait_mutex. */ +/** Release the lock_sys.wait_mutex. */ #define lock_wait_mutex_exit() do { \ - lock_sys->wait_mutex.exit(); \ + lock_sys.wait_mutex.exit(); \ } while (0) #ifdef WITH_WSREP diff --git a/storage/innobase/include/lock0lock.ic b/storage/innobase/include/lock0lock.ic index b73843e7a1f..dad62c9685c 100644 --- a/storage/innobase/include/lock0lock.ic +++ b/storage/innobase/include/lock0lock.ic @@ -35,7 +35,6 @@ Created 5/7/1996 Heikki Tuuri #include "row0vers.h" #include "que0que.h" #include "btr0cur.h" -#include "read0read.h" #include "log0recv.h" /*********************************************************************//** @@ -64,7 +63,7 @@ lock_rec_hash( ulint page_no)/*!< in: page number */ { return(unsigned(hash_calc_hash(lock_rec_fold(space, page_no), - lock_sys->rec_hash))); + lock_sys.rec_hash))); } /*********************************************************************//** @@ -100,11 +99,11 @@ lock_hash_get( ulint mode) /*!< in: lock mode */ { if (mode & LOCK_PREDICATE) { - return(lock_sys->prdt_hash); + return(lock_sys.prdt_hash); } else if (mode & LOCK_PRDT_PAGE) { - return(lock_sys->prdt_page_hash); + return(lock_sys.prdt_page_hash); } else { - return(lock_sys->rec_hash); + return(lock_sys.rec_hash); } } diff --git a/storage/innobase/include/lock0priv.h b/storage/innobase/include/lock0priv.h index 6bb75817ad6..ec596f6ca5b 100644 --- a/storage/innobase/include/lock0priv.h +++ b/storage/innobase/include/lock0priv.h @@ -111,7 +111,7 @@ operator<<(std::ostream& out, const lock_rec_t& lock) return(lock.print(out)); } -/** Lock struct; protected by lock_sys->mutex */ +/** Lock struct; protected by lock_sys.mutex */ struct lock_t { trx_t* trx; /*!< transaction owning the lock */ @@ -721,7 +721,7 @@ public: as a victim, and we got the lock immediately: no need to wait then */ dberr_t add_to_waitq( - const lock_t* wait_for, + lock_t* wait_for, const lock_prdt_t* prdt = NULL); @@ -731,21 +731,22 @@ public: @param[in] owns_trx_mutex true if caller owns the trx_t::mutex @param[in] add_to_hash add the lock to hash table @param[in] prdt Predicate lock (optional) + @param[in,out] c_lock Conflicting lock request or NULL + in Galera conflicting lock is selected + as deadlock victim if requester + is BF transaction. @return new lock instance */ lock_t* create( trx_t* trx, bool owns_trx_mutex, bool add_to_hash, const lock_prdt_t* - prdt = NULL); + prdt = NULL +#ifdef WITH_WSREP + ,lock_t* c_lock = NULL +#endif /* WITH_WSREP */ + ) const; - lock_t* create( - lock_t* const c_lock, - trx_t* trx, - bool owns_trx_mutex, - bool add_to_hash, - const lock_prdt_t* - prdt = NULL); /** Check of the lock is on m_rec_id. @param[in] lock Lock to compare with @@ -837,7 +838,7 @@ private: @param[in,out] lock Newly created record lock to add to the rec hash and the transaction lock list @param[in] add_to_hash If the lock should be added to the hash table */ - void lock_add(lock_t* lock, bool add_to_hash); + void lock_add(lock_t* lock, bool add_to_hash) const; /** Check and resolve any deadlocks diff --git a/storage/innobase/include/lock0types.h b/storage/innobase/include/lock0types.h index d08eaabfb1e..792a5f21acb 100644 --- a/storage/innobase/include/lock0types.h +++ b/storage/innobase/include/lock0types.h @@ -31,7 +31,6 @@ Created 5/7/1996 Heikki Tuuri #define lock_t ib_lock_t struct lock_t; -struct lock_sys_t; struct lock_table_t; /* Basic lock modes */ diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index 6a13d1d9640..716ca34b928 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -2,7 +2,7 @@ Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2009, Google Inc. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -514,9 +514,11 @@ or the MySQL version that created the redo log file. */ #define LOG_HEADER_FORMAT_3_23 0 /** The MySQL 5.7.9/MariaDB 10.2.2 log format */ #define LOG_HEADER_FORMAT_10_2 1 +/** The MariaDB 10.3.2 log format */ +#define LOG_HEADER_FORMAT_10_3 103 /** The redo log format identifier corresponding to the current format version. Stored in LOG_HEADER_FORMAT. */ -#define LOG_HEADER_FORMAT_CURRENT 103 +#define LOG_HEADER_FORMAT_CURRENT LOG_HEADER_FORMAT_10_3 /** Encrypted MariaDB redo log */ #define LOG_HEADER_FORMAT_ENCRYPTED (1U<<31) @@ -612,15 +614,15 @@ struct log_t{ mtr_commit and still ensure that insertions in the flush_list happen in the LSN order. */ - byte* buf_ptr; /*!< unaligned log buffer, which should - be of double of buf_size */ - byte* buf; /*!< log buffer currently in use; - this could point to either the first - half of the aligned(buf_ptr) or the + byte* buf; /*!< Memory of double the buf_size is + allocated here. This pointer will change + however to either the first half or the second half in turns, so that log write/flush to disk don't block concurrent mtrs which will write - log to this buffer */ + log to this buffer. Care to switch back + to the first half before freeing/resizing + must be undertaken. */ bool first_in_use; /*!< true if buf points to the first half of the aligned(buf_ptr), false if the second half */ diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h index 4bfbbb4bb7d..8bab2408605 100644 --- a/storage/innobase/include/log0recv.h +++ b/storage/innobase/include/log0recv.h @@ -99,14 +99,15 @@ recv_sys_debug_free(void); /** Read a log segment to a buffer. @param[out] buf buffer @param[in] group redo log files -@param[in] start_lsn read area start +@param[in, out] start_lsn in : read area start, out: the last read valid lsn @param[in] end_lsn read area end -@return valid end_lsn */ -lsn_t +@param[out] invalid_block - invalid, (maybe incompletely written) block encountered +@return false, if invalid block encountered (e.g checksum mismatch), true otherwise */ +bool log_group_read_log_seg( byte* buf, const log_group_t* group, - lsn_t start_lsn, + lsn_t* start_lsn, lsn_t end_lsn); /********************************************************//** @@ -216,6 +217,7 @@ struct recv_sys_t{ /*!< this is TRUE when a log rec application batch is running */ byte* buf; /*!< buffer for parsing log records */ + size_t buf_size; /*!< size of buf */ ulint len; /*!< amount of data in buf */ lsn_t parse_start_lsn; /*!< this is the lsn from which we were able to diff --git a/storage/innobase/include/mem0mem.ic b/storage/innobase/include/mem0mem.ic index 9c996b375d5..dbad7cb6950 100644 --- a/storage/innobase/include/mem0mem.ic +++ b/storage/innobase/include/mem0mem.ic @@ -280,8 +280,7 @@ mem_heap_free_heap_top( mem_block_set_free(block, old_top - (byte*) block); ut_ad(mem_block_get_start(block) <= mem_block_get_free(block)); - UNIV_MEM_ASSERT_W(old_top, (byte*) block + block->len - old_top); - UNIV_MEM_ALLOC(old_top, (byte*) block + block->len - old_top); + UNIV_MEM_FREE(old_top, (byte*) block + block->len - old_top); /* If free == start, we may free the block if it is not the first one */ @@ -445,7 +444,6 @@ mem_heap_free_top( /* Subtract the free field of block */ mem_block_set_free(block, mem_block_get_free(block) - MEM_SPACE_NEEDED(n)); - UNIV_MEM_ASSERT_W((byte*) block + mem_block_get_free(block), n); /* If free == start, we may free the block if it is not the first one */ @@ -454,11 +452,7 @@ mem_heap_free_top( == mem_block_get_start(block))) { mem_heap_block_free(heap, block); } else { - /* Avoid a bogus UNIV_MEM_ASSERT_W() warning in a - subsequent invocation of mem_heap_free_top(). - Originally, this was UNIV_MEM_FREE(), to catch writes - to freed memory. */ - UNIV_MEM_ALLOC((byte*) block + mem_block_get_free(block), n); + UNIV_MEM_FREE((byte*) block + mem_block_get_free(block), n); } } diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index 045a14221a3..6639a3448ea 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -32,13 +32,10 @@ Created 11/26/1995 Heikki Tuuri #include "log0types.h" #include "mtr0types.h" #include "buf0types.h" -#include "trx0types.h" #include "dyn0buf.h" /** Start a mini-transaction. */ #define mtr_start(m) (m)->start() -/** Start a mini-transaction. */ -#define mtr_start_trx(m, t) (m)->start((t)) /** Start a synchronous mini-transaction */ #define mtr_start_sync(m) (m)->start(true) @@ -217,9 +214,6 @@ struct mtr_t { /** Owning mini-transaction */ mtr_t* m_mtr; - - /* Transaction handle */ - trx_t* m_trx; }; mtr_t() @@ -239,15 +233,7 @@ struct mtr_t { /** Start a mini-transaction. @param sync true if it is a synchronous mini-transaction @param read_only true if read only mini-transaction */ - void start(bool sync = true, bool read_only = false) - { - start(NULL, sync, read_only); - } - - /** Start a mini-transaction. - @param sync true if it is a synchronous mini-transaction - @param read_only true if read only mini-transaction */ - void start(trx_t* trx, bool sync = true, bool read_only = false); + void start(bool sync = true, bool read_only = false); /** @return whether this is an asynchronous mini-transaction. */ bool is_async() const @@ -333,7 +319,7 @@ struct mtr_t { the same set of tablespaces as this one */ void set_spaces(const mtr_t& mtr) { - ut_ad(m_impl.m_user_space_id == TRX_SYS_SPACE); + ut_ad(!m_impl.m_user_space_id); ut_ad(!m_impl.m_user_space); ut_ad(!m_impl.m_undo_space); ut_ad(!m_impl.m_sys_space); @@ -350,9 +336,9 @@ struct mtr_t { @return the tablespace */ fil_space_t* set_named_space(ulint space_id) { - ut_ad(m_impl.m_user_space_id == TRX_SYS_SPACE); + ut_ad(!m_impl.m_user_space_id); ut_d(m_impl.m_user_space_id = space_id); - if (space_id == TRX_SYS_SPACE) { + if (!space_id) { return(set_sys_modified()); } else { lookup_user_space(space_id); diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h index ac24812cdfc..94d904e8efd 100644 --- a/storage/innobase/include/mtr0types.h +++ b/storage/innobase/include/mtr0types.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -100,15 +100,18 @@ enum mlog_id_t { /** Create an index page */ MLOG_PAGE_CREATE = 19, - /** Insert entry in an undo log */ + /** insert an undo log record (used in MariaDB 10.2) */ MLOG_UNDO_INSERT = 20, - /** erase an undo log page end */ + /** erase an undo log page end (used in MariaDB 10.2) */ MLOG_UNDO_ERASE_END = 21, - /** initialize a page in an undo log */ + /** initialize a page in an undo log (used in MariaDB 10.2) */ MLOG_UNDO_INIT = 22, + /** reuse an insert undo log header (used in MariaDB 10.2) */ + MLOG_UNDO_HDR_REUSE = 24, + /** create an undo log header */ MLOG_UNDO_HDR_CREATE = 25, diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 064430cbf4b..13de798280a 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -1195,11 +1195,12 @@ to original un-instrumented file I/O APIs */ # define os_file_read_no_error_handling(type, file, buf, offset, n, o) \ os_file_read_no_error_handling_func(type, file, buf, offset, n, o) # define os_file_read_no_error_handling_int_fd(type, file, buf, offset, n) \ - os_file_read_no_error_handling_func(type, file, buf, offset, n, NULL) + os_file_read_no_error_handling_func(type, OS_FILE_FROM_FD(file), buf, offset, n, NULL) # define os_file_write(type, name, file, buf, offset, n) \ os_file_write_func(type, name, file, buf, offset, n) -# define os_file_write_int_fd os_file_write_func +# define os_file_write_int_fd(type, name, file, buf, offset, n) \ + os_file_write_func(type, name, OS_FILE_FROM_FD(file), buf, offset, n) # define os_file_flush(file) os_file_flush_func(file) diff --git a/storage/innobase/include/os0once.h b/storage/innobase/include/os0once.h index 05a45a69f33..551e78d24ba 100644 --- a/storage/innobase/include/os0once.h +++ b/storage/innobase/include/os0once.h @@ -30,6 +30,7 @@ Created Feb 20, 2014 Vasil Dimov #include "univ.i" #include "ut0ut.h" +#include "my_cpu.h" /** Execute a given function exactly once in a multi-threaded environment or wait for the function to be executed by another thread. @@ -110,7 +111,7 @@ public: ut_error; } - UT_RELAX_CPU(); + MY_RELAX_CPU(); } } } diff --git a/storage/innobase/include/os0thread.h b/storage/innobase/include/os0thread.h index c240f5dacdd..c1b96ef7a1f 100644 --- a/storage/innobase/include/os0thread.h +++ b/storage/innobase/include/os0thread.h @@ -53,12 +53,8 @@ typedef LPTHREAD_START_ROUTINE os_thread_func_t; /** Macro for specifying a Windows thread start function. */ #define DECLARE_THREAD(func) WINAPI func -/** Required to get around a build error on Windows. Even though our functions -are defined/declared as WINAPI f(LPVOID a); the compiler complains that they -are defined as: os_thread_ret_t (__cdecl*)(void*). Because our functions -don't access the arguments and don't return any value, we should be safe. */ #define os_thread_create(f,a,i) \ - os_thread_create_func(reinterpret_cast<os_thread_func_t>(f), a, i) + os_thread_create_func(f, a, i) #else diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h index c2b9a833bda..dee08605e58 100644 --- a/storage/innobase/include/page0page.h +++ b/storage/innobase/include/page0page.h @@ -1,6 +1,6 @@ /***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, 2017, MariaDB Corporation. +Copyright (c) 2013, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -733,14 +733,52 @@ ulint page_rec_get_heap_no( /*=================*/ const rec_t* rec); /*!< in: the physical record */ +/** Determine whether a page has any siblings. +@param[in] page page frame +@return true if the page has any siblings */ +inline +bool +page_has_siblings(const page_t* page) +{ + compile_time_assert(!(FIL_PAGE_PREV % 8)); + compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4); + compile_time_assert(FIL_NULL == 0xffffffff); + return *reinterpret_cast<const uint64_t*>(page + FIL_PAGE_PREV) + != ~uint64_t(0); +} + /** Determine whether a page is an index root page. @param[in] page page frame @return true if the page is a root page of an index */ -UNIV_INLINE +inline bool -page_is_root( - const page_t* page) - MY_ATTRIBUTE((warn_unused_result)); +page_is_root(const page_t* page) +{ + return fil_page_index_page_check(page) && !page_has_siblings(page); +} + +/** Determine whether a page has a predecessor. +@param[in] page page frame +@return true if the page has a predecessor */ +inline +bool +page_has_prev(const page_t* page) +{ + return *reinterpret_cast<const uint32_t*>(page + FIL_PAGE_PREV) + != FIL_NULL; +} + +/** Determine whether a page has a successor. +@param[in] page page frame +@return true if the page has a successor */ +inline +bool +page_has_next(const page_t* page) +{ + return *reinterpret_cast<const uint32_t*>(page + FIL_PAGE_NEXT) + != FIL_NULL; +} + /************************************************************//** Gets the pointer to the next record on the page. @return pointer to next record */ diff --git a/storage/innobase/include/page0page.ic b/storage/innobase/include/page0page.ic index ee908896050..da0cd8511af 100644 --- a/storage/innobase/include/page0page.ic +++ b/storage/innobase/include/page0page.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2016, 2017, MariaDB Corporation. +Copyright (c) 2016, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -192,13 +192,11 @@ page_header_get_offs( const page_t* page, /*!< in: page */ ulint field) /*!< in: PAGE_FREE, ... */ { - ulint offs; - ut_ad((field == PAGE_FREE) || (field == PAGE_LAST_INSERT) || (field == PAGE_HEAP_TOP)); - offs = page_header_get_field(page, field); + uint16_t offs = page_header_get_field(page, field); ut_ad((field != PAGE_HEAP_TOP) || offs); @@ -277,31 +275,6 @@ page_rec_get_heap_no( } } -/** Determine whether a page is an index root page. -@param[in] page page frame -@return true if the page is a root page of an index */ -UNIV_INLINE -bool -page_is_root( - const page_t* page) -{ -#if FIL_PAGE_PREV % 8 -# error FIL_PAGE_PREV must be 64-bit aligned -#endif -#if FIL_PAGE_NEXT != FIL_PAGE_PREV + 4 -# error FIL_PAGE_NEXT must be adjacent to FIL_PAGE_PREV -#endif -#if FIL_NULL != 0xffffffff -# error FIL_NULL != 0xffffffff -#endif - /* Check that this is an index page and both the PREV and NEXT - pointers are FIL_NULL, because the root page does not have any - siblings. */ - return(fil_page_index_page_check(page) - && *reinterpret_cast<const ib_uint64_t*>(page + FIL_PAGE_PREV) - == IB_UINT64_MAX); -} - /** Determine whether an index page record is a user record. @param[in] rec record in an index page @return true if a user record */ @@ -1116,7 +1089,7 @@ page_get_instant(const page_t* page) ut_ad(i <= PAGE_NO_DIRECTION || !page_is_comp(page)); break; case FIL_PAGE_RTREE: - ut_ad(i == PAGE_NO_DIRECTION || i == 0); + ut_ad(i <= PAGE_NO_DIRECTION); break; default: ut_ad(!"invalid page type"); diff --git a/storage/innobase/include/pars0opt.h b/storage/innobase/include/pars0opt.h index 13ea38cc385..d9debcf325e 100644 --- a/storage/innobase/include/pars0opt.h +++ b/storage/innobase/include/pars0opt.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,7 +29,6 @@ Created 12/21/1997 Heikki Tuuri #include "univ.i" #include "que0types.h" -#include "usr0types.h" #include "pars0sym.h" #include "dict0types.h" #include "row0sel.h" diff --git a/storage/innobase/include/pars0pars.h b/storage/innobase/include/pars0pars.h index dad7953424c..37498c1c638 100644 --- a/storage/innobase/include/pars0pars.h +++ b/storage/innobase/include/pars0pars.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,7 +29,6 @@ Created 11/19/1996 Heikki Tuuri #include "univ.i" #include "que0types.h" -#include "usr0types.h" #include "pars0types.h" #include "row0types.h" #include "trx0types.h" diff --git a/storage/innobase/include/pars0sym.h b/storage/innobase/include/pars0sym.h index 4e511719639..920087b96c2 100644 --- a/storage/innobase/include/pars0sym.h +++ b/storage/innobase/include/pars0sym.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,7 +29,6 @@ Created 12/15/1997 Heikki Tuuri #include "univ.i" #include "que0types.h" -#include "usr0types.h" #include "dict0types.h" #include "pars0types.h" #include "row0types.h" diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h index 763b16820d8..ca06f5b09ba 100644 --- a/storage/innobase/include/que0que.h +++ b/storage/innobase/include/que0que.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -33,7 +33,6 @@ Created 5/27/1996 Heikki Tuuri #include "trx0trx.h" #include "trx0roll.h" #include "srv0srv.h" -#include "usr0types.h" #include "que0types.h" #include "row0types.h" #include "pars0types.h" @@ -336,13 +335,6 @@ enum que_thr_lock_t { QUE_THR_LOCK_TABLE }; -/** From where the cursor position is counted */ -enum que_cur_t { - QUE_CUR_NOT_DEFINED, - QUE_CUR_START, - QUE_CUR_END -}; - /* Query graph query thread node: the fields are protected by the trx_t::mutex with the exceptions named below */ @@ -381,9 +373,6 @@ struct que_thr_t{ thrs; /*!< list of thread nodes of the fork node */ UT_LIST_NODE_T(que_thr_t) - trx_thrs; /*!< lists of threads in wait list of - the trx */ - UT_LIST_NODE_T(que_thr_t) queue; /*!< list of runnable thread nodes in the server task queue */ ulint fk_cascade_depth; /*!< maximum cascading call depth @@ -419,18 +408,7 @@ struct que_fork_t{ generated by the parser, or NULL if the graph was created 'by hand' */ pars_info_t* info; /*!< info struct, or NULL */ - /* The following cur_... fields are relevant only in a select graph */ - ulint cur_end; /*!< QUE_CUR_NOT_DEFINED, QUE_CUR_START, - QUE_CUR_END */ - ulint cur_pos; /*!< if there are n rows in the result - set, values 0 and n + 1 mean before - first row, or after last row, depending - on cur_end; values 1...n mean a row - index */ - ibool cur_on_row; /*!< TRUE if cursor is on a row, i.e., - it is not before the first row or - after the last row */ sel_node_t* last_sel_node; /*!< last executed select node, or NULL if none */ UT_LIST_NODE_T(que_fork_t) diff --git a/storage/innobase/include/que0que.ic b/storage/innobase/include/que0que.ic index ec61081cfe2..545d5288298 100644 --- a/storage/innobase/include/que0que.ic +++ b/storage/innobase/include/que0que.ic @@ -23,8 +23,6 @@ Query graph Created 5/27/1996 Heikki Tuuri *******************************************************/ -#include "usr0sess.h" - /***********************************************************************//** Gets the trx of a query thread. */ UNIV_INLINE diff --git a/storage/innobase/include/read0read.h b/storage/innobase/include/read0read.h deleted file mode 100644 index 129341be77c..00000000000 --- a/storage/innobase/include/read0read.h +++ /dev/null @@ -1,125 +0,0 @@ -/***************************************************************************** - -Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/read0read.h -Cursor read - -Created 2/16/1997 Heikki Tuuri -*******************************************************/ - -#ifndef read0read_h -#define read0read_h - -#include "univ.i" - -#include "read0types.h" - -#include <algorithm> - -/** The MVCC read view manager */ -class MVCC { -public: - /** Constructor - @param size Number of views to pre-allocate */ - explicit MVCC(ulint size); - - /** Destructor. - Free all the views in the m_free list */ - ~MVCC(); - - /** - Allocate and create a view. - @param view view owned by this class created for the - caller. Must be freed by calling close() - @param trx transaction creating the view */ - void view_open(ReadView*& view, trx_t* trx); - - /** - Close a view created by the above function. - @para view view allocated by trx_open. - @param own_mutex true if caller owns trx_sys_t::mutex */ - void view_close(ReadView*& view, bool own_mutex); - - /** - Release a view that is inactive but not closed. Caller must own - the trx_sys_t::mutex. - @param view View to release */ - void view_release(ReadView*& view); - - /** Clones the oldest view and stores it in view. No need to - call view_close(). The caller owns the view that is passed in. - It will also move the closed views from the m_views list to the - m_free list. This function is called by Purge to create it view. - @param view Preallocated view, owned by the caller */ - void clone_oldest_view(ReadView* view); - - /** - @return the number of active views */ - ulint size() const; - - /** - @return true if the view is active and valid */ - static bool is_view_active(ReadView* view) - { - ut_a(view != reinterpret_cast<ReadView*>(0x1)); - - return(view != NULL && !(intptr_t(view) & 0x1)); - } - - /** - Set the view creator transaction id. Note: This shouldbe set only - for views created by RW transactions. */ - static void set_view_creator_trx_id(ReadView* view, trx_id_t id); - -private: - - /** - Validates a read view list. */ - bool validate() const; - - /** - Find a free view from the active list, if none found then allocate - a new view. This function will also attempt to move delete marked - views from the active list to the freed list. - @return a view to use */ - inline ReadView* get_view(); - - /** - Get the oldest view in the system. It will also move the delete - marked read views from the views list to the freed list. - @return oldest view if found or NULL */ - inline ReadView* get_oldest_view() const; - -private: - // Prevent copying - MVCC(const MVCC&); - MVCC& operator=(const MVCC&); - -private: - typedef UT_LIST_BASE_NODE_T(ReadView) view_list_t; - - /** Free views ready for reuse. */ - view_list_t m_free; - - /** Active and closed views, the closed views will have the - creator trx id set to TRX_ID_MAX */ - view_list_t m_views; -}; - -#endif /* read0read_h */ diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h index 8056dbf437f..3a06190b61d 100644 --- a/storage/innobase/include/read0types.h +++ b/storage/innobase/include/read0types.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -31,122 +32,185 @@ Created 2/16/1997 Heikki Tuuri #include "trx0types.h" -// Friend declaration -class MVCC; -/** Read view lists the trx ids of those transactions for which a consistent -read should not see the modifications to the database. */ +/** View is not in MVCC and not visible to purge thread. */ +#define READ_VIEW_STATE_CLOSED 0 -class ReadView { - /** This is similar to a std::vector but it is not a drop - in replacement. It is specific to ReadView. */ - class ids_t { - typedef trx_ids_t::value_type value_type; +/** View is in MVCC, but not visible to purge thread. */ +#define READ_VIEW_STATE_REGISTERED 1 - /** - Constructor */ - ids_t() : m_ptr(), m_size(), m_reserved() { } +/** View is in MVCC, purge thread must wait for READ_VIEW_STATE_OPEN. */ +#define READ_VIEW_STATE_SNAPSHOT 2 - /** - Destructor */ - ~ids_t() { UT_DELETE_ARRAY(m_ptr); } +/** View is in MVCC and is visible to purge thread. */ +#define READ_VIEW_STATE_OPEN 3 - /** - Try and increase the size of the array. Old elements are - copied across. It is a no-op if n is < current size. - @param n Make space for n elements */ - void reserve(ulint n); +/** + Read view lists the trx ids of those transactions for which a consistent read + should not see the modifications to the database. +*/ +class ReadView +{ + /** + View state. - /** - Resize the array, sets the current element count. - @param n new size of the array, in elements */ - void resize(ulint n) - { - ut_ad(n <= capacity()); + It is not defined as enum as it has to be updated using atomic operations. + Possible values are READ_VIEW_STATE_CLOSED, READ_VIEW_STATE_REGISTERED, + READ_VIEW_STATE_SNAPSHOT and READ_VIEW_STATE_OPEN. - m_size = n; - } - - /** - Reset the size to 0 */ - void clear() { resize(0); } - - /** - @return the capacity of the array in elements */ - ulint capacity() const { return(m_reserved); } - - /** - Copy and overwrite the current array contents - - @param start Source array - @param end Pointer to end of array */ - void assign(const value_type* start, const value_type* end); - - /** - Insert the value in the correct slot, preserving the order. - Doesn't check for duplicates. */ - void insert(value_type value); - - /** - @return the value of the first element in the array */ - value_type front() const - { - ut_ad(!empty()); + Possible state transfers... - return(m_ptr[0]); - } - - /** - @return the value of the last element in the array */ - value_type back() const - { - ut_ad(!empty()); - - return(m_ptr[m_size - 1]); - } + Opening view for the first time: + READ_VIEW_STATE_CLOSED -> READ_VIEW_STATE_SNAPSHOT (non-atomic) - /** - Append a value to the array. - @param value the value to append */ - void push_back(value_type value); + Complete first time open or reopen: + READ_VIEW_STATE_SNAPSHOT -> READ_VIEW_STATE_OPEN (atomic) - /** - @return a pointer to the start of the array */ - trx_id_t* data() { return(m_ptr); }; + Close view but keep it in list: + READ_VIEW_STATE_OPEN -> READ_VIEW_STATE_REGISTERED (atomic) - /** - @return a const pointer to the start of the array */ - const trx_id_t* data() const { return(m_ptr); }; + Close view and remove it from list: + READ_VIEW_STATE_OPEN -> READ_VIEW_STATE_CLOSED (non-atomic) - /** - @return the number of elements in the array */ - ulint size() const { return(m_size); } + Reusing view: + READ_VIEW_STATE_REGISTERED -> READ_VIEW_STATE_SNAPSHOT (atomic) - /** - @return true if size() == 0 */ - bool empty() const { return(size() == 0); } + Removing closed view from list: + READ_VIEW_STATE_REGISTERED -> READ_VIEW_STATE_CLOSED (non-atomic) + */ + int32_t m_state; - private: - // Prevent copying - ids_t(const ids_t&); - ids_t& operator=(const ids_t&); - private: - /** Memory for the array */ - value_type* m_ptr; - - /** Number of active elements in the array */ - ulint m_size; +public: + ReadView(): m_state(READ_VIEW_STATE_CLOSED) {} + + + /** + Copy state from another view. + + This method is used to find min(m_low_limit_no), min(m_low_limit_id) and + all transaction ids below min(m_low_limit_id). These values effectively + form oldest view. + + @param other view to copy from + */ + void copy(const ReadView &other) + { + ut_ad(&other != this); + if (m_low_limit_no > other.m_low_limit_no) + m_low_limit_no= other.m_low_limit_no; + if (m_low_limit_id > other.m_low_limit_id) + m_low_limit_id= other.m_low_limit_id; + + trx_ids_t::iterator dst= m_ids.begin(); + for (trx_ids_t::const_iterator src= other.m_ids.begin(); + src != other.m_ids.end(); src++) + { + if (*src >= m_low_limit_id) + break; +loop: + if (dst == m_ids.end()) + { + m_ids.push_back(*src); + dst= m_ids.end(); + continue; + } + if (*dst < *src) + { + dst++; + goto loop; + } + else if (*dst > *src) + dst= m_ids.insert(dst, *src) + 1; + } + m_ids.erase(std::lower_bound(dst, m_ids.end(), m_low_limit_id), + m_ids.end()); + + m_up_limit_id= m_ids.empty() ? m_low_limit_id : m_ids.front(); + ut_ad(m_up_limit_id <= m_low_limit_id); + } + + + /** + Opens a read view where exactly the transactions serialized before this + point in time are seen in the view. + + View becomes visible to purge thread via trx_sys.m_views. + + @param[in,out] trx transaction + */ + void open(trx_t *trx); + + + /** + Closes the view. + + View becomes not visible to purge thread via trx_sys.m_views. + */ + void close(); + + + /** + Marks view unused. + + View is still in trx_sys.m_views list, but is not visible to purge threads. + */ + void unuse() + { + ut_ad(m_state == READ_VIEW_STATE_CLOSED || + m_state == READ_VIEW_STATE_REGISTERED || + m_state == READ_VIEW_STATE_OPEN); + if (m_state == READ_VIEW_STATE_OPEN) + my_atomic_store32_explicit(&m_state, READ_VIEW_STATE_REGISTERED, + MY_MEMORY_ORDER_RELAXED); + } + + + /** m_state getter for trx_sys::clone_oldest_view() trx_sys::size(). */ + int32_t get_state() const + { + return my_atomic_load32_explicit(const_cast<int32*>(&m_state), + MY_MEMORY_ORDER_ACQUIRE); + } + + + /** + Returns true if view is open. + + Only used by view owner thread, thus we can omit atomic operations. + */ + bool is_open() const + { + ut_ad(m_state == READ_VIEW_STATE_OPEN || + m_state == READ_VIEW_STATE_CLOSED || + m_state == READ_VIEW_STATE_REGISTERED); + return m_state == READ_VIEW_STATE_OPEN; + } + + + /** + Creates a snapshot where exactly the transactions serialized before this + point in time are seen in the view. + + @param[in,out] trx transaction + */ + void snapshot(trx_t *trx); + + + /** + Sets the creator transaction id. + + This should be set only for views created by RW transactions. + */ + void set_creator_trx_id(trx_id_t id) + { + ut_ad(id > 0); + ut_ad(m_creator_trx_id == 0); + m_creator_trx_id= id; + } - /** Size of m_ptr in elements */ - ulint m_reserved; - friend class ReadView; - }; -public: - ReadView(); - ~ReadView(); /** Check whether transaction id is valid. @param[in] id transaction id to check @param[in] name table name */ @@ -179,9 +243,7 @@ public: return(true); } - const ids_t::value_type* p = m_ids.data(); - - return(!std::binary_search(p, p + m_ids.size(), id)); + return(!std::binary_search(m_ids.begin(), m_ids.end(), id)); } /** @@ -193,21 +255,6 @@ public: } /** - Mark the view as closed */ - void close() - { - ut_ad(m_creator_trx_id != TRX_ID_MAX); - m_creator_trx_id = TRX_ID_MAX; - } - - /** - @return true if the view is closed */ - bool is_closed() const - { - return(m_closed); - } - - /** Write the limits to the file. @param file file to write to */ void print_limits(FILE* file) const @@ -232,66 +279,6 @@ public: return(m_low_limit_id); } - /** - @return true if there are no transaction ids in the snapshot */ - bool empty() const - { - return(m_ids.empty()); - } - -#ifdef UNIV_DEBUG - /** - @param rhs view to compare with - @return truen if this view is less than or equal rhs */ - bool le(const ReadView* rhs) const - { - return(m_low_limit_no <= rhs->m_low_limit_no); - } - - trx_id_t up_limit_id() const - { - return(m_up_limit_id); - } -#endif /* UNIV_DEBUG */ -private: - /** - Copy the transaction ids from the source vector */ - inline void copy_trx_ids(const trx_ids_t& trx_ids); - - /** - Opens a read view where exactly the transactions serialized before this - point in time are seen in the view. - @param id Creator transaction id */ - inline void prepare(trx_id_t id); - - /** - Complete the read view creation */ - inline void complete(); - - /** - Copy state from another view. Must call copy_complete() to finish. - @param other view to copy from */ - inline void copy_prepare(const ReadView& other); - - /** - Complete the copy, insert the creator transaction id into the - m_trx_ids too and adjust the m_up_limit_id *, if required */ - inline void copy_complete(); - - /** - Set the creator transaction id, existing id must be 0 */ - void creator_trx_id(trx_id_t id) - { - ut_ad(m_creator_trx_id == 0); - m_creator_trx_id = id; - } - - friend class MVCC; - -private: - // Disable copying - ReadView(const ReadView&); - ReadView& operator=(const ReadView&); private: /** The read should not see any transaction with trx id >= this @@ -309,21 +296,16 @@ private: /** Set of RW transactions that was active when this snapshot was taken */ - ids_t m_ids; + trx_ids_t m_ids; /** The view does not need to see the undo logs for transactions whose transaction number is strictly smaller (<) than this value: they can be removed in purge if not needed by other views */ trx_id_t m_low_limit_no; - /** AC-NL-RO transaction view that has been "closed". */ - bool m_closed; - - typedef UT_LIST_NODE_T(ReadView) node_t; - - /** List of read views in trx_sys */ - byte pad1[64 - sizeof(node_t)]; - node_t m_view_list; + byte pad1[CACHE_LINE_SIZE]; +public: + UT_LIST_NODE_T(ReadView) m_view_list; }; #endif diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h index 062e4f8d8ab..da82361875c 100644 --- a/storage/innobase/include/rem0rec.h +++ b/storage/innobase/include/rem0rec.h @@ -367,7 +367,7 @@ rec_set_deleted_flag_new( The following function tells if a new-style record is a node pointer. @return TRUE if node pointer */ UNIV_INLINE -ibool +bool rec_get_node_ptr_flag( /*==================*/ const rec_t* rec) /*!< in: physical record */ diff --git a/storage/innobase/include/rem0rec.ic b/storage/innobase/include/rem0rec.ic index cc66149945c..bc9006a66e8 100644 --- a/storage/innobase/include/rem0rec.ic +++ b/storage/innobase/include/rem0rec.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -731,7 +731,7 @@ rec_set_deleted_flag_new( The following function tells if a new-style record is a node pointer. @return TRUE if node pointer */ UNIV_INLINE -ibool +bool rec_get_node_ptr_flag( /*==================*/ const rec_t* rec) /*!< in: physical record */ @@ -895,7 +895,7 @@ rec_offs_set_n_alloc( { ut_ad(offsets); ut_ad(n_alloc > REC_OFFS_HEADER_SIZE); - UNIV_MEM_ASSERT_AND_ALLOC(offsets, n_alloc * sizeof *offsets); + UNIV_MEM_ALLOC(offsets, n_alloc * sizeof *offsets); offsets[0] = n_alloc; } diff --git a/storage/innobase/include/row0ins.h b/storage/innobase/include/row0ins.h index 8cb3a2f16cd..ed425390ed2 100644 --- a/storage/innobase/include/row0ins.h +++ b/storage/innobase/include/row0ins.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -198,10 +198,13 @@ struct ins_node_t{ this should be reset to NULL */ UT_LIST_BASE_NODE_T(dtuple_t) entry_list;/* list of entries, one for each index */ - byte* row_id_buf;/* buffer for the row id sys field in row */ + /** buffer for the system columns */ + byte sys_buf[DATA_ROW_ID_LEN + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]; trx_id_t trx_id; /*!< trx id or the last trx which executed the node */ - byte* trx_id_buf;/* buffer for the trx id sys field in row */ + byte vers_start_buf[8]; /* Buffers for System Versioning */ + byte vers_end_buf[8]; /* system fields. */ mem_heap_t* entry_sys_heap; /* memory heap used as auxiliary storage; entry_list and sys fields are stored here; @@ -227,5 +230,4 @@ struct ins_node_t{ #define INS_NODE_ALLOC_ROW_ID 2 /* row id should be allocated */ #define INS_NODE_INSERT_ENTRIES 3 /* index entries should be built and inserted */ - #endif diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h index ca620cbef59..eb4da62164b 100644 --- a/storage/innobase/include/row0merge.h +++ b/storage/innobase/include/row0merge.h @@ -59,9 +59,6 @@ Created 13/06/2005 Jan Lindstrom // Forward declaration struct ib_sequence_t; -/** The DB_TRX_ID,DB_ROLL_PTR values for "no history is available" */ -extern const byte reset_trx_id[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]; - /** @brief Block size for I/O operations in merge sort. The minimum is UNIV_PAGE_SIZE, or page_get_free_space_of_empty() @@ -325,6 +322,7 @@ this function and it will be passed to other functions for further accounting. @param[in] add_v new virtual columns added along with indexes @param[in] eval_table mysql table used to evaluate virtual column value, see innobase_get_computed_value(). +@param[in] drop_historical whether to drop historical system rows @return DB_SUCCESS or error code */ dberr_t row_merge_build_indexes( @@ -343,7 +341,8 @@ row_merge_build_indexes( bool skip_pk_sort, ut_stage_alter_t* stage, const dict_add_v_col_t* add_v, - struct TABLE* eval_table) + struct TABLE* eval_table, + bool drop_historical) MY_ATTRIBUTE((warn_unused_result)); /********************************************************************//** diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h index a7a55d202e8..61a363d6de8 100644 --- a/storage/innobase/include/row0mysql.h +++ b/storage/innobase/include/row0mysql.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -219,30 +219,32 @@ row_lock_table_autoinc_for_mysql( table handle */ MY_ATTRIBUTE((nonnull, warn_unused_result)); -/*********************************************************************//** -Sets a table lock on the table mentioned in prebuilt. +/** Lock a table. +@param[in,out] prebuilt table handle @return error code or DB_SUCCESS */ dberr_t -row_lock_table_for_mysql( -/*=====================*/ - row_prebuilt_t* prebuilt, /*!< in: prebuilt struct in the MySQL - table handle */ - dict_table_t* table, /*!< in: table to lock, or NULL - if prebuilt->table should be - locked as - prebuilt->select_lock_type */ - ulint mode) /*!< in: lock mode of table - (ignored if table==NULL) */ - MY_ATTRIBUTE((nonnull(1))); +row_lock_table(row_prebuilt_t* prebuilt); + +/** System Versioning: row_insert_for_mysql() modes */ +enum ins_mode_t { + /* plain row (without versioning) */ + ROW_INS_NORMAL = 0, + /* row_start = TRX_ID, row_end = MAX */ + ROW_INS_VERSIONED, + /* row_end = TRX_ID */ + ROW_INS_HISTORICAL +}; /** Does an insert for MySQL. @param[in] mysql_rec row in the MySQL format @param[in,out] prebuilt prebuilt struct in MySQL handle +@param[in] ins_mode what row type we're inserting @return error code or DB_SUCCESS*/ dberr_t row_insert_for_mysql( const byte* mysql_rec, - row_prebuilt_t* prebuilt) + row_prebuilt_t* prebuilt, + ins_mode_t ins_mode) MY_ATTRIBUTE((warn_unused_result)); /*********************************************************************//** @@ -266,7 +268,8 @@ row_get_prebuilt_update_vector( @param[in,out] prebuilt prebuilt struct in MySQL handle @return error code or DB_SUCCESS */ dberr_t -row_update_for_mysql(row_prebuilt_t* prebuilt) +row_update_for_mysql( + row_prebuilt_t* prebuilt) MY_ATTRIBUTE((warn_unused_result)); /** This can only be used when srv_locks_unsafe_for_binlog is TRUE or this @@ -307,6 +310,18 @@ row_create_update_node_for_mysql( /*=============================*/ dict_table_t* table, /*!< in: table to update */ mem_heap_t* heap); /*!< in: mem heap from which allocated */ + +/**********************************************************************//** +Does a cascaded delete or set null in a foreign key operation. +@return error code or DB_SUCCESS */ +dberr_t +row_update_cascade_for_mysql( +/*=========================*/ + que_thr_t* thr, /*!< in: query thread */ + upd_node_t* node, /*!< in: update node used in the cascade + or set null operation */ + dict_table_t* table) /*!< in: table where we do the operation */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); /*********************************************************************//** Locks the data dictionary exclusively for performing a table create or other data dictionary modification operation. */ @@ -420,6 +435,10 @@ ulint row_get_background_drop_list_len_low(void); /*======================================*/ +/** Drop garbage tables during recovery. */ +void +row_mysql_drop_garbage_tables(); + /*********************************************************************//** Sets an exclusive lock on a table. @return error code or DB_SUCCESS */ @@ -504,18 +523,6 @@ row_rename_table_for_mysql( bool commit) /*!< in: whether to commit trx */ MY_ATTRIBUTE((nonnull, warn_unused_result)); -/** Renames a partitioned table for MySQL. -@param[in] old_name Old table name. -@param[in] new_name New table name. -@param[in,out] trx Transaction. -@return error code or DB_SUCCESS */ -dberr_t -row_rename_partitions_for_mysql( - const char* old_name, - const char* new_name, - trx_t* trx) - MY_ATTRIBUTE((nonnull, warn_unused_result)); - /*********************************************************************//** Scans an index for either COOUNT(*) or CHECK TABLE. If CHECK TABLE; Checks that the index contains entries in an ascending order, @@ -668,6 +675,8 @@ struct row_prebuilt_t { not to be confused with InnoDB externally stored columns (VARCHAR can be off-page too) */ + unsigned versioned_write:1;/*!< whether this is + a versioned write */ mysql_row_templ_t* mysql_template;/*!< template used to transform rows fast between MySQL and Innobase formats; memory for this template @@ -844,6 +853,20 @@ struct row_prebuilt_t { /** The MySQL table object */ TABLE* m_mysql_table; + + /** Get template by dict_table_t::cols[] number */ + const mysql_row_templ_t* get_template_by_col(ulint col) const + { + ut_ad(col < n_template); + ut_ad(mysql_template); + for (ulint i = col; i < n_template; ++i) { + const mysql_row_templ_t* templ = &mysql_template[i]; + if (!templ->is_virtual && templ->col_no == col) { + return templ; + } + } + return NULL; + } }; /** Callback for row_mysql_sys_index_iterate() */ diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h index 92b5942966b..01fc6cda6ae 100644 --- a/storage/innobase/include/row0upd.h +++ b/storage/innobase/include/row0upd.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -32,20 +32,10 @@ Created 12/27/1996 Heikki Tuuri #include "btr0types.h" #include "dict0types.h" #include "trx0types.h" -#include <stack> #include "btr0pcur.h" #include "que0types.h" #include "pars0types.h" -/** The std::deque to store cascade update nodes, that uses mem_heap_t -as allocator. */ -typedef std::deque<upd_node_t*, mem_heap_allocator<upd_node_t*> > - deque_mem_heap_t; - -/** Double-ended queue of update nodes to be processed for cascade -operations */ -typedef deque_mem_heap_t upd_cascade_t; - /*********************************************************************//** Creates an update vector object. @return own: update vector object */ @@ -136,8 +126,7 @@ row_upd_rec_sys_fields( dict_index_t* index, /*!< in: clustered index */ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ const trx_t* trx, /*!< in: transaction */ - roll_ptr_t roll_ptr);/*!< in: roll ptr of the undo log record, - can be 0 during IMPORT */ + roll_ptr_t roll_ptr);/*!< in: DB_ROLL_PTR to the undo log */ /*********************************************************************//** Sets the trx id or roll ptr field of a clustered index entry. */ void @@ -464,6 +453,7 @@ struct upd_t{ virtual column update now */ ulint n_fields; /*!< number of update fields */ upd_field_t* fields; /*!< array of update fields */ + byte vers_sys_value[8]; /*!< buffer for updating system fields */ /** Append an update field to the end of array @param[in] field an update field */ @@ -484,6 +474,17 @@ struct upd_t{ return(false); } + /** Determine if the update affects a system versioned column. */ + bool affects_versioned() const + { + for (ulint i = 0; i < n_fields; i++) { + if (fields[i].new_val.type.vers_sys_field()) { + return true; + } + } + return false; + } + #ifdef UNIV_DEBUG bool validate() const { @@ -500,12 +501,19 @@ struct upd_t{ }; +/** Kinds of update operation */ +enum delete_mode_t { + NO_DELETE = 0, /*!< this operation does not delete */ + PLAIN_DELETE, /*!< ordinary delete */ + VERSIONED_DELETE /*!< update old and insert a new row */ +}; + /* Update node structure which also implements the delete operation of a row */ struct upd_node_t{ que_common_t common; /*!< node type: QUE_NODE_UPDATE */ - ibool is_delete;/* TRUE if delete, FALSE if update */ + delete_mode_t is_delete; /*!< kind of DELETE */ ibool searched_update; /* TRUE if searched update, FALSE if positioned */ @@ -515,38 +523,12 @@ struct upd_node_t{ dict_foreign_t* foreign;/* NULL or pointer to a foreign key constraint if this update node is used in doing an ON DELETE or ON UPDATE operation */ - - bool cascade_top; - /*!< true if top level in cascade */ - - upd_cascade_t* cascade_upd_nodes; - /*!< Queue of update nodes to handle the - cascade of update and delete operations in an - iterative manner. Their parent/child - relations are properly maintained. All update - nodes point to this same queue. All these - nodes are allocated in heap pointed to by - upd_node_t::cascade_heap. */ - - upd_cascade_t* new_upd_nodes; - /*!< Intermediate list of update nodes in a - cascading update/delete operation. After - processing one update node, this will be - concatenated to cascade_upd_nodes. This extra - list is needed so that retry because of - DB_LOCK_WAIT works corrrectly. */ - - upd_cascade_t* processed_cascades; - /*!< List of processed update nodes in a - cascading update/delete operation. All the - cascade nodes are stored here, so that memory - can be freed. */ - + upd_node_t* cascade_node;/* NULL or an update node template which + is used to implement ON DELETE/UPDATE CASCADE + or ... SET NULL for foreign keys */ mem_heap_t* cascade_heap; - /*!< NULL or a mem heap where cascade_upd_nodes - are created. This heap is owned by the node - that has cascade_top=true. */ - + /*!< NULL or a mem heap where cascade + node is created.*/ sel_node_t* select; /*!< query graph subtree implementing a base table cursor: the rows returned will be updated */ @@ -593,25 +575,8 @@ struct upd_node_t{ sym_node_t* table_sym;/* table node in symbol table */ que_node_t* col_assign_list; /* column assignment list */ - - doc_id_t fts_doc_id; - /* The FTS doc id of the row that is now - pointed to by the pcur. */ - - doc_id_t fts_next_doc_id; - /* The new fts doc id that will be used - in update operation */ - ulint magic_n; -#ifndef DBUG_OFF - /** Print information about this object into the trace log file. */ - void dbug_trace(); - - /** Ensure that the member cascade_upd_nodes has only one update node - for each of the tables. This is useful for testing purposes. */ - void check_cascade_only_once(); -#endif /* !DBUG_OFF */ }; #define UPD_NODE_MAGIC_N 1579975 diff --git a/storage/innobase/include/row0upd.ic b/storage/innobase/include/row0upd.ic index 11271d6e9af..364c876ecc7 100644 --- a/storage/innobase/include/row0upd.ic +++ b/storage/innobase/include/row0upd.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -165,8 +165,7 @@ row_upd_rec_sys_fields( dict_index_t* index, /*!< in: clustered index */ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ const trx_t* trx, /*!< in: transaction */ - roll_ptr_t roll_ptr)/*!< in: roll ptr of the undo log record, - can be 0 during IMPORT */ + roll_ptr_t roll_ptr)/*!< in: DB_ROLL_PTR to the undo log */ { ut_ad(dict_index_is_clust(index)); ut_ad(rec_offs_validate(rec, index, offsets)); diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h index b28533578e1..645f11faaad 100644 --- a/storage/innobase/include/row0vers.h +++ b/storage/innobase/include/row0vers.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -38,19 +39,20 @@ Created 2/6/1997 Heikki Tuuri // Forward declaration class ReadView; -/*****************************************************************//** -Finds out if an active transaction has inserted or modified a secondary +/** Determine if an active transaction has inserted or modified a secondary index record. -@return 0 if committed, else the active transaction id; -NOTE that this function can return false positives but never false -negatives. The caller must confirm all positive results by calling -trx_is_active() while holding lock_sys->mutex. */ +@param[in,out] caller_trx trx of current thread +@param[in] rec secondary index record +@param[in] index secondary index +@param[in] offsets rec_get_offsets(rec, index) +@return the active transaction; trx->release_reference() must be invoked +@retval NULL if the record was committed */ trx_t* row_vers_impl_x_locked( -/*===================*/ - const rec_t* rec, /*!< in: record in a secondary index */ - dict_index_t* index, /*!< in: the secondary index */ - const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */ + trx_t* caller_trx, + const rec_t* rec, + dict_index_t* index, + const ulint* offsets); /*****************************************************************//** Finds out if we must preserve a delete marked earlier version of a clustered @@ -126,6 +128,7 @@ which should be seen by a semi-consistent read. */ void row_vers_build_for_semi_consistent_read( /*====================================*/ + trx_t* caller_trx,/*!<in/out: trx of current thread */ const rec_t* rec, /*!< in: record in a clustered index; the caller must have a latch on the page; this latch locks the top of the stack of versions diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h index e4034f3a6ff..b91f7c1103b 100644 --- a/storage/innobase/include/srv0mon.h +++ b/storage/innobase/include/srv0mon.h @@ -608,8 +608,9 @@ Use MONITOR_INC if appropriate mutex protection exists. #define MONITOR_ATOMIC_INC_LOW(monitor, enabled) \ if (enabled) { \ ib_uint64_t value; \ - value = my_atomic_add64( \ - (int64*) &MONITOR_VALUE(monitor), 1) + 1; \ + value = my_atomic_add64_explicit( \ + (int64*) &MONITOR_VALUE(monitor), 1, \ + MY_MEMORY_ORDER_RELAXED) + 1; \ /* Note: This is not 100% accurate because of the \ inherent race, we ignore it due to performance. */ \ if (value > (ib_uint64_t) MONITOR_MAX_VALUE(monitor)) { \ @@ -624,8 +625,9 @@ Use MONITOR_DEC if appropriate mutex protection exists. #define MONITOR_ATOMIC_DEC_LOW(monitor, enabled) \ if (enabled) { \ ib_uint64_t value; \ - value = my_atomic_add64( \ - (int64*) &MONITOR_VALUE(monitor), -1) - 1; \ + value = my_atomic_add64_explicit( \ + (int64*) &MONITOR_VALUE(monitor), -1, \ + MY_MEMORY_ORDER_RELAXED) - 1; \ /* Note: This is not 100% accurate because of the \ inherent race, we ignore it due to performance. */ \ if (value < (ib_uint64_t) MONITOR_MIN_VALUE(monitor)) { \ diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 580a660cedc..be29b184387 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -381,8 +381,6 @@ extern ulong srv_n_page_hash_locks; /** Scan depth for LRU flush batch i.e.: number of blocks scanned*/ extern ulong srv_LRU_scan_depth; /** Whether or not to flush neighbors of a block */ -extern ulong srv_buf_pool_dump_pct; /*!< dump that may % of each buffer - pool during BP dump */ extern ulong srv_flush_neighbors; /** Previously requested size */ extern ulint srv_buf_pool_old_size; @@ -392,6 +390,10 @@ extern ulint srv_buf_pool_base_size; extern ulint srv_buf_pool_curr_size; /** Dump this % of each buffer pool during BP dump */ extern ulong srv_buf_pool_dump_pct; +#ifdef UNIV_DEBUG +/** Abort load after this amount of pages */ +extern ulong srv_buf_pool_load_pages_abort; +#endif /** Lock table size in bytes */ extern ulint srv_lock_table_size; @@ -611,16 +613,16 @@ extern mysql_pfs_key_t trx_rollback_clean_thread_key; schema */ # define pfs_register_thread(key) \ do { \ - struct PSI_thread* psi = PSI_THREAD_CALL(new_thread)(key, NULL, 0);\ + struct PSI_thread* psi = PSI_CALL_new_thread(key, NULL, 0);\ /* JAN: TODO: MYSQL 5.7 PSI \ - PSI_THREAD_CALL(set_thread_os_id)(psi); */ \ - PSI_THREAD_CALL(set_thread)(psi); \ + PSI_CALL_set_thread_os_id(psi); */ \ + PSI_CALL_set_thread(psi); \ } while (0) /* This macro delist the current thread from performance schema */ # define pfs_delete_thread() \ do { \ - PSI_THREAD_CALL(delete_current_thread)(); \ + PSI_CALL_delete_current_thread(); \ } while (0) # else # define pfs_register_thread(key) @@ -946,6 +948,7 @@ struct export_var_t{ char innodb_buffer_pool_dump_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool dump status */ char innodb_buffer_pool_load_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool load status */ char innodb_buffer_pool_resize_status[512];/*!< Buf pool resize status */ + my_bool innodb_buffer_pool_load_incomplete;/*!< Buf pool load incomplete */ ulint innodb_buffer_pool_pages_total; /*!< Buffer pool size */ ulint innodb_buffer_pool_pages_data; /*!< Data pages */ ulint innodb_buffer_pool_bytes_data; /*!< File bytes used */ @@ -1018,9 +1021,6 @@ struct export_var_t{ ulint innodb_onlineddl_pct_progress; /*!< Online alter progress */ #ifdef UNIV_DEBUG - ulint innodb_purge_trx_id_age; /*!< rw_max_trx_id - purged trx_id */ - ulint innodb_purge_view_trx_id_age; /*!< rw_max_trx_id - - purged view's min trx_id */ ulint innodb_ahi_drop_lookups; /*!< number of adaptive hash index lookups when freeing file pages */ diff --git a/storage/innobase/include/sync0policy.h b/storage/innobase/include/sync0policy.h index a91270bde58..78e77f93269 100644 --- a/storage/innobase/include/sync0policy.h +++ b/storage/innobase/include/sync0policy.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -105,7 +105,7 @@ public: msg << m_mutex->policy().to_string(); - if (os_thread_pf(m_thread_id) != ULINT_UNDEFINED) { + if (m_thread_id != ULINT_UNDEFINED) { msg << " addr: " << m_mutex << " acquired: " << locked_from().c_str(); @@ -454,14 +454,7 @@ public: void destroy() UNIV_NOTHROW { - latch_meta_t& meta = sync_latch_get_meta(m_id); - - ut_ad(meta.get_id() == m_id); - - meta.get_counter()->sum_deregister(m_count); - m_count = NULL; - ut_d(MutexDebug<MutexType>::destroy()); } diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h index 888a32007ce..b61553fc380 100644 --- a/storage/innobase/include/sync0rw.h +++ b/storage/innobase/include/sync0rw.h @@ -501,13 +501,13 @@ bool rw_lock_lock_word_decr( /*===================*/ rw_lock_t* lock, /*!< in/out: rw-lock */ - ulint amount, /*!< in: amount to decrement */ - lint threshold); /*!< in: threshold of judgement */ + int32_t amount, /*!< in: amount to decrement */ + int32_t threshold); /*!< in: threshold of judgement */ #ifdef UNIV_DEBUG /******************************************************************//** Checks if the thread has locked the rw-lock in the specified mode, with the pass value == 0. */ -ibool +bool rw_lock_own( /*========*/ rw_lock_t* lock, /*!< in: rw-lock */ @@ -571,10 +571,10 @@ struct rw_lock_t #endif /* UNIV_DEBUG */ { /** Holds the state of the lock. */ - volatile lint lock_word; + int32_t lock_word; /** 1: there are waiters */ - volatile uint32_t waiters; + int32_t waiters; /** number of granted SX locks. */ volatile ulint sx_recursive; @@ -603,9 +603,6 @@ struct rw_lock_t /** File name where lock created */ const char* cfile_name; - /** last s-lock file/line is not guaranteed to be correct */ - const char* last_s_file_name; - /** File name where last x-locked */ const char* last_x_file_name; @@ -615,9 +612,6 @@ struct rw_lock_t /** If 1 then the rw-lock is a block lock */ unsigned is_block_lock:1; - /** Line number where last time s-locked */ - unsigned last_s_line:14; - /** Line number where last time x-locked */ unsigned last_x_line:14; diff --git a/storage/innobase/include/sync0rw.ic b/storage/innobase/include/sync0rw.ic index a048476d0e8..8a1a3741b47 100644 --- a/storage/innobase/include/sync0rw.ic +++ b/storage/innobase/include/sync0rw.ic @@ -77,7 +77,8 @@ rw_lock_get_writer( /*===============*/ const rw_lock_t* lock) /*!< in: rw-lock */ { - lint lock_word = lock->lock_word; + int32_t lock_word = my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), + MY_MEMORY_ORDER_RELAXED); ut_ad(lock_word <= X_LOCK_DECR); if (lock_word > X_LOCK_HALF_DECR) { @@ -109,7 +110,8 @@ rw_lock_get_reader_count( /*=====================*/ const rw_lock_t* lock) /*!< in: rw-lock */ { - lint lock_word = lock->lock_word; + int32_t lock_word = my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), + MY_MEMORY_ORDER_RELAXED); ut_ad(lock_word <= X_LOCK_DECR); if (lock_word > X_LOCK_HALF_DECR) { @@ -145,7 +147,8 @@ rw_lock_get_x_lock_count( /*=====================*/ const rw_lock_t* lock) /*!< in: rw-lock */ { - lint lock_copy = lock->lock_word; + int32_t lock_copy = my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), + MY_MEMORY_ORDER_RELAXED); ut_ad(lock_copy <= X_LOCK_DECR); if (lock_copy == 0 || lock_copy == -X_LOCK_HALF_DECR) { @@ -178,7 +181,8 @@ rw_lock_get_sx_lock_count( const rw_lock_t* lock) /*!< in: rw-lock */ { #ifdef UNIV_DEBUG - lint lock_copy = lock->lock_word; + int32_t lock_copy = my_atomic_load32_explicit(const_cast<int32_t*>(&lock->lock_word), + MY_MEMORY_ORDER_RELAXED); ut_ad(lock_copy <= X_LOCK_DECR); @@ -197,9 +201,7 @@ rw_lock_get_sx_lock_count( } /******************************************************************//** -Two different implementations for decrementing the lock_word of a rw_lock: -one for systems supporting atomic operations, one for others. This does -does not support recusive x-locks: they should be handled by the caller and +Recursive x-locks are not supported: they should be handled by the caller and need not be atomic since they are performed by the current lock holder. Returns true if the decrement was made, false if not. @return true if decr occurs */ @@ -208,17 +210,17 @@ bool rw_lock_lock_word_decr( /*===================*/ rw_lock_t* lock, /*!< in/out: rw-lock */ - ulint amount, /*!< in: amount to decrement */ - lint threshold) /*!< in: threshold of judgement */ + int32_t amount, /*!< in: amount to decrement */ + int32_t threshold) /*!< in: threshold of judgement */ { - lint local_lock_word; - - local_lock_word = my_atomic_loadlint_explicit(&lock->lock_word, + int32_t lock_copy = my_atomic_load32_explicit(&lock->lock_word, MY_MEMORY_ORDER_RELAXED); - while (local_lock_word > threshold) { - if (my_atomic_caslint(&lock->lock_word, - &local_lock_word, - local_lock_word - amount)) { + while (lock_copy > threshold) { + if (my_atomic_cas32_strong_explicit(&lock->lock_word, + &lock_copy, + lock_copy - amount, + MY_MEMORY_ORDER_ACQUIRE, + MY_MEMORY_ORDER_RELAXED)) { return(true); } } @@ -247,11 +249,6 @@ rw_lock_s_lock_low( ut_d(rw_lock_add_debug_info(lock, pass, RW_LOCK_S, file_name, line)); - /* These debugging values are not set safely: they may be incorrect - or even refer to a line that is invalid for the file name. */ - lock->last_s_file_name = file_name; - lock->last_s_line = line; - return(TRUE); /* locking succeeded */ } @@ -306,29 +303,32 @@ rw_lock_x_lock_func_nowait( const char* file_name,/*!< in: file name where lock requested */ unsigned line) /*!< in: line where requested */ { - lint oldval = X_LOCK_DECR; + int32_t oldval = X_LOCK_DECR; - if (my_atomic_caslint(&lock->lock_word, &oldval, 0)) { + if (my_atomic_cas32_strong_explicit(&lock->lock_word, &oldval, 0, + MY_MEMORY_ORDER_ACQUIRE, + MY_MEMORY_ORDER_RELAXED)) { lock->writer_thread = os_thread_get_curr_id(); } else if (os_thread_eq(lock->writer_thread, os_thread_get_curr_id())) { - /* Relock: this lock_word modification is safe since no other - threads can modify (lock, unlock, or reserve) lock_word while - there is an exclusive writer and this is the writer thread. */ - if (lock->lock_word == 0 || lock->lock_word == -X_LOCK_HALF_DECR) { + /* Relock: even though no other thread can modify (lock, unlock + or reserve) lock_word while there is an exclusive writer and + this is the writer thread, we still want concurrent threads to + observe consistent values. */ + if (oldval == 0 || oldval == -X_LOCK_HALF_DECR) { /* There are 1 x-locks */ - lock->lock_word -= X_LOCK_DECR; - } else if (lock->lock_word <= -X_LOCK_DECR) { + my_atomic_add32_explicit(&lock->lock_word, -X_LOCK_DECR, + MY_MEMORY_ORDER_RELAXED); + } else if (oldval <= -X_LOCK_DECR) { /* There are 2 or more x-locks */ - lock->lock_word--; + my_atomic_add32_explicit(&lock->lock_word, -1, + MY_MEMORY_ORDER_RELAXED); + /* Watch for too many recursive locks */ + ut_ad(oldval < 1); } else { /* Failure */ return(FALSE); } - - /* Watch for too many recursive locks */ - ut_ad(lock->lock_word < 0); - } else { /* Failure */ return(FALSE); @@ -357,8 +357,8 @@ rw_lock_s_unlock_func( rw_lock_t* lock) /*!< in/out: rw-lock */ { #ifdef UNIV_DEBUG - lint dbg_lock_word = my_atomic_loadlint_explicit( - &lock->lock_word, MY_MEMORY_ORDER_RELAXED); + int32_t dbg_lock_word = my_atomic_load32_explicit(&lock->lock_word, + MY_MEMORY_ORDER_RELAXED); ut_ad(dbg_lock_word > -X_LOCK_DECR); ut_ad(dbg_lock_word != 0); ut_ad(dbg_lock_word < X_LOCK_DECR); @@ -367,7 +367,8 @@ rw_lock_s_unlock_func( ut_d(rw_lock_remove_debug_info(lock, pass, RW_LOCK_S)); /* Increment lock_word to indicate 1 less reader */ - lint lock_word = my_atomic_addlint(&lock->lock_word, 1) + 1; + int32_t lock_word = my_atomic_add32_explicit(&lock->lock_word, 1, + MY_MEMORY_ORDER_RELEASE) + 1; if (lock_word == 0 || lock_word == -X_LOCK_HALF_DECR) { /* wait_ex waiter exists. It may not be asleep, but we signal @@ -393,9 +394,8 @@ rw_lock_x_unlock_func( #endif /* UNIV_DEBUG */ rw_lock_t* lock) /*!< in/out: rw-lock */ { - lint lock_word; - lock_word = my_atomic_loadlint_explicit(&lock->lock_word, - MY_MEMORY_ORDER_RELAXED); + int32_t lock_word = my_atomic_load32_explicit(&lock->lock_word, + MY_MEMORY_ORDER_RELAXED); ut_ad(lock_word == 0 || lock_word == -X_LOCK_HALF_DECR || lock_word <= -X_LOCK_DECR); @@ -408,31 +408,35 @@ rw_lock_x_unlock_func( ut_d(rw_lock_remove_debug_info(lock, pass, RW_LOCK_X)); if (lock_word == 0 || lock_word == -X_LOCK_HALF_DECR) { - /* There is 1 x-lock */ - /* atomic increment is needed, because it is last */ - if (my_atomic_addlint(&lock->lock_word, X_LOCK_DECR) <= -X_LOCK_DECR) { - ut_error; - } + /* Last X-lock owned by this thread, it may still hold SX-locks. + ACQ_REL due to... + RELEASE: we release rw-lock + ACQUIRE: we want waiters to be loaded after lock_word is stored */ + my_atomic_add32_explicit(&lock->lock_word, X_LOCK_DECR, + MY_MEMORY_ORDER_ACQ_REL); /* This no longer has an X-lock but it may still have an SX-lock. So it is now free for S-locks by other threads. We need to signal read/write waiters. We do not need to signal wait_ex waiters, since they cannot exist when there is a writer. */ - if (my_atomic_load32_explicit((int32*) &lock->waiters, + if (my_atomic_load32_explicit(&lock->waiters, MY_MEMORY_ORDER_RELAXED)) { - my_atomic_store32((int32*) &lock->waiters, 0); + my_atomic_store32_explicit(&lock->waiters, 0, + MY_MEMORY_ORDER_RELAXED); os_event_set(lock->event); sync_array_object_signalled(); } } else if (lock_word == -X_LOCK_DECR || lock_word == -(X_LOCK_DECR + X_LOCK_HALF_DECR)) { /* There are 2 x-locks */ - lock->lock_word += X_LOCK_DECR; + my_atomic_add32_explicit(&lock->lock_word, X_LOCK_DECR, + MY_MEMORY_ORDER_RELAXED); } else { /* There are more than 2 x-locks. */ ut_ad(lock_word < -X_LOCK_DECR); - lock->lock_word += 1; + my_atomic_add32_explicit(&lock->lock_word, 1, + MY_MEMORY_ORDER_RELAXED); } ut_ad(rw_lock_validate(lock)); @@ -458,28 +462,37 @@ rw_lock_sx_unlock_func( ut_d(rw_lock_remove_debug_info(lock, pass, RW_LOCK_SX)); if (lock->sx_recursive == 0) { + int32_t lock_word = my_atomic_load32_explicit(&lock->lock_word, + MY_MEMORY_ORDER_RELAXED); /* Last caller in a possible recursive chain. */ - if (lock->lock_word > 0) { + if (lock_word > 0) { lock->writer_thread = 0; + ut_ad(lock_word <= INT_MAX32 - X_LOCK_HALF_DECR); + + /* Last SX-lock owned by this thread, doesn't own X-lock. + ACQ_REL due to... + RELEASE: we release rw-lock + ACQUIRE: we want waiters to be loaded after lock_word is stored */ + my_atomic_add32_explicit(&lock->lock_word, X_LOCK_HALF_DECR, + MY_MEMORY_ORDER_ACQ_REL); - if (my_atomic_addlint(&lock->lock_word, X_LOCK_HALF_DECR) <= 0) { - ut_error; - } /* Lock is now free. May have to signal read/write waiters. We do not need to signal wait_ex waiters, since they cannot exist when there is an sx-lock holder. */ - if (lock->waiters) { - my_atomic_store32((int32*) &lock->waiters, 0); + if (my_atomic_load32_explicit(&lock->waiters, + MY_MEMORY_ORDER_RELAXED)) { + my_atomic_store32_explicit(&lock->waiters, 0, + MY_MEMORY_ORDER_RELAXED); os_event_set(lock->event); sync_array_object_signalled(); } } else { /* still has x-lock */ - ut_ad(lock->lock_word == -X_LOCK_HALF_DECR - || lock->lock_word <= -(X_LOCK_DECR - + X_LOCK_HALF_DECR)); - lock->lock_word += X_LOCK_HALF_DECR; + ut_ad(lock_word == -X_LOCK_HALF_DECR || + lock_word <= -(X_LOCK_DECR + X_LOCK_HALF_DECR)); + my_atomic_add32_explicit(&lock->lock_word, X_LOCK_HALF_DECR, + MY_MEMORY_ORDER_RELAXED); } } diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h index aed8f769716..0d813b6bd87 100644 --- a/storage/innobase/include/sync0sync.h +++ b/storage/innobase/include/sync0sync.h @@ -93,9 +93,6 @@ extern mysql_pfs_key_t rw_lock_mutex_key; extern mysql_pfs_key_t srv_innodb_monitor_mutex_key; extern mysql_pfs_key_t srv_misc_tmpfile_mutex_key; extern mysql_pfs_key_t srv_monitor_file_mutex_key; -# ifdef UNIV_DEBUG -extern mysql_pfs_key_t sync_thread_mutex_key; -# endif /* UNIV_DEBUG */ extern mysql_pfs_key_t buf_dblwr_mutex_key; extern mysql_pfs_key_t trx_undo_mutex_key; extern mysql_pfs_key_t trx_mutex_key; @@ -112,6 +109,7 @@ extern mysql_pfs_key_t sync_array_mutex_key; extern mysql_pfs_key_t thread_mutex_key; extern mysql_pfs_key_t zip_pad_mutex_key; extern mysql_pfs_key_t row_drop_list_mutex_key; +extern mysql_pfs_key_t rw_trx_hash_element_mutex_key; #endif /* UNIV_PFS_MUTEX */ #ifdef UNIV_PFS_RWLOCK diff --git a/storage/innobase/include/sync0types.h b/storage/innobase/include/sync0types.h index 1f8e245569e..e4fc24a7ede 100644 --- a/storage/innobase/include/sync0types.h +++ b/storage/innobase/include/sync0types.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -160,7 +160,7 @@ V lock_sys_mutex Mutex protecting lock_sys_t | V -trx_sys->mutex Mutex protecting trx_sys_t +trx_sys.mutex Mutex protecting trx_sys_t | V Threads mutex Background thread scheduling mutex @@ -233,6 +233,7 @@ enum latch_level_t { SYNC_REC_LOCK, SYNC_THREADS, SYNC_TRX, + SYNC_RW_TRX_HASH_ELEMENT, SYNC_TRX_SYS, SYNC_LOCK_SYS, SYNC_LOCK_WAIT_SYS, @@ -336,7 +337,6 @@ enum latch_id_t { LATCH_ID_SRV_INNODB_MONITOR, LATCH_ID_SRV_MISC_TMPFILE, LATCH_ID_SRV_MONITOR_FILE, - LATCH_ID_SYNC_THREAD, LATCH_ID_BUF_DBLWR, LATCH_ID_TRX_UNDO, LATCH_ID_TRX_POOL, @@ -383,6 +383,7 @@ enum latch_id_t { LATCH_ID_FIL_CRYPT_STAT_MUTEX, LATCH_ID_FIL_CRYPT_DATA_MUTEX, LATCH_ID_FIL_CRYPT_THREADS_MUTEX, + LATCH_ID_RW_TRX_HASH_ELEMENT, LATCH_ID_TEST_MUTEX, LATCH_ID_MAX = LATCH_ID_TEST_MUTEX }; @@ -636,14 +637,6 @@ public: return(count); } - /** Deregister the count. We don't do anything - @param[in] count The count instance to deregister */ - void sum_deregister(Count* count) - UNIV_NOTHROW - { - /* Do nothing */ - } - /** Register a single instance counter */ void single_register(Count* count) UNIV_NOTHROW @@ -1161,17 +1154,34 @@ enum rw_lock_flag_t { #endif /* UNIV_INNOCHECKSUM */ #ifdef _WIN64 -#define my_atomic_addlint(A,B) my_atomic_add64((int64*) (A), (B)) -#define my_atomic_loadlint(A) my_atomic_load64((int64*) (A)) -#define my_atomic_loadlint_explicit(A,O) my_atomic_load64_explicit((int64*) (A), (O)) -#define my_atomic_storelint(A,B) my_atomic_store64((int64*) (A), (B)) -#define my_atomic_caslint(A,B,C) my_atomic_cas64((int64*) (A), (int64*) (B), (C)) +static inline ulint my_atomic_addlint(ulint *A, ulint B) +{ + return ulint(my_atomic_add64((volatile int64*)A, B)); +} + +static inline ulint my_atomic_loadlint(const ulint *A) +{ + return ulint(my_atomic_load64((volatile int64*)A)); +} + +static inline lint my_atomic_addlint(volatile lint *A, lint B) +{ + return my_atomic_add64((volatile int64*)A, B); +} + +static inline lint my_atomic_loadlint(const lint *A) +{ + return lint(my_atomic_load64((volatile int64*)A)); +} + +static inline void my_atomic_storelint(ulint *A, ulint B) +{ + my_atomic_store64((volatile int64*)A, B); +} #else #define my_atomic_addlint my_atomic_addlong #define my_atomic_loadlint my_atomic_loadlong -#define my_atomic_loadlint_explicit my_atomic_loadlong_explicit #define my_atomic_storelint my_atomic_storelong -#define my_atomic_caslint my_atomic_caslong #endif /** Simple counter aligned to CACHE_LINE_SIZE @@ -1197,7 +1207,7 @@ struct MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) simple_counter #pragma warning (push) #pragma warning (disable : 4244) #endif - return Type(my_atomic_addlint(reinterpret_cast<lint*> + return Type(my_atomic_addlint(reinterpret_cast<ulint*> (&m_counter), i)); #ifdef _MSC_VER #pragma warning (pop) diff --git a/storage/innobase/include/trx0i_s.h b/storage/innobase/include/trx0i_s.h index e02c5d88a29..ee7da7b74dc 100644 --- a/storage/innobase/include/trx0i_s.h +++ b/storage/innobase/include/trx0i_s.h @@ -264,10 +264,10 @@ trx_i_s_possibly_fetch_data_into_cache( trx_i_s_cache_t* cache); /*!< in/out: cache */ /*******************************************************************//** -Returns TRUE if the data in the cache is truncated due to the memory +Returns true, if the data in the cache is truncated due to the memory limit posed by TRX_I_S_MEM_LIMIT. @return TRUE if truncated */ -ibool +bool trx_i_s_cache_is_truncated( /*=======================*/ trx_i_s_cache_t* cache); /*!< in: cache */ diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h index 0b42479cc21..2efcfc75a06 100644 --- a/storage/innobase/include/trx0purge.h +++ b/storage/innobase/include/trx0purge.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. All Rights Reserved. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -27,15 +27,8 @@ Created 3/26/1996 Heikki Tuuri #ifndef trx0purge_h #define trx0purge_h -#include "univ.i" -#include "trx0types.h" -#include "mtr0mtr.h" -#include "trx0sys.h" +#include "trx0rseg.h" #include "que0types.h" -#include "page0page.h" -#include "usr0sess.h" -#include "fil0fil.h" -#include "read0types.h" /** A dummy undo record used as a return value when we have a whole undo log which needs no purge */ @@ -66,8 +59,6 @@ trx_purge( /*======*/ ulint n_purge_threads, /*!< in: number of purge tasks to submit to task queue. */ - ulint limit, /*!< in: the maximum number of - records to purge in one batch */ bool truncate); /*!< in: truncate history if true */ /*******************************************************************//** Stop purge and wait for it to stop, move to PURGE_STATE_STOP. */ @@ -104,69 +95,28 @@ private: trx_rsegs_t; public: typedef trx_rsegs_t::iterator iterator; + typedef trx_rsegs_t::const_iterator const_iterator; /** Default constructor */ - TrxUndoRsegs() : m_trx_no() { } - - explicit TrxUndoRsegs(trx_id_t trx_no) - : - m_trx_no(trx_no) - { - // Do nothing - } - - /** Get transaction number - @return trx_id_t - get transaction number. */ - trx_id_t get_trx_no() const - { - return(m_trx_no); - } - - /** Add rollback segment. - @param rseg rollback segment to add. */ - void push_back(trx_rseg_t* rseg) - { - m_rsegs.push_back(rseg); - } - - /** Erase the element pointed by given iterator. - @param[in] iterator iterator */ - void erase(iterator& it) - { - m_rsegs.erase(it); - } - - /** Number of registered rsegs. - @return size of rseg list. */ - ulint size() const - { - return(m_rsegs.size()); - } - - /** - @return an iterator to the first element */ - iterator begin() - { - return(m_rsegs.begin()); - } - - /** - @return an iterator to the end */ - iterator end() - { - return(m_rsegs.end()); - } + TrxUndoRsegs() {} + /** Constructor */ + TrxUndoRsegs(trx_rseg_t& rseg) + : m_commit(rseg.last_commit), m_rsegs(1, &rseg) {} + /** Constructor */ + TrxUndoRsegs(trx_id_t trx_no, trx_rseg_t& rseg) + : m_commit(trx_no << 1), m_rsegs(1, &rseg) {} - /** Append rollback segments from referred instance to current - instance. */ - void append(const TrxUndoRsegs& append_from) - { - ut_ad(get_trx_no() == append_from.get_trx_no()); + /** @return the transaction commit identifier */ + trx_id_t trx_no() const { return m_commit >> 1; } - m_rsegs.insert(m_rsegs.end(), - append_from.m_rsegs.begin(), - append_from.m_rsegs.end()); - } + bool operator!=(const TrxUndoRsegs& other) const + { return m_commit != other.m_commit; } + bool empty() const { return m_rsegs.empty(); } + void erase(iterator& it) { m_rsegs.erase(it); } + iterator begin() { return(m_rsegs.begin()); } + iterator end() { return(m_rsegs.end()); } + const_iterator begin() const { return m_rsegs.begin(); } + const_iterator end() const { return m_rsegs.end(); } /** Compare two TrxUndoRsegs based on trx_no. @param elem1 first element to compare @@ -174,17 +124,12 @@ public: @return true if elem1 > elem2 else false.*/ bool operator()(const TrxUndoRsegs& lhs, const TrxUndoRsegs& rhs) { - return(lhs.m_trx_no > rhs.m_trx_no); + return(lhs.m_commit > rhs.m_commit); } - /** Compiler defined copy-constructor/assignment operator - should be fine given that there is no reference to a memory - object outside scope of class object.*/ - private: - /** The rollback segments transaction number. */ - trx_id_t m_trx_no; - + /** Copy trx_rseg_t::last_commit */ + trx_id_t m_commit; /** Rollback segments of a transaction, scheduled for purge. */ trx_rsegs_t m_rsegs; }; @@ -194,16 +139,14 @@ typedef std::priority_queue< std::vector<TrxUndoRsegs, ut_allocator<TrxUndoRsegs> >, TrxUndoRsegs> purge_pq_t; -/** -Chooses the rollback segment with the smallest trx_no. */ +/** Chooses the rollback segment with the oldest committed transaction */ struct TrxUndoRsegsIterator { - /** Constructor */ TrxUndoRsegsIterator(); - /** Sets the next rseg to purge in purge_sys. + Executed in the purge coordinator thread. @return whether anything is to be purged */ - bool set_next(); + inline bool set_next(); private: // Disable copying @@ -211,38 +154,11 @@ private: TrxUndoRsegsIterator& operator=(const TrxUndoRsegsIterator&); /** The current element to process */ - TrxUndoRsegs m_trx_undo_rsegs; - - /** Track the current element in m_trx_undo_rseg */ - TrxUndoRsegs::iterator m_iter; - - /** Sentinel value */ - static const TrxUndoRsegs NullElement; -}; - -/** This is the purge pointer/iterator. We need both the undo no and the -transaction no up to which purge has parsed and applied the records. */ -struct purge_iter_t { - purge_iter_t() - : - trx_no(), - undo_no(), - undo_rseg_space(ULINT_UNDEFINED) - { - // Do nothing - } - - trx_id_t trx_no; /*!< Purge has advanced past all - transactions whose number is less - than this */ - undo_no_t undo_no; /*!< Purge has advanced past all records - whose undo number is less than this */ - ulint undo_rseg_space; - /*!< Last undo record resided in this - space id. */ + TrxUndoRsegs m_rsegs; + /** Track the current element in m_rsegs */ + TrxUndoRsegs::const_iterator m_iter; }; - /* Namespace to hold all the related functions and variables need for truncate of undo tablespace. */ namespace undo { @@ -288,17 +204,12 @@ namespace undo { /** Track UNDO tablespace mark for truncate. */ class Truncate { public: - - Truncate() - : - m_undo_for_trunc(ULINT_UNDEFINED), - m_rseg_for_trunc(), - m_scan_start(1), - m_purge_rseg_truncate_frequency( - static_cast<ulint>( - srv_purge_rseg_truncate_frequency)) + void create() { - /* Do Nothing. */ + m_undo_for_trunc = ULINT_UNDEFINED; + m_scan_start = 1; + m_purge_rseg_truncate_frequency = + ulint(srv_purge_rseg_truncate_frequency); } /** Clear the cached rollback segment. Normally done @@ -485,14 +396,9 @@ namespace undo { /** The control structure used in the purge operation */ class purge_sys_t { + bool m_initialised; public: - /** Construct the purge system. */ - purge_sys_t(); - /** Destruct the purge system. */ - ~purge_sys_t(); - - sess_t* sess; /*!< System session running the purge - query */ + MY_ALIGNED(CACHE_LINE_SIZE) rw_lock_t latch; /*!< The latch protecting the purge view. A purge operation must acquire an x-latch here for the instant at which @@ -500,11 +406,14 @@ public: log operation can prevent this by obtaining an s-latch here. It also protects state and running */ + MY_ALIGNED(CACHE_LINE_SIZE) os_event_t event; /*!< State signal event; os_event_set() and os_event_reset() are protected by purge_sys_t::latch X-lock */ + MY_ALIGNED(CACHE_LINE_SIZE) ulint n_stop; /*!< Counter to track number stops */ + volatile bool running; /*!< true, if purge is active, we check this without the latch too */ volatile purge_state_t state; /*!< Purge coordinator thread states, @@ -512,29 +421,40 @@ public: without holding the latch. */ que_t* query; /*!< The query graph which will do the parallelized purge operation */ + MY_ALIGNED(CACHE_LINE_SIZE) ReadView view; /*!< The purge will not remove undo logs which are >= this view (purge view) */ - volatile ulint n_submitted; /*!< Count of total tasks submitted + ulint n_submitted; /*!< Count of total tasks submitted to the task queue */ - volatile ulint n_completed; /*!< Count of total tasks completed */ - - /*------------------------------*/ - /* The following two fields form the 'purge pointer' which advances - during a purge, and which is used in history list truncation */ - - purge_iter_t iter; /* Limit up to which we have read and - parsed the UNDO log records. Not - necessarily purged from the indexes. - Note that this can never be less than - the limit below, we check for this - invariant in trx0purge.cc */ - purge_iter_t limit; /* The 'purge pointer' which advances - during a purge, and which is used in - history list truncation */ -#ifdef UNIV_DEBUG - purge_iter_t done; /* Indicate 'purge pointer' which have - purged already accurately. */ -#endif /* UNIV_DEBUG */ + ulint n_completed; /*!< Count of total tasks completed */ + + /** Iterator to the undo log records of committed transactions */ + struct iterator + { + bool operator<=(const iterator& other) const + { + if (commit < other.commit) return true; + if (commit > other.commit) return false; + return undo_no <= other.undo_no; + } + + /** @return the commit number of the transaction */ + trx_id_t trx_no() const { return commit >> 1; } + void reset_trx_no(trx_id_t trx_no) { commit = trx_no << 1; } + + /** 2 * trx_t::no + old_insert of the committed transaction */ + trx_id_t commit; + /** The record number within the committed transaction's undo + log, increasing, purged from from 0 onwards */ + undo_no_t undo_no; + }; + + /** The tail of the purge queue; the last parsed undo log of a + committed transaction. */ + iterator tail; + /** The head of the purge queue; any older undo logs of committed + transactions may be discarded (history list truncation). */ + iterator head; /*-----------------------------*/ bool next_stored; /*!< whether rseg holds the next record to purge */ @@ -562,10 +482,30 @@ public: undo::Truncate undo_trunc; /*!< Track UNDO tablespace marked for truncate. */ + + + /** + Constructor. + + Some members may require late initialisation, thus we just mark object as + uninitialised. Real initialisation happens in create(). + */ + + purge_sys_t() : m_initialised(false) {} + + + bool is_initialised() const { return m_initialised; } + + + /** Create the instance */ + void create(); + + /** Close the purge system on shutdown */ + void close(); }; /** The global data structure coordinating a purge */ -extern purge_sys_t* purge_sys; +extern purge_sys_t purge_sys; /** Info required to purge a record */ struct trx_purge_rec_t { diff --git a/storage/innobase/include/trx0purge.ic b/storage/innobase/include/trx0purge.ic index c32651b7a00..cd519a8e64d 100644 --- a/storage/innobase/include/trx0purge.ic +++ b/storage/innobase/include/trx0purge.ic @@ -40,24 +40,3 @@ trx_purge_get_log_from_hist( return(node_addr); } - -/********************************************************************//** -address of its history list node. -@return true if purge_sys_t::limit <= purge_sys_t::iter */ -UNIV_INLINE -bool -trx_purge_check_limit(void) -/*=======================*/ -{ - /* limit is used to track till what point purge element has been - processed and so limit <= iter. - undo_no ordering is enforced only within the same rollback segment. - If a transaction uses multiple rollback segments then we need to - consider the rollback segment space id too. */ - return(purge_sys->iter.trx_no > purge_sys->limit.trx_no - || (purge_sys->iter.trx_no == purge_sys->limit.trx_no - && ((purge_sys->iter.undo_no >= purge_sys->limit.undo_no) - || (purge_sys->iter.undo_rseg_space - != purge_sys->limit.undo_rseg_space)))); -} - diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h index ed45e1de82e..955a726eb50 100644 --- a/storage/innobase/include/trx0rec.h +++ b/storage/innobase/include/trx0rec.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -155,6 +155,7 @@ trx_undo_rec_get_partial_row( used, as we do NOT copy the data in the record! */ dict_index_t* index, /*!< in: clustered index */ + const upd_t* update, /*!< in: updated columns */ dtuple_t** row, /*!< out, own: partial row */ ibool ignore_prefix, /*!< in: flag to indicate if we expect blob prefixes in undo. Used @@ -162,6 +163,13 @@ trx_undo_rec_get_partial_row( mem_heap_t* heap) /*!< in: memory heap from which the memory needed is allocated */ MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Report a RENAME TABLE operation. +@param[in,out] trx transaction +@param[in] table table that is being renamed +@return DB_SUCCESS or error code */ +dberr_t +trx_undo_report_rename(trx_t* trx, const dict_table_t* table) + MY_ATTRIBUTE((nonnull, warn_unused_result)); /***********************************************************************//** Writes information to an undo log about an insert, update, or a delete marking of a clustered index record. This information is used in a rollback of the @@ -188,10 +196,8 @@ trx_undo_report_row_operation( marking, the record in the clustered index; NULL if insert */ const ulint* offsets, /*!< in: rec_get_offsets(rec) */ - roll_ptr_t* roll_ptr) /*!< out: rollback pointer to the - inserted undo log record, - 0 if BTR_NO_UNDO_LOG - flag was specified */ + roll_ptr_t* roll_ptr) /*!< out: DB_ROLL_PTR to the + undo log record */ MY_ATTRIBUTE((nonnull(1,2,8), warn_unused_result)); /** status bit used for trx_undo_prev_version_build() */ @@ -238,25 +244,22 @@ trx_undo_prev_version_build( into this function by purge thread or not. And if we read "after image" of undo log */ -/***********************************************************//** -Parses a redo log record of adding an undo log record. -@return end of log record or NULL */ +/** Parse MLOG_UNDO_INSERT for crash-upgrade from MariaDB 10.2. +@param[in] ptr log record +@param[in] end_ptr end of log record buffer +@param[in,out] page page or NULL +@return end of log record +@retval NULL if the log record is incomplete */ byte* trx_undo_parse_add_undo_rec( -/*========================*/ - byte* ptr, /*!< in: buffer */ - byte* end_ptr,/*!< in: buffer end */ - page_t* page); /*!< in: page or NULL */ -/***********************************************************//** -Parses a redo log record of erasing of an undo page end. -@return end of log record or NULL */ -byte* -trx_undo_parse_erase_page_end( -/*==========================*/ - byte* ptr, /*!< in: buffer */ - byte* end_ptr,/*!< in: buffer end */ - page_t* page, /*!< in: page or NULL */ - mtr_t* mtr); /*!< in: mtr or NULL */ + const byte* ptr, + const byte* end_ptr, + page_t* page); +/** Erase the unused undo log page end. +@param[in,out] undo_page undo log page +@return whether the page contained something */ +bool +trx_undo_erase_page_end(page_t* undo_page); /** Read from an undo log record a non-virtual column value. @param[in,out] ptr pointer to remaining part of the undo record @@ -307,7 +310,8 @@ trx_undo_read_v_idx( compilation info multiplied by 16 is ORed to this value in an undo log record */ -#define TRX_UNDO_INSERT_DEFAULT 10 /* insert a "default value" +#define TRX_UNDO_RENAME_TABLE 9 /*!< RENAME TABLE */ +#define TRX_UNDO_INSERT_DEFAULT 10 /*!< insert a "default value" pseudo-record for instant ALTER */ #define TRX_UNDO_INSERT_REC 11 /* fresh insert into clustered index */ #define TRX_UNDO_UPD_EXIST_REC 12 /* update of a non-delete-marked diff --git a/storage/innobase/include/trx0rec.ic b/storage/innobase/include/trx0rec.ic index 136e0edb468..5ae34c486cc 100644 --- a/storage/innobase/include/trx0rec.ic +++ b/storage/innobase/include/trx0rec.ic @@ -66,5 +66,8 @@ trx_undo_rec_copy( len = mach_read_from_2(undo_rec) - ut_align_offset(undo_rec, UNIV_PAGE_SIZE); ut_ad(len < UNIV_PAGE_SIZE); - return((trx_undo_rec_t*) mem_heap_dup(heap, undo_rec, len)); + trx_undo_rec_t* rec = static_cast<trx_undo_rec_t*>( + mem_heap_dup(heap, undo_rec, len)); + mach_write_to_2(rec, len); + return rec; } diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h index 8908376bff1..ba9c901d4f7 100644 --- a/storage/innobase/include/trx0roll.h +++ b/storage/innobase/include/trx0roll.h @@ -33,7 +33,8 @@ Created 3/26/1996 Heikki Tuuri #include "mtr0mtr.h" #include "trx0sys.h" -extern bool trx_rollback_or_clean_is_active; +extern bool trx_rollback_is_active; +extern const trx_t* trx_roll_crash_recv_trx; /*******************************************************************//** Determines if this transaction is rolling back an incomplete transaction @@ -62,16 +63,19 @@ trx_undo_rec_t* trx_roll_pop_top_rec_of_trx(trx_t* trx, roll_ptr_t* roll_ptr, mem_heap_t* heap) MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Report progress when rolling back a row of a recovered transaction. +@return whether the rollback should be aborted due to pending shutdown */ +bool +trx_roll_must_shutdown(); /*******************************************************************//** Rollback or clean up any incomplete transactions which were encountered in crash recovery. If the transaction already was committed, then we clean up a possible insert undo log. If the -transaction was not yet committed, then we roll it back. */ +transaction was not yet committed, then we roll it back. +@param all true=roll back all recovered active transactions; +false=roll back any incomplete dictionary transaction */ void -trx_rollback_or_clean_recovered( -/*============================*/ - ibool all); /*!< in: FALSE=roll back dictionary transactions; - TRUE=roll back all non-PREPARED transactions */ +trx_rollback_recovered(bool all); /*******************************************************************//** Rollback or clean up any incomplete transactions which were encountered in crash recovery. If the transaction already was @@ -81,11 +85,7 @@ Note: this is done in a background thread. @return a dummy parameter */ extern "C" os_thread_ret_t -DECLARE_THREAD(trx_rollback_or_clean_all_recovered)( -/*================================================*/ - void* arg MY_ATTRIBUTE((unused))); - /*!< in: a dummy parameter required by - os_thread_create */ +DECLARE_THREAD(trx_rollback_all_recovered)(void*); /*********************************************************************//** Creates a rollback command node struct. @return own: rollback node struct */ diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h index c1961bb4169..d68ece39911 100644 --- a/storage/innobase/include/trx0rseg.h +++ b/storage/innobase/include/trx0rseg.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -27,10 +27,8 @@ Created 3/26/1996 Heikki Tuuri #ifndef trx0rseg_h #define trx0rseg_h -#include "trx0types.h" #include "trx0sys.h" #include "fut0lst.h" -#include <vector> /** Gets a rollback segment header. @param[in] space space where placed @@ -57,16 +55,6 @@ trx_rsegf_get_new( mtr_t* mtr); /***************************************************************//** -Gets the file page number of the nth undo log slot. -@return page number of the undo log segment */ -UNIV_INLINE -ulint -trx_rsegf_get_nth_undo( -/*===================*/ - trx_rsegf_t* rsegf, /*!< in: rollback segment header */ - ulint n, /*!< in: index of slot */ - mtr_t* mtr); /*!< in: mtr */ -/***************************************************************//** Sets the file page number of the nth undo log slot. */ UNIV_INLINE void @@ -81,24 +69,21 @@ Looks for a free slot for an undo log segment. @return slot index or ULINT_UNDEFINED if not found */ UNIV_INLINE ulint -trx_rsegf_undo_find_free( -/*=====================*/ - trx_rsegf_t* rsegf, /*!< in: rollback segment header */ - mtr_t* mtr); /*!< in: mtr */ +trx_rsegf_undo_find_free(const trx_rsegf_t* rsegf); /** Creates a rollback segment header. This function is called only when a new rollback segment is created in the database. @param[in] space space id -@param[in] max_size max size in pages -@param[in] rseg_slot_no rseg id == slot number in trx sys +@param[in] rseg_id rollback segment identifier +@param[in,out] sys_header the TRX_SYS page (NULL for temporary rseg) @param[in,out] mtr mini-transaction @return page number of the created segment, FIL_NULL if fail */ ulint trx_rseg_header_create( ulint space, - ulint max_size, - ulint rseg_slot_no, + ulint rseg_id, + buf_block_t* sys_header, mtr_t* mtr); /** Initialize the rollback segments in memory at database startup. */ @@ -155,9 +140,6 @@ struct trx_rseg_t { /** page number of the rollback segment header */ ulint page_no; - /** maximum allowed size in pages */ - ulint max_size; - /** current size in pages */ ulint curr_size; @@ -182,8 +164,8 @@ struct trx_rseg_t { /** Byte offset of the last not yet purged log header */ ulint last_offset; - /** Transaction number of the last not yet purged log */ - trx_id_t last_trx_no; + /** trx_t::no * 2 + old_insert of the last not yet purged log */ + trx_id_t last_commit; /** Whether the log segment needs purge */ bool needs_purge; @@ -195,6 +177,14 @@ struct trx_rseg_t { UNDO-tablespace marked for truncate. */ bool skip_allocation; + /** @return the commit ID of the last committed transaction */ + trx_id_t last_trx_no() const { return last_commit >> 1; } + + void set_last_trx_no(trx_id_t trx_no, bool is_update) + { + last_commit = trx_no << 1 | trx_id_t(is_update); + } + /** @return whether the rollback segment is persistent */ bool is_persistent() const { @@ -228,19 +218,100 @@ struct trx_rseg_t { /* Transaction rollback segment header */ /*-------------------------------------------------------------*/ -#define TRX_RSEG_MAX_SIZE 0 /* Maximum allowed size for rollback - segment in pages */ -#define TRX_RSEG_HISTORY_SIZE 4 /* Number of file pages occupied - by the logs in the history list */ -#define TRX_RSEG_HISTORY 8 /* The update undo logs for committed - transactions */ +/** 0xfffffffe = pre-MariaDB 10.3.5 format; 0=MariaDB 10.3.5 or later */ +#define TRX_RSEG_FORMAT 0 +/** Number of pages in the TRX_RSEG_HISTORY list */ +#define TRX_RSEG_HISTORY_SIZE 4 +/** Committed transaction logs that have not been purged yet */ +#define TRX_RSEG_HISTORY 8 #define TRX_RSEG_FSEG_HEADER (8 + FLST_BASE_NODE_SIZE) /* Header for the file segment where this page is placed */ #define TRX_RSEG_UNDO_SLOTS (8 + FLST_BASE_NODE_SIZE + FSEG_HEADER_SIZE) /* Undo log segment slots */ +/** Maximum transaction ID (valid only if TRX_RSEG_FORMAT is 0) */ +#define TRX_RSEG_MAX_TRX_ID (TRX_RSEG_UNDO_SLOTS + TRX_RSEG_N_SLOTS \ + * TRX_RSEG_SLOT_SIZE) + +/** 8 bytes offset within the binlog file */ +#define TRX_RSEG_BINLOG_OFFSET TRX_RSEG_MAX_TRX_ID + 8 +/** MySQL log file name, 512 bytes, including terminating NUL +(valid only if TRX_RSEG_FORMAT is 0). +If no binlog information is present, the first byte is NUL. */ +#define TRX_RSEG_BINLOG_NAME TRX_RSEG_MAX_TRX_ID + 16 +/** Maximum length of binlog file name, including terminating NUL, in bytes */ +#define TRX_RSEG_BINLOG_NAME_LEN 512 + +#ifdef WITH_WSREP +/** The offset to WSREP XID headers */ +#define TRX_RSEG_WSREP_XID_INFO TRX_RSEG_MAX_TRX_ID + 16 + 512 + +/** WSREP XID format (1 if present and valid, 0 if not present) */ +#define TRX_RSEG_WSREP_XID_FORMAT TRX_RSEG_WSREP_XID_INFO +/** WSREP XID GTRID length */ +#define TRX_RSEG_WSREP_XID_GTRID_LEN TRX_RSEG_WSREP_XID_INFO + 4 +/** WSREP XID bqual length */ +#define TRX_RSEG_WSREP_XID_BQUAL_LEN TRX_RSEG_WSREP_XID_INFO + 8 +/** WSREP XID data (XIDDATASIZE bytes) */ +#define TRX_RSEG_WSREP_XID_DATA TRX_RSEG_WSREP_XID_INFO + 12 +#endif /* WITH_WSREP*/ + /*-------------------------------------------------------------*/ +/** Read the page number of an undo log slot. +@param[in] rsegf rollback segment header +@param[in] n slot number */ +inline +uint32_t +trx_rsegf_get_nth_undo(const trx_rsegf_t* rsegf, ulint n) +{ + ut_ad(n < TRX_RSEG_N_SLOTS); + return mach_read_from_4(rsegf + TRX_RSEG_UNDO_SLOTS + + n * TRX_RSEG_SLOT_SIZE); +} + +#ifdef WITH_WSREP +/** Update the WSREP XID information in rollback segment header. +@param[in,out] rseg_header rollback segment header +@param[in] xid WSREP XID +@param[in,out] mtr mini-transaction */ +void +trx_rseg_update_wsrep_checkpoint( + trx_rsegf_t* rseg_header, + const XID* xid, + mtr_t* mtr); + +/** Update WSREP checkpoint XID in first rollback segment header. +@param[in] xid WSREP XID */ +void trx_rseg_update_wsrep_checkpoint(const XID* xid); + +/** Read the WSREP XID information in rollback segment header. +@param[in] rseg_header Rollback segment header +@param[out] xid Transaction XID +@return whether the WSREP XID was present */ +bool trx_rseg_read_wsrep_checkpoint(const trx_rsegf_t* rseg_header, XID& xid); + +/** Recover the latest WSREP checkpoint XID. +@param[out] xid WSREP XID +@return whether the WSREP XID was found */ +bool trx_rseg_read_wsrep_checkpoint(XID& xid); +#endif /* WITH_WSREP */ + +/** Upgrade a rollback segment header page to MariaDB 10.3 format. +@param[in,out] rseg_header rollback segment header page +@param[in,out] mtr mini-transaction */ +void trx_rseg_format_upgrade(trx_rsegf_t* rseg_header, mtr_t* mtr); + +/** Update the offset information about the end of the binlog entry +which corresponds to the transaction just being committed. +In a replication slave, this updates the master binlog position +up to which replication has proceeded. +@param[in,out] rseg_header rollback segment header +@param[in] trx committing transaction +@param[in,out] mtr mini-transaction */ +void +trx_rseg_update_binlog_offset(byte* rseg_header, const trx_t* trx, mtr_t* mtr); + #include "trx0rseg.ic" #endif diff --git a/storage/innobase/include/trx0rseg.ic b/storage/innobase/include/trx0rseg.ic index 45ee3ef8d66..dd0ce8b3719 100644 --- a/storage/innobase/include/trx0rseg.ic +++ b/storage/innobase/include/trx0rseg.ic @@ -86,23 +86,6 @@ trx_rsegf_get_new( } /***************************************************************//** -Gets the file page number of the nth undo log slot. -@return page number of the undo log segment */ -UNIV_INLINE -ulint -trx_rsegf_get_nth_undo( -/*===================*/ - trx_rsegf_t* rsegf, /*!< in: rollback segment header */ - ulint n, /*!< in: index of slot */ - mtr_t* mtr) /*!< in: mtr */ -{ - ut_a(n < TRX_RSEG_N_SLOTS); - - return(mtr_read_ulint(rsegf + TRX_RSEG_UNDO_SLOTS - + n * TRX_RSEG_SLOT_SIZE, MLOG_4BYTES, mtr)); -} - -/***************************************************************//** Sets the file page number of the nth undo log slot. */ UNIV_INLINE void @@ -124,10 +107,7 @@ Looks for a free slot for an undo log segment. @return slot index or ULINT_UNDEFINED if not found */ UNIV_INLINE ulint -trx_rsegf_undo_find_free( -/*=====================*/ - trx_rsegf_t* rsegf, /*!< in: rollback segment header */ - mtr_t* mtr) /*!< in: mtr */ +trx_rsegf_undo_find_free(const trx_rsegf_t* rsegf) { ulint i; ulint page_no; @@ -141,7 +121,7 @@ trx_rsegf_undo_find_free( #endif for (i = 0; i < max_slots; i++) { - page_no = trx_rsegf_get_nth_undo(rsegf, i, mtr); + page_no = trx_rsegf_get_nth_undo(rsegf, i); if (page_no == FIL_NULL) { return(i); diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index dd8929911c6..00f245a05c0 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -35,7 +35,6 @@ Created 3/26/1996 Heikki Tuuri #include "mem0mem.h" #include "mtr0mtr.h" #include "ut0byte.h" -#include "mem0mem.h" #include "ut0lst.h" #include "read0types.h" #include "page0types.h" @@ -47,122 +46,61 @@ Created 3/26/1996 Heikki Tuuri typedef UT_LIST_BASE_NODE_T(trx_t) trx_ut_list_t; -// Forward declaration -class MVCC; -class ReadView; - -/** The transaction system */ -extern trx_sys_t* trx_sys; - /** Checks if a page address is the trx sys header page. @param[in] page_id page id @return true if trx sys header page */ -UNIV_INLINE +inline bool -trx_sys_hdr_page( - const page_id_t& page_id); - -/** Initialize the transaction system main-memory data structures. */ -void trx_sys_init_at_db_start(); +trx_sys_hdr_page(const page_id_t& page_id) +{ + return(page_id.space() == TRX_SYS_SPACE + && page_id.page_no() == TRX_SYS_PAGE_NO); +} /*****************************************************************//** -Creates the trx_sys instance and initializes purge_queue and mutex. */ -void -trx_sys_create(void); -/*================*/ -/*****************************************************************//** Creates and initializes the transaction system at the database creation. */ void trx_sys_create_sys_pages(void); /*==========================*/ -/** @return an unallocated rollback segment slot in the TRX_SYS header +/** Find an available rollback segment. +@param[in] sys_header +@return an unallocated rollback segment slot in the TRX_SYS header @retval ULINT_UNDEFINED if not found */ ulint -trx_sysf_rseg_find_free(mtr_t* mtr); -/**********************************************************************//** -Gets a pointer to the transaction system file copy and x-locks its page. -@return pointer to system file copy, page x-locked */ -UNIV_INLINE -trx_sysf_t* -trx_sysf_get( -/*=========*/ - mtr_t* mtr); /*!< in: mtr */ -/*****************************************************************//** -Gets the space of the nth rollback segment slot in the trx system -file copy. -@return space id */ -UNIV_INLINE -ulint -trx_sysf_rseg_get_space( -/*====================*/ - trx_sysf_t* sys_header, /*!< in: trx sys file copy */ - ulint i, /*!< in: slot index == rseg id */ - mtr_t* mtr); /*!< in: mtr */ -/*****************************************************************//** -Gets the page number of the nth rollback segment slot in the trx system -file copy. -@return page number, FIL_NULL if slot unused */ -UNIV_INLINE -ulint -trx_sysf_rseg_get_page_no( -/*======================*/ - trx_sysf_t* sys_header, /*!< in: trx sys file copy */ - ulint i, /*!< in: slot index == rseg id */ - mtr_t* mtr); /*!< in: mtr */ -/*****************************************************************//** -Sets the space id of the nth rollback segment slot in the trx system -file copy. */ -UNIV_INLINE -void -trx_sysf_rseg_set_space( -/*====================*/ - trx_sysf_t* sys_header, /*!< in: trx sys file copy */ - ulint i, /*!< in: slot index == rseg id */ - ulint space, /*!< in: space id */ - mtr_t* mtr); /*!< in: mtr */ -/*****************************************************************//** -Sets the page number of the nth rollback segment slot in the trx system -file copy. */ -UNIV_INLINE -void -trx_sysf_rseg_set_page_no( -/*======================*/ - trx_sysf_t* sys_header, /*!< in: trx sys file copy */ - ulint i, /*!< in: slot index == rseg id */ - ulint page_no, /*!< in: page number, FIL_NULL if - the slot is reset to unused */ - mtr_t* mtr); /*!< in: mtr */ -/*****************************************************************//** -Allocates a new transaction id. -@return new, allocated trx id */ -UNIV_INLINE -trx_id_t -trx_sys_get_new_trx_id(); -/*===================*/ -/*****************************************************************//** -Determines the maximum transaction id. -@return maximum currently allocated trx id; will be stale after the -next call to trx_sys_get_new_trx_id() */ -UNIV_INLINE -trx_id_t -trx_sys_get_max_trx_id(void); -/*========================*/ +trx_sys_rseg_find_free(const buf_block_t* sys_header); +/** Request the TRX_SYS page. +@param[in] rw whether to lock the page for writing +@return the TRX_SYS page +@retval NULL if the page cannot be read */ +inline +buf_block_t* +trx_sysf_get(mtr_t* mtr, bool rw = true) +{ + buf_block_t* block = buf_page_get( + page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), + univ_page_size, rw ? RW_X_LATCH : RW_S_LATCH, mtr); + if (block) { + buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); + } + return block; +} #ifdef UNIV_DEBUG /* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */ extern uint trx_rseg_n_slots_debug; #endif -/*****************************************************************//** -Writes a trx id to an index page. In case that the id size changes in -some future version, this function should be used instead of -mach_write_... */ +/** Write DB_TRX_ID. +@param[out] db_trx_id the DB_TRX_ID field to be written to +@param[in] id transaction ID */ UNIV_INLINE void -trx_write_trx_id( -/*=============*/ - byte* ptr, /*!< in: pointer to memory where written */ - trx_id_t id); /*!< in: id */ +trx_write_trx_id(byte* db_trx_id, trx_id_t id) +{ + compile_time_assert(DATA_TRX_ID_LEN == 6); + ut_ad(id); + mach_write_to_6(db_trx_id, id); +} /** Read a transaction identifier. @return id */ @@ -170,9 +108,7 @@ inline trx_id_t trx_read_trx_id(const byte* ptr) { -#if DATA_TRX_ID_LEN != 6 -# error "DATA_TRX_ID_LEN != 6" -#endif + compile_time_assert(DATA_TRX_ID_LEN == 6); return(mach_read_from_6(ptr)); } @@ -188,59 +124,6 @@ inline bool trx_id_check(const void* db_trx_id, trx_id_t trx_id) } #endif -/****************************************************************//** -Looks for the trx instance with the given id in the rw trx_list. -@return the trx handle or NULL if not found */ -UNIV_INLINE -trx_t* -trx_get_rw_trx_by_id( -/*=================*/ - trx_id_t trx_id);/*!< in: trx id to search for */ -/****************************************************************//** -Returns the minimum trx id in rw trx list. This is the smallest id for which -the trx can possibly be active. (But, you must look at the trx->state to -find out if the minimum trx id transaction itself is active, or already -committed.) -@return the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */ -UNIV_INLINE -trx_id_t -trx_rw_min_trx_id(void); -/*===================*/ -/****************************************************************//** -Checks if a rw transaction with the given id is active. -@return transaction instance if active, or NULL */ -UNIV_INLINE -trx_t* -trx_rw_is_active_low( -/*=================*/ - trx_id_t trx_id, /*!< in: trx id of the transaction */ - ibool* corrupt); /*!< in: NULL or pointer to a flag - that will be set if corrupt */ -/****************************************************************//** -Checks if a rw transaction with the given id is active. If the caller is -not holding trx_sys->mutex, the transaction may already have been -committed. -@return transaction instance if active, or NULL; */ -UNIV_INLINE -trx_t* -trx_rw_is_active( -/*=============*/ - trx_id_t trx_id, /*!< in: trx id of the transaction */ - ibool* corrupt, /*!< in: NULL or pointer to a flag - that will be set if corrupt */ - bool do_ref_count); /*!< in: if true then increment the - trx_t::n_ref_count */ -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG -/***********************************************************//** -Assert that a transaction has been recovered. -@return TRUE */ -UNIV_INLINE -ibool -trx_assert_recovered( -/*=================*/ - trx_id_t trx_id) /*!< in: transaction identifier */ - MY_ATTRIBUTE((warn_unused_result)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ /*****************************************************************//** Updates the offset information about the end of the MySQL binlog entry which corresponds to the transaction just being committed. In a MySQL @@ -251,72 +134,17 @@ trx_sys_update_mysql_binlog_offset( /*===============================*/ const char* file_name,/*!< in: MySQL log file name */ int64_t offset, /*!< in: position in that log file */ - trx_sysf_t* sys_header, /*!< in: trx sys header */ - mtr_t* mtr); /*!< in: mtr */ + buf_block_t* sys_header, /*!< in,out: trx sys header */ + mtr_t* mtr); /*!< in,out: mini-transaction */ /** Display the MySQL binlog offset info if it is present in the trx system header. */ void trx_sys_print_mysql_binlog_offset(); -#ifdef WITH_WSREP -/** Update WSREP XID info in sys_header of TRX_SYS_PAGE_NO = 5. -@param[in] xid Transaction XID -@param[in,out] sys_header sys_header -@param[in] mtr minitransaction */ -UNIV_INTERN -void -trx_sys_update_wsrep_checkpoint( - const XID* xid, - trx_sysf_t* sys_header, - mtr_t* mtr); - -/** Read WSREP checkpoint XID from sys header. -@param[out] xid WSREP XID -@return whether the checkpoint was present */ -UNIV_INTERN -bool -trx_sys_read_wsrep_checkpoint(XID* xid); -#endif /* WITH_WSREP */ - -/*****************************************************************//** -Shutdown/Close the transaction system. */ -void -trx_sys_close(void); -/*===============*/ /** Create the rollback segments. @return whether the creation succeeded */ bool trx_sys_create_rsegs(); -/*****************************************************************//** -Get the number of transaction in the system, independent of their state. -@return count of transactions in trx_sys_t::trx_list */ -UNIV_INLINE -ulint -trx_sys_get_n_rw_trx(void); -/*======================*/ - -/********************************************************************* -Check if there are any active (non-prepared) transactions. -@return total number of active transactions or 0 if none */ -ulint -trx_sys_any_active_transactions(void); -/*=================================*/ - -/** -Add the transaction to the RW transaction set -@param trx transaction instance to add */ -UNIV_INLINE -void -trx_sys_rw_trx_add(trx_t* trx); - -#ifdef UNIV_DEBUG -/*************************************************************//** -Validate the trx_sys_t::rw_trx_list. -@return true if the list is valid */ -bool -trx_sys_validate_trx_list(); -/*========================*/ -#endif /* UNIV_DEBUG */ /** The automatically created system rollback segment has this id */ #define TRX_SYS_SYSTEM_RSEG_ID 0 @@ -326,18 +154,13 @@ trx_sys_validate_trx_list(); /** Transaction system header */ /*------------------------------------------------------------- @{ */ -#define TRX_SYS_TRX_ID_STORE 0 /*!< the maximum trx id or trx - number modulo - TRX_SYS_TRX_ID_UPDATE_MARGIN - written to a file page by any - transaction; the assignment of - transaction ids continues from - this number rounded up by - TRX_SYS_TRX_ID_UPDATE_MARGIN - plus - TRX_SYS_TRX_ID_UPDATE_MARGIN - when the database is - started */ +/** In old versions of InnoDB, this persisted the value of +trx_sys.get_max_trx_id(). Starting with MariaDB 10.3.5, +the field TRX_RSEG_MAX_TRX_ID in rollback segment header pages +and the fields TRX_UNDO_TRX_ID, TRX_UNDO_TRX_NO in undo log pages +are used instead. The field only exists for the purpose of upgrading +from older MySQL or MariaDB versions. */ +#define TRX_SYS_TRX_ID_STORE 0 #define TRX_SYS_FSEG_HEADER 8 /*!< segment header for the tablespace segment the trx system is created into */ @@ -347,16 +170,52 @@ trx_sys_validate_trx_list(); slots */ /*------------------------------------------------------------- @} */ -/* Max number of rollback segments: the number of segment specification slots -in the transaction system array; rollback segment id must fit in one (signed) -byte, therefore 128; each slot is currently 8 bytes in size. If you want -to raise the level to 256 then you will need to fix some assertions that -impose the 7 bit restriction. e.g., mach_write_to_3() */ +/** The number of rollback segments; rollback segment id must fit in +the 7 bits reserved for it in DB_ROLL_PTR. */ #define TRX_SYS_N_RSEGS 128 /** Maximum number of undo tablespaces (not counting the system tablespace) */ #define TRX_SYS_MAX_UNDO_SPACES (TRX_SYS_N_RSEGS - 1) -/** Maximum length of MySQL binlog file name, in bytes. */ +/* Rollback segment specification slot offsets */ + +/** the tablespace ID of an undo log header; starting with +MySQL/InnoDB 5.1.7, this is FIL_NULL if the slot is unused */ +#define TRX_SYS_RSEG_SPACE 0 +/** the page number of an undo log header, or FIL_NULL if unused */ +#define TRX_SYS_RSEG_PAGE_NO 4 +/** Size of a rollback segment specification slot */ +#define TRX_SYS_RSEG_SLOT_SIZE 8 + +/** Read the tablespace ID of a rollback segment slot. +@param[in] sys_header TRX_SYS page +@param[in] rseg_id rollback segment identifier +@return undo tablespace id */ +inline +uint32_t +trx_sysf_rseg_get_space(const buf_block_t* sys_header, ulint rseg_id) +{ + ut_ad(rseg_id < TRX_SYS_N_RSEGS); + return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE + + rseg_id * TRX_SYS_RSEG_SLOT_SIZE + + sys_header->frame); +} + +/** Read the page number of a rollback segment slot. +@param[in] sys_header TRX_SYS page +@param[in] rseg_id rollback segment identifier +@return undo page number */ +inline +uint32_t +trx_sysf_rseg_get_page_no(const buf_block_t* sys_header, ulint rseg_id) +{ + ut_ad(rseg_id < TRX_SYS_N_RSEGS); + return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO + + rseg_id * TRX_SYS_RSEG_SLOT_SIZE + + sys_header->frame); +} + +/** Maximum length of MySQL binlog file name, in bytes. +(Used before MariaDB 10.3.5.) */ #define TRX_SYS_MYSQL_LOG_NAME_LEN 512 /** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */ #define TRX_SYS_MYSQL_LOG_MAGIC_N 873422344 @@ -433,7 +292,7 @@ FIXED WSREP XID info offsets for 4k page size 10.0.32-galera */ #ifdef WITH_WSREP -/** The offset to WSREP XID headers */ +/** The offset to WSREP XID headers (used before MariaDB 10.3.5) */ #define TRX_SYS_WSREP_XID_INFO std::max(srv_page_size - 3500, 1596UL) #define TRX_SYS_WSREP_XID_MAGIC_N_FLD 0 #define TRX_SYS_WSREP_XID_MAGIC_N 0x77737265 @@ -497,42 +356,466 @@ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */ #define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE /* @} */ +trx_t* current_trx(); + +struct rw_trx_hash_element_t +{ + rw_trx_hash_element_t(): trx(0) + { + mutex_create(LATCH_ID_RW_TRX_HASH_ELEMENT, &mutex); + } + + + ~rw_trx_hash_element_t() + { + mutex_free(&mutex); + } + + + trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */ + trx_id_t no; + trx_t *trx; + ib_mutex_t mutex; +}; + + +/** + Wrapper around LF_HASH to store set of in memory read-write transactions. +*/ + +class rw_trx_hash_t +{ + LF_HASH hash; + + + /** + Constructor callback for lock-free allocator. + + Object is just allocated and is not yet accessible via rw_trx_hash by + concurrent threads. Object can be reused multiple times before it is freed. + Every time object is being reused initializer() callback is called. + */ + + static void rw_trx_hash_constructor(uchar *arg) + { + new(arg + LF_HASH_OVERHEAD) rw_trx_hash_element_t(); + } + + + /** + Destructor callback for lock-free allocator. + + Object is about to be freed and is not accessible via rw_trx_hash by + concurrent threads. + */ + + static void rw_trx_hash_destructor(uchar *arg) + { + reinterpret_cast<rw_trx_hash_element_t*> + (arg + LF_HASH_OVERHEAD)->~rw_trx_hash_element_t(); + } + + + /** + Destructor callback for lock-free allocator. + + This destructor is used at shutdown. It frees remaining transaction + objects. + + XA PREPARED transactions may remain if they haven't been committed or + rolled back. ACTIVE transactions may remain if startup was interrupted or + server is running in read-only mode or for certain srv_force_recovery + levels. + */ + + static void rw_trx_hash_shutdown_destructor(uchar *arg) + { + rw_trx_hash_element_t *element= + reinterpret_cast<rw_trx_hash_element_t*>(arg + LF_HASH_OVERHEAD); + if (trx_t *trx= element->trx) + { + ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED) || + (trx_state_eq(trx, TRX_STATE_ACTIVE) && + (!srv_was_started || + srv_read_only_mode || + srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO))); + trx_free_at_shutdown(trx); + } + element->~rw_trx_hash_element_t(); + } + + + /** + Initializer callback for lock-free hash. + + Object is not yet accessible via rw_trx_hash by concurrent threads, but is + about to become such. Object id can be changed only by this callback and + remains the same until all pins to this object are released. + + Object trx can be changed to 0 by erase() under object mutex protection, + which indicates it is about to be removed from lock-free hash and become + not accessible by concurrent threads. + */ + + static void rw_trx_hash_initializer(LF_HASH *, + rw_trx_hash_element_t *element, + trx_t *trx) + { + ut_ad(element->trx == 0); + element->trx= trx; + element->id= trx->id; + element->no= TRX_ID_MAX; + trx->rw_trx_hash_element= element; + } + + + /** + Gets LF_HASH pins. + + Pins are used to protect object from being destroyed or reused. They are + normally stored in trx object for quick access. If caller doesn't have trx + available, we try to get it using currnet_trx(). If caller doesn't have trx + at all, temporary pins are allocated. + */ + + LF_PINS *get_pins(trx_t *trx) + { + if (!trx->rw_trx_hash_pins) + { + trx->rw_trx_hash_pins= lf_hash_get_pins(&hash); + ut_a(trx->rw_trx_hash_pins); + } + return trx->rw_trx_hash_pins; + } + + + struct eliminate_duplicates_arg + { + trx_ids_t ids; + my_hash_walk_action action; + void *argument; + eliminate_duplicates_arg(size_t size, my_hash_walk_action act, void* arg): + action(act), argument(arg) { ids.reserve(size); } + }; + + + static my_bool eliminate_duplicates(rw_trx_hash_element_t *element, + eliminate_duplicates_arg *arg) + { + for (trx_ids_t::iterator it= arg->ids.begin(); it != arg->ids.end(); it++) + { + if (*it == element->id) + return 0; + } + arg->ids.push_back(element->id); + return arg->action(element, arg->argument); + } + + +#ifdef UNIV_DEBUG + static void validate_element(trx_t *trx) + { + ut_ad(!trx->read_only || !trx->rsegs.m_redo.rseg); + ut_ad(!trx_is_autocommit_non_locking(trx)); + mutex_enter(&trx->mutex); + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) || + trx_state_eq(trx, TRX_STATE_PREPARED)); + mutex_exit(&trx->mutex); + } + + + struct debug_iterator_arg + { + my_hash_walk_action action; + void *argument; + }; + + + static my_bool debug_iterator(rw_trx_hash_element_t *element, + debug_iterator_arg *arg) + { + mutex_enter(&element->mutex); + if (element->trx) + validate_element(element->trx); + mutex_exit(&element->mutex); + return arg->action(element, arg->argument); + } +#endif + + +public: + void init() + { + lf_hash_init(&hash, sizeof(rw_trx_hash_element_t), LF_HASH_UNIQUE, 0, + sizeof(trx_id_t), 0, &my_charset_bin); + hash.alloc.constructor= rw_trx_hash_constructor; + hash.alloc.destructor= rw_trx_hash_destructor; + hash.initializer= + reinterpret_cast<lf_hash_initializer>(rw_trx_hash_initializer); + } + + + void destroy() + { + hash.alloc.destructor= rw_trx_hash_shutdown_destructor; + lf_hash_destroy(&hash); + } + + + /** + Releases LF_HASH pins. + + Must be called by thread that owns trx_t object when the latter is being + "detached" from thread (e.g. released to the pool by trx_free()). Can be + called earlier if thread is expected not to use rw_trx_hash. + + Since pins are not allowed to be transferred to another thread, + initialisation thread calls this for recovered transactions. + */ + + void put_pins(trx_t *trx) + { + if (trx->rw_trx_hash_pins) + { + lf_hash_put_pins(trx->rw_trx_hash_pins); + trx->rw_trx_hash_pins= 0; + } + } + + + /** + Finds trx object in lock-free hash with given id. + + Only ACTIVE or PREPARED trx objects may participate in hash. Nevertheless + the transaction may get committed before this method returns. + + With do_ref_count == false the caller may dereference returned trx pointer + only if lock_sys.mutex was acquired before calling find(). + + With do_ref_count == true caller may dereference trx even if it is not + holding lock_sys.mutex. Caller is responsible for calling + trx->release_reference() when it is done playing with trx. + + Ideally this method should get caller rw_trx_hash_pins along with trx + object as a parameter, similar to insert() and erase(). However most + callers lose trx early in their call chains and it is not that easy to pass + them through. + + So we take more expensive approach: get trx through current_thd()->ha_data. + Some threads don't have trx attached to THD, and at least server + initialisation thread, fts_optimize_thread, srv_master_thread, + dict_stats_thread, srv_monitor_thread, btr_defragment_thread don't even + have THD at all. For such cases we allocate pins only for duration of + search and free them immediately. + + This has negative performance impact and should be fixed eventually (by + passing caller_trx as a parameter). Still stream of DML is more or less Ok. + + @return + @retval 0 not found + @retval pointer to trx + */ + + trx_t *find(trx_t *caller_trx, trx_id_t trx_id, bool do_ref_count= false) + { + /* + In MariaDB 10.3, purge will reset DB_TRX_ID to 0 + when the history is lost. Read/write transactions will + always have a nonzero trx_t::id; there the value 0 is + reserved for transactions that did not write or lock + anything yet. + */ + if (!trx_id) + return NULL; + + trx_t *trx= 0; + LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash); + ut_a(pins); + + rw_trx_hash_element_t *element= reinterpret_cast<rw_trx_hash_element_t*> + (lf_hash_search(&hash, pins, reinterpret_cast<const void*>(&trx_id), + sizeof(trx_id_t))); + if (element) + { + mutex_enter(&element->mutex); + lf_hash_search_unpin(pins); + if ((trx= element->trx)) + { + if (do_ref_count) + trx->reference(); + ut_d(validate_element(trx)); + } + mutex_exit(&element->mutex); + } + if (!caller_trx) + lf_hash_put_pins(pins); + return trx; + } + + + /** + Inserts trx to lock-free hash. + + Object becomes accessible via rw_trx_hash. + */ + + void insert(trx_t *trx) + { + ut_d(validate_element(trx)); + int res= lf_hash_insert(&hash, get_pins(trx), + reinterpret_cast<void*>(trx)); + ut_a(res == 0); + } + + + /** + Removes trx from lock-free hash. + + Object becomes not accessible via rw_trx_hash. But it still can be pinned + by concurrent find(), which is supposed to release it immediately after + it sees object trx is 0. + */ + + void erase(trx_t *trx) + { + ut_d(validate_element(trx)); + mutex_enter(&trx->rw_trx_hash_element->mutex); + trx->rw_trx_hash_element->trx= 0; + mutex_exit(&trx->rw_trx_hash_element->mutex); + int res= lf_hash_delete(&hash, get_pins(trx), + reinterpret_cast<const void*>(&trx->id), + sizeof(trx_id_t)); + ut_a(res == 0); + } + + + /** + Returns the number of elements in the hash. + + The number is exact only if hash is protected against concurrent + modifications (e.g. single threaded startup or hash is protected + by some mutex). Otherwise the number may be used as a hint only, + because it may change even before this method returns. + */ + + int32_t size() + { + return my_atomic_load32_explicit(&hash.count, MY_MEMORY_ORDER_RELAXED); + } + + + /** + Iterates the hash. + + @param caller_trx used to get/set pins + @param action called for every element in hash + @param argument opque argument passed to action + + May return the same element multiple times if hash is under contention. + If caller doesn't like to see the same transaction multiple times, it has + to call iterate_no_dups() instead. + + May return element with committed transaction. If caller doesn't like to + see committed transactions, it has to skip those under element mutex: + + mutex_enter(&element->mutex); + if (trx_t trx= element->trx) + { + // trx is protected against commit in this branch + } + mutex_exit(&element->mutex); + + May miss concurrently inserted transactions. + + @return + @retval 0 iteration completed successfully + @retval 1 iteration was interrupted (action returned 1) + */ + + int iterate(trx_t *caller_trx, my_hash_walk_action action, void *argument) + { + LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash); + ut_a(pins); +#ifdef UNIV_DEBUG + debug_iterator_arg debug_arg= { action, argument }; + action= reinterpret_cast<my_hash_walk_action>(debug_iterator); + argument= &debug_arg; +#endif + int res= lf_hash_iterate(&hash, pins, action, argument); + if (!caller_trx) + lf_hash_put_pins(pins); + return res; + } + + + int iterate(my_hash_walk_action action, void *argument) + { + return iterate(current_trx(), action, argument); + } + + + /** + Iterates the hash and eliminates duplicate elements. + + @sa iterate() + */ + + int iterate_no_dups(trx_t *caller_trx, my_hash_walk_action action, + void *argument) + { + eliminate_duplicates_arg arg(size() + 32, action, argument); + return iterate(caller_trx, reinterpret_cast<my_hash_walk_action> + (eliminate_duplicates), &arg); + } + + + int iterate_no_dups(my_hash_walk_action action, void *argument) + { + return iterate_no_dups(current_trx(), action, argument); + } +}; + + /** The transaction system central memory data structure. */ -struct trx_sys_t { +class trx_sys_t +{ + /** + The smallest number not yet assigned as a transaction id or transaction + number. Accessed and updated with atomic operations. + */ + MY_ALIGNED(CACHE_LINE_SIZE) trx_id_t m_max_trx_id; + + + /** + Solves race conditions between register_rw() and snapshot_ids() as well as + race condition between assign_new_trx_no() and snapshot_ids(). + + @sa register_rw() + @sa assign_new_trx_no() + @sa snapshot_ids() + */ + MY_ALIGNED(CACHE_LINE_SIZE) trx_id_t m_rw_trx_hash_version; + + + /** + TRX_RSEG_HISTORY list length (number of committed transactions to purge) + */ + MY_ALIGNED(CACHE_LINE_SIZE) int32 rseg_history_len; + + /** Active views. */ + MY_ALIGNED(CACHE_LINE_SIZE) UT_LIST_BASE_NODE_T(ReadView) m_views; + bool m_initialised; + +public: + MY_ALIGNED(CACHE_LINE_SIZE) mutable TrxSysMutex mutex; /*!< mutex protecting most fields in this structure except when noted otherwise */ - - MVCC* mvcc; /*!< Multi version concurrency control - manager */ - volatile trx_id_t - max_trx_id; /*!< The smallest number not yet - assigned as a transaction id or - transaction number. This is declared - volatile because it can be accessed - without holding any mutex during - AC-NL-RO view creation. */ - trx_ut_list_t serialisation_list; - /*!< Ordered on trx_t::no of all the - currenrtly active RW transactions */ -#ifdef UNIV_DEBUG - trx_id_t rw_max_trx_id; /*!< Max trx id of read-write - transactions which exist or existed */ -#endif /* UNIV_DEBUG */ - - /** Avoid false sharing */ - const char pad1[CACHE_LINE_SIZE]; - trx_ut_list_t rw_trx_list; /*!< List of active and committed in - memory read-write transactions, sorted - on trx id, biggest first. Recovered - transactions are always on this list. */ - - /** Avoid false sharing */ - const char pad2[CACHE_LINE_SIZE]; + MY_ALIGNED(CACHE_LINE_SIZE) trx_ut_list_t mysql_trx_list; /*!< List of transactions created for MySQL. All user transactions are - on mysql_trx_list. The rw_trx_list + on mysql_trx_list. The rw_trx_hash can contain system transactions and recovered transactions that will not be in the mysql_trx_list. @@ -540,22 +823,11 @@ struct trx_sys_t { transactions that have not yet been started in InnoDB. */ - trx_ids_t rw_trx_ids; /*!< Array of Read write transaction IDs - for MVCC snapshot. A ReadView would take - a snapshot of these transactions whose - changes are not visible to it. We should - remove transactions from the list before - committing in memory and releasing locks - to ensure right order of removal and - consistent snapshot. */ - - /** Avoid false sharing */ - const char pad3[CACHE_LINE_SIZE]; + MY_ALIGNED(CACHE_LINE_SIZE) /** Temporary rollback segments */ trx_rseg_t* temp_rsegs[TRX_SYS_N_RSEGS]; - /** Avoid false sharing */ - const char pad4[CACHE_LINE_SIZE]; + MY_ALIGNED(CACHE_LINE_SIZE) trx_rseg_t* rseg_array[TRX_SYS_N_RSEGS]; /*!< Pointer array to rollback segments; NULL if slot not in use; @@ -563,46 +835,378 @@ struct trx_sys_t { single-threaded mode; not protected by any mutex, because it is read-only during multi-threaded operation */ - ulint rseg_history_len; - /*!< Length of the TRX_RSEG_HISTORY - list (update undo logs for committed - transactions), protected by - rseg->mutex */ - - TrxIdSet rw_trx_set; /*!< Mapping from transaction id - to transaction instance */ - - ulint n_prepared_trx; /*!< Number of transactions currently - in the XA PREPARED state */ - - ulint n_prepared_recovered_trx; /*!< Number of transactions - currently in XA PREPARED state that are - also recovered. Such transactions cannot - be added during runtime. They can only - occur after recovery if mysqld crashed - while there were XA PREPARED - transactions. We disable query cache - if such transactions exist. */ -}; -/** When a trx id which is zero modulo this number (which must be a power of -two) is assigned, the field TRX_SYS_TRX_ID_STORE on the transaction system -page is updated */ -#define TRX_SYS_TRX_ID_WRITE_MARGIN ((trx_id_t) 256) + /** + Lock-free hash of in memory read-write transactions. + Works faster when it is on it's own cache line (tested). + */ + + MY_ALIGNED(CACHE_LINE_SIZE) rw_trx_hash_t rw_trx_hash; + + +#ifdef WITH_WSREP + /** Latest recovered XID during startup */ + XID recovered_wsrep_xid; +#endif + /** Latest recovered binlog offset */ + int64_t recovered_binlog_offset; + /** Latest recovred binlog file name */ + char recovered_binlog_filename[TRX_SYS_MYSQL_LOG_NAME_LEN]; + + + /** + Constructor. + + Some members may require late initialisation, thus we just mark object as + uninitialised. Real initialisation happens in create(). + */ + + trx_sys_t(): m_initialised(false) {} + + + /** + Returns the minimum trx id in rw trx list. + + This is the smallest id for which the trx can possibly be active. (But, you + must look at the trx->state to find out if the minimum trx id transaction + itself is active, or already committed.) + + @return the minimum trx id, or m_max_trx_id if the trx list is empty + */ + + trx_id_t get_min_trx_id() + { + trx_id_t id= get_max_trx_id(); + rw_trx_hash.iterate(reinterpret_cast<my_hash_walk_action> + (get_min_trx_id_callback), &id); + return id; + } + + + /** + Determines the maximum transaction id. + + @return maximum currently allocated trx id; will be stale after the + next call to trx_sys.get_new_trx_id() + */ + + trx_id_t get_max_trx_id() + { + return static_cast<trx_id_t> + (my_atomic_load64_explicit(reinterpret_cast<int64*>(&m_max_trx_id), + MY_MEMORY_ORDER_RELAXED)); + } + + + /** + Allocates a new transaction id. + @return new, allocated trx id + */ + + trx_id_t get_new_trx_id() + { + trx_id_t id= get_new_trx_id_no_refresh(); + refresh_rw_trx_hash_version(); + return id; + } -/** Test if trx_sys->mutex is owned. */ -#define trx_sys_mutex_own() (trx_sys->mutex.is_owned()) -/** Acquire the trx_sys->mutex. */ -#define trx_sys_mutex_enter() do { \ - mutex_enter(&trx_sys->mutex); \ -} while (0) + /** + Allocates and assigns new transaction serialisation number. -/** Release the trx_sys->mutex. */ -#define trx_sys_mutex_exit() do { \ - trx_sys->mutex.exit(); \ -} while (0) + There's a gap between m_max_trx_id increment and transaction serialisation + number becoming visible through rw_trx_hash. While we're in this gap + concurrent thread may come and do MVCC snapshot without seeing allocated + but not yet assigned serialisation number. Then at some point purge thread + may clone this view. As a result it won't see newly allocated serialisation + number and may remove "unnecessary" history data of this transaction from + rollback segments. -#include "trx0sys.ic" + m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has + to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively + means that all transaction serialisation numbers up to m_max_trx_id are + available through rw_trx_hash. + + We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so + that m_rw_trx_hash_version increment happens after + trx->rw_trx_hash_element->no becomes visible through rw_trx_hash. + + @param trx transaction + */ + void assign_new_trx_no(trx_t *trx) + { + trx->no= get_new_trx_id_no_refresh(); + my_atomic_store64_explicit(reinterpret_cast<int64*> + (&trx->rw_trx_hash_element->no), + trx->no, MY_MEMORY_ORDER_RELAXED); + refresh_rw_trx_hash_version(); + } + + + /** + Takes MVCC snapshot. + + To reduce malloc probablility we reserver rw_trx_hash.size() + 32 elements + in ids. + + For details about get_rw_trx_hash_version() != get_max_trx_id() spin + @sa register_rw() and @sa assign_new_trx_no(). + + We rely on get_rw_trx_hash_version() to issue ACQUIRE memory barrier so + that loading of m_rw_trx_hash_version happens before accessing rw_trx_hash. + + To optimise snapshot creation rw_trx_hash.iterate() is being used instead + of rw_trx_hash.iterate_no_dups(). It means that some transaction + identifiers may appear multiple times in ids. + + @param[in,out] caller_trx used to get access to rw_trx_hash_pins + @param[out] ids array to store registered transaction identifiers + @param[out] max_trx_id variable to store m_max_trx_id value + @param[out] mix_trx_no variable to store min(trx->no) value + */ + + void snapshot_ids(trx_t *caller_trx, trx_ids_t *ids, trx_id_t *max_trx_id, + trx_id_t *min_trx_no) + { + ut_ad(!mutex_own(&mutex)); + snapshot_ids_arg arg(ids); + + while ((arg.m_id= get_rw_trx_hash_version()) != get_max_trx_id()) + ut_delay(1); + arg.m_no= arg.m_id; + + ids->clear(); + ids->reserve(rw_trx_hash.size() + 32); + rw_trx_hash.iterate(caller_trx, + reinterpret_cast<my_hash_walk_action>(copy_one_id), + &arg); + + *max_trx_id= arg.m_id; + *min_trx_no= arg.m_no; + } + + + /** Initialiser for m_max_trx_id and m_rw_trx_hash_version. */ + void init_max_trx_id(trx_id_t value) + { + m_max_trx_id= m_rw_trx_hash_version= value; + } + + + bool is_initialised() { return m_initialised; } + + + /** Initialise the purge subsystem. */ + void create(); + + /** Close the purge subsystem on shutdown. */ + void close(); + + /** @return total number of active (non-prepared) transactions */ + ulint any_active_transactions(); + + + /** + Registers read-write transaction. + + Transaction becomes visible to MVCC. + + There's a gap between m_max_trx_id increment and transaction becoming + visible through rw_trx_hash. While we're in this gap concurrent thread may + come and do MVCC snapshot. As a result concurrent read view will be able to + observe records owned by this transaction even before it was committed. + + m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has + to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively + means that all transactions up to m_max_trx_id are available through + rw_trx_hash. + + We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so + that m_rw_trx_hash_version increment happens after transaction becomes + visible through rw_trx_hash. + */ + + void register_rw(trx_t *trx) + { + trx->id= get_new_trx_id_no_refresh(); + rw_trx_hash.insert(trx); + refresh_rw_trx_hash_version(); + } + + + /** + Deregisters read-write transaction. + + Transaction is removed from rw_trx_hash, which releases all implicit locks. + MVCC snapshot won't see this transaction anymore. + */ + + void deregister_rw(trx_t *trx) + { + rw_trx_hash.erase(trx); + } + + + bool is_registered(trx_t *caller_trx, trx_id_t id) + { + return rw_trx_hash.find(caller_trx, id); + } + + + trx_t *find(trx_t *caller_trx, trx_id_t id) + { + return rw_trx_hash.find(caller_trx, id, true); + } + + + /** + Registers view in MVCC. + + @param view view owned by the caller + */ + void register_view(ReadView *view) + { + mutex_enter(&mutex); + UT_LIST_ADD_FIRST(m_views, view); + mutex_exit(&mutex); + } + + + /** + Deregisters view in MVCC. + + @param view view owned by the caller + */ + void deregister_view(ReadView *view) + { + mutex_enter(&mutex); + UT_LIST_REMOVE(m_views, view); + mutex_exit(&mutex); + } + + + /** + Clones the oldest view and stores it in view. + + No need to call ReadView::close(). The caller owns the view that is passed + in. This function is called by purge thread to determine whether it should + purge the delete marked record or not. + */ + void clone_oldest_view(); + + + /** @return the number of active views */ + size_t view_count() const + { + size_t count= 0; + + mutex_enter(&mutex); + for (const ReadView* view= UT_LIST_GET_FIRST(m_views); view; + view= UT_LIST_GET_NEXT(m_view_list, view)) + { + if (view->get_state() == READ_VIEW_STATE_OPEN) + ++count; + } + mutex_exit(&mutex); + return count; + } + + /** @return number of committed transactions waiting for purge */ + ulint history_size() const + { + return uint32(my_atomic_load32(&const_cast<trx_sys_t*>(this) + ->rseg_history_len)); + } + /** Add to the TRX_RSEG_HISTORY length (on database startup). */ + void history_add(int32 len) + { + my_atomic_add32(&rseg_history_len, len); + } + /** Register a committed transaction. */ + void history_insert() { history_add(1); } + /** Note that a committed transaction was purged. */ + void history_remove() { history_add(-1); } + +private: + static my_bool get_min_trx_id_callback(rw_trx_hash_element_t *element, + trx_id_t *id) + { + if (element->id < *id) + { + mutex_enter(&element->mutex); + /* We don't care about read-only transactions here. */ + if (element->trx && element->trx->rsegs.m_redo.rseg) + *id= element->id; + mutex_exit(&element->mutex); + } + return 0; + } + + + struct snapshot_ids_arg + { + snapshot_ids_arg(trx_ids_t *ids): m_ids(ids) {} + trx_ids_t *m_ids; + trx_id_t m_id; + trx_id_t m_no; + }; + + + static my_bool copy_one_id(rw_trx_hash_element_t *element, + snapshot_ids_arg *arg) + { + if (element->id < arg->m_id) + { + trx_id_t no= static_cast<trx_id_t>(my_atomic_load64_explicit( + reinterpret_cast<int64*>(&element->no), MY_MEMORY_ORDER_RELAXED)); + arg->m_ids->push_back(element->id); + if (no < arg->m_no) + arg->m_no= no; + } + return 0; + } + + + /** Getter for m_rw_trx_hash_version, must issue ACQUIRE memory barrier. */ + trx_id_t get_rw_trx_hash_version() + { + return static_cast<trx_id_t> + (my_atomic_load64_explicit(reinterpret_cast<int64*> + (&m_rw_trx_hash_version), + MY_MEMORY_ORDER_ACQUIRE)); + } + + + /** Increments m_rw_trx_hash_version, must issue RELEASE memory barrier. */ + void refresh_rw_trx_hash_version() + { + my_atomic_add64_explicit(reinterpret_cast<int64*>(&m_rw_trx_hash_version), + 1, MY_MEMORY_ORDER_RELEASE); + } + + + /** + Allocates new transaction id without refreshing rw_trx_hash version. + + This method is extracted for exclusive use by register_rw() and + assign_new_trx_no() where new id must be allocated atomically with + payload of these methods from MVCC snapshot point of view. + + @sa get_new_trx_id() + @sa assign_new_trx_no() + + @return new transaction id + */ + + trx_id_t get_new_trx_id_no_refresh() + { + return static_cast<trx_id_t>(my_atomic_add64_explicit( + reinterpret_cast<int64*>(&m_max_trx_id), 1, MY_MEMORY_ORDER_RELAXED)); + } +}; + + +/** The transaction system */ +extern trx_sys_t trx_sys; #endif diff --git a/storage/innobase/include/trx0sys.ic b/storage/innobase/include/trx0sys.ic deleted file mode 100644 index e8efc1525c4..00000000000 --- a/storage/innobase/include/trx0sys.ic +++ /dev/null @@ -1,448 +0,0 @@ -/***************************************************************************** - -Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/trx0sys.ic -Transaction system - -Created 3/26/1996 Heikki Tuuri -*******************************************************/ - -#include "trx0trx.h" -#include "data0type.h" -#include "srv0srv.h" -#include "mtr0log.h" - -/* The typedef for rseg slot in the file copy */ -typedef byte trx_sysf_rseg_t; - -/* Rollback segment specification slot offsets */ -/*-------------------------------------------------------------*/ -#define TRX_SYS_RSEG_SPACE 0 /* space where the segment - header is placed; starting with - MySQL/InnoDB 5.1.7, this is - UNIV_UNDEFINED if the slot is unused */ -#define TRX_SYS_RSEG_PAGE_NO 4 /* page number where the segment - header is placed; this is FIL_NULL - if the slot is unused */ -/*-------------------------------------------------------------*/ -/* Size of a rollback segment specification slot */ -#define TRX_SYS_RSEG_SLOT_SIZE 8 - -/*****************************************************************//** -Writes the value of max_trx_id to the file based trx system header. */ -void -trx_sys_flush_max_trx_id(void); -/*==========================*/ - -/** Checks if a page address is the trx sys header page. -@param[in] page_id page id -@return true if trx sys header page */ -UNIV_INLINE -bool -trx_sys_hdr_page( - const page_id_t& page_id) -{ - return(page_id.space() == TRX_SYS_SPACE - && page_id.page_no() == TRX_SYS_PAGE_NO); -} - -/**********************************************************************//** -Gets a pointer to the transaction system header and x-latches its page. -@return pointer to system header, page x-latched. */ -UNIV_INLINE -trx_sysf_t* -trx_sysf_get( -/*=========*/ - mtr_t* mtr) /*!< in: mtr */ -{ - buf_block_t* block = NULL; - trx_sysf_t* header = NULL; - - ut_ad(mtr); - - block = buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), - univ_page_size, RW_X_LATCH, mtr); - - if (block) { - buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); - - header = TRX_SYS + buf_block_get_frame(block); - } - - return(header); -} - -/*****************************************************************//** -Gets the space of the nth rollback segment slot in the trx system -file copy. -@return space id */ -UNIV_INLINE -ulint -trx_sysf_rseg_get_space( -/*====================*/ - trx_sysf_t* sys_header, /*!< in: trx sys header */ - ulint i, /*!< in: slot index == rseg id */ - mtr_t* mtr) /*!< in: mtr */ -{ - ut_ad(sys_header); - ut_ad(i < TRX_SYS_N_RSEGS); - - return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS - + i * TRX_SYS_RSEG_SLOT_SIZE - + TRX_SYS_RSEG_SPACE, MLOG_4BYTES, mtr)); -} - -/*****************************************************************//** -Gets the page number of the nth rollback segment slot in the trx system -header. -@return page number, FIL_NULL if slot unused */ -UNIV_INLINE -ulint -trx_sysf_rseg_get_page_no( -/*======================*/ - trx_sysf_t* sys_header, /*!< in: trx system header */ - ulint i, /*!< in: slot index == rseg id */ - mtr_t* mtr) /*!< in: mtr */ -{ - ut_ad(sys_header); - ut_ad(i < TRX_SYS_N_RSEGS); - - return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS - + i * TRX_SYS_RSEG_SLOT_SIZE - + TRX_SYS_RSEG_PAGE_NO, MLOG_4BYTES, mtr)); -} - -/*****************************************************************//** -Sets the space id of the nth rollback segment slot in the trx system -file copy. */ -UNIV_INLINE -void -trx_sysf_rseg_set_space( -/*====================*/ - trx_sysf_t* sys_header, /*!< in: trx sys file copy */ - ulint i, /*!< in: slot index == rseg id */ - ulint space, /*!< in: space id */ - mtr_t* mtr) /*!< in: mtr */ -{ - ut_ad(sys_header); - ut_ad(i < TRX_SYS_N_RSEGS); - - mlog_write_ulint(sys_header + TRX_SYS_RSEGS - + i * TRX_SYS_RSEG_SLOT_SIZE - + TRX_SYS_RSEG_SPACE, - space, - MLOG_4BYTES, mtr); -} - -/*****************************************************************//** -Sets the page number of the nth rollback segment slot in the trx system -header. */ -UNIV_INLINE -void -trx_sysf_rseg_set_page_no( -/*======================*/ - trx_sysf_t* sys_header, /*!< in: trx sys header */ - ulint i, /*!< in: slot index == rseg id */ - ulint page_no, /*!< in: page number, FIL_NULL if the - slot is reset to unused */ - mtr_t* mtr) /*!< in: mtr */ -{ - ut_ad(sys_header); - ut_ad(i < TRX_SYS_N_RSEGS); - - mlog_write_ulint(sys_header + TRX_SYS_RSEGS - + i * TRX_SYS_RSEG_SLOT_SIZE - + TRX_SYS_RSEG_PAGE_NO, - page_no, - MLOG_4BYTES, mtr); -} - -/*****************************************************************//** -Writes a trx id to an index page. In case that the id size changes in -some future version, this function should be used instead of -mach_write_... */ -UNIV_INLINE -void -trx_write_trx_id( -/*=============*/ - byte* ptr, /*!< in: pointer to memory where written */ - trx_id_t id) /*!< in: id */ -{ -#if DATA_TRX_ID_LEN != 6 -# error "DATA_TRX_ID_LEN != 6" -#endif - ut_ad(id > 0); - mach_write_to_6(ptr, id); -} - -/****************************************************************//** -Looks for the trx handle with the given id in rw_trx_list. -The caller must be holding trx_sys->mutex. -@return the trx handle or NULL if not found; -the pointer must not be dereferenced unless lock_sys->mutex was -acquired before calling this function and is still being held */ -UNIV_INLINE -trx_t* -trx_get_rw_trx_by_id( -/*=================*/ - trx_id_t trx_id) /*!< in: trx id to search for */ -{ - ut_ad(trx_id > 0); - ut_ad(trx_sys_mutex_own()); - - if (trx_sys->rw_trx_set.empty()) { - return(NULL); - } - - TrxIdSet::iterator it; - - it = trx_sys->rw_trx_set.find(TrxTrack(trx_id)); - - return(it == trx_sys->rw_trx_set.end() ? NULL : it->m_trx); -} - -/****************************************************************//** -Returns the minimum trx id in trx list. This is the smallest id for which -the trx can possibly be active. (But, you must look at the trx->state -to find out if the minimum trx id transaction itself is active, or already -committed.). The caller must be holding the trx_sys_t::mutex in shared mode. -@return the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */ -UNIV_INLINE -trx_id_t -trx_rw_min_trx_id_low(void) -/*=======================*/ -{ - trx_id_t id; - - ut_ad(trx_sys_mutex_own()); - - const trx_t* trx = UT_LIST_GET_LAST(trx_sys->rw_trx_list); - - if (trx == NULL) { - id = trx_sys->max_trx_id; - } else { - assert_trx_in_rw_list(trx); - id = trx->id; - } - - return(id); -} - -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG -/***********************************************************//** -Assert that a transaction has been recovered. -@return TRUE */ -UNIV_INLINE -ibool -trx_assert_recovered( -/*=================*/ - trx_id_t trx_id) /*!< in: transaction identifier */ -{ - const trx_t* trx; - - trx_sys_mutex_enter(); - - trx = trx_get_rw_trx_by_id(trx_id); - ut_a(trx->is_recovered); - - trx_sys_mutex_exit(); - - return(TRUE); -} -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ - -/****************************************************************//** -Returns the minimum trx id in rw trx list. This is the smallest id for which -the rw trx can possibly be active. (But, you must look at the trx->state -to find out if the minimum trx id transaction itself is active, or already -committed.) -@return the minimum trx id, or trx_sys->max_trx_id if rw trx list is empty */ -UNIV_INLINE -trx_id_t -trx_rw_min_trx_id(void) -/*===================*/ -{ - trx_sys_mutex_enter(); - - trx_id_t id = trx_rw_min_trx_id_low(); - - trx_sys_mutex_exit(); - - return(id); -} - -/****************************************************************//** -Checks if a rw transaction with the given id is active. If the caller is -not holding lock_sys->mutex, the transaction may already have been committed. -@return transaction instance if active, or NULL */ -UNIV_INLINE -trx_t* -trx_rw_is_active_low( -/*=================*/ - trx_id_t trx_id, /*!< in: trx id of the transaction */ - ibool* corrupt) /*!< in: NULL or pointer to a flag - that will be set if corrupt */ -{ - trx_t* trx; - - ut_ad(trx_sys_mutex_own()); - - if (trx_id < trx_rw_min_trx_id_low()) { - - trx = NULL; - } else if (trx_id >= trx_sys->max_trx_id) { - - /* There must be corruption: we let the caller handle the - diagnostic prints in this case. */ - - trx = NULL; - if (corrupt != NULL) { - *corrupt = TRUE; - } - } else { - trx = trx_get_rw_trx_by_id(trx_id); - - if (trx != NULL - && trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)) { - - trx = NULL; - } - } - - return(trx); -} - -/****************************************************************//** -Checks if a rw transaction with the given id is active. If the caller is -not holding lock_sys->mutex, the transaction may already have been -committed. -@return transaction instance if active, or NULL; */ -UNIV_INLINE -trx_t* -trx_rw_is_active( -/*=============*/ - trx_id_t trx_id, /*!< in: trx id of the transaction */ - ibool* corrupt, /*!< in: NULL or pointer to a flag - that will be set if corrupt */ - bool do_ref_count) /*!< in: if true then increment the - trx_t::n_ref_count */ -{ - trx_t* trx; - - trx_sys_mutex_enter(); - - trx = trx_rw_is_active_low(trx_id, corrupt); - - if (trx != 0) { - trx = trx_reference(trx, do_ref_count); - } - - trx_sys_mutex_exit(); - - return(trx); -} - -/*****************************************************************//** -Allocates a new transaction id. -@return new, allocated trx id */ -UNIV_INLINE -trx_id_t -trx_sys_get_new_trx_id() -/*====================*/ -{ -#ifndef WITH_WSREP - /* wsrep_fake_trx_id violates this assert */ - ut_ad(trx_sys_mutex_own()); -#endif /* WITH_WSREP */ - - /* VERY important: after the database is started, max_trx_id value is - divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the following if - will evaluate to TRUE when this function is first time called, - and the value for trx id will be written to disk-based header! - Thus trx id values will not overlap when the database is - repeatedly started! */ - - if (!(trx_sys->max_trx_id % TRX_SYS_TRX_ID_WRITE_MARGIN)) { - - trx_sys_flush_max_trx_id(); - } - - return(trx_sys->max_trx_id++); -} - -/*****************************************************************//** -Determines the maximum transaction id. -@return maximum currently allocated trx id; will be stale after the -next call to trx_sys_get_new_trx_id() */ -UNIV_INLINE -trx_id_t -trx_sys_get_max_trx_id(void) -/*========================*/ -{ - ut_ad(!trx_sys_mutex_own()); - -#if UNIV_WORD_SIZE < DATA_TRX_ID_LEN - /* Avoid torn reads. */ - - trx_sys_mutex_enter(); - - trx_id_t max_trx_id = trx_sys->max_trx_id; - - trx_sys_mutex_exit(); - - return(max_trx_id); -#else - /* Perform a dirty read. Callers should be prepared for stale - values, and we know that the value fits in a machine word, so - that it will be read and written atomically. */ - return(trx_sys->max_trx_id); -#endif /* UNIV_WORD_SIZE < DATA_TRX_ID_LEN */ -} - -/*****************************************************************//** -Get the number of transaction in the system, independent of their state. -@return count of transactions in trx_sys_t::rw_trx_list */ -UNIV_INLINE -ulint -trx_sys_get_n_rw_trx(void) -/*======================*/ -{ - ulint n_trx; - - trx_sys_mutex_enter(); - - n_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list); - - trx_sys_mutex_exit(); - - return(n_trx); -} - -/** -Add the transaction to the RW transaction set -@param trx transaction instance to add */ -UNIV_INLINE -void -trx_sys_rw_trx_add(trx_t* trx) -{ - ut_ad(trx->id != 0); - - trx_sys->rw_trx_set.insert(TrxTrack(trx->id, trx)); - ut_d(trx->in_rw_trx_list = true); -} diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index 133f23081a0..685208853ee 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2015, 2017, MariaDB Corporation. +Copyright (c) 2015, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -38,25 +38,21 @@ Created 3/26/1996 Heikki Tuuri #include "lock0types.h" #include "log0log.h" -#include "usr0types.h" #include "que0types.h" #include "mem0mem.h" #include "trx0xa.h" #include "ut0vec.h" #include "fts0fts.h" #include "srv0srv.h" +#include "read0types.h" // Forward declaration struct mtr_t; // Forward declaration -class ReadView; - -// Forward declaration class FlushObserver; -/** Dummy session used currently in MySQL interface */ -extern sess_t* trx_dummy_sess; +struct rw_trx_hash_element_t; /** Set flush observer for the transaction @param[in/out] trx transaction struct @@ -112,12 +108,9 @@ trx_free_resurrected(trx_t* trx); void trx_free_for_background(trx_t* trx); -/********************************************************************//** -At shutdown, frees a transaction object that is in the PREPARED state. */ +/** At shutdown, frees a transaction object. */ void -trx_free_prepared( -/*==============*/ - trx_t* trx); /*!< in, own: trx object */ +trx_free_at_shutdown(trx_t *trx); /** Free a transaction object for MySQL. @param[in,out] trx transaction */ @@ -244,14 +237,6 @@ trx_commit_low( trx_t* trx, /*!< in/out: transaction */ mtr_t* mtr); /*!< in/out: mini-transaction (will be committed), or NULL if trx made no modifications */ -/****************************************************************//** -Cleans up a transaction at database startup. The cleanup is needed if -the transaction already got to the middle of a commit when the database -crashed, and we cannot roll it back. */ -void -trx_cleanup_at_db_startup( -/*======================*/ - trx_t* trx); /*!< in: transaction */ /**********************************************************************//** Does the transaction commit for MySQL. @return DB_SUCCESS or error number */ @@ -275,13 +260,13 @@ int trx_recover_for_mysql( /*==================*/ XID* xid_list, /*!< in/out: prepared transactions */ - ulint len); /*!< in: number of slots in xid_list */ + uint len); /*!< in: number of slots in xid_list */ /*******************************************************************//** This function is used to find one X/Open XA distributed transaction which is in the prepared state @return trx or NULL; on match, the trx->xid will be invalidated; note that the trx may have been committed, unless the caller is -holding lock_sys->mutex */ +holding lock_sys.mutex */ trx_t * trx_get_trx_by_xid( /*===============*/ @@ -299,31 +284,6 @@ void trx_mark_sql_stat_end( /*==================*/ trx_t* trx); /*!< in: trx handle */ -/********************************************************************//** -Assigns a read view for a consistent read query. All the consistent reads -within the same transaction will get the same read view, which is created -when this function is first called for a new started transaction. */ -ReadView* -trx_assign_read_view( -/*=================*/ - trx_t* trx); /*!< in: active transaction */ - -/****************************************************************//** -@return the transaction's read view or NULL if one not assigned. */ -UNIV_INLINE -ReadView* -trx_get_read_view( -/*==============*/ - trx_t* trx); - -/****************************************************************//** -@return the transaction's read view or NULL if one not assigned. */ -UNIV_INLINE -const ReadView* -trx_get_read_view( -/*==============*/ - const trx_t* trx); - /****************************************************************//** Prepares a transaction for commit/rollback. */ void @@ -347,7 +307,7 @@ trx_commit_step( /**********************************************************************//** Prints info about a transaction. -Caller must hold trx_sys->mutex. */ +Caller must hold trx_sys.mutex. */ void trx_print_low( /*==========*/ @@ -367,7 +327,7 @@ trx_print_low( /**********************************************************************//** Prints info about a transaction. -The caller must hold lock_sys->mutex and trx_sys->mutex. +The caller must hold lock_sys.mutex and trx_sys.mutex. When possible, use trx_print() instead. */ void trx_print_latched( @@ -377,25 +337,9 @@ trx_print_latched( ulint max_query_len); /*!< in: max query length to print, or 0 to use the default max length */ -#ifdef WITH_WSREP -/**********************************************************************//** -Prints info about a transaction. -Transaction information may be retrieved without having trx_sys->mutex acquired -so it may not be completely accurate. The caller must own lock_sys->mutex -and the trx must have some locks to make sure that it does not escape -without locking lock_sys->mutex. */ -UNIV_INTERN -void -wsrep_trx_print_locking( - FILE* f, /*!< in: output stream */ - const trx_t* trx, /*!< in: transaction */ - ulint max_query_len) /*!< in: max query length to print, - or 0 to use the default max length */ - MY_ATTRIBUTE((nonnull)); -#endif /* WITH_WSREP */ /**********************************************************************//** Prints info about a transaction. -Acquires and releases lock_sys->mutex and trx_sys->mutex. */ +Acquires and releases lock_sys.mutex. */ void trx_print( /*======*/ @@ -425,9 +369,9 @@ trx_set_dict_operation( /**********************************************************************//** Determines if a transaction is in the given state. -The caller must hold trx_sys->mutex, or it must be the thread +The caller must hold trx_sys.mutex, or it must be the thread that is serving a running transaction. -A running RW transaction must be in trx_sys->rw_trx_list. +A running RW transaction must be in trx_sys.rw_trx_hash. @return TRUE if trx->state == state */ UNIV_INLINE bool @@ -446,7 +390,7 @@ trx_state_eq( # ifdef UNIV_DEBUG /**********************************************************************//** Asserts that a transaction has been started. -The caller must hold trx_sys->mutex. +The caller must hold trx_sys.mutex. @return TRUE if started */ ibool trx_assert_started( @@ -531,31 +475,6 @@ trx_set_rw_mode( trx_t* trx); /** -Increase the reference count. If the transaction is in state -TRX_STATE_COMMITTED_IN_MEMORY then the transaction is considered -committed and the reference count is not incremented. -@param trx Transaction that is being referenced -@param do_ref_count Increment the reference iff this is true -@return transaction instance if it is not committed */ -UNIV_INLINE -trx_t* -trx_reference( - trx_t* trx, - bool do_ref_count); - -/** -Release the transaction. Decrease the reference count. -@param trx Transaction that is being released */ -UNIV_INLINE -void -trx_release_reference( - trx_t* trx); - -/** -Check if the transaction is being referenced. */ -#define trx_is_referenced(t) ((t)->n_ref > 0) - -/** @param[in] requestor Transaction requesting the lock @param[in] holder Transaction holding the lock @return the transaction that will be rolled back, null don't care */ @@ -607,15 +526,6 @@ with an explicit check for the read-only status. ((t)->read_only && trx_is_autocommit_non_locking((t))) /** -Assert that the transaction is in the trx_sys_t::rw_trx_list */ -#define assert_trx_in_rw_list(t) do { \ - ut_ad(!(t)->read_only); \ - ut_ad((t)->in_rw_trx_list \ - == !((t)->read_only || !(t)->rsegs.m_redo.rseg)); \ - check_trx_state(t); \ -} while (0) - -/** Check transaction state */ #define check_trx_state(t) do { \ ut_ad(!trx_is_autocommit_non_locking((t))); \ @@ -638,7 +548,7 @@ Check transaction state */ ut_ad(trx_state_eq((t), TRX_STATE_NOT_STARTED) \ || trx_state_eq((t), TRX_STATE_FORCED_ROLLBACK)); \ ut_ad(!trx->has_logged()); \ - ut_ad(!MVCC::is_view_active((t)->read_view)); \ + ut_ad(!(t)->read_view.is_open()); \ ut_ad((t)->lock.wait_thr == NULL); \ ut_ad(UT_LIST_GET_LEN((t)->lock.trx_locks) == 0); \ ut_ad((t)->dict_operation == TRX_DICT_OP_NONE); \ @@ -655,7 +565,7 @@ transaction pool. #ifdef UNIV_DEBUG /*******************************************************************//** Assert that an autocommit non-locking select cannot be in the -rw_trx_list and that it is a read-only transaction. +rw_trx_hash and that it is a read-only transaction. The tranasction must be in the mysql_trx_list. */ # define assert_trx_nonlocking_or_in_list(t) \ do { \ @@ -663,7 +573,6 @@ The tranasction must be in the mysql_trx_list. */ trx_state_t t_state = (t)->state; \ ut_ad((t)->read_only); \ ut_ad(!(t)->is_recovered); \ - ut_ad(!(t)->in_rw_trx_list); \ ut_ad((t)->in_mysql_trx_list); \ ut_ad(t_state == TRX_STATE_NOT_STARTED \ || t_state == TRX_STATE_FORCED_ROLLBACK \ @@ -675,7 +584,7 @@ The tranasction must be in the mysql_trx_list. */ #else /* UNIV_DEBUG */ /*******************************************************************//** Assert that an autocommit non-locking slect cannot be in the -rw_trx_list and that it is a read-only transaction. +rw_trx_hash and that it is a read-only transaction. The tranasction must be in the mysql_trx_list. */ # define assert_trx_nonlocking_or_in_list(trx) ((void)0) #endif /* UNIV_DEBUG */ @@ -703,7 +612,7 @@ To query the state either of the mutexes is sufficient within the locking code and no mutex is required when the query thread is no longer waiting. */ /** The locks and state of an active transaction. Protected by -lock_sys->mutex, trx->mutex or both. */ +lock_sys.mutex, trx->mutex or both. */ struct trx_lock_t { ulint n_active_thrs; /*!< number of active query threads */ @@ -715,10 +624,10 @@ struct trx_lock_t { TRX_QUE_LOCK_WAIT, this points to the lock request, otherwise this is NULL; set to non-NULL when holding - both trx->mutex and lock_sys->mutex; + both trx->mutex and lock_sys.mutex; set to NULL when holding - lock_sys->mutex; readers should - hold lock_sys->mutex, except when + lock_sys.mutex; readers should + hold lock_sys.mutex, except when they are holding trx->mutex and wait_lock==NULL */ ib_uint64_t deadlock_mark; /*!< A mark field that is initialized @@ -732,13 +641,13 @@ struct trx_lock_t { resolution, it sets this to true. Protected by trx->mutex. */ time_t wait_started; /*!< lock wait started at this time, - protected only by lock_sys->mutex */ + protected only by lock_sys.mutex */ que_thr_t* wait_thr; /*!< query thread belonging to this trx that is in QUE_THR_LOCK_WAIT state. For threads suspended in a lock wait, this is protected by - lock_sys->mutex. Otherwise, this may + lock_sys.mutex. Otherwise, this may only be modified by the thread that is serving the running transaction. */ @@ -751,12 +660,12 @@ struct trx_lock_t { ulint table_cached; /*!< Next free table lock in pool */ mem_heap_t* lock_heap; /*!< memory heap for trx_locks; - protected by lock_sys->mutex */ + protected by lock_sys.mutex */ trx_lock_list_t trx_locks; /*!< locks requested by the transaction; insertions are protected by trx->mutex - and lock_sys->mutex; removals are - protected by lock_sys->mutex */ + and lock_sys.mutex; removals are + protected by lock_sys.mutex */ lock_pool_t table_locks; /*!< All table locks requested by this transaction, including AUTOINC locks */ @@ -779,14 +688,73 @@ struct trx_lock_t { bool start_stmt; }; -/** Type used to store the list of tables that are modified by a given -transaction. We store pointers to the table objects in memory because +/** Logical first modification time of a table in a transaction */ +class trx_mod_table_time_t +{ + /** First modification of the table */ + undo_no_t first; + /** First modification of a system versioned column */ + undo_no_t first_versioned; + + /** Magic value signifying that a system versioned column of a + table was never modified in a transaction. */ + static const undo_no_t UNVERSIONED = IB_ID_MAX; + +public: + /** Constructor + @param[in] rows number of modified rows so far */ + trx_mod_table_time_t(undo_no_t rows) + : first(rows), first_versioned(UNVERSIONED) {} + +#ifdef UNIV_DEBUG + /** Validation + @param[in] rows number of modified rows so far + @return whether the object is valid */ + bool valid(undo_no_t rows = UNVERSIONED) const + { + return first <= first_versioned && first <= rows; + } +#endif /* UNIV_DEBUG */ + /** @return if versioned columns were modified */ + bool is_versioned() const { return first_versioned != UNVERSIONED; } + + /** After writing an undo log record, set is_versioned() if needed + @param[in] rows number of modified rows so far */ + void set_versioned(undo_no_t rows) + { + ut_ad(!is_versioned()); + first_versioned = rows; + ut_ad(valid()); + } + + /** Invoked after partial rollback + @param[in] limit number of surviving modified rows + @return whether this should be erased from trx_t::mod_tables */ + bool rollback(undo_no_t limit) + { + ut_ad(valid()); + if (first >= limit) { + return true; + } + + if (first_versioned < limit && is_versioned()) { + first_versioned = UNVERSIONED; + } + + return false; + } +}; + +/** Collection of persistent tables and their first modification +in a transaction. +We store pointers to the table objects in memory because we know that a table object will not be destroyed while a transaction that modified it is running. */ -typedef std::set< - dict_table_t*, +typedef std::map< + dict_table_t*, trx_mod_table_time_t, std::less<dict_table_t*>, - ut_allocator<dict_table_t*> > trx_mod_tables_t; + ut_allocator<std::pair<dict_table_t* const, trx_mod_table_time_t> > > + trx_mod_tables_t; /** The transaction handle @@ -816,27 +784,26 @@ so without holding any mutex. The following are exceptions to this: * trx_rollback_resurrected() may access resurrected (connectionless) transactions while the system is already processing new user -transactions. The trx_sys->mutex prevents a race condition between it +transactions. The trx_sys.mutex prevents a race condition between it and lock_trx_release_locks() [invoked by trx_commit()]. * trx_print_low() may access transactions not associated with the current -thread. The caller must be holding trx_sys->mutex and lock_sys->mutex. +thread. The caller must be holding lock_sys.mutex. -* When a transaction handle is in the trx_sys->mysql_trx_list or -trx_sys->trx_list, some of its fields must not be modified without -holding trx_sys->mutex exclusively. +* When a transaction handle is in the trx_sys.mysql_trx_list or +trx_sys.trx_list, some of its fields must not be modified without +holding trx_sys.mutex exclusively. * The locking code (in particular, lock_deadlock_recursive() and lock_rec_convert_impl_to_expl()) will access transactions associated to other connections. The locks of transactions are protected by -lock_sys->mutex and sometimes by trx->mutex. */ +lock_sys.mutex and sometimes by trx->mutex. */ typedef enum { TRX_SERVER_ABORT = 0, TRX_WSREP_ABORT = 1 } trx_abort_t; - /** Represents an instance of rollback segment along with its state variables.*/ struct trx_undo_ptr_t { trx_rseg_t* rseg; /*!< rollback segment assigned to the @@ -887,10 +854,23 @@ struct TrxVersion { typedef std::list<TrxVersion, ut_allocator<TrxVersion> > hit_list_t; struct trx_t { +private: + /** + Count of references. + + We can't release the locks nor commit the transaction until this reference + is 0. We can change the state to TRX_STATE_COMMITTED_IN_MEMORY to signify + that it is no longer "active". + */ + + int32_t n_ref; + + +public: TrxMutex mutex; /*!< Mutex protecting the fields state and lock (except some fields of lock, which are protected by - lock_sys->mutex) */ + lock_sys.mutex) */ /* Note: in_depth was split from in_innodb for fixing a RO performance issue. Acquiring the trx_t::mutex for each row @@ -918,7 +898,7 @@ struct trx_t { transaction is moved to COMMITTED_IN_MEMORY state. Protected by trx_sys_t::mutex - when trx->in_rw_trx_list. Initially + when trx is in rw_trx_hash. Initially set to TRX_ID_MAX. */ /** State of the trx from the point of view of concurrency control @@ -946,6 +926,9 @@ struct trx_t { Recovered XA: * NOT_STARTED -> PREPARED -> COMMITTED -> (freed) + Recovered XA followed by XA ROLLBACK: + * NOT_STARTED -> PREPARED -> ACTIVE -> COMMITTED -> (freed) + XA (2PC) (shutdown or disconnect before ROLLBACK or COMMIT): * NOT_STARTED -> PREPARED -> (freed) @@ -956,11 +939,11 @@ struct trx_t { XA (2PC) transactions are always treated as non-autocommit. - Transitions to ACTIVE or NOT_STARTED occur when - !in_rw_trx_list (no trx_sys->mutex needed). + Transitions to ACTIVE or NOT_STARTED occur when transaction + is not in rw_trx_hash (no trx_sys.mutex needed). Autocommit non-locking read-only transactions move between states - without holding any mutex. They are !in_rw_trx_list. + without holding any mutex. They are not in rw_trx_hash. All transactions, unless they are determined to be ac-nl-ro, explicitly tagged as read-only or read-write, will first be put @@ -970,15 +953,15 @@ struct trx_t { list. During this switch we assign it a rollback segment. When a transaction is NOT_STARTED, it can be in_mysql_trx_list if - it is a user transaction. It cannot be in rw_trx_list. + it is a user transaction. It cannot be in rw_trx_hash. - ACTIVE->PREPARED->COMMITTED is only possible when trx->in_rw_trx_list. - The transition ACTIVE->PREPARED is protected by trx_sys->mutex. + ACTIVE->PREPARED->COMMITTED is only possible when trx is in rw_trx_hash. + The transition ACTIVE->PREPARED is protected by trx_sys.mutex. ACTIVE->COMMITTED is possible when the transaction is in - rw_trx_list. + rw_trx_hash. - Transitions to COMMITTED are protected by both lock_sys->mutex + Transitions to COMMITTED are protected by both lock_sys.mutex and trx->mutex. NOTE: Some of these state change constraints are an overkill, @@ -987,25 +970,16 @@ struct trx_t { trx_state_t state; - ReadView* read_view; /*!< consistent read view used in the + ReadView read_view; /*!< consistent read view used in the transaction, or NULL if not yet set */ - - UT_LIST_NODE_T(trx_t) - trx_list; /*!< list of transactions; - protected by trx_sys->mutex. */ - UT_LIST_NODE_T(trx_t) - no_list; /*!< Required during view creation - to check for the view limit for - transactions that are committing */ - trx_lock_t lock; /*!< Information about the transaction locks and state. Protected by - trx->mutex or lock_sys->mutex + trx->mutex or lock_sys.mutex or both */ bool is_recovered; /*!< 0=normal transaction, 1=recovered, must be rolled back, - protected by trx_sys->mutex when - trx->in_rw_trx_list holds */ + protected by trx_sys.mutex when + trx is in rw_trx_hash */ hit_list_t hit_list; /*!< List of transactions to kill, when a high priority transaction @@ -1114,20 +1088,13 @@ struct trx_t { statement uses, except those in consistent read */ /*------------------------------*/ -#ifdef UNIV_DEBUG - /** The following two fields are mutually exclusive. */ - /* @{ */ - - bool in_rw_trx_list; /*!< true if in trx_sys->rw_trx_list */ - /* @} */ -#endif /* UNIV_DEBUG */ UT_LIST_NODE_T(trx_t) mysql_trx_list; /*!< list of transactions created for - MySQL; protected by trx_sys->mutex */ + MySQL; protected by trx_sys.mutex */ #ifdef UNIV_DEBUG bool in_mysql_trx_list; /*!< true if in - trx_sys->mysql_trx_list */ + trx_sys.mysql_trx_list */ #endif /* UNIV_DEBUG */ /*------------------------------*/ dberr_t error_state; /*!< 0 if no error, otherwise error @@ -1141,7 +1108,6 @@ struct trx_t { ulint error_key_num; /*!< if the index creation fails to a duplicate key error, a mysql key number of that index is stored here */ - sess_t* sess; /*!< session of the trx, NULL if none */ que_t* graph; /*!< query currently run in the session, or NULL if none; NOTE that the query belongs to the session, and it can @@ -1190,7 +1156,7 @@ struct trx_t { also in the lock list trx_locks. This vector needs to be freed explicitly when the trx instance is destroyed. - Protected by lock_sys->mutex. */ + Protected by lock_sys.mutex. */ /*------------------------------*/ bool read_only; /*!< true if transaction is flagged as a READ-ONLY transaction. @@ -1227,14 +1193,6 @@ struct trx_t { const char* start_file; /*!< Filename where it was started */ #endif /* UNIV_DEBUG */ - lint n_ref; /*!< Count of references, protected - by trx_t::mutex. We can't release the - locks nor commit the transaction until - this reference is 0. We can change - the state to COMMITTED_IN_MEMORY to - signify that it is no longer - "active". */ - /** Version of this instance. It is incremented each time the instance is re-used in trx_start_low(). It is used to track whether a transaction has been restarted since it was tagged @@ -1269,6 +1227,8 @@ struct trx_t { os_event_t wsrep_event; /* event waited for in srv_conc_slot */ #endif /* WITH_WSREP */ + rw_trx_hash_element_t *rw_trx_hash_element; + LF_PINS *rw_trx_hash_pins; ulint magic_n; /** @return whether any persistent undo log has been generated */ @@ -1301,6 +1261,33 @@ struct trx_t { return(assign_temp_rseg()); } + + bool is_referenced() + { + return my_atomic_load32_explicit(&n_ref, MY_MEMORY_ORDER_RELAXED) > 0; + } + + + void reference() + { +#ifdef UNIV_DEBUG + int32_t old_n_ref= +#endif + my_atomic_add32_explicit(&n_ref, 1, MY_MEMORY_ORDER_RELAXED); + ut_ad(old_n_ref >= 0); + } + + + void release_reference() + { +#ifdef UNIV_DEBUG + int32_t old_n_ref= +#endif + my_atomic_add32_explicit(&n_ref, -1, MY_MEMORY_ORDER_RELAXED); + ut_ad(old_n_ref > 0); + } + + private: /** Assign a rollback segment for modifying temporary tables. @return the assigned rollback segment */ diff --git a/storage/innobase/include/trx0trx.ic b/storage/innobase/include/trx0trx.ic index 6fa00c5333f..6372a02db17 100644 --- a/storage/innobase/include/trx0trx.ic +++ b/storage/innobase/include/trx0trx.ic @@ -24,13 +24,11 @@ The transaction Created 3/26/1996 Heikki Tuuri *******************************************************/ -#include "read0read.h" - /**********************************************************************//** Determines if a transaction is in the given state. -The caller must hold trx_sys->mutex, or it must be the thread +The caller must hold trx_sys.mutex, or it must be the thread that is serving a running transaction. -A running RW transaction must be in trx_sys->rw_trx_list. +A running RW transaction must be in trx_sys.rw_trx_hash. @return TRUE if trx->state == state */ UNIV_INLINE bool @@ -72,8 +70,6 @@ trx_state_eq( || (relaxed && thd_get_error_number(trx->mysql_thd))); - ut_ad(!trx->in_rw_trx_list); - return(true); } ut_error; @@ -214,74 +210,6 @@ ok: } /** -Increase the reference count. If the transaction is in state -TRX_STATE_COMMITTED_IN_MEMORY then the transaction is considered -committed and the reference count is not incremented. -@param trx Transaction that is being referenced -@param do_ref_count Increment the reference iff this is true -@return transaction instance if it is not committed */ -UNIV_INLINE -trx_t* -trx_reference( - trx_t* trx, - bool do_ref_count) -{ - trx_mutex_enter(trx); - - if (trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)) { - trx_mutex_exit(trx); - trx = NULL; - } else if (do_ref_count) { - ut_ad(trx->n_ref >= 0); - ++trx->n_ref; - trx_mutex_exit(trx); - } else { - trx_mutex_exit(trx); - } - - return(trx); -} - -/** -Release the transaction. Decrease the reference count. -@param trx Transaction that is being released */ -UNIV_INLINE -void -trx_release_reference( - trx_t* trx) -{ - trx_mutex_enter(trx); - - ut_ad(trx->n_ref > 0); - --trx->n_ref; - - trx_mutex_exit(trx); -} - - -/** -@param trx Get the active view for this transaction, if one exists -@return the transaction's read view or NULL if one not assigned. */ -UNIV_INLINE -ReadView* -trx_get_read_view( - trx_t* trx) -{ - return(!MVCC::is_view_active(trx->read_view) ? NULL : trx->read_view); -} - -/** -@param trx Get the active view for this transaction, if one exists -@return the transaction's read view or NULL if one not assigned. */ -UNIV_INLINE -const ReadView* -trx_get_read_view( - const trx_t* trx) -{ - return(!MVCC::is_view_active(trx->read_view) ? NULL : trx->read_view); -} - -/** @param[in] trx Transaction to check @return true if the transaction is a high priority transaction.*/ UNIV_INLINE diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h index 8092246c7fa..29139172c92 100644 --- a/storage/innobase/include/trx0types.h +++ b/storage/innobase/include/trx0types.h @@ -31,12 +31,9 @@ Created 3/26/1996 Heikki Tuuri #include "ut0mutex.h" #include "ut0new.h" -#include <set> #include <queue> #include <vector> -//#include <unordered_set> - /** printf(3) format used for printing DB_TRX_ID and other system fields */ #define TRX_ID_FMT IB_ID_FMT @@ -115,8 +112,6 @@ enum trx_dict_op_t { struct trx_t; /** The locks and state of an active transaction */ struct trx_lock_t; -/** Transaction system */ -struct trx_sys_t; /** Signal */ struct trx_sig_t; /** Rollback segment */ @@ -140,9 +135,6 @@ typedef ib_id_t roll_ptr_t; /** Undo number */ typedef ib_id_t undo_no_t; -/** Maximum transaction identifier */ -#define TRX_ID_MAX IB_ID_MAX - /** Transaction savepoint */ struct trx_savept_t{ undo_no_t least_undo_no; /*!< least undo number to undo */ @@ -150,8 +142,6 @@ struct trx_savept_t{ /** File objects */ /* @{ */ -/** Transaction system header */ -typedef byte trx_sysf_t; /** Rollback segment header */ typedef byte trx_rsegf_t; /** Undo segment header */ @@ -173,51 +163,4 @@ typedef ib_mutex_t PQMutex; typedef ib_mutex_t TrxSysMutex; typedef std::vector<trx_id_t, ut_allocator<trx_id_t> > trx_ids_t; - -/** Mapping read-write transactions from id to transaction instance, for -creating read views and during trx id lookup for MVCC and locking. */ -struct TrxTrack { - explicit TrxTrack(trx_id_t id, trx_t* trx = NULL) - : - m_id(id), - m_trx(trx) - { - // Do nothing - } - - trx_id_t m_id; - trx_t* m_trx; -}; - -struct TrxTrackHash { - size_t operator()(const TrxTrack& key) const - { - return(size_t(key.m_id)); - } -}; - -/** -Comparator for TrxMap */ -struct TrxTrackHashCmp { - - bool operator() (const TrxTrack& lhs, const TrxTrack& rhs) const - { - return(lhs.m_id == rhs.m_id); - } -}; - -/** -Comparator for TrxMap */ -struct TrxTrackCmp { - - bool operator() (const TrxTrack& lhs, const TrxTrack& rhs) const - { - return(lhs.m_id < rhs.m_id); - } -}; - -//typedef std::unordered_set<TrxTrack, TrxTrackHash, TrxTrackHashCmp> TrxIdSet; -typedef std::set<TrxTrack, TrxTrackCmp, ut_allocator<TrxTrack> > - TrxIdSet; - #endif /* trx0types_h */ diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h index 51f8035d886..b9e5d72866b 100644 --- a/storage/innobase/include/trx0undo.h +++ b/storage/innobase/include/trx0undo.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -123,17 +123,6 @@ page_t* trx_undo_page_get_s_latched(const page_id_t& page_id, mtr_t* mtr); /******************************************************************//** -Returns the previous undo record on the page in the specified log, or -NULL if none exists. -@return pointer to record, NULL if none */ -UNIV_INLINE -trx_undo_rec_t* -trx_undo_page_get_prev_rec( -/*=======================*/ - trx_undo_rec_t* rec, /*!< in: undo log record */ - ulint page_no,/*!< in: undo log header page number */ - ulint offset);/*!< in: undo log header offset on page */ -/******************************************************************//** Returns the next undo log record on the page in the specified log, or NULL if none exists. @return pointer to record, NULL if none */ @@ -144,28 +133,6 @@ trx_undo_page_get_next_rec( trx_undo_rec_t* rec, /*!< in: undo log record */ ulint page_no,/*!< in: undo log header page number */ ulint offset);/*!< in: undo log header offset on page */ -/******************************************************************//** -Returns the last undo record on the page in the specified undo log, or -NULL if none exists. -@return pointer to record, NULL if none */ -UNIV_INLINE -trx_undo_rec_t* -trx_undo_page_get_last_rec( -/*=======================*/ - page_t* undo_page,/*!< in: undo log page */ - ulint page_no,/*!< in: undo log header page number */ - ulint offset); /*!< in: undo log header offset on page */ -/******************************************************************//** -Returns the first undo record on the page in the specified undo log, or -NULL if none exists. -@return pointer to record, NULL if none */ -UNIV_INLINE -trx_undo_rec_t* -trx_undo_page_get_first_rec( -/*========================*/ - page_t* undo_page,/*!< in: undo log page */ - ulint page_no,/*!< in: undo log header page number */ - ulint offset);/*!< in: undo log header offset on page */ /***********************************************************************//** Gets the previous record in an undo log. @return undo log record, the page s-latched, NULL if none */ @@ -243,27 +210,28 @@ trx_undo_truncate_start( ulint hdr_page_no, ulint hdr_offset, undo_no_t limit); -/********************************************************************//** -Initializes the undo log lists for a rollback segment memory copy. -This function is only called when the database is started or a new -rollback segment created. -@return the combined size of undo log segments in pages */ -ulint -trx_undo_lists_init( -/*================*/ - trx_rseg_t* rseg); /*!< in: rollback segment memory object */ +/** Assign an undo log for a persistent transaction. +A new undo log is created or a cached undo log reused. +@param[in,out] trx transaction +@param[out] err error code +@param[in,out] mtr mini-transaction +@return the undo log block +@retval NULL on error */ +buf_block_t* +trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr) + MY_ATTRIBUTE((nonnull)); /** Assign an undo log for a transaction. A new undo log is created or a cached undo log reused. @param[in,out] trx transaction @param[in] rseg rollback segment @param[out] undo the undo log -@retval DB_SUCCESS on success -@retval DB_TOO_MANY_CONCURRENT_TRXS -@retval DB_OUT_OF_FILE_SPACE -@retval DB_READ_ONLY -@retval DB_OUT_OF_MEMORY */ -dberr_t -trx_undo_assign_undo(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo) +@param[out] err error code +@param[in,out] mtr mini-transaction +@return the undo log block +@retval NULL on error */ +buf_block_t* +trx_undo_assign_low(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo, + dberr_t* err, mtr_t* mtr) MY_ATTRIBUTE((nonnull, warn_unused_result)); /******************************************************************//** Sets the state of the undo log segment at a transaction finish. @@ -295,13 +263,9 @@ the data can be discarded. void trx_undo_commit_cleanup(trx_undo_t* undo, bool is_temp); -/********************************************************************//** -At shutdown, frees the undo logs of a PREPARED transaction. */ +/** At shutdown, frees the undo logs of a transaction. */ void -trx_undo_free_prepared( -/*===================*/ - trx_t* trx) /*!< in/out: PREPARED transaction */ - ATTRIBUTE_COLD __attribute__((nonnull)); +trx_undo_free_at_shutdown(trx_t *trx); /* Forward declaration. */ namespace undo { @@ -315,16 +279,32 @@ bool trx_undo_truncate_tablespace( undo::Truncate* undo_trunc); -/***********************************************************//** -Parses the redo log entry of an undo log page initialization. -@return end of log record or NULL */ +/** Parse MLOG_UNDO_INIT for crash-upgrade from MariaDB 10.2. +@param[in] ptr log record +@param[in] end_ptr end of log record buffer +@param[in,out] page page or NULL +@param[in,out] mtr mini-transaction +@return end of log record +@retval NULL if the log record is incomplete */ byte* trx_undo_parse_page_init( -/*=====================*/ - const byte* ptr, /*!< in: buffer */ - const byte* end_ptr,/*!< in: buffer end */ - page_t* page, /*!< in: page or NULL */ - mtr_t* mtr); /*!< in: mtr or NULL */ + const byte* ptr, + const byte* end_ptr, + page_t* page, + mtr_t* mtr); +/** Parse MLOG_UNDO_HDR_REUSE for crash-upgrade from MariaDB 10.2. +@param[in] ptr redo log record +@param[in] end_ptr end of log buffer +@param[in,out] page undo page or NULL +@param[in,out] mtr mini-transaction +@return end of log record or NULL */ +byte* +trx_undo_parse_page_header_reuse( + const byte* ptr, + const byte* end_ptr, + page_t* page, + mtr_t* mtr); + /** Parse the redo log entry of an undo log page header create. @param[in] ptr redo log record @param[in] end_ptr end of log buffer @@ -337,12 +317,15 @@ trx_undo_parse_page_header( const byte* end_ptr, page_t* page, mtr_t* mtr); -/************************************************************************ -Frees an undo log memory copy. */ -void -trx_undo_mem_free( -/*==============*/ - trx_undo_t* undo); /* in: the undo object to be freed */ +/** Read an undo log when starting up the database. +@param[in,out] rseg rollback segment +@param[in] id rollback segment slot +@param[in] page_no undo log segment page number +@param[in,out] max_trx_id the largest observed transaction ID +@return size of the undo log in pages */ +ulint +trx_undo_mem_create_at_db_start(trx_rseg_t* rseg, ulint id, ulint page_no, + trx_id_t& max_trx_id); #endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/trx0undo.ic b/storage/innobase/include/trx0undo.ic index 6e76ba205ae..407bc9ff484 100644 --- a/storage/innobase/include/trx0undo.ic +++ b/storage/innobase/include/trx0undo.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -184,89 +184,24 @@ trx_undo_page_get_s_latched(const page_id_t& page_id, mtr_t* mtr) return(buf_block_get_frame(block)); } -/******************************************************************//** -Returns the start offset of the undo log records of the specified undo -log on the page. -@return start offset */ -UNIV_INLINE -ulint -trx_undo_page_get_start( -/*====================*/ - page_t* undo_page,/*!< in: undo log page */ - ulint page_no,/*!< in: undo log header page number */ - ulint offset) /*!< in: undo log header offset on page */ -{ - ulint start; - - if (page_no == page_get_page_no(undo_page)) { - - start = mach_read_from_2(offset + undo_page - + TRX_UNDO_LOG_START); - } else { - start = TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE; - } - - return(start); -} - -/******************************************************************//** -Returns the end offset of the undo log records of the specified undo -log on the page. +/** Determine the end offset of undo log records of an undo log page. +@param[in] undo_page undo log page +@param[in] page_no undo log header page number +@param[in] offset undo log header offset @return end offset */ -UNIV_INLINE -ulint -trx_undo_page_get_end( -/*==================*/ - page_t* undo_page,/*!< in: undo log page */ - ulint page_no,/*!< in: undo log header page number */ - ulint offset) /*!< in: undo log header offset on page */ +inline +uint16_t +trx_undo_page_get_end(const page_t* undo_page, ulint page_no, ulint offset) { - trx_ulogf_t* log_hdr; - ulint end; - if (page_no == page_get_page_no(undo_page)) { - - log_hdr = undo_page + offset; - - end = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG); - - if (end == 0) { - end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR - + TRX_UNDO_PAGE_FREE); + if (uint16_t end = mach_read_from_2(TRX_UNDO_NEXT_LOG + + offset + undo_page)) { + return end; } - } else { - end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR - + TRX_UNDO_PAGE_FREE); - } - - return(end); -} - -/******************************************************************//** -Returns the previous undo record on the page in the specified log, or -NULL if none exists. -@return pointer to record, NULL if none */ -UNIV_INLINE -trx_undo_rec_t* -trx_undo_page_get_prev_rec( -/*=======================*/ - trx_undo_rec_t* rec, /*!< in: undo log record */ - ulint page_no,/*!< in: undo log header page number */ - ulint offset) /*!< in: undo log header offset on page */ -{ - page_t* undo_page; - ulint start; - - undo_page = (page_t*) ut_align_down(rec, UNIV_PAGE_SIZE); - - start = trx_undo_page_get_start(undo_page, page_no, offset); - - if (start + undo_page == rec) { - - return(NULL); } - return(undo_page + mach_read_from_2(rec - 2)); + return mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + + undo_page); } /******************************************************************//** @@ -298,55 +233,3 @@ trx_undo_page_get_next_rec( return(undo_page + next); } - -/******************************************************************//** -Returns the last undo record on the page in the specified undo log, or -NULL if none exists. -@return pointer to record, NULL if none */ -UNIV_INLINE -trx_undo_rec_t* -trx_undo_page_get_last_rec( -/*=======================*/ - page_t* undo_page,/*!< in: undo log page */ - ulint page_no,/*!< in: undo log header page number */ - ulint offset) /*!< in: undo log header offset on page */ -{ - ulint start; - ulint end; - - start = trx_undo_page_get_start(undo_page, page_no, offset); - end = trx_undo_page_get_end(undo_page, page_no, offset); - - if (start == end) { - - return(NULL); - } - - return(undo_page + mach_read_from_2(undo_page + end - 2)); -} - -/******************************************************************//** -Returns the first undo record on the page in the specified undo log, or -NULL if none exists. -@return pointer to record, NULL if none */ -UNIV_INLINE -trx_undo_rec_t* -trx_undo_page_get_first_rec( -/*========================*/ - page_t* undo_page,/*!< in: undo log page */ - ulint page_no,/*!< in: undo log header page number */ - ulint offset) /*!< in: undo log header offset on page */ -{ - ulint start; - ulint end; - - start = trx_undo_page_get_start(undo_page, page_no, offset); - end = trx_undo_page_get_end(undo_page, page_no, offset); - - if (start == end) { - - return(NULL); - } - - return(undo_page + start); -} diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i index 80c0b5476b0..ba1256feac2 100644 --- a/storage/innobase/include/univ.i +++ b/storage/innobase/include/univ.i @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, 2017, MariaDB Corporation. +Copyright (c) 2013, 2018, MariaDB Corporation. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -41,7 +41,7 @@ Created 1/20/1994 Heikki Tuuri #define INNODB_VERSION_MAJOR 5 #define INNODB_VERSION_MINOR 7 -#define INNODB_VERSION_BUGFIX 20 +#define INNODB_VERSION_BUGFIX 21 /* The following is the InnoDB version as shown in SELECT plugin_version FROM information_schema.plugins; @@ -647,14 +647,6 @@ typedef void* os_thread_ret_t; # define UNIV_MEM_ASSERT_W(addr, size) do {} while(0) # define UNIV_MEM_TRASH(addr, c, size) do {} while(0) #endif -#define UNIV_MEM_ASSERT_AND_FREE(addr, size) do { \ - UNIV_MEM_ASSERT_W(addr, size); \ - UNIV_MEM_FREE(addr, size); \ -} while (0) -#define UNIV_MEM_ASSERT_AND_ALLOC(addr, size) do { \ - UNIV_MEM_ASSERT_W(addr, size); \ - UNIV_MEM_ALLOC(addr, size); \ -} while (0) extern ulong srv_page_size_shift; extern ulong srv_page_size; diff --git a/storage/innobase/include/usr0sess.h b/storage/innobase/include/usr0sess.h deleted file mode 100644 index 8e9497a85c5..00000000000 --- a/storage/innobase/include/usr0sess.h +++ /dev/null @@ -1,69 +0,0 @@ -/***************************************************************************** - -Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/usr0sess.h -Sessions - -Created 6/25/1996 Heikki Tuuri -*******************************************************/ - -#ifndef usr0sess_h -#define usr0sess_h - -#include "univ.i" -#include "ut0byte.h" -#include "trx0types.h" -#include "srv0srv.h" -#include "trx0types.h" -#include "usr0types.h" -#include "que0types.h" -#include "data0data.h" -#include "rem0rec.h" - -/*********************************************************************//** -Opens a session. -@return own: session object */ -sess_t* -sess_open(void); -/*============*/ -/*********************************************************************//** -Closes a session, freeing the memory occupied by it. */ -void -sess_close( -/*=======*/ - sess_t* sess); /* in, own: session object */ - -/* The session handle. This data structure is only used by purge and is -not really necessary. We should get rid of it. */ -struct sess_t{ - ulint state; /*!< state of the session */ - trx_t* trx; /*!< transaction object permanently - assigned for the session: the - transaction instance designated by the - trx id changes, but the memory - structure is preserved */ -}; - -/* Session states */ -#define SESS_ACTIVE 1 -#define SESS_ERROR 2 /* session contains an error message - which has not yet been communicated - to the client */ -#endif diff --git a/storage/innobase/include/usr0types.h b/storage/innobase/include/usr0types.h deleted file mode 100644 index 6ba937cacc8..00000000000 --- a/storage/innobase/include/usr0types.h +++ /dev/null @@ -1,31 +0,0 @@ -/***************************************************************************** - -Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/usr0types.h -Users and sessions global types - -Created 6/25/1996 Heikki Tuuri -*******************************************************/ - -#ifndef usr0types_h -#define usr0types_h - -struct sess_t; - -#endif diff --git a/storage/innobase/include/ut0crc32.h b/storage/innobase/include/ut0crc32.h index 36b389b5bd2..32ad066f85a 100644 --- a/storage/innobase/include/ut0crc32.h +++ b/storage/innobase/include/ut0crc32.h @@ -47,14 +47,11 @@ typedef uint32_t (*ut_crc32_func_t)(const byte* ptr, ulint len); /** Pointer to CRC32 calculation function. */ extern ut_crc32_func_t ut_crc32; -/** Pointer to CRC32 calculation function, which uses big-endian byte order +/** CRC32 calculation function, which uses big-endian byte order when converting byte strings to integers internally. */ -extern ut_crc32_func_t ut_crc32_legacy_big_endian; - -/** Pointer to CRC32-byte-by-byte calculation function (byte order agnostic, -but very slow). */ -extern ut_crc32_func_t ut_crc32_byte_by_byte; +extern uint32_t ut_crc32_legacy_big_endian(const byte* buf, ulint len); +/** Text description of CRC32 implementation */ extern const char* ut_crc32_implementation; #endif /* ut0crc32_h */ diff --git a/storage/innobase/include/ut0mutex.h b/storage/innobase/include/ut0mutex.h index bd3603ad4d0..dc387dadbdc 100644 --- a/storage/innobase/include/ut0mutex.h +++ b/storage/innobase/include/ut0mutex.h @@ -164,7 +164,7 @@ public: }; /** Defined in sync0sync.cc */ -extern MutexMonitor* mutex_monitor; +extern MutexMonitor mutex_monitor; /** Creates, or rather, initializes a mutex object in a specified memory diff --git a/storage/innobase/include/ut0new.h b/storage/innobase/include/ut0new.h index 955e7b026c7..d61d3072d3d 100644 --- a/storage/innobase/include/ut0new.h +++ b/storage/innobase/include/ut0new.h @@ -129,6 +129,10 @@ InnoDB: #include <string.h> /* strlen(), strrchr(), strncmp() */ #include "my_global.h" /* needed for headers from mysql/psi/ */ +#if !defined(DBUG_OFF) && defined(HAVE_MADVISE) +#include <sys/mman.h> +#endif + /* JAN: TODO: missing 5.7 header */ #ifdef HAVE_MYSQL_MEMORY_H #include "mysql/psi/mysql_memory.h" /* PSI_MEMORY_CALL() */ @@ -172,7 +176,6 @@ extern PSI_memory_key mem_key_other; extern PSI_memory_key mem_key_row_log_buf; extern PSI_memory_key mem_key_row_merge_sort; extern PSI_memory_key mem_key_std; -extern PSI_memory_key mem_key_trx_sys_t_rw_trx_ids; extern PSI_memory_key mem_key_partitioning; /** Setup the internal objects needed for UT_NEW() to operate. @@ -235,6 +238,42 @@ struct ut_new_pfx_t { #endif }; +static void ut_allocate_trace_dontdump(void * ptr, + size_t bytes, + bool dontdump, + ut_new_pfx_t* pfx, + const char* file) +{ + ut_a(ptr != NULL); + +#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DONTDUMP) + if (dontdump && madvise(ptr, bytes, MADV_DONTDUMP)) { + ib::warn() << "Failed to set memory to DONTDUMP: " + << strerror(errno) + << " ptr " << ptr + << " size " << bytes; + } +#endif + if (pfx != NULL) { +#ifdef UNIV_PFS_MEMORY + allocate_trace(bytes, file, pfx); +#endif /* UNIV_PFS_MEMORY */ + pfx->m_size = bytes; + } +} + +static void ut_dodump(void* ptr, size_t m_size) +{ +#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP) + if (ptr && madvise(ptr, m_size, MADV_DODUMP)) { + ib::warn() << "Failed to set memory to DODUMP: " + << strerror(errno) + << " ptr " << ptr + << " size " << m_size; + } +#endif +} + /** Allocator class for allocating memory from inside std::* containers. @tparam T type of allocated object @tparam oom_fatal whether to commit suicide when running out of memory */ @@ -295,6 +334,7 @@ public: @param[in] file file name of the caller @param[in] set_to_zero if true, then the returned memory is initialized with 0x0 bytes. + @param[in] throw_on_error if true, raize exception if too big @return pointer to the allocated memory */ pointer allocate( @@ -567,6 +607,8 @@ public: /** Allocate a large chunk of memory that can hold 'n_elements' objects of type 'T' and trace the allocation. @param[in] n_elements number of elements + @param[in] dontdump if true, advise the OS is not to core + dump this memory. @param[out] pfx storage for the description of the allocated memory. The caller must provide space for this one and keep it until the memory is no longer needed and then pass it to @@ -575,7 +617,8 @@ public: pointer allocate_large( size_type n_elements, - ut_new_pfx_t* pfx) + ut_new_pfx_t* pfx, + bool dontdump = false) { if (n_elements == 0 || n_elements > max_size()) { return(NULL); @@ -586,13 +629,11 @@ public: pointer ptr = reinterpret_cast<pointer>( os_mem_alloc_large(&n_bytes)); -#ifdef UNIV_PFS_MEMORY - if (ptr != NULL) { - allocate_trace(n_bytes, NULL, pfx); + if (ptr == NULL) { + return NULL; } -#else - pfx->m_size = n_bytes; -#endif /* UNIV_PFS_MEMORY */ + + ut_allocate_trace_dontdump(ptr, n_bytes, dontdump, pfx, NULL); return(ptr); } @@ -601,17 +642,26 @@ public: deallocation. @param[in,out] ptr pointer to memory to free @param[in] pfx descriptor of the memory, as returned by - allocate_large(). */ + allocate_large(). + @param[in] dodump if true, advise the OS to include this + memory again if a core dump occurs. */ void deallocate_large( pointer ptr, - const ut_new_pfx_t* pfx) + const ut_new_pfx_t* pfx, + size_t size, + bool dodump = false) { + if (dodump) { + ut_dodump(ptr, size); + } #ifdef UNIV_PFS_MEMORY - deallocate_trace(pfx); + if (pfx) { + deallocate_trace(pfx); + } #endif /* UNIV_PFS_MEMORY */ - os_mem_free_large(ptr, pfx->m_size); + os_mem_free_large(ptr, size); } #ifdef UNIV_PFS_MEMORY @@ -843,6 +893,10 @@ ut_delete_array( ut_allocator<byte>(key).allocate( \ n_bytes, NULL, __FILE__, false, false)) +#define ut_malloc_dontdump(n_bytes) static_cast<void*>( \ + ut_allocator<byte>(PSI_NOT_INSTRUMENTED).allocate_large( \ + n_bytes, true)) + #define ut_zalloc(n_bytes, key) static_cast<void*>( \ ut_allocator<byte>(key).allocate( \ n_bytes, NULL, __FILE__, true, false)) @@ -866,6 +920,10 @@ ut_delete_array( #define ut_free(ptr) ut_allocator<byte>(PSI_NOT_INSTRUMENTED).deallocate( \ reinterpret_cast<byte*>(ptr)) +#define ut_free_dodump(ptr, size) static_cast<void*>( \ + ut_allocator<byte>(PSI_NOT_INSTRUMENTED).deallocate_large( \ + ptr, NULL, size, true)) + #else /* UNIV_PFS_MEMORY */ /* Fallbacks when memory tracing is disabled at compile time. */ @@ -888,6 +946,14 @@ ut_delete_array( #define ut_malloc_nokey(n_bytes) ::malloc(n_bytes) +static inline void *ut_malloc_dontdump(size_t n_bytes) +{ + void *ptr = os_mem_alloc_large(&n_bytes); + + ut_allocate_trace_dontdump(ptr, n_bytes, true, NULL, NULL); + return ptr; +} + #define ut_zalloc_nokey(n_bytes) ::calloc(1, n_bytes) #define ut_zalloc_nokey_nofatal(n_bytes) ::calloc(1, n_bytes) @@ -896,6 +962,12 @@ ut_delete_array( #define ut_free(ptr) ::free(ptr) +static inline void ut_free_dodump(void *ptr, size_t size) +{ + ut_dodump(ptr, size); + os_mem_free_large(ptr, size); +} + #endif /* UNIV_PFS_MEMORY */ #endif /* ut0new_h */ diff --git a/storage/innobase/include/ut0rnd.h b/storage/innobase/include/ut0rnd.h index 49ae3c81356..5baf8684d23 100644 --- a/storage/innobase/include/ut0rnd.h +++ b/storage/innobase/include/ut0rnd.h @@ -61,16 +61,6 @@ UNIV_INLINE ulint ut_rnd_gen_ulint(void); /*==================*/ -/********************************************************//** -Generates a random integer from a given interval. -@return the 'random' number */ -UNIV_INLINE -ulint -ut_rnd_interval( -/*============*/ - ulint low, /*!< in: low limit; can generate also this value */ - ulint high); /*!< in: high limit; can generate also this value */ - /*******************************************************//** The following function generates a hash value for a ulint integer to a hash table of size table_size, which should be a prime or some diff --git a/storage/innobase/include/ut0rnd.ic b/storage/innobase/include/ut0rnd.ic index 16dccb545d8..1e4915dd0f9 100644 --- a/storage/innobase/include/ut0rnd.ic +++ b/storage/innobase/include/ut0rnd.ic @@ -97,30 +97,6 @@ ut_rnd_gen_ulint(void) return(rnd); } -/********************************************************//** -Generates a random integer from a given interval. -@return the 'random' number */ -UNIV_INLINE -ulint -ut_rnd_interval( -/*============*/ - ulint low, /*!< in: low limit; can generate also this value */ - ulint high) /*!< in: high limit; can generate also this value */ -{ - ulint rnd; - - ut_ad(high >= low); - - if (low == high) { - - return(low); - } - - rnd = ut_rnd_gen_ulint(); - - return(low + (rnd % (high - low))); -} - /*******************************************************//** The following function generates a hash value for a ulint integer to a hash table of size table_size, which should be a prime diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h index b8282b7d0de..1614d3ead6d 100644 --- a/storage/innobase/include/ut0ut.h +++ b/storage/innobase/include/ut0ut.h @@ -45,6 +45,7 @@ Created 1/20/1994 Heikki Tuuri #include <stdarg.h> #include <string> +#include <my_atomic.h> /** Index name prefix in fast index creation, as a string constant */ #define TEMP_INDEX_PREFIX_STR "\377" @@ -52,35 +53,6 @@ Created 1/20/1994 Heikki Tuuri /** Time stamp */ typedef time_t ib_time_t; -#ifdef HAVE_PAUSE_INSTRUCTION - /* According to the gcc info page, asm volatile means that the - instruction has important side-effects and must not be removed. - Also asm volatile may trigger a memory barrier (spilling all registers - to memory). */ -# ifdef __SUNPRO_CC -# define UT_RELAX_CPU() asm ("pause" ) -# else -# define UT_RELAX_CPU() __asm__ __volatile__ ("pause") -# endif /* __SUNPRO_CC */ - -#elif defined(HAVE_FAKE_PAUSE_INSTRUCTION) -# define UT_RELAX_CPU() __asm__ __volatile__ ("rep; nop") -#elif defined _WIN32 - /* In the Win32 API, the x86 PAUSE instruction is executed by calling - the YieldProcessor macro defined in WinNT.h. It is a CPU architecture- - independent way by using YieldProcessor. */ -# define UT_RELAX_CPU() YieldProcessor() -#elif defined(__powerpc__) && defined __GLIBC__ -# include <sys/platform/ppc.h> -# define UT_RELAX_CPU() __ppc_get_timebase() -#else -# define UT_RELAX_CPU() do { \ - volatile int32 volatile_var; \ - int32 oldval= 0; \ - my_atomic_cas32(&volatile_var, &oldval, 1); \ - } while (0) -#endif - #if defined (__GNUC__) # define UT_COMPILER_BARRIER() __asm__ __volatile__ ("":::"memory") #elif defined (_MSC_VER) @@ -89,15 +61,6 @@ typedef time_t ib_time_t; # define UT_COMPILER_BARRIER() #endif -#if defined(HAVE_HMT_PRIORITY_INSTRUCTION) -# include <sys/platform/ppc.h> -# define UT_LOW_PRIORITY_CPU() __ppc_set_ppr_low() -# define UT_RESUME_PRIORITY_CPU() __ppc_set_ppr_med() -#else -# define UT_LOW_PRIORITY_CPU() ((void)0) -# define UT_RESUME_PRIORITY_CPU() ((void)0) -#endif - /*********************************************************************//** Delays execution for at most max_wait_us microseconds or returns earlier if cond becomes true. @@ -395,50 +358,6 @@ ut_copy_file( FILE* dest, /*!< in: output file */ FILE* src); /*!< in: input file to be appended to output */ -#ifdef _WIN32 -/**********************************************************************//** -A substitute for vsnprintf(3), formatted output conversion into -a limited buffer. Note: this function DOES NOT return the number of -characters that would have been printed if the buffer was unlimited because -VC's _vsnprintf() returns -1 in this case and we would need to call -_vscprintf() in addition to estimate that but we would need another copy -of "ap" for that and VC does not provide va_copy(). */ -void -ut_vsnprintf( -/*=========*/ - char* str, /*!< out: string */ - size_t size, /*!< in: str size */ - const char* fmt, /*!< in: format */ - va_list ap); /*!< in: format values */ - -/**********************************************************************//** -A substitute for snprintf(3), formatted output conversion into -a limited buffer. -@return number of characters that would have been printed if the size -were unlimited, not including the terminating '\0'. */ -int -ut_snprintf( -/*========*/ - char* str, /*!< out: string */ - size_t size, /*!< in: str size */ - const char* fmt, /*!< in: format */ - ...); /*!< in: format values */ -#else -/**********************************************************************//** -A wrapper for vsnprintf(3), formatted output conversion into -a limited buffer. Note: this function DOES NOT return the number of -characters that would have been printed if the buffer was unlimited because -VC's _vsnprintf() returns -1 in this case and we would need to call -_vscprintf() in addition to estimate that but we would need another copy -of "ap" for that and VC does not provide va_copy(). */ -# define ut_vsnprintf(buf, size, fmt, ap) \ - ((void) vsnprintf(buf, size, fmt, ap)) -/**********************************************************************//** -A wrapper for snprintf(3), formatted output conversion into -a limited buffer. */ -# define ut_snprintf snprintf -#endif /* _WIN32 */ - /*************************************************************//** Convert an error number to a human readable text message. The returned string is static and should not be freed or modified. |