diff options
Diffstat (limited to 'storage/innobase')
35 files changed, 1391 insertions, 1261 deletions
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index e9e04f1f1af..9b69fde0408 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -251,7 +251,7 @@ Gets the root node of a tree and x- or s-latches it. buf_block_t* btr_root_block_get( /*===============*/ - const dict_index_t* index, /*!< in: index tree */ + dict_index_t* index, /*!< in: index tree */ rw_lock_type_t mode, /*!< in: either RW_S_LATCH or RW_X_LATCH */ mtr_t* mtr, /*!< in: mtr */ @@ -263,18 +263,42 @@ btr_root_block_get( return nullptr; } - buf_block_t *block = btr_block_get(*index, index->page, mode, mtr, err); - if (block) + buf_block_t *block; +#ifndef BTR_CUR_ADAPT + static constexpr buf_block_t *guess= nullptr; +#else + buf_block_t *&guess= btr_search_get_info(index)->root_guess; + guess= +#endif + block= + buf_page_get_gen(page_id_t{index->table->space->id, index->page}, + index->table->space->zip_size(), mode, guess, BUF_GET, + mtr, err); + ut_ad(!block == (*err != DB_SUCCESS)); + + if (UNIV_LIKELY(block != nullptr)) { - if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF, - *block, *index->table->space) || - !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP, - *block, *index->table->space)) + if (!!page_is_comp(block->page.frame) != index->table->not_redundant() || + btr_page_get_index_id(block->page.frame) != index->id || + !fil_page_index_page_check(block->page.frame) || + index->is_spatial() != + (fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE)) + { + *err= DB_PAGE_CORRUPTED; + block= nullptr; + } + else if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF, + *block, *index->table->space) || + !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP, + *block, *index->table->space)) { *err= DB_CORRUPTION; block= nullptr; } } + else if (*err == DB_DECRYPTION_FAILED) + btr_decryption_failed(*index); + return block; } @@ -285,7 +309,7 @@ static page_t* btr_root_get( /*=========*/ - const dict_index_t* index, /*!< in: index tree */ + dict_index_t* index, /*!< in: index tree */ mtr_t* mtr, /*!< in: mtr */ dberr_t* err) /*!< out: error code */ { @@ -497,9 +521,7 @@ btr_block_reget(mtr_t *mtr, const dict_index_t &index, return block; } -#if 0 /* MDEV-29385 FIXME: Acquire the page latch upfront. */ ut_ad(mtr->memo_contains_flagged(&index.lock, MTR_MEMO_X_LOCK)); -#endif return btr_block_get(index, id.page_no(), rw_latch, mtr, err); } @@ -687,9 +709,7 @@ btr_page_get_father_node_ptr_for_validate( const uint32_t page_no = btr_cur_get_block(cursor)->page.id().page_no(); dict_index_t* index = btr_cur_get_index(cursor); ut_ad(!dict_index_is_spatial(index)); - - ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK - | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); ut_ad(dict_index_get_page(index) != page_no); const auto level = btr_page_get_level(btr_cur_get_page(cursor)); @@ -707,10 +727,6 @@ btr_page_get_father_node_ptr_for_validate( } const rec_t* node_ptr = btr_cur_get_rec(cursor); -#if 0 /* MDEV-29835 FIXME */ - ut_ad(!btr_cur_get_block(cursor)->page.lock.not_recursive() - || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); -#endif offsets = rec_get_offsets(node_ptr, index, offsets, 0, ULINT_UNDEFINED, &heap); @@ -2288,11 +2304,10 @@ btr_insert_on_non_leaf_level( } ut_ad(cursor.flag == BTR_CUR_BINARY); -#if 0 /* MDEV-29835 FIXME */ - ut_ad(!btr_cur_get_block(&cursor)->page.lock.not_recursive() + ut_ad(btr_cur_get_block(&cursor) + != mtr->at_savepoint(mtr->get_savepoint() - 1) || index->is_spatial() || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); -#endif if (UNIV_LIKELY(err == DB_SUCCESS)) { err = btr_cur_optimistic_insert(flags, @@ -2400,10 +2415,8 @@ btr_attach_half_pages( prev_block = mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); #if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ if (!prev_block) { -# if 0 /* MDEV-29835 FIXME */ ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); -# endif prev_block = btr_block_get(*index, prev_page_no, RW_X_LATCH, mtr); } @@ -2414,10 +2427,8 @@ btr_attach_half_pages( next_block = mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); #if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ if (!next_block) { -# if 0 /* MDEV-29835 FIXME */ ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); -# endif next_block = btr_block_get(*index, next_page_no, RW_X_LATCH, mtr); } @@ -2758,6 +2769,8 @@ btr_page_split_and_insert( ut_ad(dtuple_check_typed(tuple)); ut_ad(!cursor->index()->is_spatial()); + buf_pool.pages_split++; + if (!*heap) { *heap = mem_heap_create(1024); } @@ -3131,8 +3144,6 @@ insert_failed: } func_exit: - MONITOR_INC(MONITOR_INDEX_SPLIT); - ut_ad(page_validate(buf_block_get_frame(left_block), page_cursor->index)); ut_ad(page_validate(buf_block_get_frame(right_block), @@ -3167,9 +3178,7 @@ dberr_t btr_level_list_remove(const buf_block_t& block, #if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ if (!prev) { -# if 0 /* MDEV-29835 FIXME */ ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK)); -# endif prev= btr_block_get(index, id.page_no(), RW_X_LATCH, mtr, &err); if (UNIV_UNLIKELY(!prev)) return err; @@ -3184,9 +3193,7 @@ dberr_t btr_level_list_remove(const buf_block_t& block, #if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ if (!next) { -# if 0 /* MDEV-29835 FIXME */ ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK)); -# endif next= btr_block_get(index, id.page_no(), RW_X_LATCH, mtr, &err); if (UNIV_UNLIKELY(!next)) return err; @@ -4016,7 +4023,7 @@ btr_discard_page( if (UNIV_UNLIKELY(!merge_block)) { return err; } -#if 0 /* MDEV-29385 FIXME: Acquire the page latch upfront. */ +#if 1 /* MDEV-29835 FIXME: Acquire the page latch upfront. */ ut_ad(!memcmp_aligned<4>(merge_block->page.frame + FIL_PAGE_NEXT, block->page.frame + FIL_PAGE_OFFSET, @@ -4042,7 +4049,7 @@ btr_discard_page( if (UNIV_UNLIKELY(!merge_block)) { return err; } -#if 0 /* MDEV-29385 FIXME: Acquire the page latch upfront. */ +#if 1 /* MDEV-29835 FIXME: Acquire the page latch upfront. */ ut_ad(!memcmp_aligned<4>(merge_block->page.frame + FIL_PAGE_PREV, block->page.frame + FIL_PAGE_OFFSET, @@ -4613,8 +4620,7 @@ btr_validate_level( /*===============*/ dict_index_t* index, /*!< in: index tree */ const trx_t* trx, /*!< in: transaction or NULL */ - ulint level, /*!< in: level number */ - bool lockout)/*!< in: true if X-latch index is intended */ + ulint level) /*!< in: level number */ { buf_block_t* block; page_t* page; @@ -4633,18 +4639,10 @@ btr_validate_level( #ifdef UNIV_ZIP_DEBUG page_zip_des_t* page_zip; #endif /* UNIV_ZIP_DEBUG */ - ulint savepoint = 0; - uint32_t parent_page_no = FIL_NULL; - uint32_t parent_right_page_no = FIL_NULL; - bool rightmost_child = false; mtr.start(); - if (lockout) { - mtr_x_lock_index(index, &mtr); - } else { - mtr_sx_lock_index(index, &mtr); - } + mtr_x_lock_index(index, &mtr); dberr_t err; block = btr_root_block_get(index, RW_SX_LATCH, &mtr, &err); @@ -4739,11 +4737,7 @@ func_exit: mem_heap_empty(heap); offsets = offsets2 = NULL; - if (lockout) { - mtr_x_lock_index(index, &mtr); - } else { - mtr_sx_lock_index(index, &mtr); - } + mtr_x_lock_index(index, &mtr); page = block->page.frame; @@ -4787,7 +4781,6 @@ func_exit: if (right_page_no != FIL_NULL) { const rec_t* right_rec; - savepoint = mtr.get_savepoint(); right_block = btr_block_get(*index, right_page_no, RW_SX_LATCH, &mtr, &err); @@ -4890,11 +4883,6 @@ broken_links: father_page = btr_cur_get_page(&node_cur); node_ptr = btr_cur_get_rec(&node_cur); - parent_page_no = page_get_page_no(father_page); - parent_right_page_no = btr_page_get_next(father_page); - rightmost_child = page_rec_is_supremum( - page_rec_get_next(node_ptr)); - rec = page_rec_get_prev(page_get_supremum_rec(page)); if (rec) { btr_cur_position(index, rec, block, &node_cur); @@ -4976,35 +4964,6 @@ broken_links: } } else if (const rec_t* right_node_ptr = page_rec_get_next(node_ptr)) { - if (!lockout && rightmost_child) { - - /* To obey latch order of tree blocks, - we should release the right_block once to - obtain lock of the uncle block. */ - ut_ad(right_block - == mtr.at_savepoint(savepoint)); - mtr.rollback_to_savepoint(savepoint, - savepoint + 1); - - if (parent_right_page_no != FIL_NULL) { - btr_block_get(*index, - parent_right_page_no, - RW_SX_LATCH, &mtr); - } - - right_block = btr_block_get(*index, - right_page_no, - RW_SX_LATCH, - &mtr, &err); - if (!right_block) { - btr_validate_report1(index, level, - block); - fputs("InnoDB: broken FIL_PAGE_NEXT" - " link\n", stderr); - goto invalid_page; - } - } - btr_cur_position( index, page_get_infimum_rec(right_block->page.frame), @@ -5076,19 +5035,6 @@ node_ptr_fails: mtr.start(); - if (!lockout) { - if (rightmost_child) { - if (parent_right_page_no != FIL_NULL) { - btr_block_get(*index, - parent_right_page_no, - RW_SX_LATCH, &mtr); - } - } else if (parent_page_no != FIL_NULL) { - btr_block_get(*index, parent_page_no, - RW_SX_LATCH, &mtr); - } - } - block = btr_block_get(*index, right_page_no, RW_SX_LATCH, &mtr, &err); goto loop; @@ -5106,21 +5052,16 @@ btr_validate_index( dict_index_t* index, /*!< in: index */ const trx_t* trx) /*!< in: transaction or NULL */ { - const bool lockout= index->is_spatial(); - mtr_t mtr; mtr.start(); - if (lockout) - mtr_x_lock_index(index, &mtr); - else - mtr_sx_lock_index(index, &mtr); + mtr_x_lock_index(index, &mtr); dberr_t err; if (page_t *root= btr_root_get(index, &mtr, &err)) for (auto level= btr_page_get_level(root);; level--) { - if (dberr_t err_level= btr_validate_level(index, trx, level, lockout)) + if (dberr_t err_level= btr_validate_level(index, trx, level)) err= err_level; if (!level) break; diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index 2941a765fa4..74db3fa3d8f 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -750,29 +750,34 @@ btr_cur_will_modify_tree( /** Detects whether the modifying record might need a opposite modification to the intention. -@param[in] page page -@param[in] lock_intention lock intention for the tree operation -@param[in] rec record (current node_ptr) +@param page page +@param lock_intention lock intention for the tree operation +@param node_ptr_max_size the maximum size of a node pointer +@param compress_limit BTR_CUR_PAGE_COMPRESS_LIMIT(index) +@param rec record (current node_ptr) @return true if tree modification is needed */ -static -bool -btr_cur_need_opposite_intention( - const page_t* page, - btr_intention_t lock_intention, - const rec_t* rec) +static bool btr_cur_need_opposite_intention(const page_t *page, + btr_intention_t lock_intention, + ulint node_ptr_max_size, + ulint compress_limit, + const rec_t *rec) { - switch (lock_intention) { - case BTR_INTENTION_DELETE: - return (page_has_prev(page) && page_rec_is_first(rec, page)) || - (page_has_next(page) && page_rec_is_last(rec, page)); - case BTR_INTENTION_INSERT: - return page_has_next(page) && page_rec_is_last(rec, page); - case BTR_INTENTION_BOTH: - return(false); - } - - MY_ASSERT_UNREACHABLE(); - return(false); + if (lock_intention != BTR_INTENTION_INSERT) + { + /* We compensate also for btr_cur_compress_recommendation() */ + if (!page_has_siblings(page) || + page_rec_is_first(rec, page) || page_rec_is_last(rec, page) || + page_get_data_size(page) < node_ptr_max_size + compress_limit) + return true; + if (lock_intention == BTR_INTENTION_DELETE) + return false; + } + else if (page_has_next(page) && page_rec_is_last(rec, page)) + return true; + LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), return true); + const ulint max_size= page_get_max_insert_size_after_reorganize(page, 2); + return max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + node_ptr_max_size || + max_size < node_ptr_max_size * 2; } /** @@ -997,7 +1002,7 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, const ulint savepoint= mtr->get_savepoint(); - ulint node_ptr_max_size= 0; + ulint node_ptr_max_size= 0, compress_limit= 0; rw_lock_type_t rw_latch= RW_S_LATCH; switch (latch_mode) { @@ -1009,13 +1014,19 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, ut_ad(mtr->memo_contains_flagged(&index()->lock, MTR_MEMO_X_LOCK)); break; } - if (lock_intention == BTR_INTENTION_DELETE && buf_pool.n_pend_reads && - trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH) - /* Most delete-intended operations are due to the purge of history. - Prioritize them when the history list is growing huge. */ - mtr_x_lock_index(index(), mtr); - else - mtr_sx_lock_index(index(), mtr); + if (lock_intention == BTR_INTENTION_DELETE) + { + compress_limit= BTR_CUR_PAGE_COMPRESS_LIMIT(index()); + if (buf_pool.n_pend_reads && + trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH) + { + /* Most delete-intended operations are due to the purge of history. + Prioritize them when the history list is growing huge. */ + mtr_x_lock_index(index(), mtr); + break; + } + } + mtr_sx_lock_index(index(), mtr); break; #ifdef UNIV_DEBUG case BTR_CONT_MODIFY_TREE: @@ -1221,6 +1232,10 @@ release_tree: !btr_block_get(*index(), btr_page_get_next(block->page.frame), RW_X_LATCH, mtr, &err)) goto func_exit; + if (btr_cur_need_opposite_intention(block->page.frame, lock_intention, + node_ptr_max_size, compress_limit, + page_cur.rec)) + goto need_opposite_intention; } reached_latched_leaf: @@ -1274,6 +1289,7 @@ release_tree: break; case BTR_MODIFY_TREE: if (btr_cur_need_opposite_intention(block->page.frame, lock_intention, + node_ptr_max_size, compress_limit, page_cur.rec)) /* If the rec is the first or last in the page for pessimistic delete intention, it might cause node_ptr insert for the upper @@ -1417,6 +1433,17 @@ release_tree: goto search_loop; } +ATTRIBUTE_COLD void mtr_t::index_lock_upgrade() +{ + auto &slot= m_memo[get_savepoint() - 1]; + if (slot.type == MTR_MEMO_X_LOCK) + return; + ut_ad(slot.type == MTR_MEMO_SX_LOCK); + index_lock *lock= static_cast<index_lock*>(slot.object); + lock->u_x_upgrade(SRW_LOCK_CALL); + slot.type= MTR_MEMO_X_LOCK; +} + ATTRIBUTE_COLD dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, mtr_t *mtr) @@ -1434,8 +1461,7 @@ dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple, ut_ad(block->page.id().page_no() == index()->page); block->page.fix(); mtr->rollback_to_savepoint(1); - ut_ad(mtr->memo_contains_flagged(&index()->lock, - MTR_MEMO_SX_LOCK | MTR_MEMO_X_LOCK)); + mtr->index_lock_upgrade(); const page_cur_mode_t page_mode{btr_cur_nonleaf_mode(mode)}; @@ -1665,7 +1691,6 @@ search_loop: dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, btr_latch_mode latch_mode, mtr_t *mtr) { - btr_intention_t lock_intention; ulint n_blocks= 0; mem_heap_t *heap= nullptr; rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; @@ -1677,7 +1702,7 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED; latch_mode= btr_latch_mode(latch_mode & ~BTR_ALREADY_S_LATCHED); - lock_intention= btr_cur_get_and_clear_intention(&latch_mode); + btr_intention_t lock_intention= btr_cur_get_and_clear_intention(&latch_mode); /* Store the position of the tree latch we push to mtr so that we know how to release it when we have latched the leaf node */ @@ -1685,7 +1710,7 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, auto savepoint= mtr->get_savepoint(); rw_lock_type_t upper_rw_latch= RW_X_LATCH; - ulint node_ptr_max_size= 0; + ulint node_ptr_max_size= 0, compress_limit= 0; if (latch_mode == BTR_MODIFY_TREE) { @@ -1694,12 +1719,18 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, and read IO bandwidth should be prioritized for them, when the history list is growing huge. */ savepoint++; - if (lock_intention == BTR_INTENTION_DELETE - && buf_pool.n_pend_reads - && trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH) - mtr_x_lock_index(index, mtr); - else - mtr_sx_lock_index(index, mtr); + if (lock_intention == BTR_INTENTION_DELETE) + { + compress_limit= BTR_CUR_PAGE_COMPRESS_LIMIT(index); + + if (buf_pool.n_pend_reads && + trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH) + { + mtr_x_lock_index(index, mtr); + goto index_locked; + } + } + mtr_sx_lock_index(index, mtr); } else { @@ -1720,6 +1751,7 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, } } +index_locked: ut_ad(savepoint == mtr->get_savepoint()); const rw_lock_type_t root_leaf_rw_latch= @@ -1792,15 +1824,28 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, !btr_block_get(*index, btr_page_get_next(block->page.frame), RW_X_LATCH, mtr, &err)) break; + + if (!index->lock.have_x() && + btr_cur_need_opposite_intention(block->page.frame, + lock_intention, + node_ptr_max_size, + compress_limit, page_cur.rec)) + goto need_opposite_intention; } else { if (rw_latch == RW_NO_LATCH) mtr->upgrade_buffer_fix(leaf_savepoint - 1, - rw_lock_type_t(latch_mode)); - /* Release index->lock if needed, and the non-leaf pages. */ - mtr->rollback_to_savepoint(savepoint - !latch_by_caller, - leaf_savepoint - 1); + rw_lock_type_t(latch_mode & + (RW_X_LATCH | RW_S_LATCH))); + if (latch_mode != BTR_CONT_MODIFY_TREE) + { + ut_ad(latch_mode == BTR_MODIFY_LEAF || + latch_mode == BTR_SEARCH_LEAF); + /* Release index->lock if needed, and the non-leaf pages. */ + mtr->rollback_to_savepoint(savepoint - !latch_by_caller, + leaf_savepoint - 1); + } } break; } @@ -1822,22 +1867,25 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, : !page_cur_move_to_prev(&page_cur)) goto corrupted; - const rec_t *node_ptr= page_cur.rec; - offsets= rec_get_offsets(node_ptr, index, offsets, 0, ULINT_UNDEFINED, + offsets= rec_get_offsets(page_cur.rec, index, offsets, 0, ULINT_UNDEFINED, &heap); ut_ad(latch_mode != BTR_MODIFY_TREE || upper_rw_latch == RW_X_LATCH); if (latch_mode != BTR_MODIFY_TREE); - else if (btr_cur_need_opposite_intention(block->page.frame, - lock_intention, node_ptr)) + else if (btr_cur_need_opposite_intention(block->page.frame, lock_intention, + node_ptr_max_size, compress_limit, + page_cur.rec)) { + need_opposite_intention: /* If the rec is the first or last in the page for pessimistic delete intention, it might cause node_ptr insert for the upper level. We should change the intention and retry. */ mtr->rollback_to_savepoint(savepoint); - lock_intention= BTR_INTENTION_BOTH; + mtr->index_lock_upgrade(); + /* X-latch all pages from now on */ + latch_mode= BTR_CONT_MODIFY_TREE; page= index->page; height= ULINT_UNDEFINED; n_blocks= 0; @@ -1846,7 +1894,7 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, else { if (!btr_cur_will_modify_tree(index, block->page.frame, - lock_intention, node_ptr, + lock_intention, page_cur.rec, node_ptr_max_size, zip_size, mtr)) { ut_ad(n_blocks); @@ -1876,7 +1924,7 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, } /* Go to the child node */ - page= btr_node_ptr_get_child_page_no(node_ptr, offsets); + page= btr_node_ptr_get_child_page_no(page_cur.rec, offsets); n_blocks++; } @@ -2178,8 +2226,7 @@ convert_big_rec: return(DB_TOO_BIG_RECORD); } - LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), - goto fail); + LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), goto fail); if (block->page.zip.data && leaf && (page_get_data_size(page) + rec_size @@ -2193,7 +2240,7 @@ fail: /* prefetch siblings of the leaf for the pessimistic operation, if the page is leaf. */ - if (page_is_leaf(page)) { + if (leaf) { btr_cur_prefetch_siblings(block, index); } fail_err: @@ -2262,7 +2309,7 @@ fail_err: #ifdef UNIV_DEBUG if (!(flags & BTR_CREATE_FLAG) - && index->is_primary() && page_is_leaf(page)) { + && leaf && index->is_primary()) { const dfield_t* trx_id = dtuple_get_nth_field( entry, dict_col_get_clust_pos( dict_table_get_sys_col(index->table, diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 0a4da87083a..4de8b4fd175 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -408,7 +408,6 @@ static bool buf_page_decrypt_after_read(buf_page_t *bpage, if (id.space() == SRV_TMP_SPACE_ID && innodb_encrypt_temporary_tables) { slot = buf_pool.io_buf_reserve(); - ut_a(slot); slot->allocate(); bool ok = buf_tmp_page_decrypt(slot->crypt_buf, dst_frame); slot->release(); @@ -431,7 +430,6 @@ decompress: } slot = buf_pool.io_buf_reserve(); - ut_a(slot); slot->allocate(); decompress_with_slot: @@ -455,7 +453,6 @@ decrypt_failed: } slot = buf_pool.io_buf_reserve(); - ut_a(slot); slot->allocate(); /* decrypt using crypt_buf to dst_frame */ @@ -1287,6 +1284,41 @@ inline bool buf_pool_t::realloc(buf_block_t *block) return(true); /* free_list was enough */ } +void buf_pool_t::io_buf_t::create(ulint n_slots) +{ + this->n_slots= n_slots; + slots= static_cast<buf_tmp_buffer_t*> + (ut_malloc_nokey(n_slots * sizeof *slots)); + memset((void*) slots, 0, n_slots * sizeof *slots); +} + +void buf_pool_t::io_buf_t::close() +{ + for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) + { + aligned_free(s->crypt_buf); + aligned_free(s->comp_buf); + } + ut_free(slots); + slots= nullptr; + n_slots= 0; +} + +buf_tmp_buffer_t *buf_pool_t::io_buf_t::reserve() +{ + for (;;) + { + for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) + if (s->acquire()) + return s; + os_aio_wait_until_no_pending_writes(); + for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) + if (s->acquire()) + return s; + os_aio_wait_until_no_pending_reads(); + } +} + /** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status to the specified string. The format and the following parameters are the same as the ones used for printf(3). @@ -1353,21 +1385,25 @@ inline bool buf_pool_t::withdraw_blocks() block = next_block; } - mysql_mutex_unlock(&mutex); /* reserve free_list length */ if (UT_LIST_GET_LEN(withdraw) < withdraw_target) { buf_flush_LRU( std::max<ulint>(withdraw_target - UT_LIST_GET_LEN(withdraw), - srv_LRU_scan_depth)); - buf_flush_wait_batch_end_acquiring_mutex(true); + srv_LRU_scan_depth), + true); + mysql_mutex_unlock(&buf_pool.mutex); + buf_dblwr.flush_buffered_writes(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_flush_wait_LRU_batch_end(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + mysql_mutex_lock(&buf_pool.mutex); } /* relocate blocks/buddies in withdrawn area */ ulint count2 = 0; - mysql_mutex_lock(&mutex); buf_pool_mutex_exit_forbid(); for (buf_page_t* bpage = UT_LIST_GET_FIRST(LRU), *next_bpage; bpage; bpage = next_bpage) { @@ -2095,13 +2131,15 @@ lookup: return bpage; must_read_page: - if (dberr_t err= buf_read_page(page_id, zip_size, chain)) - { + switch (dberr_t err= buf_read_page(page_id, zip_size, chain)) { + case DB_SUCCESS: + case DB_SUCCESS_LOCKED_REC: + goto lookup; + default: ib::error() << "Reading compressed page " << page_id << " failed with error: " << err; return nullptr; } - goto lookup; } /********************************************************************//** @@ -2245,15 +2283,14 @@ buf_page_get_low( #ifdef UNIV_DEBUG switch (mode) { - case BUF_PEEK_IF_IN_POOL: + default: + ut_ad(mode == BUF_PEEK_IF_IN_POOL); + break; + case BUF_GET_POSSIBLY_FREED: case BUF_GET_IF_IN_POOL: /* The caller may pass a dummy page size, because it does not really matter. */ break; - default: - MY_ASSERT_UNREACHABLE(); - case BUF_GET_POSSIBLY_FREED: - break; case BUF_GET: ut_ad(!mtr->is_freeing_tree()); fil_space_t* s = fil_space_get(page_id.space()); @@ -2319,20 +2356,23 @@ loop: corrupted, or if an encrypted page with a valid checksum cannot be decypted. */ - if (dberr_t local_err = buf_read_page(page_id, zip_size, chain)) { - if (local_err != DB_CORRUPTION - && mode != BUF_GET_POSSIBLY_FREED + switch (dberr_t local_err = buf_read_page(page_id, zip_size, chain)) { + case DB_SUCCESS: + case DB_SUCCESS_LOCKED_REC: + buf_read_ahead_random(page_id, zip_size); + break; + default: + if (mode != BUF_GET_POSSIBLY_FREED && retries++ < BUF_PAGE_READ_MAX_RETRIES) { DBUG_EXECUTE_IF("intermittent_read_failure", retries = BUF_PAGE_READ_MAX_RETRIES;); - } else { - if (err) { - *err = local_err; - } - return nullptr; } - } else { - buf_read_ahead_random(page_id, zip_size); + /* fall through */ + case DB_PAGE_CORRUPTED: + if (err) { + *err = local_err; + } + return nullptr; } ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate()); @@ -2383,11 +2423,12 @@ ignore_block: return nullptr; } } else if (mode != BUF_PEEK_IF_IN_POOL) { - } else if (!mtr) { + } else if (!mtr) { ut_ad(!block->page.oldest_modification()); mysql_mutex_lock(&buf_pool.mutex); block->unfix(); +free_unfixed_block: if (!buf_LRU_free_page(&block->page, true)) { ut_ad(0); } @@ -2495,20 +2536,19 @@ wait_for_unfix: /* Decompress the page while not holding buf_pool.mutex. */ - auto ok = buf_zip_decompress(block, false); - block->page.read_unfix(state); - state = block->page.state(); - block->page.lock.x_unlock(); + const auto ok = buf_zip_decompress(block, false); --buf_pool.n_pend_unzip; - if (!ok) { - /* FIXME: Evict the corrupted - ROW_FORMAT=COMPRESSED page! */ - if (err) { *err = DB_PAGE_CORRUPTED; } - return nullptr; + mysql_mutex_lock(&buf_pool.mutex); + } + state = block->page.read_unfix(state); + block->page.lock.x_unlock(); + + if (!ok) { + goto free_unfixed_block; } } @@ -2606,35 +2646,37 @@ buf_page_get_gen( mtr_t* mtr, dberr_t* err) { - if (buf_block_t *block= recv_sys.recover(page_id)) + buf_block_t *block= recv_sys.recover(page_id); + if (UNIV_LIKELY(!block)) + return buf_page_get_low(page_id, zip_size, rw_latch, + guess, mode, mtr, err); + else if (UNIV_UNLIKELY(block == reinterpret_cast<buf_block_t*>(-1))) { - if (UNIV_UNLIKELY(block == reinterpret_cast<buf_block_t*>(-1))) - { - corrupted: - if (err) - *err= DB_CORRUPTION; - return nullptr; - } - /* Recovery is a special case; we fix() before acquiring lock. */ - auto s= block->page.fix(); - ut_ad(s >= buf_page_t::FREED); - /* The block may be write-fixed at this point because we are not - holding a lock, but it must not be read-fixed. */ - ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX); - if (s < buf_page_t::UNFIXED) - { - ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL); - block->page.unfix(); - goto corrupted; - } + corrupted: if (err) - *err= DB_SUCCESS; - mtr->page_lock(block, rw_latch); - return block; + *err= DB_CORRUPTION; + return nullptr; + } + if (err) + *err= DB_SUCCESS; + /* Recovery is a special case; we fix() before acquiring lock. */ + auto s= block->page.fix(); + ut_ad(s >= buf_page_t::FREED); + /* The block may be write-fixed at this point because we are not + holding a lock, but it must not be read-fixed. */ + ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX); + if (s < buf_page_t::UNFIXED) + { + ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL); + mysql_mutex_lock(&buf_pool.mutex); + block->page.unfix(); + buf_LRU_free_page(&block->page, true); + mysql_mutex_unlock(&buf_pool.mutex); + goto corrupted; } - return buf_page_get_low(page_id, zip_size, rw_latch, - guess, mode, mtr, err); + mtr->page_lock(block, rw_latch); + return block; } /********************************************************************//** @@ -2922,12 +2964,12 @@ retry: buf_unzip_LRU_add_block(reinterpret_cast<buf_block_t*>(bpage), FALSE); } + buf_pool.stat.n_pages_created++; mysql_mutex_unlock(&buf_pool.mutex); mtr->memo_push(reinterpret_cast<buf_block_t*>(bpage), MTR_MEMO_PAGE_X_FIX); bpage->set_accessed(); - buf_pool.stat.n_pages_created++; static_assert(FIL_PAGE_PREV + 4 == FIL_PAGE_NEXT, "adjacent"); memset_aligned<8>(bpage->frame + FIL_PAGE_PREV, 0xff, 8); @@ -3145,7 +3187,6 @@ dberr_t buf_page_t::read_complete(const fil_node_t &node) ut_d(auto n=) buf_pool.n_pend_reads--; ut_ad(n > 0); - buf_pool.stat.n_pages_read++; const byte *read_frame= zip.data ? zip.data : frame; ut_ad(read_frame); @@ -3286,9 +3327,6 @@ void buf_pool_invalidate() { mysql_mutex_lock(&buf_pool.mutex); - buf_flush_wait_batch_end(true); - buf_flush_wait_batch_end(false); - /* It is possible that a write batch that has been posted earlier is still not complete. For buffer pool invalidation to proceed we must ensure there is NO write activity happening. */ @@ -3439,8 +3477,8 @@ void buf_pool_t::print() << UT_LIST_GET_LEN(flush_list) << ", n pending decompressions=" << n_pend_unzip << ", n pending reads=" << n_pend_reads - << ", n pending flush LRU=" << n_flush_LRU_ - << " list=" << n_flush_list_ + << ", n pending flush LRU=" << n_flush() + << " list=" << buf_dblwr.pending_writes() << ", pages made young=" << stat.n_pages_made_young << ", not young=" << stat.n_pages_not_made_young << ", pages read=" << stat.n_pages_read @@ -3552,13 +3590,13 @@ void buf_stats_get_pool_info(buf_pool_info_t *pool_info) pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool.flush_list); pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool.unzip_LRU); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); pool_info->n_pend_reads = buf_pool.n_pend_reads; - pool_info->n_pending_flush_lru = buf_pool.n_flush_LRU_; + pool_info->n_pending_flush_lru = buf_pool.n_flush(); - pool_info->n_pending_flush_list = buf_pool.n_flush_list_; + pool_info->n_pending_flush_list = buf_dblwr.pending_writes(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); current_time = time(NULL); time_elapsed = 0.001 + difftime(current_time, diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index cbf7885a271..2ea72eb4c5f 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -46,7 +46,17 @@ inline buf_block_t *buf_dblwr_trx_sys_get(mtr_t *mtr) 0, RW_X_LATCH, mtr); } -/** Initialize the doublewrite buffer data structure. +void buf_dblwr_t::init() +{ + if (!active_slot) + { + active_slot= &slots[0]; + mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr); + pthread_cond_init(&cond, nullptr); + } +} + +/** Initialise the persistent storage of the doublewrite buffer. @param header doublewrite page header in the TRX_SYS page */ inline void buf_dblwr_t::init(const byte *header) { @@ -54,8 +64,6 @@ inline void buf_dblwr_t::init(const byte *header) ut_ad(!active_slot->reserved); ut_ad(!batch_running); - mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr); - pthread_cond_init(&cond, nullptr); block1= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK1)); block2= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK2)); @@ -74,7 +82,7 @@ inline void buf_dblwr_t::init(const byte *header) @return whether the operation succeeded */ bool buf_dblwr_t::create() { - if (is_initialised()) + if (is_created()) return true; mtr_t mtr; @@ -341,7 +349,7 @@ func_exit: void buf_dblwr_t::recover() { ut_ad(log_sys.last_checkpoint_lsn); - if (!is_initialised()) + if (!is_created()) return; uint32_t page_no_dblwr= 0; @@ -450,10 +458,9 @@ next_page: /** Free the doublewrite buffer. */ void buf_dblwr_t::close() { - if (!is_initialised()) + if (!active_slot) return; - /* Free the double write data structures. */ ut_ad(!active_slot->reserved); ut_ad(!active_slot->first_free); ut_ad(!batch_running); @@ -467,35 +474,41 @@ void buf_dblwr_t::close() mysql_mutex_destroy(&mutex); memset((void*) this, 0, sizeof *this); - active_slot= &slots[0]; } /** Update the doublewrite buffer on write completion. */ -void buf_dblwr_t::write_completed() +void buf_dblwr_t::write_completed(bool with_doublewrite) { ut_ad(this == &buf_dblwr); - ut_ad(srv_use_doublewrite_buf); - ut_ad(is_initialised()); ut_ad(!srv_read_only_mode); mysql_mutex_lock(&mutex); - ut_ad(batch_running); - slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0]; - ut_ad(flush_slot->reserved); - ut_ad(flush_slot->reserved <= flush_slot->first_free); + ut_ad(writes_pending); + if (!--writes_pending) + pthread_cond_broadcast(&write_cond); - if (!--flush_slot->reserved) + if (with_doublewrite) { - mysql_mutex_unlock(&mutex); - /* This will finish the batch. Sync data files to the disk. */ - fil_flush_file_spaces(); - mysql_mutex_lock(&mutex); + ut_ad(is_created()); + ut_ad(srv_use_doublewrite_buf); + ut_ad(batch_running); + slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0]; + ut_ad(flush_slot->reserved); + ut_ad(flush_slot->reserved <= flush_slot->first_free); + + if (!--flush_slot->reserved) + { + mysql_mutex_unlock(&mutex); + /* This will finish the batch. Sync data files to the disk. */ + fil_flush_file_spaces(); + mysql_mutex_lock(&mutex); - /* We can now reuse the doublewrite memory buffer: */ - flush_slot->first_free= 0; - batch_running= false; - pthread_cond_broadcast(&cond); + /* We can now reuse the doublewrite memory buffer: */ + flush_slot->first_free= 0; + batch_running= false; + pthread_cond_broadcast(&cond); + } } mysql_mutex_unlock(&mutex); @@ -640,7 +653,7 @@ void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request) { ut_ad(this == &buf_dblwr); ut_ad(srv_use_doublewrite_buf); - ut_ad(is_initialised()); + ut_ad(is_created()); ut_ad(!srv_read_only_mode); ut_ad(!request.bpage); ut_ad(request.node == fil_system.sys_space->chain.start); @@ -706,7 +719,7 @@ posted, and also when we may have to wait for a page latch! Otherwise a deadlock of threads can occur. */ void buf_dblwr_t::flush_buffered_writes() { - if (!is_initialised() || !srv_use_doublewrite_buf) + if (!is_created() || !srv_use_doublewrite_buf) { fil_flush_file_spaces(); return; @@ -739,6 +752,7 @@ void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size) const ulint buf_size= 2 * block_size(); mysql_mutex_lock(&mutex); + writes_pending++; for (;;) { diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 99b44c0d06e..569096377c0 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -44,15 +44,12 @@ Created 11/11/1995 Heikki Tuuri #include "snappy-c.h" /** Number of pages flushed via LRU. Protected by buf_pool.mutex. -Also included in buf_flush_page_count. */ +Also included in buf_pool.stat.n_pages_written. */ ulint buf_lru_flush_page_count; /** Number of pages freed without flushing. Protected by buf_pool.mutex. */ ulint buf_lru_freed_page_count; -/** Number of pages flushed. Protected by buf_pool.mutex. */ -ulint buf_flush_page_count; - /** Flag indicating if the page_cleaner is in active state. */ Atomic_relaxed<bool> buf_page_cleaner_is_active; @@ -112,8 +109,7 @@ static void buf_flush_validate_skip() } #endif /* UNIV_DEBUG */ -/** Wake up the page cleaner if needed */ -void buf_pool_t::page_cleaner_wakeup() +void buf_pool_t::page_cleaner_wakeup(bool for_LRU) { ut_d(buf_flush_validate_skip()); if (!page_cleaner_idle()) @@ -147,32 +143,26 @@ void buf_pool_t::page_cleaner_wakeup() - by allowing last_activity_count to updated when page-cleaner is made active and has work to do. This ensures that the last_activity signal is consumed by the page-cleaner before the next one is generated. */ - if ((pct_lwm != 0.0 && pct_lwm <= dirty_pct) || - (pct_lwm != 0.0 && last_activity_count == srv_get_activity_count()) || + if (for_LRU || + (pct_lwm != 0.0 && (pct_lwm <= dirty_pct || + last_activity_count == srv_get_activity_count())) || srv_max_buf_pool_modified_pct <= dirty_pct) { - page_cleaner_is_idle= false; + page_cleaner_status-= PAGE_CLEANER_IDLE; pthread_cond_signal(&do_flush_list); } } -inline void buf_pool_t::delete_from_flush_list_low(buf_page_t *bpage) noexcept +/** Remove a block from flush_list. +@param bpage buffer pool page */ +void buf_pool_t::delete_from_flush_list(buf_page_t *bpage) noexcept { ut_ad(!fsp_is_system_temporary(bpage->id().space())); mysql_mutex_assert_owner(&flush_list_mutex); flush_hp.adjust(bpage); UT_LIST_REMOVE(flush_list, bpage); -} - -/** Remove a block from flush_list. -@param bpage buffer pool page -@param clear whether to invoke buf_page_t::clear_oldest_modification() */ -void buf_pool_t::delete_from_flush_list(buf_page_t *bpage, bool clear) noexcept -{ - delete_from_flush_list_low(bpage); - stat.flush_list_bytes-= bpage->physical_size(); - if (clear) - bpage->clear_oldest_modification(); + flush_list_bytes-= bpage->physical_size(); + bpage->clear_oldest_modification(); #ifdef UNIV_DEBUG buf_flush_validate_skip(); #endif /* UNIV_DEBUG */ @@ -187,10 +177,10 @@ void buf_flush_remove_pages(uint32_t id) { const page_id_t first(id, 0), end(id + 1, 0); ut_ad(id); - mysql_mutex_lock(&buf_pool.mutex); for (;;) { + mysql_mutex_lock(&buf_pool.mutex); bool deferred= false; mysql_mutex_lock(&buf_pool.flush_list_mutex); @@ -213,18 +203,14 @@ void buf_flush_remove_pages(uint32_t id) bpage= prev; } + mysql_mutex_unlock(&buf_pool.mutex); mysql_mutex_unlock(&buf_pool.flush_list_mutex); if (!deferred) break; - mysql_mutex_unlock(&buf_pool.mutex); - std::this_thread::yield(); - mysql_mutex_lock(&buf_pool.mutex); - buf_flush_wait_batch_end(false); + buf_dblwr.wait_for_page_writes(); } - - mysql_mutex_unlock(&buf_pool.mutex); } /*******************************************************************//** @@ -269,7 +255,7 @@ buf_flush_relocate_on_flush_list( bpage->clear_oldest_modification(); if (lsn == 1) { - buf_pool.stat.flush_list_bytes -= dpage->physical_size(); + buf_pool.flush_list_bytes -= dpage->physical_size(); dpage->list.prev = nullptr; dpage->list.next = nullptr; dpage->clear_oldest_modification(); @@ -309,6 +295,21 @@ inline void buf_page_t::write_complete(bool temporary) lock.u_unlock(true); } +inline void buf_pool_t::n_flush_inc() +{ + mysql_mutex_assert_owner(&flush_list_mutex); + page_cleaner_status+= LRU_FLUSH; +} + +inline void buf_pool_t::n_flush_dec() +{ + mysql_mutex_lock(&flush_list_mutex); + ut_ad(page_cleaner_status >= LRU_FLUSH); + if ((page_cleaner_status-= LRU_FLUSH) < LRU_FLUSH) + pthread_cond_broadcast(&done_flush_LRU); + mysql_mutex_unlock(&flush_list_mutex); +} + /** Complete write of a file page from buf_pool. @param request write request */ void buf_page_write_complete(const IORequest &request) @@ -324,13 +325,6 @@ void buf_page_write_complete(const IORequest &request) ut_ad(!buf_dblwr.is_inside(bpage->id())); ut_ad(request.node->space->id == bpage->id().space()); - if (state < buf_page_t::WRITE_FIX_REINIT && - request.node->space->use_doublewrite()) - { - ut_ad(request.node->space != fil_system.temp_space); - buf_dblwr.write_completed(); - } - if (request.slot) request.slot->release(); @@ -338,33 +332,31 @@ void buf_page_write_complete(const IORequest &request) buf_page_monitor(*bpage, false); DBUG_PRINT("ib_buf", ("write page %u:%u", bpage->id().space(), bpage->id().page_no())); - const bool temp= fsp_is_system_temporary(bpage->id().space()); - mysql_mutex_lock(&buf_pool.mutex); + mysql_mutex_assert_not_owner(&buf_pool.mutex); mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); - buf_pool.stat.n_pages_written++; - bpage->write_complete(temp); if (request.is_LRU()) { + const bool temp= bpage->oldest_modification() == 2; + if (!temp) + buf_dblwr.write_completed(state < buf_page_t::WRITE_FIX_REINIT && + request.node->space->use_doublewrite()); + /* We must hold buf_pool.mutex while releasing the block, so that + no other thread can access it before we have freed it. */ + mysql_mutex_lock(&buf_pool.mutex); + bpage->write_complete(temp); buf_LRU_free_page(bpage, true); + mysql_mutex_unlock(&buf_pool.mutex); - ut_ad(buf_pool.n_flush_LRU_); - if (!--buf_pool.n_flush_LRU_) - { - pthread_cond_broadcast(&buf_pool.done_flush_LRU); - pthread_cond_signal(&buf_pool.done_free); - } + buf_pool.n_flush_dec(); } else { - ut_ad(!temp); - ut_ad(buf_pool.n_flush_list_); - if (!--buf_pool.n_flush_list_) - pthread_cond_broadcast(&buf_pool.done_flush_list); + buf_dblwr.write_completed(state < buf_page_t::WRITE_FIX_REINIT && + request.node->space->use_doublewrite()); + bpage->write_complete(false); } - - mysql_mutex_unlock(&buf_pool.mutex); } /** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page. @@ -707,43 +699,41 @@ not_compressed: } /** Free a page whose underlying file page has been freed. */ -inline void buf_pool_t::release_freed_page(buf_page_t *bpage) noexcept +ATTRIBUTE_COLD void buf_pool_t::release_freed_page(buf_page_t *bpage) noexcept { mysql_mutex_assert_owner(&mutex); - mysql_mutex_lock(&flush_list_mutex); ut_d(const lsn_t oldest_modification= bpage->oldest_modification();) if (fsp_is_system_temporary(bpage->id().space())) { ut_ad(bpage->frame); ut_ad(oldest_modification == 2); + bpage->clear_oldest_modification(); } else { + mysql_mutex_lock(&flush_list_mutex); ut_ad(oldest_modification > 2); - delete_from_flush_list(bpage, false); + delete_from_flush_list(bpage); + mysql_mutex_unlock(&flush_list_mutex); } - bpage->clear_oldest_modification(); - mysql_mutex_unlock(&flush_list_mutex); - bpage->lock.u_unlock(true); + bpage->lock.u_unlock(true); buf_LRU_free_page(bpage, true); } -/** Write a flushable page to a file. buf_pool.mutex must be held. -@param lru true=buf_pool.LRU; false=buf_pool.flush_list +/** Write a flushable page to a file or free a freeable block. +@param evict whether to evict the page on write completion @param space tablespace -@return whether the page was flushed and buf_pool.mutex was released */ -inline bool buf_page_t::flush(bool lru, fil_space_t *space) +@return whether a page write was initiated and buf_pool.mutex released */ +bool buf_page_t::flush(bool evict, fil_space_t *space) { + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); ut_ad(in_file()); ut_ad(in_LRU_list); ut_ad((space->purpose == FIL_TYPE_TEMPORARY) == (space == fil_system.temp_space)); + ut_ad(evict || space != fil_system.temp_space); ut_ad(space->referenced()); - ut_ad(lru || space != fil_system.temp_space); - - if (!lock.u_lock_try(true)) - return false; const auto s= state(); ut_a(s >= FREED); @@ -751,44 +741,36 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space) if (s < UNFIXED) { buf_pool.release_freed_page(this); - mysql_mutex_unlock(&buf_pool.mutex); - return true; - } - - if (s >= READ_FIX || oldest_modification() < 2) - { - lock.u_unlock(true); return false; } - mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); - - /* Apart from the U-lock, this block will also be protected by - is_write_fixed() and oldest_modification()>1. - Thus, it cannot be relocated or removed. */ - - DBUG_PRINT("ib_buf", ("%s %u page %u:%u", - lru ? "LRU" : "flush_list", - id().space(), id().page_no())); ut_d(const auto f=) zip.fix.fetch_add(WRITE_FIX - UNFIXED); ut_ad(f >= UNFIXED); ut_ad(f < READ_FIX); - ut_ad(space == fil_system.temp_space + ut_ad((space == fil_system.temp_space) ? oldest_modification() == 2 : oldest_modification() > 2); - if (lru) - { - ut_ad(buf_pool.n_flush_LRU_ < ULINT_UNDEFINED); - buf_pool.n_flush_LRU_++; - } - else + + /* Increment the I/O operation count used for selecting LRU policy. */ + buf_LRU_stat_inc_io(); + mysql_mutex_unlock(&buf_pool.mutex); + + IORequest::Type type= IORequest::WRITE_ASYNC; + if (UNIV_UNLIKELY(evict)) { - ut_ad(buf_pool.n_flush_list_ < ULINT_UNDEFINED); - buf_pool.n_flush_list_++; + type= IORequest::WRITE_LRU; + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.n_flush_inc(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); } - buf_flush_page_count++; - mysql_mutex_unlock(&buf_pool.mutex); + /* Apart from the U-lock, this block will also be protected by + is_write_fixed() and oldest_modification()>1. + Thus, it cannot be relocated or removed. */ + + DBUG_PRINT("ib_buf", ("%s %u page %u:%u", + evict ? "LRU" : "flush_list", + id().space(), id().page_no())); buf_block_t *block= reinterpret_cast<buf_block_t*>(this); page_t *write_frame= zip.data; @@ -798,7 +780,6 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space) #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 size_t orig_size; #endif - IORequest::Type type= lru ? IORequest::WRITE_LRU : IORequest::WRITE_ASYNC; buf_tmp_buffer_t *slot= nullptr; if (UNIV_UNLIKELY(!frame)) /* ROW_FORMAT=COMPRESSED */ @@ -842,7 +823,10 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space) { switch (space->chain.start->punch_hole) { case 1: - type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH; + static_assert(IORequest::PUNCH_LRU - IORequest::PUNCH == + IORequest::WRITE_LRU - IORequest::WRITE_ASYNC, ""); + type= + IORequest::Type(type + (IORequest::PUNCH - IORequest::WRITE_ASYNC)); break; case 2: size= orig_size; @@ -863,24 +847,24 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space) ut_ad(lsn >= oldest_modification()); log_write_up_to(lsn, true); } + if (UNIV_LIKELY(space->purpose != FIL_TYPE_TEMPORARY)) + buf_dblwr.add_unbuffered(); space->io(IORequest{type, this, slot}, physical_offset(), size, write_frame, this); } else buf_dblwr.add_to_batch(IORequest{this, slot, space->chain.start, type}, size); - - /* Increment the I/O operation count used for selecting LRU policy. */ - buf_LRU_stat_inc_io(); return true; } /** Check whether a page can be flushed from the buf_pool. @param id page identifier @param fold id.fold() -@param lru true=buf_pool.LRU; false=buf_pool.flush_list +@param evict true=buf_pool.LRU; false=buf_pool.flush_list @return whether the page can be flushed */ -static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru) +static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, + bool evict) { mysql_mutex_assert_owner(&buf_pool.mutex); ut_ad(fold == id.fold()); @@ -892,23 +876,23 @@ static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru) if (!bpage) return false; - /* We avoid flushing 'non-old' blocks in an LRU flush, because the + /* We avoid flushing 'non-old' blocks in an eviction flush, because the flushed blocks are soon freed */ - if (lru && !bpage->is_old()) + if (evict && !bpage->is_old()) return false; - return bpage->oldest_modification() > 1 && bpage->ready_for_flush(); + return bpage->oldest_modification() > 1 && !bpage->is_io_fixed(); } /** Check which neighbors of a page can be flushed from the buf_pool. @param space tablespace @param id page identifier of a dirty page @param contiguous whether to consider contiguous areas of pages -@param lru true=buf_pool.LRU; false=buf_pool.flush_list +@param evict true=buf_pool.LRU; false=buf_pool.flush_list @return last page number that can be flushed */ static page_id_t buf_flush_check_neighbors(const fil_space_t &space, page_id_t &id, bool contiguous, - bool lru) + bool evict) { ut_ad(id.page_no() < space.size + (space.physical_size() == 2048 ? 1 @@ -941,7 +925,7 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space, for (page_id_t i= id - 1;; --i) { fold--; - if (!buf_flush_check_neighbor(i, fold, lru)) + if (!buf_flush_check_neighbor(i, fold, evict)) { low= i + 1; break; @@ -957,7 +941,7 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space, while (++i < high) { ++fold; - if (!buf_flush_check_neighbor(i, fold, lru)) + if (!buf_flush_check_neighbor(i, fold, evict)) break; } @@ -1024,28 +1008,37 @@ uint32_t fil_space_t::flush_freed(bool writable) and also write zeroes or punch the hole for the freed ranges of pages. @param space tablespace @param page_id page identifier +@param bpage buffer page @param contiguous whether to consider contiguous areas of pages -@param lru true=buf_pool.LRU; false=buf_pool.flush_list +@param evict true=buf_pool.LRU; false=buf_pool.flush_list @param n_flushed number of pages flushed so far in this batch @param n_to_flush maximum number of pages we are allowed to flush @return number of pages flushed */ static ulint buf_flush_try_neighbors(fil_space_t *space, const page_id_t page_id, - bool contiguous, bool lru, + buf_page_t *bpage, + bool contiguous, bool evict, ulint n_flushed, ulint n_to_flush) { ut_ad(space->id == page_id.space()); + ut_ad(bpage->id() == page_id); ulint count= 0; page_id_t id= page_id; - page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, lru); + page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, evict); ut_ad(page_id >= id); ut_ad(page_id < high); - for (ulint id_fold= id.fold(); id < high && !space->is_stopping(); - ++id, ++id_fold) + for (ulint id_fold= id.fold(); id < high; ++id, ++id_fold) { + if (UNIV_UNLIKELY(space->is_stopping())) + { + if (bpage) + bpage->lock.u_unlock(true); + break; + } + if (count + n_flushed >= n_to_flush) { if (id > page_id) @@ -1059,25 +1052,38 @@ static ulint buf_flush_try_neighbors(fil_space_t *space, const buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id_fold); mysql_mutex_lock(&buf_pool.mutex); - if (buf_page_t *bpage= buf_pool.page_hash.get(id, chain)) + if (buf_page_t *b= buf_pool.page_hash.get(id, chain)) { - ut_ad(bpage->in_file()); - /* We avoid flushing 'non-old' blocks in an LRU flush, - because the flushed blocks are soon freed */ - if (!lru || id == page_id || bpage->is_old()) + ut_ad(b->in_file()); + if (id == page_id) { - if (bpage->oldest_modification() > 1 && bpage->ready_for_flush() && - bpage->flush(lru, space)) + ut_ad(bpage == b); + bpage= nullptr; + ut_ad(b->oldest_modification() > 1); + flush: + if (b->flush(evict, space)) { ++count; continue; } } + /* We avoid flushing 'non-old' blocks in an eviction flush, + because the flushed blocks are soon freed */ + else if ((!evict || b->is_old()) && + b->oldest_modification() > 1 && b->lock.u_lock_try(true)) + { + if (b->oldest_modification() < 2) + b->lock.u_unlock(true); + else + goto flush; + } } mysql_mutex_unlock(&buf_pool.mutex); } + ut_ad(!bpage); + if (auto n= count - 1) { MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE, @@ -1093,12 +1099,8 @@ This utility moves the uncompressed frames of pages to the free list. Note that this function does not actually flush any data to disk. It just detaches the uncompressed frames from the compressed pages at the tail of the unzip_LRU and puts those freed frames in the free list. -Note that it is a best effort attempt and it is not guaranteed that -after a call to this function there will be 'max' blocks in the free -list. -@param[in] max desired number of blocks in the free_list @return number of blocks moved to the free list. */ -static ulint buf_free_from_unzip_LRU_list_batch(ulint max) +static ulint buf_free_from_unzip_LRU_list_batch() { ulint scanned = 0; ulint count = 0; @@ -1108,7 +1110,6 @@ static ulint buf_free_from_unzip_LRU_list_batch(ulint max) buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU); while (block - && count < max && UT_LIST_GET_LEN(buf_pool.free) < srv_LRU_scan_depth && UT_LIST_GET_LEN(buf_pool.unzip_LRU) > UT_LIST_GET_LEN(buf_pool.LRU) / 10) { @@ -1155,34 +1156,30 @@ struct flush_counters_t ulint evicted; }; -/** Try to discard a dirty page. +/** Discard a dirty page, and release buf_pool.flush_list_mutex. @param bpage dirty page whose tablespace is not accessible */ static void buf_flush_discard_page(buf_page_t *bpage) { - mysql_mutex_assert_owner(&buf_pool.mutex); - mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); ut_ad(bpage->in_file()); ut_ad(bpage->oldest_modification()); - if (!bpage->lock.u_lock_try(false)) - return; - - mysql_mutex_lock(&buf_pool.flush_list_mutex); buf_pool.delete_from_flush_list(bpage); mysql_mutex_unlock(&buf_pool.flush_list_mutex); ut_d(const auto state= bpage->state()); ut_ad(state == buf_page_t::FREED || state == buf_page_t::UNFIXED || state == buf_page_t::REINIT); - bpage->lock.u_unlock(); - + bpage->lock.u_unlock(true); buf_LRU_free_page(bpage, true); } -/** Flush dirty blocks from the end of the LRU list. -@param max maximum number of blocks to make available in buf_pool.free -@param n counts of flushed and evicted pages */ -static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) +/** Flush dirty blocks from the end buf_pool.LRU, +and move clean blocks to buf_pool.free. +@param max maximum number of blocks to flush +@param evict whether dirty pages are to be evicted after flushing them +@param n counts of flushed and evicted pages */ +static void buf_flush_LRU_list_batch(ulint max, bool evict, + flush_counters_t *n) { ulint scanned= 0; ulint free_limit= srv_LRU_scan_depth; @@ -1201,29 +1198,48 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU); bpage && ((UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN && - UT_LIST_GET_LEN(buf_pool.free) < free_limit && - n->flushed + n->evicted < max) || - recv_recovery_is_on()); ++scanned) + UT_LIST_GET_LEN(buf_pool.free) < free_limit) || + recv_recovery_is_on()); + ++scanned, bpage= buf_pool.lru_hp.get()) { buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage); - const lsn_t oldest_modification= bpage->oldest_modification(); buf_pool.lru_hp.set(prev); - const auto state= bpage->state(); + auto state= bpage->state(); ut_ad(state >= buf_page_t::FREED); ut_ad(bpage->in_LRU_list); - if (oldest_modification <= 1) - { + switch (bpage->oldest_modification()) { + case 0: + evict: if (state != buf_page_t::FREED && (state >= buf_page_t::READ_FIX || (~buf_page_t::LRU_MASK & state))) - goto must_skip; - if (buf_LRU_free_page(bpage, true)) - ++n->evicted; + continue; + buf_LRU_free_page(bpage, true); + ++n->evicted; + /* fall through */ + case 1: + continue; } - else if (state < buf_page_t::READ_FIX) + + if (state < buf_page_t::READ_FIX && bpage->lock.u_lock_try(true)) { - /* Block is ready for flush. Dispatch an IO request. The IO - helper thread will put it on free list in IO completion routine. */ + ut_ad(!bpage->is_io_fixed()); + bool do_evict= evict; + switch (bpage->oldest_modification()) { + case 1: + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.delete_from_flush_list(bpage); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + /* fall through */ + case 0: + bpage->lock.u_unlock(true); + goto evict; + case 2: + /* LRU flushing will always evict pages of the temporary tablespace. */ + do_evict= true; + } + /* Block is ready for flush. Dispatch an IO request. + If do_evict, the page may be evicted by buf_page_write_complete(). */ const page_id_t page_id(bpage->id()); const uint32_t space_id= page_id.space(); if (!space || space->id != space_id) @@ -1240,7 +1256,6 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) mysql_mutex_lock(&buf_pool.mutex); if (p.second) buf_pool.stat.n_pages_written+= p.second; - goto retry; } else ut_ad(!space); @@ -1252,27 +1267,33 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) } if (!space) + { + mysql_mutex_lock(&buf_pool.flush_list_mutex); buf_flush_discard_page(bpage); + } else if (neighbors && space->is_rotational()) { mysql_mutex_unlock(&buf_pool.mutex); - n->flushed+= buf_flush_try_neighbors(space, page_id, neighbors == 1, - true, n->flushed, max); + n->flushed+= buf_flush_try_neighbors(space, page_id, bpage, + neighbors == 1, + do_evict, n->flushed, max); reacquire_mutex: mysql_mutex_lock(&buf_pool.mutex); } - else if (bpage->flush(true, space)) + else if (n->flushed >= max && !recv_recovery_is_on()) + { + bpage->lock.u_unlock(true); + break; + } + else if (bpage->flush(do_evict, space)) { ++n->flushed; goto reacquire_mutex; } } else - must_skip: /* Can't evict or dispatch this block. Go to previous. */ ut_ad(buf_pool.lru_hp.is_hp(prev)); - retry: - bpage= buf_pool.lru_hp.get(); } buf_pool.lru_hp.set(nullptr); @@ -1289,26 +1310,21 @@ reacquire_mutex: /** Flush and move pages from LRU or unzip_LRU list to the free list. Whether LRU or unzip_LRU is used depends on the state of the system. -@param max maximum number of blocks to make available in buf_pool.free -@return number of flushed pages */ -static ulint buf_do_LRU_batch(ulint max) +@param max maximum number of blocks to flush +@param evict whether dirty pages are to be evicted after flushing them +@param n counts of flushed and evicted pages */ +static void buf_do_LRU_batch(ulint max, bool evict, flush_counters_t *n) { - const ulint n_unzip_LRU_evicted= buf_LRU_evict_from_unzip_LRU() - ? buf_free_from_unzip_LRU_list_batch(max) - : 0; - flush_counters_t n; - n.flushed= 0; - n.evicted= n_unzip_LRU_evicted; - buf_flush_LRU_list_batch(max, &n); - mysql_mutex_assert_owner(&buf_pool.mutex); - - if (const ulint evicted= n.evicted - n_unzip_LRU_evicted) - buf_lru_freed_page_count+= evicted; + if (buf_LRU_evict_from_unzip_LRU()) + buf_free_from_unzip_LRU_list_batch(); + n->evicted= 0; + n->flushed= 0; + buf_flush_LRU_list_batch(max, evict, n); - if (n.flushed) - buf_lru_flush_page_count+= n.flushed; - - return n.flushed; + mysql_mutex_assert_owner(&buf_pool.mutex); + buf_lru_freed_page_count+= n->evicted; + buf_lru_flush_page_count+= n->flushed; + buf_pool.stat.n_pages_written+= n->flushed; } /** This utility flushes dirty blocks from the end of the flush_list. @@ -1322,6 +1338,7 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) ulint scanned= 0; mysql_mutex_assert_owner(&buf_pool.mutex); + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN ? 0 : srv_flush_neighbors; @@ -1332,7 +1349,6 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) /* Start from the end of the list looking for a suitable block to be flushed. */ - mysql_mutex_lock(&buf_pool.flush_list_mutex); ulint len= UT_LIST_GET_LEN(buf_pool.flush_list); for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); @@ -1343,32 +1359,42 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) break; ut_ad(bpage->in_file()); - buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); - - if (oldest_modification == 1) { - buf_pool.delete_from_flush_list(bpage); - skip: - bpage= prev; - continue; - } + buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); - ut_ad(oldest_modification > 2); + if (oldest_modification == 1) + { + clear: + buf_pool.delete_from_flush_list(bpage); + skip: + bpage= prev; + continue; + } - if (!bpage->ready_for_flush()) - goto skip; + ut_ad(oldest_modification > 2); - /* In order not to degenerate this scan to O(n*n) we attempt to - preserve the pointer position. Any thread that would remove 'prev' - from buf_pool.flush_list must adjust the hazard pointer. + if (!bpage->lock.u_lock_try(true)) + goto skip; - Note: A concurrent execution of buf_flush_list_space() may - terminate this scan prematurely. The buf_pool.n_flush_list() - should prevent multiple threads from executing - buf_do_flush_list_batch() concurrently, - but buf_flush_list_space() is ignoring that. */ - buf_pool.flush_hp.set(prev); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); + ut_ad(!bpage->is_io_fixed()); + + if (bpage->oldest_modification() == 1) + { + bpage->lock.u_unlock(true); + goto clear; + } + + /* In order not to degenerate this scan to O(n*n) we attempt to + preserve the pointer position. Any thread that would remove 'prev' + from buf_pool.flush_list must adjust the hazard pointer. + + Note: A concurrent execution of buf_flush_list_space() may + terminate this scan prematurely. The buf_pool.flush_list_active + should prevent multiple threads from executing + buf_do_flush_list_batch() concurrently, + but buf_flush_list_space() is ignoring that. */ + buf_pool.flush_hp.set(prev); + } const page_id_t page_id(bpage->id()); const uint32_t space_id= page_id.space(); @@ -1376,8 +1402,6 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) { if (last_space_id != space_id) { - mysql_mutex_lock(&buf_pool.flush_list_mutex); - buf_pool.flush_hp.set(bpage); mysql_mutex_unlock(&buf_pool.flush_list_mutex); mysql_mutex_unlock(&buf_pool.mutex); if (space) @@ -1386,18 +1410,8 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) space= p.first; last_space_id= space_id; mysql_mutex_lock(&buf_pool.mutex); - if (p.second) - buf_pool.stat.n_pages_written+= p.second; + buf_pool.stat.n_pages_written+= p.second; mysql_mutex_lock(&buf_pool.flush_list_mutex); - bpage= buf_pool.flush_hp.get(); - if (!bpage) - break; - if (bpage->id() != page_id) - continue; - buf_pool.flush_hp.set(UT_LIST_GET_PREV(list, bpage)); - if (bpage->oldest_modification() <= 1 || !bpage->ready_for_flush()) - goto next; - mysql_mutex_unlock(&buf_pool.flush_list_mutex); } else ut_ad(!space); @@ -1410,27 +1424,29 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) if (!space) buf_flush_discard_page(bpage); - else if (neighbors && space->is_rotational()) - { - mysql_mutex_unlock(&buf_pool.mutex); - count+= buf_flush_try_neighbors(space, page_id, neighbors == 1, - false, count, max_n); - reacquire_mutex: - mysql_mutex_lock(&buf_pool.mutex); - } - else if (bpage->flush(false, space)) + else { - ++count; - goto reacquire_mutex; + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + if (neighbors && space->is_rotational()) + { + mysql_mutex_unlock(&buf_pool.mutex); + count+= buf_flush_try_neighbors(space, page_id, bpage, neighbors == 1, + false, count, max_n); + reacquire_mutex: + mysql_mutex_lock(&buf_pool.mutex); + } + else if (bpage->flush(false, space)) + { + ++count; + goto reacquire_mutex; + } } mysql_mutex_lock(&buf_pool.flush_list_mutex); - next: bpage= buf_pool.flush_hp.get(); } buf_pool.flush_hp.set(nullptr); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); if (space) space->release(); @@ -1440,76 +1456,86 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL, MONITOR_FLUSH_BATCH_SCANNED_PER_CALL, scanned); - if (count) - MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE, - MONITOR_FLUSH_BATCH_COUNT, - MONITOR_FLUSH_BATCH_PAGES, - count); - mysql_mutex_assert_owner(&buf_pool.mutex); return count; } -/** Wait until a flush batch ends. -@param lru true=buf_pool.LRU; false=buf_pool.flush_list */ -void buf_flush_wait_batch_end(bool lru) +/** Wait until a LRU flush batch ends. */ +void buf_flush_wait_LRU_batch_end() { - const auto &n_flush= lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_; + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); + mysql_mutex_assert_not_owner(&buf_pool.mutex); - if (n_flush) + if (buf_pool.n_flush()) { - auto cond= lru ? &buf_pool.done_flush_LRU : &buf_pool.done_flush_list; tpool::tpool_wait_begin(); thd_wait_begin(nullptr, THD_WAIT_DISKIO); do - my_cond_wait(cond, &buf_pool.mutex.m_mutex); - while (n_flush); + my_cond_wait(&buf_pool.done_flush_LRU, + &buf_pool.flush_list_mutex.m_mutex); + while (buf_pool.n_flush()); tpool::tpool_wait_end(); thd_wait_end(nullptr); - pthread_cond_broadcast(cond); } } /** Write out dirty blocks from buf_pool.flush_list. +The caller must invoke buf_dblwr.flush_buffered_writes() +after releasing buf_pool.mutex. @param max_n wished maximum mumber of blocks flushed @param lsn buf_pool.get_oldest_modification(LSN_MAX) target @return the number of processed pages @retval 0 if a buf_pool.flush_list batch is already running */ -static ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, lsn_t lsn= LSN_MAX) +static ulint buf_flush_list_holding_mutex(ulint max_n= ULINT_UNDEFINED, + lsn_t lsn= LSN_MAX) { ut_ad(lsn); + mysql_mutex_assert_owner(&buf_pool.mutex); - if (buf_pool.n_flush_list()) - return 0; - - mysql_mutex_lock(&buf_pool.mutex); - const bool running= buf_pool.n_flush_list_ != 0; - /* FIXME: we are performing a dirty read of buf_pool.flush_list.count - while not holding buf_pool.flush_list_mutex */ - if (running || !UT_LIST_GET_LEN(buf_pool.flush_list)) + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (buf_pool.flush_list_active()) { - if (!running) - pthread_cond_broadcast(&buf_pool.done_flush_list); - mysql_mutex_unlock(&buf_pool.mutex); +nothing_to_do: + mysql_mutex_unlock(&buf_pool.flush_list_mutex); return 0; } - - buf_pool.n_flush_list_++; - const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn); - const ulint n_flushing= --buf_pool.n_flush_list_; - - buf_pool.try_LRU_scan= true; - - mysql_mutex_unlock(&buf_pool.mutex); - - if (!n_flushing) + if (!buf_pool.get_oldest_modification(0)) + { pthread_cond_broadcast(&buf_pool.done_flush_list); + goto nothing_to_do; + } + buf_pool.flush_list_set_active(); + const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn); + if (n_flushed) + buf_pool.stat.n_pages_written+= n_flushed; + buf_pool.flush_list_set_inactive(); + pthread_cond_broadcast(&buf_pool.done_flush_list); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); - buf_dblwr.flush_buffered_writes(); + if (n_flushed) + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + n_flushed); DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed)); return n_flushed; } +/** Write out dirty blocks from buf_pool.flush_list. +@param max_n wished maximum mumber of blocks flushed +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target +@return the number of processed pages +@retval 0 if a buf_pool.flush_list batch is already running */ +static ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, + lsn_t lsn= LSN_MAX) +{ + mysql_mutex_lock(&buf_pool.mutex); + ulint n= buf_flush_list_holding_mutex(max_n, lsn); + mysql_mutex_unlock(&buf_pool.mutex); + buf_dblwr.flush_buffered_writes(); + return n; +} + /** Try to flush all the dirty pages that belong to a given tablespace. @param space tablespace @param n_flushed number of pages written @@ -1521,6 +1547,7 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) bool may_have_skipped= false; ulint max_n_flush= srv_io_capacity; + ulint n_flush= 0; bool acquired= space->acquire(); { @@ -1537,11 +1564,17 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) ut_ad(bpage->in_file()); buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); - if (bpage->id().space() != space_id); - else if (bpage->oldest_modification() == 1) + if (bpage->oldest_modification() == 1) + clear: buf_pool.delete_from_flush_list(bpage); - else if (!bpage->ready_for_flush()) + else if (bpage->id().space() != space_id); + else if (!bpage->lock.u_lock_try(true)) may_have_skipped= true; + else if (bpage->oldest_modification() == 1) + { + bpage->lock.u_unlock(true); + goto clear; + } else { /* In order not to degenerate this scan to O(n*n) we attempt to @@ -1553,13 +1586,10 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) concurrently. This may terminate our iteration prematurely, leading us to return may_have_skipped=true. */ buf_pool.flush_hp.set(prev); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); if (!acquired) - { was_freed: buf_flush_discard_page(bpage); - } else { if (space->is_stopping()) @@ -1568,28 +1598,24 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) acquired= false; goto was_freed; } - if (!bpage->flush(false, space)) - { - may_have_skipped= true; - mysql_mutex_lock(&buf_pool.flush_list_mutex); - goto next_after_skip; - } - if (n_flushed) - ++*n_flushed; - if (!--max_n_flush) + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + if (bpage->flush(false, space)) { + ++n_flush; + if (!--max_n_flush) + { + mysql_mutex_lock(&buf_pool.mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + may_have_skipped= true; + goto done; + } mysql_mutex_lock(&buf_pool.mutex); - mysql_mutex_lock(&buf_pool.flush_list_mutex); - may_have_skipped= true; - break; } - mysql_mutex_lock(&buf_pool.mutex); } mysql_mutex_lock(&buf_pool.flush_list_mutex); if (!buf_pool.flush_hp.is_hp(prev)) may_have_skipped= true; - next_after_skip: bpage= buf_pool.flush_hp.get(); continue; } @@ -1602,14 +1628,19 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) buf_flush_list_space(). We should always return true from buf_flush_list_space() if that should be the case; in buf_do_flush_list_batch() we will simply perform less work. */ - +done: buf_pool.flush_hp.set(nullptr); mysql_mutex_unlock(&buf_pool.flush_list_mutex); - buf_pool.try_LRU_scan= true; + buf_pool.stat.n_pages_written+= n_flush; + buf_pool.try_LRU_scan= true; + pthread_cond_broadcast(&buf_pool.done_free); mysql_mutex_unlock(&buf_pool.mutex); + if (n_flushed) + *n_flushed= n_flush; + if (acquired) space->release(); @@ -1621,43 +1652,32 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) return may_have_skipped; } -/** Write out dirty blocks from buf_pool.LRU. +/** Write out dirty blocks from buf_pool.LRU, +and move clean blocks to buf_pool.free. +The caller must invoke buf_dblwr.flush_buffered_writes() +after releasing buf_pool.mutex. @param max_n wished maximum mumber of blocks flushed -@return the number of processed pages +@param evict whether to evict pages after flushing +@return evict ? number of processed pages : number of pages written @retval 0 if a buf_pool.LRU batch is already running */ -ulint buf_flush_LRU(ulint max_n) +ulint buf_flush_LRU(ulint max_n, bool evict) { - if (buf_pool.n_flush_LRU()) - return 0; - - log_buffer_flush_to_disk(); - - mysql_mutex_lock(&buf_pool.mutex); - if (buf_pool.n_flush_LRU_) - { - mysql_mutex_unlock(&buf_pool.mutex); - return 0; - } - buf_pool.n_flush_LRU_++; - - ulint n_flushed= buf_do_LRU_batch(max_n); - - const ulint n_flushing= --buf_pool.n_flush_LRU_; + mysql_mutex_assert_owner(&buf_pool.mutex); - buf_pool.try_LRU_scan= true; + flush_counters_t n; + buf_do_LRU_batch(max_n, evict, &n); - mysql_mutex_unlock(&buf_pool.mutex); + ulint pages= n.flushed; - if (!n_flushing) + if (n.evicted) { - pthread_cond_broadcast(&buf_pool.done_flush_LRU); - pthread_cond_signal(&buf_pool.done_free); + if (evict) + pages+= n.evicted; + buf_pool.try_LRU_scan= true; + pthread_cond_broadcast(&buf_pool.done_free); } - buf_dblwr.flush_buffered_writes(); - - DBUG_PRINT("ib_buf", ("LRU flush completed, " ULINTPF " pages", n_flushed)); - return n_flushed; + return pages; } #ifdef HAVE_PMEM @@ -1931,9 +1951,14 @@ static void buf_flush_wait(lsn_t lsn) buf_flush_sync_lsn= lsn; buf_pool.page_cleaner_set_idle(false); pthread_cond_signal(&buf_pool.do_flush_list); + my_cond_wait(&buf_pool.done_flush_list, + &buf_pool.flush_list_mutex.m_mutex); + if (buf_pool.get_oldest_modification(lsn) >= lsn) + break; } - my_cond_wait(&buf_pool.done_flush_list, - &buf_pool.flush_list_mutex.m_mutex); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + buf_dblwr.wait_for_page_writes(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); } } @@ -1953,6 +1978,9 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn) if (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn) { MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); + thd_wait_begin(nullptr, THD_WAIT_DISKIO); + tpool::tpool_wait_begin(); + #if 1 /* FIXME: remove this, and guarantee that the page cleaner serves us */ if (UNIV_UNLIKELY(!buf_page_cleaner_is_active)) { @@ -1960,26 +1988,23 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn) { mysql_mutex_unlock(&buf_pool.flush_list_mutex); ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn); - buf_flush_wait_batch_end_acquiring_mutex(false); if (n_pages) { MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, MONITOR_FLUSH_SYNC_COUNT, MONITOR_FLUSH_SYNC_PAGES, n_pages); } + buf_dblwr.wait_for_page_writes(); mysql_mutex_lock(&buf_pool.flush_list_mutex); } while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn); } else #endif - { - thd_wait_begin(nullptr, THD_WAIT_DISKIO); - tpool::tpool_wait_begin(); buf_flush_wait(sync_lsn); - tpool::tpool_wait_end(); - thd_wait_end(nullptr); - } + + tpool::tpool_wait_end(); + thd_wait_end(nullptr); } mysql_mutex_unlock(&buf_pool.flush_list_mutex); @@ -2024,28 +2049,16 @@ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious) } } -/** Wait for pending flushes to complete. */ -void buf_flush_wait_batch_end_acquiring_mutex(bool lru) -{ - if (lru ? buf_pool.n_flush_LRU() : buf_pool.n_flush_list()) - { - mysql_mutex_lock(&buf_pool.mutex); - buf_flush_wait_batch_end(lru); - mysql_mutex_unlock(&buf_pool.mutex); - } -} - /** Conduct checkpoint-related flushing for innodb_flush_sync=ON, and try to initiate checkpoints until the target is met. @param lsn minimum value of buf_pool.get_oldest_modification(LSN_MAX) */ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn) { ut_ad(!srv_read_only_mode); + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); for (;;) { - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - if (ulint n_flushed= buf_flush_list(srv_max_io_capacity, lsn)) { MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, @@ -2086,6 +2099,7 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn) /* wake up buf_flush_wait() */ pthread_cond_broadcast(&buf_pool.done_flush_list); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); lsn= std::max(lsn, target); @@ -2136,8 +2150,9 @@ af_get_pct_for_lsn( / 7.5)); } -/** This function is called approximately once every second by the -page_cleaner thread if innodb_adaptive_flushing=ON. +/** This function is called approximately once every second by +buf_flush_page_cleaner() if innodb_max_dirty_pages_pct_lwm>0 +and innodb_adaptive_flushing=ON. Based on various factors it decides if there is a need to do flushing. @return number of pages recommended to be flushed @param last_pages_in number of pages flushed in previous batch @@ -2175,52 +2190,43 @@ static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in, n_pages= std::min<ulint>(srv_io_capacity, dirty_blocks); } +func_exit: + page_cleaner.flush_pass++; return n_pages; } sum_pages += last_pages_in; - double time_elapsed = difftime(curr_time, prev_time); + const ulint time_elapsed = std::max<ulint>(curr_time - prev_time, 1); - /* We update our variables every srv_flushing_avg_loops + /* We update our variables every innodb_flushing_avg_loops iterations to smooth out transition in workload. */ if (++n_iterations >= srv_flushing_avg_loops - || time_elapsed >= static_cast<double>(srv_flushing_avg_loops)) { + || time_elapsed >= srv_flushing_avg_loops) { - if (time_elapsed < 1) { - time_elapsed = 1; - } - - avg_page_rate = static_cast<ulint>( - ((static_cast<double>(sum_pages) - / time_elapsed) - + static_cast<double>(avg_page_rate)) / 2); + avg_page_rate = (sum_pages / time_elapsed + avg_page_rate) / 2; /* How much LSN we have generated since last call. */ - lsn_rate = static_cast<lsn_t>( - static_cast<double>(cur_lsn - prev_lsn) - / time_elapsed); + lsn_rate = (cur_lsn - prev_lsn) / time_elapsed; lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2; - ulint flush_tm = page_cleaner.flush_time; - ulint flush_pass = page_cleaner.flush_pass; - - page_cleaner.flush_time = 0; - page_cleaner.flush_pass = 0; - - if (flush_pass) { - flush_tm /= flush_pass; + if (page_cleaner.flush_pass) { + page_cleaner.flush_time /= page_cleaner.flush_pass; } - MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME, flush_tm); - MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, flush_pass); - prev_lsn = cur_lsn; prev_time = curr_time; - n_iterations = 0; + MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME, + page_cleaner.flush_time); + MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, + page_cleaner.flush_pass); + + page_cleaner.flush_time = 0; + page_cleaner.flush_pass = 0; + n_iterations = 0; sum_pages = 0; } @@ -2270,7 +2276,7 @@ static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in, MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty); MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn); - return(n_pages); + goto func_exit; } #if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__ @@ -2294,8 +2300,6 @@ static void buf_flush_page_cleaner() timespec abstime; set_timespec(abstime, 1); - mysql_mutex_lock(&buf_pool.flush_list_mutex); - lsn_t lsn_limit; ulint last_activity_count= srv_get_activity_count(); @@ -2303,44 +2307,34 @@ static void buf_flush_page_cleaner() { lsn_limit= buf_flush_sync_lsn; - if (UNIV_UNLIKELY(lsn_limit != 0)) + if (UNIV_UNLIKELY(lsn_limit != 0) && UNIV_LIKELY(srv_flush_sync)) { -furious_flush: - if (UNIV_LIKELY(srv_flush_sync)) - { - buf_flush_sync_for_checkpoint(lsn_limit); - last_pages= 0; - set_timespec(abstime, 1); - continue; - } + furious_flush: + buf_flush_sync_for_checkpoint(lsn_limit); + last_pages= 0; + set_timespec(abstime, 1); + continue; } + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (buf_pool.ran_out()) + goto no_wait; else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) break; - /* If buf pager cleaner is idle and there is no work - (either dirty pages are all flushed or adaptive flushing - is not enabled) then opt for non-timed wait */ if (buf_pool.page_cleaner_idle() && (!UT_LIST_GET_LEN(buf_pool.flush_list) || srv_max_dirty_pages_pct_lwm == 0.0)) - my_cond_wait(&buf_pool.do_flush_list, &buf_pool.flush_list_mutex.m_mutex); + /* We are idle; wait for buf_pool.page_cleaner_wakeup() */ + my_cond_wait(&buf_pool.do_flush_list, + &buf_pool.flush_list_mutex.m_mutex); else my_cond_timedwait(&buf_pool.do_flush_list, &buf_pool.flush_list_mutex.m_mutex, &abstime); - + no_wait: set_timespec(abstime, 1); - lsn_t soft_lsn_limit= buf_flush_async_lsn; lsn_limit= buf_flush_sync_lsn; - - if (UNIV_UNLIKELY(lsn_limit != 0)) - { - if (UNIV_LIKELY(srv_flush_sync)) - goto furious_flush; - } - else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) - break; - const lsn_t oldest_lsn= buf_pool.get_oldest_modification(0); if (!oldest_lsn) @@ -2351,23 +2345,78 @@ furious_flush: /* wake up buf_flush_wait() */ pthread_cond_broadcast(&buf_pool.done_flush_list); } -unemployed: + unemployed: buf_flush_async_lsn= 0; + set_idle: buf_pool.page_cleaner_set_idle(true); + if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) + break; + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + end_of_batch: + buf_dblwr.flush_buffered_writes(); - DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", continue;); - DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", continue;); + do + { + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", continue;); + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", continue;); + if (!recv_recovery_is_on() && + !srv_startup_is_before_trx_rollback_phase && + srv_operation == SRV_OPERATION_NORMAL) + log_checkpoint(); + } + while (false); + + if (!buf_pool.ran_out()) + continue; + mysql_mutex_lock(&buf_pool.flush_list_mutex); + } + + lsn_t soft_lsn_limit= buf_flush_async_lsn; + + if (UNIV_UNLIKELY(lsn_limit != 0)) + { + if (srv_flush_sync) + goto do_furious_flush; + if (oldest_lsn >= lsn_limit) + { + buf_flush_sync_lsn= 0; + pthread_cond_broadcast(&buf_pool.done_flush_list); + } + else if (lsn_limit > soft_lsn_limit) + soft_lsn_limit= lsn_limit; + } + + bool idle_flush= false; + ulint n_flushed= 0, n; + + if (UNIV_UNLIKELY(soft_lsn_limit != 0)) + { + if (oldest_lsn >= soft_lsn_limit) + buf_flush_async_lsn= soft_lsn_limit= 0; + } + else if (buf_pool.ran_out()) + { + buf_pool.page_cleaner_set_idle(false); mysql_mutex_unlock(&buf_pool.flush_list_mutex); + n= srv_max_io_capacity; + mysql_mutex_lock(&buf_pool.mutex); + LRU_flush: + n= buf_flush_LRU(n, false); + mysql_mutex_unlock(&buf_pool.mutex); + last_pages+= n; - if (!recv_recovery_is_on() && - !srv_startup_is_before_trx_rollback_phase && - srv_operation == SRV_OPERATION_NORMAL) - log_checkpoint(); + if (!idle_flush) + goto end_of_batch; + /* when idle flushing kicks in page_cleaner is marked active. + reset it back to idle since the it was made active as part of + idle flushing stage. */ mysql_mutex_lock(&buf_pool.flush_list_mutex); - continue; + goto set_idle; } + else if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) + break; const ulint dirty_blocks= UT_LIST_GET_LEN(buf_pool.flush_list); ut_ad(dirty_blocks); @@ -2376,94 +2425,71 @@ unemployed: guaranteed to be nonempty, and it is a subset of buf_pool.LRU. */ const double dirty_pct= double(dirty_blocks) * 100.0 / double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free)); - - bool idle_flush= false; - - if (lsn_limit || soft_lsn_limit); - else if (af_needed_for_redo(oldest_lsn)); - else if (srv_max_dirty_pages_pct_lwm != 0.0) + if (srv_max_dirty_pages_pct_lwm != 0.0) { const ulint activity_count= srv_get_activity_count(); if (activity_count != last_activity_count) + { last_activity_count= activity_count; + goto maybe_unemployed; + } else if (buf_pool.page_cleaner_idle() && buf_pool.n_pend_reads == 0) { - /* reaching here means 3 things: - - last_activity_count == activity_count: suggesting server is idle - (no trx_t::commit activity) - - page cleaner is idle (dirty_pct < srv_max_dirty_pages_pct_lwm) - - there are no pending reads but there are dirty pages to flush */ - idle_flush= true; + /* reaching here means 3 things: + - last_activity_count == activity_count: suggesting server is idle + (no trx_t::commit() activity) + - page cleaner is idle (dirty_pct < srv_max_dirty_pages_pct_lwm) + - there are no pending reads but there are dirty pages to flush */ buf_pool.update_last_activity_count(activity_count); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + idle_flush= true; + goto idle_flush; } - - if (!idle_flush && dirty_pct < srv_max_dirty_pages_pct_lwm) - goto unemployed; + else + maybe_unemployed: + if (dirty_pct < srv_max_dirty_pages_pct_lwm) + goto possibly_unemployed; } else if (dirty_pct < srv_max_buf_pool_modified_pct) - goto unemployed; - - if (UNIV_UNLIKELY(lsn_limit != 0) && oldest_lsn >= lsn_limit) - lsn_limit= buf_flush_sync_lsn= 0; - if (UNIV_UNLIKELY(soft_lsn_limit != 0) && oldest_lsn >= soft_lsn_limit) - soft_lsn_limit= buf_flush_async_lsn= 0; + possibly_unemployed: + if (!soft_lsn_limit && !af_needed_for_redo(oldest_lsn)) + goto unemployed; buf_pool.page_cleaner_set_idle(false); mysql_mutex_unlock(&buf_pool.flush_list_mutex); - if (!lsn_limit) - lsn_limit= soft_lsn_limit; - - ulint n_flushed; - - if (UNIV_UNLIKELY(lsn_limit != 0)) + if (UNIV_UNLIKELY(soft_lsn_limit != 0)) { - n_flushed= buf_flush_list(srv_max_io_capacity, lsn_limit); - /* wake up buf_flush_wait() */ - pthread_cond_broadcast(&buf_pool.done_flush_list); - goto try_checkpoint; + n= srv_max_io_capacity; + goto background_flush; } - else if (idle_flush || !srv_adaptive_flushing) + + if (!srv_adaptive_flushing) { - n_flushed= buf_flush_list(srv_io_capacity); -try_checkpoint: - if (n_flushed) - { - MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, - MONITOR_FLUSH_BACKGROUND_COUNT, - MONITOR_FLUSH_BACKGROUND_PAGES, - n_flushed); -do_checkpoint: - /* The periodic log_checkpoint() call here makes it harder to - reproduce bugs in crash recovery or mariabackup --prepare, or - in code that writes the redo log records. Omitting the call - here should not affect correctness, because log_free_check() - should still be invoking checkpoints when needed. */ - DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", goto next;); - DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", goto next;); - - if (!recv_recovery_is_on() && srv_operation == SRV_OPERATION_NORMAL) - log_checkpoint(); - } + idle_flush: + n= srv_io_capacity; + soft_lsn_limit= LSN_MAX; + background_flush: + mysql_mutex_lock(&buf_pool.mutex); + n_flushed= buf_flush_list_holding_mutex(n, soft_lsn_limit); + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, + MONITOR_FLUSH_BACKGROUND_COUNT, + MONITOR_FLUSH_BACKGROUND_PAGES, + n_flushed); } - else if (ulint n= page_cleaner_flush_pages_recommendation(last_pages, - oldest_lsn, - dirty_blocks, - dirty_pct)) + else if ((n= page_cleaner_flush_pages_recommendation(last_pages, + oldest_lsn, + dirty_blocks, + dirty_pct)) != 0) { - page_cleaner.flush_pass++; const ulint tm= ut_time_ms(); - last_pages= n_flushed= buf_flush_list(n); + mysql_mutex_lock(&buf_pool.mutex); + last_pages= n_flushed= buf_flush_list_holding_mutex(n); page_cleaner.flush_time+= ut_time_ms() - tm; - - if (n_flushed) - { - MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, - MONITOR_FLUSH_ADAPTIVE_COUNT, - MONITOR_FLUSH_ADAPTIVE_PAGES, - n_flushed); - goto do_checkpoint; - } + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, + MONITOR_FLUSH_ADAPTIVE_COUNT, + MONITOR_FLUSH_ADAPTIVE_PAGES, + n_flushed); } else if (buf_flush_async_lsn <= oldest_lsn) { @@ -2471,30 +2497,29 @@ do_checkpoint: goto unemployed; } -#ifndef DBUG_OFF -next: -#endif /* !DBUG_OFF */ - mysql_mutex_lock(&buf_pool.flush_list_mutex); - - /* when idle flushing kicks in page_cleaner is marked active. - reset it back to idle since the it was made active as part of - idle flushing stage. */ - if (idle_flush) - buf_pool.page_cleaner_set_idle(true); + n= n >= n_flushed ? n - n_flushed : 0; + goto LRU_flush; } mysql_mutex_unlock(&buf_pool.flush_list_mutex); if (srv_fast_shutdown != 2) { - buf_flush_wait_batch_end_acquiring_mutex(true); - buf_flush_wait_batch_end_acquiring_mutex(false); + buf_dblwr.flush_buffered_writes(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_flush_wait_LRU_batch_end(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + buf_dblwr.wait_for_page_writes(); } mysql_mutex_lock(&buf_pool.flush_list_mutex); lsn_limit= buf_flush_sync_lsn; if (UNIV_UNLIKELY(lsn_limit != 0)) + { + do_furious_flush: + mysql_mutex_unlock(&buf_pool.flush_list_mutex); goto furious_flush; + } buf_page_cleaner_is_active= false; pthread_cond_broadcast(&buf_pool.done_flush_list); mysql_mutex_unlock(&buf_pool.flush_list_mutex); @@ -2519,17 +2544,6 @@ ATTRIBUTE_COLD void buf_flush_page_cleaner_init() std::thread(buf_flush_page_cleaner).detach(); } -#if defined(HAVE_SYSTEMD) && !defined(EMBEDDED_LIBRARY) -/** @return the number of dirty pages in the buffer pool */ -static ulint buf_flush_list_length() -{ - mysql_mutex_lock(&buf_pool.flush_list_mutex); - const ulint len= UT_LIST_GET_LEN(buf_pool.flush_list); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - return len; -} -#endif - /** Flush the buffer pool on shutdown. */ ATTRIBUTE_COLD void buf_flush_buffer_pool() { @@ -2545,20 +2559,19 @@ ATTRIBUTE_COLD void buf_flush_buffer_pool() { mysql_mutex_unlock(&buf_pool.flush_list_mutex); buf_flush_list(srv_max_io_capacity); - if (buf_pool.n_flush_list()) + if (const size_t pending= buf_dblwr.pending_writes()) { timespec abstime; service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, - "Waiting to flush " ULINTPF " pages", - buf_flush_list_length()); + "Waiting to write %zu pages", pending); set_timespec(abstime, INNODB_EXTEND_TIMEOUT_INTERVAL / 2); - mysql_mutex_lock(&buf_pool.mutex); - while (buf_pool.n_flush_list_) - my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex, - &abstime); - mysql_mutex_unlock(&buf_pool.mutex); + buf_dblwr.wait_for_page_writes(abstime); } + mysql_mutex_lock(&buf_pool.flush_list_mutex); + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "Waiting to flush " ULINTPF " pages", + UT_LIST_GET_LEN(buf_pool.flush_list)); } mysql_mutex_unlock(&buf_pool.flush_list_mutex); @@ -2603,6 +2616,7 @@ void buf_flush_sync() if (lsn == log_sys.get_lsn()) break; } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); tpool::tpool_wait_end(); thd_wait_end(nullptr); diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 6c994b8c9bd..e4e20e8335f 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -136,7 +136,6 @@ static void buf_LRU_block_free_hashed_page(buf_block_t *block) @param[in] bpage control block */ static inline void incr_LRU_size_in_bytes(const buf_page_t* bpage) { - /* FIXME: use atomics, not mutex */ mysql_mutex_assert_owner(&buf_pool.mutex); buf_pool.stat.LRU_bytes += bpage->physical_size(); @@ -401,8 +400,10 @@ buf_block_t* buf_LRU_get_free_block(buf_LRU_get get) DBUG_EXECUTE_IF("recv_ran_out_of_buffer", if (recv_recovery_is_on() && recv_sys.apply_log_recs) { + mysql_mutex_lock(&buf_pool.mutex); goto flush_lru; }); +get_mutex: mysql_mutex_lock(&buf_pool.mutex); got_mutex: buf_LRU_check_size_of_non_data_objects(); @@ -451,20 +452,32 @@ got_block: if ((block = buf_LRU_get_free_only()) != nullptr) { goto got_block; } - if (!buf_pool.n_flush_LRU_) { - break; + mysql_mutex_unlock(&buf_pool.mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + const auto n_flush = buf_pool.n_flush(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + mysql_mutex_lock(&buf_pool.mutex); + if (!n_flush) { + goto not_found; + } + if (!buf_pool.try_LRU_scan) { + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.page_cleaner_wakeup(true); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + my_cond_wait(&buf_pool.done_free, + &buf_pool.mutex.m_mutex); } - my_cond_wait(&buf_pool.done_free, &buf_pool.mutex.m_mutex); } -#ifndef DBUG_OFF not_found: -#endif - mysql_mutex_unlock(&buf_pool.mutex); + if (n_iterations > 1) { + MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS ); + } - if (n_iterations > 20 && !buf_lru_free_blocks_error_printed + if (n_iterations == 21 && !buf_lru_free_blocks_error_printed && srv_buf_pool_old_size == srv_buf_pool_size) { - + buf_lru_free_blocks_error_printed = true; + mysql_mutex_unlock(&buf_pool.mutex); ib::warn() << "Difficult to find free blocks in the buffer pool" " (" << n_iterations << " search iterations)! " << flush_failures << " failed attempts to" @@ -476,12 +489,7 @@ not_found: << os_n_file_writes << " OS file writes, " << os_n_fsyncs << " OS fsyncs."; - - buf_lru_free_blocks_error_printed = true; - } - - if (n_iterations > 1) { - MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS ); + mysql_mutex_lock(&buf_pool.mutex); } /* No free block was found: try to flush the LRU list. @@ -495,15 +503,16 @@ not_found: #ifndef DBUG_OFF flush_lru: #endif - if (!buf_flush_LRU(innodb_lru_flush_size)) { + if (!buf_flush_LRU(innodb_lru_flush_size, true)) { MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT); ++flush_failures; } n_iterations++; - mysql_mutex_lock(&buf_pool.mutex); buf_pool.stat.LRU_waits++; - goto got_mutex; + mysql_mutex_unlock(&buf_pool.mutex); + buf_dblwr.flush_buffered_writes(); + goto get_mutex; } /** Move the LRU_old pointer so that the length of the old blocks list @@ -812,50 +821,57 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip) /* We cannot use transactional_lock_guard here, because buf_buddy_relocate() in buf_buddy_free() could get stuck. */ hash_lock.lock(); - lsn_t oldest_modification = bpage->oldest_modification_acquire(); + const lsn_t oldest_modification = bpage->oldest_modification_acquire(); if (UNIV_UNLIKELY(!bpage->can_relocate())) { /* Do not free buffer fixed and I/O-fixed blocks. */ goto func_exit; } - if (oldest_modification == 1) { + switch (oldest_modification) { + case 2: + ut_ad(id.space() == SRV_TMP_SPACE_ID); + ut_ad(!bpage->zip.data); + if (!bpage->is_freed()) { + goto func_exit; + } + bpage->clear_oldest_modification(); + break; + case 1: mysql_mutex_lock(&buf_pool.flush_list_mutex); - oldest_modification = bpage->oldest_modification(); - if (oldest_modification) { - ut_ad(oldest_modification == 1); + if (const lsn_t om = bpage->oldest_modification()) { + ut_ad(om == 1); buf_pool.delete_from_flush_list(bpage); } mysql_mutex_unlock(&buf_pool.flush_list_mutex); ut_ad(!bpage->oldest_modification()); - oldest_modification = 0; - } - - if (zip || !bpage->zip.data) { - /* This would completely free the block. */ - /* Do not completely free dirty blocks. */ - - if (oldest_modification) { - goto func_exit; + /* fall through */ + case 0: + if (zip || !bpage->zip.data || !bpage->frame) { + break; } - } else if (oldest_modification && !bpage->frame) { -func_exit: - hash_lock.unlock(); - return(false); - - } else if (bpage->frame) { +relocate_compressed: b = static_cast<buf_page_t*>(ut_zalloc_nokey(sizeof *b)); ut_a(b); mysql_mutex_lock(&buf_pool.flush_list_mutex); new (b) buf_page_t(*bpage); b->frame = nullptr; b->set_state(buf_page_t::UNFIXED + 1); + break; + default: + if (zip || !bpage->zip.data || !bpage->frame) { + /* This would completely free the block. */ + /* Do not completely free dirty blocks. */ +func_exit: + hash_lock.unlock(); + return(false); + } + goto relocate_compressed; } mysql_mutex_assert_owner(&buf_pool.mutex); - DBUG_PRINT("ib_buf", ("free page %u:%u", - id.space(), id.page_no())); + DBUG_PRINT("ib_buf", ("free page %u:%u", id.space(), id.page_no())); ut_ad(bpage->can_relocate()); @@ -1026,7 +1042,8 @@ buf_LRU_block_free_non_file_page( } else { UT_LIST_ADD_FIRST(buf_pool.free, &block->page); ut_d(block->page.in_free_list = true); - pthread_cond_signal(&buf_pool.done_free); + buf_pool.try_LRU_scan= true; + pthread_cond_broadcast(&buf_pool.done_free); } block->page.set_os_unused(); diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index 3906ff65dfb..bbd905365ed 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -186,6 +186,7 @@ page_exists: buf_LRU_add_block(bpage, true/* to old blocks */); } + buf_pool.stat.n_pages_read++; mysql_mutex_unlock(&buf_pool.mutex); buf_pool.n_pend_reads++; return bpage; @@ -205,35 +206,31 @@ flag is cleared and the x-lock released by an i/o-handler thread. @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0, bitwise-ORed with 1 in recovery @param[in,out] chain buf_pool.page_hash cell for page_id -@param[out] err DB_SUCCESS or DB_TABLESPACE_DELETED - if we are trying - to read from a non-existent tablespace @param[in,out] space tablespace @param[in,out] block preallocated buffer block @param[in] sync true if synchronous aio is desired -@return whether a read request was enqueued */ +@return error code +@retval DB_SUCCESS if the page was read +@retval DB_SUCCESS_LOCKED_REC if the page exists in the buffer pool already */ static -bool +dberr_t buf_read_page_low( const page_id_t page_id, ulint zip_size, buf_pool_t::hash_chain& chain, - dberr_t* err, fil_space_t* space, buf_block_t*& block, bool sync = false) { buf_page_t* bpage; - *err = DB_SUCCESS; - ut_ad(!buf_dblwr.is_inside(page_id)); bpage = buf_page_init_for_read(page_id, zip_size, chain, block); if (!bpage) { space->release(); - return false; + return DB_SUCCESS_LOCKED_REC; } ut_ad(bpage->in_file()); @@ -253,7 +250,6 @@ buf_read_page_low( ? IORequest::READ_SYNC : IORequest::READ_ASYNC), page_id.page_no() * len, len, dst, bpage); - *err = fio.err; if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) { ut_d(auto n=) buf_pool.n_pend_reads--; @@ -262,14 +258,14 @@ buf_read_page_low( } else if (sync) { thd_wait_end(NULL); /* The i/o was already completed in space->io() */ - *err = bpage->read_complete(*fio.node); + fio.err = bpage->read_complete(*fio.node); space->release(); - if (*err == DB_FAIL) { - *err = DB_PAGE_CORRUPTED; + if (fio.err == DB_FAIL) { + fio.err = DB_PAGE_CORRUPTED; } } - return true; + return fio.err; } /** Acquire a buffer block. */ @@ -353,9 +349,8 @@ read_ahead: if (space->is_stopping()) break; buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(i.fold()); - dberr_t err; space->reacquire(); - if (buf_read_page_low(i, zip_size, chain, &err, space, block)) + if (buf_read_page_low(i, zip_size, chain, space, block) == DB_SUCCESS) { count++; ut_ad(!block); @@ -365,18 +360,20 @@ read_ahead: } if (count) + { DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u", count, space->chain.start->name, low.page_no())); - space->release(); + mysql_mutex_lock(&buf_pool.mutex); + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_LRU_stat_inc_io(); + buf_pool.stat.n_ra_pages_read_rnd+= count; + mysql_mutex_unlock(&buf_pool.mutex); + } - /* Read ahead is considered one I/O operation for the purpose of - LRU policy decision. */ - buf_LRU_stat_inc_io(); + space->release(); buf_read_release(block); - - buf_pool.stat.n_ra_pages_read_rnd+= count; - srv_stats.buf_pool_reads.add(count); return count; } @@ -388,6 +385,7 @@ released by the i/o-handler thread. @param zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param chain buf_pool.page_hash cell for page_id @retval DB_SUCCESS if the page was read and is not corrupted, +@retval DB_SUCCESS_LOCKED_REC if the page was not read @retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted, @retval DB_DECRYPTION_FAILED if page post encryption checksum matches but after decryption normal page checksum does not match. @@ -403,22 +401,19 @@ dberr_t buf_read_page(const page_id_t page_id, ulint zip_size, return DB_TABLESPACE_DELETED; } - buf_block_t *block= zip_size - ? nullptr - : buf_LRU_get_free_block(have_no_mutex); - /* Our caller should already have ensured that the page does not exist in buf_pool.page_hash. */ - dberr_t err; - if (buf_read_page_low(page_id, zip_size, chain, &err, space, block, true)) + buf_block_t *block= nullptr; + if (UNIV_LIKELY(!zip_size)) { - ut_ad(!block); - srv_stats.buf_pool_reads.add(1); + mysql_mutex_lock(&buf_pool.mutex); + buf_LRU_stat_inc_io(); + block= buf_LRU_get_free_block(have_mutex); + mysql_mutex_unlock(&buf_pool.mutex); } - else - buf_read_release(block); - buf_LRU_stat_inc_io(); + dberr_t err= buf_read_page_low(page_id, zip_size, chain, space, block, true); + buf_read_release(block); return err; } @@ -444,12 +439,9 @@ void buf_read_page_background(fil_space_t *space, const page_id_t page_id, if (!zip_size && !(block= buf_read_acquire())) goto skip; - dberr_t err; - if (buf_read_page_low(page_id, zip_size, chain, &err, space, block)) - { + if (buf_read_page_low(page_id, zip_size, chain, space, block) == + DB_SUCCESS) ut_ad(!block); - srv_stats.buf_pool_reads.add(1); - } else buf_read_release(block); @@ -601,9 +593,9 @@ failed: if (space->is_stopping()) break; buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(new_low.fold()); - dberr_t err; space->reacquire(); - if (buf_read_page_low(new_low, zip_size, chain, &err, space, block)) + if (buf_read_page_low(new_low, zip_size, chain, space, block) == + DB_SUCCESS) { count++; ut_ad(!block); @@ -613,17 +605,20 @@ failed: } if (count) + { DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u", count, space->chain.start->name, new_low.page_no())); - space->release(); + mysql_mutex_lock(&buf_pool.mutex); + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_LRU_stat_inc_io(); + buf_pool.stat.n_ra_pages_read+= count; + mysql_mutex_unlock(&buf_pool.mutex); + } - /* Read ahead is considered one I/O operation for the purpose of - LRU policy decision. */ - buf_LRU_stat_inc_io(); + space->release(); buf_read_release(block); - - buf_pool.stat.n_ra_pages_read+= count; return count; } @@ -679,20 +674,22 @@ void buf_read_recv_pages(uint32_t space_id, st_::span<uint32_t> page_nos) buf_pool_t::hash_chain& chain = buf_pool.page_hash.cell_get(cur_page_id.fold()); - dberr_t err; space->reacquire(); - if (buf_read_page_low(cur_page_id, zip_size, chain, &err, space, - block)) { + switch (buf_read_page_low(cur_page_id, zip_size, chain, space, + block)) { + case DB_SUCCESS: ut_ad(!block); block = buf_LRU_get_free_block(have_no_mutex); - } - - if (err != DB_SUCCESS) { + break; + case DB_SUCCESS_LOCKED_REC: + break; + default: sql_print_error("InnoDB: Recovery failed to read page " UINT32PF " from %s", cur_page_id.page_no(), space->chain.start->name); } + ut_ad(block); } DBUG_PRINT("ib_buf", ("recovery read (%zu pages) for %s", diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 86792c2680f..c1fd916be55 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -118,6 +118,9 @@ bool fil_space_t::try_to_close(bool print_info) } node->close(); + + fil_system.move_closed_last_to_space_list(node->space); + return true; } @@ -392,13 +395,7 @@ static bool fil_node_open_file_low(fil_node_t *node) ut_ad(node->is_open()); - if (UNIV_LIKELY(!fil_system.freeze_space_list)) - { - /* Move the file last in fil_system.space_list, so that - fil_space_t::try_to_close() should close it as a last resort. */ - fil_system.space_list.erase(space_list_t::iterator(node->space)); - fil_system.space_list.push_back(*node->space); - } + fil_system.move_opened_last_to_space_list(node->space); fil_system.n_open++; return true; @@ -797,7 +794,17 @@ pfs_os_file_t fil_system_t::detach(fil_space_t *space, bool detach_handle) space->is_in_default_encrypt= false; default_encrypt_tables.remove(*space); } - space_list.erase(space_list_t::iterator(space)); + + { + space_list_t::iterator s= space_list_t::iterator(space); + if (space_list_last_opened == space) + { + space_list_t::iterator prev= s; + space_list_last_opened= &*--prev; + } + space_list.erase(s); + } + if (space == sys_space) sys_space= nullptr; else if (space == temp_space) @@ -916,12 +923,14 @@ bool fil_space_free(uint32_t id, bool x_latched) @param purpose tablespace purpose @param crypt_data encryption information @param mode encryption mode +@param opened true if space files are opened @return pointer to created tablespace, to be filled in with add() @retval nullptr on failure (such as when the same tablespace exists) */ fil_space_t *fil_space_t::create(uint32_t id, uint32_t flags, fil_type_t purpose, fil_space_crypt_t *crypt_data, - fil_encryption_t mode) + fil_encryption_t mode, + bool opened) { fil_space_t* space; @@ -974,7 +983,10 @@ fil_space_t *fil_space_t::create(uint32_t id, uint32_t flags, HASH_INSERT(fil_space_t, hash, &fil_system.spaces, id, space); - fil_system.space_list.push_back(*space); + if (opened) + fil_system.add_opened_last_to_space_list(space); + else + fil_system.space_list.push_back(*space); switch (id) { case 0: @@ -1295,6 +1307,15 @@ void fil_system_t::close() #endif /* __linux__ */ } +void fil_system_t::add_opened_last_to_space_list(fil_space_t *space) +{ + if (UNIV_LIKELY(space_list_last_opened != nullptr)) + space_list.insert(space_list_t::iterator(space_list_last_opened), *space); + else + space_list.push_back(*space); + space_list_last_opened= space; +} + /** Extend all open data files to the recovered size */ ATTRIBUTE_COLD void fil_system_t::extend_to_recv_size() { @@ -2028,7 +2049,7 @@ err_exit: if (fil_space_t* space = fil_space_t::create(space_id, flags, FIL_TYPE_TABLESPACE, - crypt_data, mode)) { + crypt_data, mode, true)) { fil_node_t* node = space->add(path, file, size, false, true); IF_WIN(node->find_metadata(), node->find_metadata(file, true)); mtr.start(); diff --git a/storage/innobase/gis/gis0rtree.cc b/storage/innobase/gis/gis0rtree.cc index 28e8359cab4..60218a132c9 100644 --- a/storage/innobase/gis/gis0rtree.cc +++ b/storage/innobase/gis/gis0rtree.cc @@ -890,6 +890,8 @@ rtr_page_split_and_insert( int first_rec_group = 1; IF_DBUG(bool iterated = false,); + buf_pool.pages_split++; + if (!*heap) { *heap = mem_heap_create(1024); } @@ -1197,8 +1199,6 @@ after_insert: ut_ad(!rec || rec_offs_validate(rec, cursor->index(), *offsets)); #endif - MONITOR_INC(MONITOR_INDEX_SPLIT); - return(rec); } diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index f79f709dedd..d18f85f4ac5 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -894,43 +894,37 @@ static SHOW_VAR innodb_status_variables[]= { (char*) &export_vars.innodb_buffer_pool_resize_status, SHOW_CHAR}, {"buffer_pool_load_incomplete", &export_vars.innodb_buffer_pool_load_incomplete, SHOW_BOOL}, - {"buffer_pool_pages_data", - &export_vars.innodb_buffer_pool_pages_data, SHOW_SIZE_T}, + {"buffer_pool_pages_data", &UT_LIST_GET_LEN(buf_pool.LRU), SHOW_SIZE_T}, {"buffer_pool_bytes_data", &export_vars.innodb_buffer_pool_bytes_data, SHOW_SIZE_T}, {"buffer_pool_pages_dirty", - &export_vars.innodb_buffer_pool_pages_dirty, SHOW_SIZE_T}, - {"buffer_pool_bytes_dirty", - &export_vars.innodb_buffer_pool_bytes_dirty, SHOW_SIZE_T}, - {"buffer_pool_pages_flushed", &buf_flush_page_count, SHOW_SIZE_T}, - {"buffer_pool_pages_free", - &export_vars.innodb_buffer_pool_pages_free, SHOW_SIZE_T}, + &UT_LIST_GET_LEN(buf_pool.flush_list), SHOW_SIZE_T}, + {"buffer_pool_bytes_dirty", &buf_pool.flush_list_bytes, SHOW_SIZE_T}, + {"buffer_pool_pages_flushed", &buf_pool.stat.n_pages_written, SHOW_SIZE_T}, + {"buffer_pool_pages_free", &UT_LIST_GET_LEN(buf_pool.free), SHOW_SIZE_T}, #ifdef UNIV_DEBUG {"buffer_pool_pages_latched", &export_vars.innodb_buffer_pool_pages_latched, SHOW_SIZE_T}, #endif /* UNIV_DEBUG */ {"buffer_pool_pages_made_not_young", - &export_vars.innodb_buffer_pool_pages_made_not_young, SHOW_SIZE_T}, + &buf_pool.stat.n_pages_not_made_young, SHOW_SIZE_T}, {"buffer_pool_pages_made_young", - &export_vars.innodb_buffer_pool_pages_made_young, SHOW_SIZE_T}, + &buf_pool.stat.n_pages_made_young, SHOW_SIZE_T}, {"buffer_pool_pages_misc", &export_vars.innodb_buffer_pool_pages_misc, SHOW_SIZE_T}, - {"buffer_pool_pages_old", - &export_vars.innodb_buffer_pool_pages_old, SHOW_SIZE_T}, + {"buffer_pool_pages_old", &buf_pool.LRU_old_len, SHOW_SIZE_T}, {"buffer_pool_pages_total", &export_vars.innodb_buffer_pool_pages_total, SHOW_SIZE_T}, {"buffer_pool_pages_LRU_flushed", &buf_lru_flush_page_count, SHOW_SIZE_T}, {"buffer_pool_pages_LRU_freed", &buf_lru_freed_page_count, SHOW_SIZE_T}, + {"buffer_pool_pages_split", &buf_pool.pages_split, SHOW_SIZE_T}, {"buffer_pool_read_ahead_rnd", - &export_vars.innodb_buffer_pool_read_ahead_rnd, SHOW_SIZE_T}, - {"buffer_pool_read_ahead", - &export_vars.innodb_buffer_pool_read_ahead, SHOW_SIZE_T}, + &buf_pool.stat.n_ra_pages_read_rnd, SHOW_SIZE_T}, + {"buffer_pool_read_ahead", &buf_pool.stat.n_ra_pages_read, SHOW_SIZE_T}, {"buffer_pool_read_ahead_evicted", - &export_vars.innodb_buffer_pool_read_ahead_evicted, SHOW_SIZE_T}, - {"buffer_pool_read_requests", - &export_vars.innodb_buffer_pool_read_requests, SHOW_SIZE_T}, - {"buffer_pool_reads", - &export_vars.innodb_buffer_pool_reads, SHOW_SIZE_T}, + &buf_pool.stat.n_ra_pages_evicted, SHOW_SIZE_T}, + {"buffer_pool_read_requests", &buf_pool.stat.n_page_gets, SHOW_SIZE_T}, + {"buffer_pool_reads", &buf_pool.stat.n_pages_read, SHOW_SIZE_T}, {"buffer_pool_wait_free", &buf_pool.stat.LRU_waits, SHOW_SIZE_T}, {"buffer_pool_write_requests", &buf_pool.flush_list_requests, SHOW_SIZE_T}, {"checkpoint_age", &export_vars.innodb_checkpoint_age, SHOW_SIZE_T}, @@ -4431,6 +4425,25 @@ innobase_commit_ordered( DBUG_VOID_RETURN; } +/** Mark the end of a statement. +@param trx transaction +@return whether an error occurred */ +static bool end_of_statement(trx_t *trx) +{ + trx_mark_sql_stat_end(trx); + if (UNIV_LIKELY(trx->error_state == DB_SUCCESS)) + return false; + + trx_savept_t savept; + savept.least_undo_no= 0; + trx->rollback(&savept); + /* MariaDB will roll back the entire transaction. */ + trx->bulk_insert= false; + trx->last_sql_stat_start.least_undo_no= 0; + trx->savepoints_discard(); + return true; +} + /*****************************************************************//** Commits a transaction in an InnoDB database or marks an SQL statement ended. @@ -4507,10 +4520,7 @@ innobase_commit( /* Store the current undo_no of the transaction so that we know where to roll back if we have to roll back the next SQL statement */ - - trx_mark_sql_stat_end(trx); - if (UNIV_UNLIKELY(trx->error_state != DB_SUCCESS)) { - trx_rollback_for_mysql(trx); + if (UNIV_UNLIKELY(end_of_statement(trx))) { DBUG_RETURN(1); } } @@ -15148,16 +15158,26 @@ ha_innobase::check( } if ((check_opt->flags & T_QUICK) || index->is_corrupted()) { - } else if (btr_validate_index(index, m_prebuilt->trx) - != DB_SUCCESS) { - is_ok = false; - push_warning_printf(thd, - Sql_condition::WARN_LEVEL_WARN, - ER_NOT_KEYFILE, - "InnoDB: The B-tree of" - " index %s is corrupted.", - index->name()); - continue; + } else if (trx_id_t bulk_trx_id = + m_prebuilt->table->bulk_trx_id) { + if (!m_prebuilt->trx->read_view.changes_visible( + bulk_trx_id)) { + is_ok = true; + goto func_exit; + } + + if (btr_validate_index(index, m_prebuilt->trx) + != DB_SUCCESS) { + is_ok = false; + push_warning_printf( + thd, + Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, + "InnoDB: The B-tree of" + " index %s is corrupted.", + index->name()); + continue; + } } /* Instead of invoking change_active_index(), set up @@ -15280,6 +15300,7 @@ ha_innobase::check( } # endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ #endif /* BTR_CUR_HASH_ADAPT */ +func_exit: m_prebuilt->trx->op_info = ""; DBUG_RETURN(is_ok ? HA_ADMIN_OK : HA_ADMIN_CORRUPT); @@ -16945,10 +16966,7 @@ innobase_xa_prepare( /* Store the current undo_no of the transaction so that we know where to roll back if we have to roll back the next SQL statement */ - - trx_mark_sql_stat_end(trx); - if (UNIV_UNLIKELY(trx->error_state != DB_SUCCESS)) { - trx_rollback_for_mysql(trx); + if (UNIV_UNLIKELY(end_of_statement(trx))) { return 1; } } diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc index d6b05b3a3ec..cfa5ed922da 100644 --- a/storage/innobase/handler/handler0alter.cc +++ b/storage/innobase/handler/handler0alter.cc @@ -6119,6 +6119,7 @@ func_exit: id, MTR_MEMO_PAGE_SX_FIX); if (UNIV_UNLIKELY(!root)) { + err = DB_CORRUPTION; goto func_exit; } diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc index 2a26f4f5ac2..589182b73ba 100644 --- a/storage/innobase/handler/i_s.cc +++ b/storage/innobase/handler/i_s.cc @@ -6130,8 +6130,13 @@ static int i_s_sys_tablespaces_fill(THD *thd, const fil_space_t &s, TABLE *t) OK(f->store(name.data(), name.size(), system_charset_info)); f->set_notnull(); } - else - f->set_notnull(); + else if (srv_is_undo_tablespace(s.id)) + { + char name[15]; + snprintf(name, sizeof name, "innodb_undo%03u", + (s.id - srv_undo_space_id_start + 1)); + OK(f->store(name, strlen(name), system_charset_info)); + } else f->set_notnull(); } fields[SYS_TABLESPACES_NAME]->set_null(); diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index 84ad4f3fcab..bfcc559cf5f 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -441,7 +441,7 @@ Gets the root node of a tree and x- or s-latches it. buf_block_t* btr_root_block_get( /*===============*/ - const dict_index_t* index, /*!< in: index tree */ + dict_index_t* index, /*!< in: index tree */ rw_lock_type_t mode, /*!< in: either RW_S_LATCH or RW_X_LATCH */ mtr_t* mtr, /*!< in: mtr */ diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h index a80765e5daa..966247ffa00 100644 --- a/storage/innobase/include/btr0types.h +++ b/storage/innobase/include/btr0types.h @@ -87,6 +87,9 @@ enum btr_latch_mode { dict_index_t::lock is being held in non-exclusive mode. */ BTR_MODIFY_LEAF_ALREADY_LATCHED = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED, + /** Attempt to modify records in an x-latched tree. */ + BTR_MODIFY_TREE_ALREADY_LATCHED = BTR_MODIFY_TREE + | BTR_ALREADY_S_LATCHED, /** U-latch root and X-latch a leaf page, assuming that dict_index_t::lock is being held in U mode. */ BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED = BTR_MODIFY_ROOT_AND_LEAF diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 59aa1ac08ad..420d4a388e8 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -691,13 +691,14 @@ public: ut_ad(s < prev_state + UNFIXED); } - void read_unfix(uint32_t s) + uint32_t read_unfix(uint32_t s) { ut_ad(lock.is_write_locked()); ut_ad(s == UNFIXED + 1 || s == REINIT + 1); - ut_d(auto old_state=) zip.fix.fetch_add(s - READ_FIX); + uint32_t old_state= zip.fix.fetch_add(s - READ_FIX); ut_ad(old_state >= READ_FIX); ut_ad(old_state < WRITE_FIX); + return old_state + (s - READ_FIX); } void set_freed(uint32_t prev_state, uint32_t count= 0) @@ -748,11 +749,11 @@ public: it from buf_pool.flush_list */ inline void write_complete(bool temporary); - /** Write a flushable page to a file. buf_pool.mutex must be held. - @param lru true=buf_pool.LRU; false=buf_pool.flush_list + /** Write a flushable page to a file or free a freeable block. + @param evict whether to evict the page on write completion @param space tablespace - @return whether the page was flushed and buf_pool.mutex was released */ - inline bool flush(bool lru, fil_space_t *space); + @return whether a page write was initiated and buf_pool.mutex released */ + bool flush(bool evict, fil_space_t *space); /** Notify that a page in a temporary tablespace has been modified. */ void set_temp_modified() @@ -822,8 +823,6 @@ public: /** @return whether the block is mapped to a data file */ bool in_file() const { return state() >= FREED; } - /** @return whether the block is modified and ready for flushing */ - inline bool ready_for_flush() const; /** @return whether the block can be relocated in memory. The block can be dirty, but it must not be I/O-fixed or bufferfixed. */ inline bool can_relocate() const; @@ -996,10 +995,10 @@ Compute the hash fold value for blocks in buf_pool.zip_hash. */ #define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b)) /* @} */ -/** A "Hazard Pointer" class used to iterate over page lists -inside the buffer pool. A hazard pointer is a buf_page_t pointer +/** A "Hazard Pointer" class used to iterate over buf_pool.LRU or +buf_pool.flush_list. A hazard pointer is a buf_page_t pointer which we intend to iterate over next and we want it remain valid -even after we release the buffer pool mutex. */ +even after we release the mutex that protects the list. */ class HazardPointer { public: @@ -1114,7 +1113,8 @@ struct buf_buddy_free_t { /*!< Node of zip_free list */ }; -/** @brief The buffer pool statistics structure. */ +/** @brief The buffer pool statistics structure; +protected by buf_pool.mutex unless otherwise noted. */ struct buf_pool_stat_t{ /** Initialize the counters */ void init() { memset((void*) this, 0, sizeof *this); } @@ -1123,9 +1123,8 @@ struct buf_pool_stat_t{ /*!< number of page gets performed; also successful searches through the adaptive hash index are - counted as page gets; this field - is NOT protected by the buffer - pool mutex */ + counted as page gets; + NOT protected by buf_pool.mutex */ ulint n_pages_read; /*!< number read operations */ ulint n_pages_written;/*!< number write operations */ ulint n_pages_created;/*!< number of pages created @@ -1143,10 +1142,9 @@ struct buf_pool_stat_t{ young because the first access was not long enough ago, in buf_page_peek_if_too_old() */ - /** number of waits for eviction; writes protected by buf_pool.mutex */ + /** number of waits for eviction */ ulint LRU_waits; ulint LRU_bytes; /*!< LRU size in bytes */ - ulint flush_list_bytes;/*!< flush_list size in bytes */ }; /** Statistics of buddy blocks of a given size. */ @@ -1399,6 +1397,11 @@ public: n_chunks_new / 4 * chunks->size; } + /** @return whether the buffer pool has run out */ + TPOOL_SUPPRESS_TSAN + bool ran_out() const + { return UNIV_UNLIKELY(!try_LRU_scan || !UT_LIST_GET_LEN(free)); } + /** @return whether the buffer pool is shrinking */ inline bool is_shrinking() const { @@ -1436,17 +1439,10 @@ public: /** Buffer pool mutex */ alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex; - /** Number of pending LRU flush; protected by mutex. */ - ulint n_flush_LRU_; - /** broadcast when n_flush_LRU reaches 0; protected by mutex */ - pthread_cond_t done_flush_LRU; - /** Number of pending flush_list flush; protected by mutex */ - ulint n_flush_list_; - /** broadcast when n_flush_list reaches 0; protected by mutex */ - pthread_cond_t done_flush_list; - - TPOOL_SUPPRESS_TSAN ulint n_flush_LRU() const { return n_flush_LRU_; } - TPOOL_SUPPRESS_TSAN ulint n_flush_list() const { return n_flush_list_; } + /** current statistics; protected by mutex */ + buf_pool_stat_t stat; + /** old statistics; protected by mutex */ + buf_pool_stat_t old_stat; /** @name General fields */ /* @{ */ @@ -1607,11 +1603,12 @@ public: buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES_MAX + 1]; /*!< Statistics of buddy system, indexed by block size */ - buf_pool_stat_t stat; /*!< current statistics */ - buf_pool_stat_t old_stat; /*!< old statistics */ /* @} */ + /** number of index page splits */ + Atomic_counter<ulint> pages_split; + /** @name Page flushing algorithm fields */ /* @{ */ @@ -1620,7 +1617,10 @@ public: alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_list_mutex; /** "hazard pointer" for flush_list scans; protected by flush_list_mutex */ FlushHp flush_hp; - /** modified blocks (a subset of LRU) */ + /** flush_list size in bytes; protected by flush_list_mutex */ + ulint flush_list_bytes; + /** possibly modified persistent pages (a subset of LRU); + buf_dblwr.pending_writes() is approximately COUNT(is_write_fixed()) */ UT_LIST_BASE_NODE_T(buf_page_t) flush_list; /** number of blocks ever added to flush_list; sometimes protected by flush_list_mutex */ @@ -1629,28 +1629,70 @@ public: TPOOL_SUPPRESS_TSAN void add_flush_list_requests(size_t size) { ut_ad(size); flush_list_requests+= size; } private: - /** whether the page cleaner needs wakeup from indefinite sleep */ - bool page_cleaner_is_idle; + static constexpr unsigned PAGE_CLEANER_IDLE= 1; + static constexpr unsigned FLUSH_LIST_ACTIVE= 2; + static constexpr unsigned LRU_FLUSH= 4; + + /** Number of pending LRU flush * LRU_FLUSH + + PAGE_CLEANER_IDLE + FLUSH_LIST_ACTIVE flags */ + unsigned page_cleaner_status; /** track server activity count for signaling idle flushing */ ulint last_activity_count; public: /** signalled to wake up the page_cleaner; protected by flush_list_mutex */ pthread_cond_t do_flush_list; + /** broadcast when !n_flush(); protected by flush_list_mutex */ + pthread_cond_t done_flush_LRU; + /** broadcast when a batch completes; protected by flush_list_mutex */ + pthread_cond_t done_flush_list; + + /** @return number of pending LRU flush */ + unsigned n_flush() const + { + mysql_mutex_assert_owner(&flush_list_mutex); + return page_cleaner_status / LRU_FLUSH; + } + + /** Increment the number of pending LRU flush */ + inline void n_flush_inc(); + + /** Decrement the number of pending LRU flush */ + inline void n_flush_dec(); + + /** @return whether flush_list flushing is active */ + bool flush_list_active() const + { + mysql_mutex_assert_owner(&flush_list_mutex); + return page_cleaner_status & FLUSH_LIST_ACTIVE; + } + + void flush_list_set_active() + { + ut_ad(!flush_list_active()); + page_cleaner_status+= FLUSH_LIST_ACTIVE; + } + void flush_list_set_inactive() + { + ut_ad(flush_list_active()); + page_cleaner_status-= FLUSH_LIST_ACTIVE; + } /** @return whether the page cleaner must sleep due to being idle */ bool page_cleaner_idle() const noexcept { mysql_mutex_assert_owner(&flush_list_mutex); - return page_cleaner_is_idle; + return page_cleaner_status & PAGE_CLEANER_IDLE; } - /** Wake up the page cleaner if needed */ - void page_cleaner_wakeup(); + /** Wake up the page cleaner if needed. + @param for_LRU whether to wake up for LRU eviction */ + void page_cleaner_wakeup(bool for_LRU= false); /** Register whether an explicit wakeup of the page cleaner is needed */ void page_cleaner_set_idle(bool deep_sleep) { mysql_mutex_assert_owner(&flush_list_mutex); - page_cleaner_is_idle= deep_sleep; + page_cleaner_status= (page_cleaner_status & ~PAGE_CLEANER_IDLE) | + (PAGE_CLEANER_IDLE * deep_sleep); } /** Update server last activity count */ @@ -1660,9 +1702,6 @@ public: last_activity_count= activity_count; } - // n_flush_LRU() + n_flush_list() - // is approximately COUNT(is_write_fixed()) in flush_list - unsigned freed_page_clock;/*!< a sequence number used to count the number of buffer blocks removed from the end of @@ -1672,16 +1711,10 @@ public: to read this for heuristic purposes without holding any mutex or latch */ - bool try_LRU_scan; /*!< Cleared when an LRU - scan for free block fails. This - flag is used to avoid repeated - scans of LRU list when we know - that there is no free block - available in the scan depth for - eviction. Set whenever - we flush a batch from the - buffer pool. Protected by the - buf_pool.mutex */ + /** Cleared when buf_LRU_get_free_block() fails. + Set whenever the free list grows, along with a broadcast of done_free. + Protected by buf_pool.mutex. */ + Atomic_relaxed<bool> try_LRU_scan; /* @} */ /** @name LRU replacement algorithm fields */ @@ -1690,7 +1723,8 @@ public: UT_LIST_BASE_NODE_T(buf_page_t) free; /*!< base node of the free block list */ - /** signaled each time when the free list grows; protected by mutex */ + /** broadcast each time when the free list grows or try_LRU_scan is set; + protected by mutex */ pthread_cond_t done_free; UT_LIST_BASE_NODE_T(buf_page_t) withdraw; @@ -1747,29 +1781,16 @@ public: { if (n_pend_reads) return true; - mysql_mutex_lock(&mutex); - const bool any_pending{n_flush_LRU_ || n_flush_list_}; - mysql_mutex_unlock(&mutex); + mysql_mutex_lock(&flush_list_mutex); + const bool any_pending= page_cleaner_status > PAGE_CLEANER_IDLE || + buf_dblwr.pending_writes(); + mysql_mutex_unlock(&flush_list_mutex); return any_pending; } - /** @return total amount of pending I/O */ - ulint io_pending() const - { - return n_pend_reads + n_flush_LRU() + n_flush_list(); - } -private: - /** Remove a block from the flush list. */ - inline void delete_from_flush_list_low(buf_page_t *bpage) noexcept; - /** Remove a block from flush_list. - @param bpage buffer pool page - @param clear whether to invoke buf_page_t::clear_oldest_modification() */ - void delete_from_flush_list(buf_page_t *bpage, bool clear) noexcept; -public: /** Remove a block from flush_list. @param bpage buffer pool page */ - void delete_from_flush_list(buf_page_t *bpage) noexcept - { delete_from_flush_list(bpage, true); } + void delete_from_flush_list(buf_page_t *bpage) noexcept; /** Prepare to insert a modified blcok into flush_list. @param lsn start LSN of the mini-transaction @@ -1784,7 +1805,7 @@ public: lsn_t lsn) noexcept; /** Free a page whose underlying file page has been freed. */ - inline void release_freed_page(buf_page_t *bpage) noexcept; + ATTRIBUTE_COLD void release_freed_page(buf_page_t *bpage) noexcept; private: /** Temporary memory for page_compressed and encrypted I/O */ @@ -1795,34 +1816,12 @@ private: /** array of slots */ buf_tmp_buffer_t *slots; - void create(ulint n_slots) - { - this->n_slots= n_slots; - slots= static_cast<buf_tmp_buffer_t*> - (ut_malloc_nokey(n_slots * sizeof *slots)); - memset((void*) slots, 0, n_slots * sizeof *slots); - } + void create(ulint n_slots); - void close() - { - for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) - { - aligned_free(s->crypt_buf); - aligned_free(s->comp_buf); - } - ut_free(slots); - slots= nullptr; - n_slots= 0; - } + void close(); /** Reserve a buffer */ - buf_tmp_buffer_t *reserve() - { - for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) - if (s->acquire()) - return s; - return nullptr; - } + buf_tmp_buffer_t *reserve(); } io_buf; /** whether resize() is in the critical path */ @@ -1911,7 +1910,10 @@ inline void buf_page_t::set_oldest_modification(lsn_t lsn) /** Clear oldest_modification after removing from buf_pool.flush_list */ inline void buf_page_t::clear_oldest_modification() { - mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); +#ifdef SAFE_MUTEX + if (oldest_modification() != 2) + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); +#endif /* SAFE_MUTEX */ ut_d(const auto s= state()); ut_ad(s >= REMOVE_HASH); ut_ad(oldest_modification()); @@ -1923,17 +1925,6 @@ inline void buf_page_t::clear_oldest_modification() oldest_modification_.store(0, std::memory_order_release); } -/** @return whether the block is modified and ready for flushing */ -inline bool buf_page_t::ready_for_flush() const -{ - mysql_mutex_assert_owner(&buf_pool.mutex); - ut_ad(in_LRU_list); - const auto s= state(); - ut_a(s >= FREED); - ut_ad(!fsp_is_system_temporary(id().space()) || oldest_modification() == 2); - return s < READ_FIX; -} - /** @return whether the block can be relocated in memory. The block can be dirty, but it must not be I/O-fixed or bufferfixed. */ inline bool buf_page_t::can_relocate() const diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h index fb9df55504c..d9c9239c0b4 100644 --- a/storage/innobase/include/buf0dblwr.h +++ b/storage/innobase/include/buf0dblwr.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -54,9 +54,9 @@ class buf_dblwr_t }; /** the page number of the first doublewrite block (block_size() pages) */ - page_id_t block1= page_id_t(0, 0); + page_id_t block1{0, 0}; /** the page number of the second doublewrite block (block_size() pages) */ - page_id_t block2= page_id_t(0, 0); + page_id_t block2{0, 0}; /** mutex protecting the data members below */ mysql_mutex_t mutex; @@ -72,11 +72,15 @@ class buf_dblwr_t ulint writes_completed; /** number of pages written by flush_buffered_writes_completed() */ ulint pages_written; + /** condition variable for !writes_pending */ + pthread_cond_t write_cond; + /** number of pending page writes */ + size_t writes_pending; slot slots[2]; - slot *active_slot= &slots[0]; + slot *active_slot; - /** Initialize the doublewrite buffer data structure. + /** Initialise the persistent storage of the doublewrite buffer. @param header doublewrite page header in the TRX_SYS page */ inline void init(const byte *header); @@ -84,6 +88,8 @@ class buf_dblwr_t bool flush_buffered_writes(const ulint size); public: + /** Initialise the doublewrite buffer data structures. */ + void init(); /** Create or restore the doublewrite buffer in the TRX_SYS page. @return whether the operation succeeded */ bool create(); @@ -118,7 +124,7 @@ public: void recover(); /** Update the doublewrite buffer on data page write completion. */ - void write_completed(); + void write_completed(bool with_doublewrite); /** Flush possible buffered writes to persistent storage. It is very important to call this function after a batch of writes has been posted, and also when we may have to wait for a page latch! @@ -137,14 +143,14 @@ public: @param size payload size in bytes */ void add_to_batch(const IORequest &request, size_t size); - /** Determine whether the doublewrite buffer is initialized */ - bool is_initialised() const + /** Determine whether the doublewrite buffer has been created */ + bool is_created() const { return UNIV_LIKELY(block1 != page_id_t(0, 0)); } /** @return whether a page identifier is part of the doublewrite buffer */ bool is_inside(const page_id_t id) const { - if (!is_initialised()) + if (!is_created()) return false; ut_ad(block1 < block2); if (id < block1) @@ -156,13 +162,44 @@ public: /** Wait for flush_buffered_writes() to be fully completed */ void wait_flush_buffered_writes() { - if (is_initialised()) - { - mysql_mutex_lock(&mutex); - while (batch_running) - my_cond_wait(&cond, &mutex.m_mutex); - mysql_mutex_unlock(&mutex); - } + mysql_mutex_lock(&mutex); + while (batch_running) + my_cond_wait(&cond, &mutex.m_mutex); + mysql_mutex_unlock(&mutex); + } + + /** Register an unbuffered page write */ + void add_unbuffered() + { + mysql_mutex_lock(&mutex); + writes_pending++; + mysql_mutex_unlock(&mutex); + } + + size_t pending_writes() + { + mysql_mutex_lock(&mutex); + const size_t pending{writes_pending}; + mysql_mutex_unlock(&mutex); + return pending; + } + + /** Wait for writes_pending to reach 0 */ + void wait_for_page_writes() + { + mysql_mutex_lock(&mutex); + while (writes_pending) + my_cond_wait(&write_cond, &mutex.m_mutex); + mysql_mutex_unlock(&mutex); + } + + /** Wait for writes_pending to reach 0 */ + void wait_for_page_writes(const timespec &abstime) + { + mysql_mutex_lock(&mutex); + while (writes_pending) + my_cond_timedwait(&write_cond, &mutex.m_mutex, &abstime); + mysql_mutex_unlock(&mutex); } }; diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h index af38f61b13b..31fe4446681 100644 --- a/storage/innobase/include/buf0flu.h +++ b/storage/innobase/include/buf0flu.h @@ -30,10 +30,8 @@ Created 11/5/1995 Heikki Tuuri #include "log0log.h" #include "buf0buf.h" -/** Number of pages flushed. Protected by buf_pool.mutex. */ -extern ulint buf_flush_page_count; /** Number of pages flushed via LRU. Protected by buf_pool.mutex. -Also included in buf_flush_page_count. */ +Also included in buf_pool.stat.n_pages_written. */ extern ulint buf_lru_flush_page_count; /** Number of pages freed without flushing. Protected by buf_pool.mutex. */ extern ulint buf_lru_freed_page_count; @@ -86,15 +84,18 @@ buf_flush_init_for_writing( bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr) MY_ATTRIBUTE((warn_unused_result)); -/** Write out dirty blocks from buf_pool.LRU. +/** Write out dirty blocks from buf_pool.LRU, +and move clean blocks to buf_pool.free. +The caller must invoke buf_dblwr.flush_buffered_writes() +after releasing buf_pool.mutex. @param max_n wished maximum mumber of blocks flushed -@return the number of processed pages +@param evict whether to evict pages after flushing +@return evict ? number of processed pages : number of pages written @retval 0 if a buf_pool.LRU batch is already running */ -ulint buf_flush_LRU(ulint max_n); +ulint buf_flush_LRU(ulint max_n, bool evict); -/** Wait until a flush batch ends. -@param lru true=buf_pool.LRU; false=buf_pool.flush_list */ -void buf_flush_wait_batch_end(bool lru); +/** Wait until a LRU flush batch ends. */ +void buf_flush_wait_LRU_batch_end(); /** Wait until all persistent pages are flushed up to a limit. @param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn); @@ -106,9 +107,6 @@ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious); /** Initialize page_cleaner. */ ATTRIBUTE_COLD void buf_flush_page_cleaner_init(); -/** Wait for pending flushes to complete. */ -void buf_flush_wait_batch_end_acquiring_mutex(bool lru); - /** Flush the buffer pool on shutdown. */ ATTRIBUTE_COLD void buf_flush_buffer_pool(); diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h index 65ae33cc188..ebf0f60ffe5 100644 --- a/storage/innobase/include/buf0rea.h +++ b/storage/innobase/include/buf0rea.h @@ -36,6 +36,7 @@ released by the i/o-handler thread. @param zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param chain buf_pool.page_hash cell for page_id @retval DB_SUCCESS if the page was read and is not corrupted, +@retval DB_SUCCESS_LOCKED_REC if the page was not read @retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted, @retval DB_DECRYPTION_FAILED if page post encryption checksum matches but after decryption normal page checksum does not match. diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 65ca704b6d8..f53279ecb88 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -871,11 +871,13 @@ public: @param purpose tablespace purpose @param crypt_data encryption information @param mode encryption mode + @param opened true if space files are opened @return pointer to created tablespace, to be filled in with add() @retval nullptr on failure (such as when the same tablespace exists) */ static fil_space_t *create(uint32_t id, uint32_t flags, fil_type_t purpose, fil_space_crypt_t *crypt_data, - fil_encryption_t mode= FIL_ENCRYPTION_DEFAULT); + fil_encryption_t mode= FIL_ENCRYPTION_DEFAULT, + bool opened= false); MY_ATTRIBUTE((warn_unused_result)) /** Acquire a tablespace reference. @@ -1080,7 +1082,7 @@ private: inline bool fil_space_t::use_doublewrite() const { return !UT_LIST_GET_FIRST(chain)->atomic_write && srv_use_doublewrite_buf && - buf_dblwr.is_initialised(); + buf_dblwr.is_created(); } inline void fil_space_t::set_imported() @@ -1357,6 +1359,11 @@ struct fil_system_t private: bool m_initialised; + + /** Points to the last opened space in space_list. Protected with + fil_system.mutex. */ + fil_space_t *space_list_last_opened= nullptr; + #ifdef __linux__ /** available block devices that reside on non-rotational storage */ std::vector<dev_t> ssd; @@ -1412,7 +1419,8 @@ public: /** nonzero if fil_node_open_file_low() should avoid moving the tablespace to the end of space_list, for FIFO policy of try_to_close() */ ulint freeze_space_list; - /** list of all tablespaces */ + /** List of all file spaces, opened spaces should be at the top of the list + to optimize try_to_close() execution. Protected with fil_system.mutex. */ ilist<fil_space_t, space_list_tag_t> space_list; /** list of all tablespaces for which a FILE_MODIFY record has been written since the latest redo log checkpoint. @@ -1427,6 +1435,49 @@ public: potential space_id reuse */ bool space_id_reuse_warned; + /** Add the file to the end of opened spaces list in + fil_system.space_list, so that fil_space_t::try_to_close() should close + it as a last resort. + @param space space to add */ + void add_opened_last_to_space_list(fil_space_t *space); + + /** Move the file to the end of opened spaces list in + fil_system.space_list, so that fil_space_t::try_to_close() should close + it as a last resort. + @param space space to move */ + inline void move_opened_last_to_space_list(fil_space_t *space) + { + /* In the case when several files of the same space are added in a + row, there is no need to remove and add a space to the same position + in space_list. It can be for system or temporary tablespaces. */ + if (freeze_space_list || space_list_last_opened == space) + return; + + space_list.erase(space_list_t::iterator(space)); + add_opened_last_to_space_list(space); + } + + /** Move closed file last in fil_system.space_list, so that + fil_space_t::try_to_close() iterates opened files first in FIFO order, + i.e. first opened, first closed. + @param space space to move */ + void move_closed_last_to_space_list(fil_space_t *space) + { + if (UNIV_UNLIKELY(freeze_space_list)) + return; + + space_list_t::iterator s= space_list_t::iterator(space); + + if (space_list_last_opened == space) + { + space_list_t::iterator prev= s; + space_list_last_opened= &*--prev; + } + + space_list.erase(s); + space_list.push_back(*space); + } + /** Return the next tablespace from default_encrypt_tables list. @param space previous tablespace (nullptr to start from the start) @param recheck whether the removal condition needs to be rechecked after diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index cdb159a5b26..5576560dca8 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -333,6 +333,9 @@ public: /** Upgrade U locks on a block to X */ void page_lock_upgrade(const buf_block_t &block); + /** Upgrade index U lock to X */ + ATTRIBUTE_COLD void index_lock_upgrade(); + /** Check if we are holding tablespace latch @param space tablespace to search for @return whether space.latch is being held */ diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index dea420d0528..52e5a724efd 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -85,11 +85,6 @@ struct srv_stats_t /** Count the amount of data written in total (in bytes) */ ulint_ctr_1_t data_written; - - /** Number of buffer pool reads that led to the reading of - a disk page */ - ulint_ctr_1_t buf_pool_reads; - /** Number of bytes saved by page compression */ ulint_ctr_n_t page_compression_saved; /* Number of pages compressed with page compression */ @@ -607,23 +602,11 @@ struct export_var_t{ char innodb_buffer_pool_resize_status[512];/*!< Buf pool resize status */ my_bool innodb_buffer_pool_load_incomplete;/*!< Buf pool load incomplete */ ulint innodb_buffer_pool_pages_total; /*!< Buffer pool size */ - ulint innodb_buffer_pool_pages_data; /*!< Data pages */ ulint innodb_buffer_pool_bytes_data; /*!< File bytes used */ - ulint innodb_buffer_pool_pages_dirty; /*!< Dirty data pages */ - ulint innodb_buffer_pool_bytes_dirty; /*!< File bytes modified */ ulint innodb_buffer_pool_pages_misc; /*!< Miscellanous pages */ - ulint innodb_buffer_pool_pages_free; /*!< Free pages */ #ifdef UNIV_DEBUG ulint innodb_buffer_pool_pages_latched; /*!< Latched pages */ #endif /* UNIV_DEBUG */ - ulint innodb_buffer_pool_pages_made_not_young; - ulint innodb_buffer_pool_pages_made_young; - ulint innodb_buffer_pool_pages_old; - ulint innodb_buffer_pool_read_requests; /*!< buf_pool.stat.n_page_gets */ - ulint innodb_buffer_pool_reads; /*!< srv_buf_pool_reads */ - ulint innodb_buffer_pool_read_ahead_rnd;/*!< srv_read_ahead_rnd */ - ulint innodb_buffer_pool_read_ahead; /*!< srv_read_ahead */ - ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/ ulint innodb_checkpoint_age; ulint innodb_checkpoint_max_age; ulint innodb_data_pending_reads; /*!< Pending reads */ diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index 2ed9566c215..bbfed2490e9 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -926,14 +926,19 @@ public: /** Determine if the specified transaction or any older one might be active. - @param caller_trx used to get/set pins + @param trx current transaction @param id transaction identifier @return whether any transaction not newer than id might be active */ - bool find_same_or_older(trx_t *caller_trx, trx_id_t id) + bool find_same_or_older(trx_t *trx, trx_id_t id) { - return rw_trx_hash.iterate(caller_trx, find_same_or_older_callback, &id); + if (trx->max_inactive_id >= id) + return false; + bool found= rw_trx_hash.iterate(trx, find_same_or_older_callback, &id); + if (!found) + trx->max_inactive_id= id; + return found; } diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index 2799750ee01..81eb5471a7b 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -642,6 +642,10 @@ public: Cleared in commit_in_memory() after commit_state(), trx_sys_t::deregister_rw(), release_locks(). */ trx_id_t id; + /** The largest encountered transaction identifier for which no + transaction was observed to be active. This is a cache to speed up + trx_sys_t::find_same_or_older(). */ + trx_id_t max_inactive_id; private: /** mutex protecting state and some of lock diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 2b30b9b1a03..3c7c3d348af 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -1064,13 +1064,16 @@ lock_sec_rec_some_has_impl( const trx_id_t max_trx_id= page_get_max_trx_id(page_align(rec)); - if ((caller_trx->id > max_trx_id && - !trx_sys.find_same_or_older(caller_trx, max_trx_id)) || + /* Note: It is possible to have caller_trx->id == 0 in a locking read + if caller_trx has not modified any persistent tables. */ + if (!trx_sys.find_same_or_older(caller_trx, max_trx_id) || !lock_check_trx_id_sanity(max_trx_id, rec, index, offsets)) return nullptr; - /* In this case it is possible that some transaction has an implicit - x-lock. We have to look in the clustered index. */ + /* We checked above that some active (or XA PREPARE) transaction exists + that is older than PAGE_MAX_TRX_ID. That is, some transaction may be + holding an implicit lock on the record. We have to look up the + clustered index record to find if it is (or was) the case. */ return row_vers_impl_x_locked(caller_trx, rec, index, offsets); } @@ -5157,20 +5160,24 @@ has an implicit lock on the record. The transaction instance must have a reference count > 0 so that it can't be committed and freed before this function has completed. */ static -void +bool lock_rec_convert_impl_to_expl_for_trx( /*==================================*/ + trx_t* trx, /*!< in/out: active transaction */ const page_id_t id, /*!< in: page identifier */ const rec_t* rec, /*!< in: user record on page */ - dict_index_t* index, /*!< in: index of record */ - trx_t* trx, /*!< in/out: active transaction */ - ulint heap_no)/*!< in: rec heap number to lock */ + dict_index_t* index) /*!< in: index of record */ { + if (!trx) + return false; + ut_ad(trx->is_referenced()); ut_ad(page_rec_is_leaf(rec)); ut_ad(!rec_is_metadata(rec, *index)); DEBUG_SYNC_C("before_lock_rec_convert_impl_to_expl_for_trx"); + ulint heap_no= page_rec_get_heap_no(rec); + { LockGuard g{lock_sys.rec_hash, id}; trx->mutex_lock(); @@ -5187,6 +5194,7 @@ lock_rec_convert_impl_to_expl_for_trx( trx->release_reference(); DEBUG_SYNC_C("after_lock_rec_convert_impl_to_expl_for_trx"); + return false; } @@ -5260,7 +5268,6 @@ static void lock_rec_other_trx_holds_expl(trx_t *caller_trx, trx_t *trx, } #endif /* UNIV_DEBUG */ - /** If an implicit x-lock exists on a record, convert it to an explicit one. Often, this is called by a transaction that is about to enter a lock wait @@ -5272,12 +5279,14 @@ This may also be called by the same transaction that is already holding an implicit exclusive lock on the record. In this case, no explicit lock should be created. +@tparam is_primary whether the index is the primary key @param[in,out] caller_trx current transaction @param[in] id index tree leaf page identifier @param[in] rec record on the leaf page @param[in] index the index of the record @param[in] offsets rec_get_offsets(rec,index) @return whether caller_trx already holds an exclusive lock on rec */ +template<bool is_primary> static bool lock_rec_convert_impl_to_expl( @@ -5295,8 +5304,9 @@ lock_rec_convert_impl_to_expl( ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); ut_ad(page_rec_is_leaf(rec)); ut_ad(!rec_is_metadata(rec, *index)); + ut_ad(index->is_primary() == is_primary); - if (dict_index_is_clust(index)) { + if (is_primary) { trx_id_t trx_id; trx_id = lock_clust_rec_some_has_impl(rec, index, offsets); @@ -5322,20 +5332,7 @@ lock_rec_convert_impl_to_expl( ut_d(lock_rec_other_trx_holds_expl(caller_trx, trx, rec, id)); } - if (trx) { - ulint heap_no = page_rec_get_heap_no(rec); - - ut_ad(trx->is_referenced()); - - /* If the transaction is still active and has no - explicit x-lock set on the record, set one for it. - trx cannot be committed until the ref count is zero. */ - - lock_rec_convert_impl_to_expl_for_trx( - id, rec, index, trx, heap_no); - } - - return false; + return lock_rec_convert_impl_to_expl_for_trx(trx, id, rec, index); } /*********************************************************************//** @@ -5374,8 +5371,9 @@ lock_clust_rec_modify_check_and_lock( /* If a transaction has no explicit x-lock set on the record, set one for it */ - if (lock_rec_convert_impl_to_expl(thr_get_trx(thr), block->page.id(), - rec, index, offsets)) { + if (lock_rec_convert_impl_to_expl<true>(thr_get_trx(thr), + block->page.id(), + rec, index, offsets)) { /* We already hold an implicit exclusive lock. */ return DB_SUCCESS; } @@ -5532,15 +5530,17 @@ lock_sec_rec_read_check_and_lock( return(DB_SUCCESS); } - const page_id_t id{block->page.id()}; - ut_ad(!rec_is_metadata(rec, *index)); trx_t *trx = thr_get_trx(thr); + + if (lock_table_has(trx, index->table, mode)) { + return DB_SUCCESS; + } + if (!page_rec_is_supremum(rec) - && !lock_table_has(trx, index->table, LOCK_X) - && lock_rec_convert_impl_to_expl(thr_get_trx(thr), id, rec, - index, offsets) + && lock_rec_convert_impl_to_expl<false>( + trx, block->page.id(), rec, index, offsets) && gap_mode == LOCK_REC_NOT_GAP) { /* We already hold an implicit exclusive lock. */ return DB_SUCCESS; @@ -5565,7 +5565,8 @@ lock_sec_rec_read_check_and_lock( if (trx->wsrep == 3) trx->wsrep = 1; #endif /* WITH_WSREP */ - ut_ad(lock_rec_queue_validate(false, id, rec, index, offsets)); + ut_ad(lock_rec_queue_validate(false, block->page.id(), + rec, index, offsets)); return(err); } @@ -5622,7 +5623,8 @@ lock_clust_rec_read_check_and_lock( trx_t *trx = thr_get_trx(thr); if (!lock_table_has(trx, index->table, LOCK_X) && heap_no != PAGE_HEAP_NO_SUPREMUM - && lock_rec_convert_impl_to_expl(trx, id, rec, index, offsets) + && lock_rec_convert_impl_to_expl<true>(trx, id, + rec, index, offsets) && gap_mode == LOCK_REC_NOT_GAP) { /* We already hold an implicit exclusive lock. */ return DB_SUCCESS; diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index e144b9fea38..02c6649bc33 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -1162,14 +1162,6 @@ wait_suspend_loop: if (!buf_pool.is_initialised()) { ut_ad(!srv_was_started); - } else if (ulint pending_io = buf_pool.io_pending()) { - if (srv_print_verbose_log && count > 600) { - ib::info() << "Waiting for " << pending_io << " buffer" - " page I/Os to complete"; - count = 0; - } - - goto loop; } else { buf_flush_buffer_pool(); } diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index 54e2e1a42f4..3443369af6c 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -3004,7 +3004,7 @@ set_start_lsn: /* The following is adapted from buf_pool_t::insert_into_flush_list() */ mysql_mutex_lock(&buf_pool.flush_list_mutex); - buf_pool.stat.flush_list_bytes+= block->physical_size(); + buf_pool.flush_list_bytes+= block->physical_size(); block->page.set_oldest_modification(start_lsn); UT_LIST_ADD_FIRST(buf_pool.flush_list, &block->page); buf_pool.page_cleaner_wakeup(); diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index d642c7266bb..6d31a55e8ed 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -140,9 +140,9 @@ inline void buf_pool_t::insert_into_flush_list(buf_page_t *prev, UT_LIST_REMOVE(flush_list, &block->page); } else - stat.flush_list_bytes+= block->physical_size(); + flush_list_bytes+= block->physical_size(); - ut_ad(stat.flush_list_bytes <= curr_pool_size); + ut_ad(flush_list_bytes <= curr_pool_size); if (prev) UT_LIST_INSERT_AFTER(flush_list, prev, &block->page); diff --git a/storage/innobase/rem/rem0rec.cc b/storage/innobase/rem/rem0rec.cc index a84cba01ef0..f489669b408 100644 --- a/storage/innobase/rem/rem0rec.cc +++ b/storage/innobase/rem/rem0rec.cc @@ -217,14 +217,12 @@ rec_get_n_extern_new( stored in one byte for 0..127. The length will be encoded in two bytes when it is 128 or more, or when the field is stored externally. */ - if (DATA_BIG_COL(col)) { - if (len & 0x80) { - /* 1exxxxxxx xxxxxxxx */ - if (len & 0x40) { - n_extern++; - } - lens--; + if (UNIV_UNLIKELY(len & 0x80) && DATA_BIG_COL(col)) { + /* 1exxxxxxx xxxxxxxx */ + if (len & 0x40) { + n_extern++; } + lens--; } } } while (++i < n); @@ -244,6 +242,10 @@ enum rec_leaf_format { REC_LEAF_INSTANT }; +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 11 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 to 10 need this */ +#endif /** Determine the offset to each field in a leaf-page record in ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED. This is a special case of rec_init_offsets() and rec_get_offsets_func(). @@ -361,8 +363,7 @@ start: do { if (mblob) { if (i == index->first_user_field()) { - offs = static_cast<rec_offs>(offs - + FIELD_REF_SIZE); + offs += FIELD_REF_SIZE; len = combine(offs, STORED_OFFPAGE); any |= REC_OFFS_EXTERNAL; field--; @@ -433,27 +434,23 @@ start: stored in one byte for 0..127. The length will be encoded in two bytes when it is 128 or more, or when the field is stored externally. */ - if ((len & 0x80) && DATA_BIG_COL(col)) { + if (UNIV_UNLIKELY(len & 0x80) && DATA_BIG_COL(col)) { /* 1exxxxxxx xxxxxxxx */ - len = static_cast<rec_offs>(len << 8 - | *lens--); - offs = static_cast<rec_offs>(offs - + get_value(len)); - if (UNIV_UNLIKELY(len & 0x4000)) { - ut_ad(index->is_primary()); - any |= REC_OFFS_EXTERNAL; - len = combine(offs, STORED_OFFPAGE); - } else { - len = offs; - } - + len <<= 8; + len |= *lens--; + static_assert(STORED_OFFPAGE == 0x4000, ""); + static_assert(REC_OFFS_EXTERNAL == 0x4000, ""); + const rec_offs ext = len & REC_OFFS_EXTERNAL; + offs += get_value(len); + len = offs | ext; + any |= ext; + ut_ad(!ext || index->is_primary()); continue; } - len = offs = static_cast<rec_offs>(offs + len); + len = offs += len; } else { - len = offs = static_cast<rec_offs>(offs - + field->fixed_len); + len = offs += field->fixed_len; } } while (field++, rec_offs_base(offsets)[++i] = len, i < rec_offs_n_fields(offsets)); @@ -679,8 +676,7 @@ rec_init_offsets( do { rec_offs len; if (UNIV_UNLIKELY(i == n_node_ptr_field)) { - len = offs = static_cast<rec_offs>( - offs + REC_NODE_PTR_SIZE); + len = offs += REC_NODE_PTR_SIZE; goto resolved; } @@ -720,29 +716,25 @@ rec_init_offsets( encoded in two bytes when it is 128 or more, or when the field is stored externally. */ - if (DATA_BIG_COL(col)) { - if (len & 0x80) { - /* 1exxxxxxx xxxxxxxx */ - len = static_cast<rec_offs>( - len << 8 | *lens--); - - /* B-tree node pointers - must not contain externally - stored columns. Thus - the "e" flag must be 0. */ - ut_a(!(len & 0x4000)); - offs = static_cast<rec_offs>( - offs + get_value(len)); - len = offs; - - goto resolved; - } + if (UNIV_UNLIKELY(len & 0x80) + && DATA_BIG_COL(col)) { + /* 1exxxxxxx xxxxxxxx */ + len <<= 8; + len |= *lens--; + + /* B-tree node pointers + must not contain externally + stored columns. Thus + the "e" flag must be 0. */ + ut_a(!(len & 0x4000)); + offs += len & 0x3fff; + len = offs; + goto resolved; } - len = offs = static_cast<rec_offs>(offs + len); + len = offs += len; } else { - len = offs = static_cast<rec_offs>( - offs + field->fixed_len); + len = offs += field->fixed_len; } resolved: rec_offs_base(offsets)[i + 1] = len; @@ -759,35 +751,30 @@ resolved: rec_offs any; if (rec_get_1byte_offs_flag(rec)) { - offs = static_cast<rec_offs>(offs + n_fields); + offs += static_cast<rec_offs>(n_fields); any = offs; /* Determine offsets to fields */ do { offs = rec_1_get_field_end_info(rec, i); if (offs & REC_1BYTE_SQL_NULL_MASK) { - offs &= static_cast<rec_offs>( - ~REC_1BYTE_SQL_NULL_MASK); - set_type(offs, SQL_NULL); + offs ^= REC_1BYTE_SQL_NULL_MASK + | SQL_NULL; } rec_offs_base(offsets)[1 + i] = offs; } while (++i < n); } else { - offs = static_cast<rec_offs>(offs + 2 * n_fields); + offs += static_cast<rec_offs>(2 * n_fields); any = offs; /* Determine offsets to fields */ do { offs = rec_2_get_field_end_info(rec, i); - if (offs & REC_2BYTE_SQL_NULL_MASK) { - offs &= static_cast<rec_offs>( - ~REC_2BYTE_SQL_NULL_MASK); - set_type(offs, SQL_NULL); - } - if (offs & REC_2BYTE_EXTERN_MASK) { - offs &= static_cast<rec_offs>( - ~REC_2BYTE_EXTERN_MASK); - set_type(offs, STORED_OFFPAGE); - any |= REC_OFFS_EXTERNAL; - } + static_assert(REC_2BYTE_SQL_NULL_MASK + == SQL_NULL, ""); + static_assert(REC_2BYTE_EXTERN_MASK + == STORED_OFFPAGE, ""); + static_assert(REC_OFFS_EXTERNAL + == STORED_OFFPAGE, ""); + any |= (offs & REC_OFFS_EXTERNAL); rec_offs_base(offsets)[1 + i] = offs; } while (++i < n); } @@ -996,8 +983,7 @@ rec_get_offsets_reverse( do { rec_offs len; if (UNIV_UNLIKELY(i == n_node_ptr_field)) { - len = offs = static_cast<rec_offs>( - offs + REC_NODE_PTR_SIZE); + len = offs += REC_NODE_PTR_SIZE; goto resolved; } @@ -1034,30 +1020,23 @@ rec_get_offsets_reverse( stored in one byte for 0..127. The length will be encoded in two bytes when it is 128 or more, or when the field is stored externally. */ - if (DATA_BIG_COL(col)) { - if (len & 0x80) { - /* 1exxxxxxx xxxxxxxx */ - len = static_cast<rec_offs>( - len << 8 | *lens++); - - offs = static_cast<rec_offs>( - offs + get_value(len)); - if (UNIV_UNLIKELY(len & 0x4000)) { - any_ext = REC_OFFS_EXTERNAL; - len = combine(offs, - STORED_OFFPAGE); - } else { - len = offs; - } - - goto resolved; - } + if (UNIV_UNLIKELY(len & 0x80) && DATA_BIG_COL(col)) { + /* 1exxxxxxx xxxxxxxx */ + len &= 0x7f; + len <<= 8; + len |= *lens++; + static_assert(STORED_OFFPAGE == 0x4000, ""); + static_assert(REC_OFFS_EXTERNAL == 0x4000, ""); + rec_offs ext = len & REC_OFFS_EXTERNAL; + offs += get_value(len); + len = offs | ext; + any_ext |= ext; + goto resolved; } - len = offs = static_cast<rec_offs>(offs + len); + len = offs += len; } else { - len = offs = static_cast<rec_offs>(offs - + field->fixed_len); + len = offs += field->fixed_len; } resolved: rec_offs_base(offsets)[i + 1] = len; @@ -1097,7 +1076,7 @@ rec_get_nth_field_offs_old( return(os); } - next_os = next_os & ~REC_1BYTE_SQL_NULL_MASK; + next_os &= ~REC_1BYTE_SQL_NULL_MASK; } else { os = rec_2_get_field_start_offs(rec, n); @@ -1109,8 +1088,7 @@ rec_get_nth_field_offs_old( return(os); } - next_os = next_os & ~(REC_2BYTE_SQL_NULL_MASK - | REC_2BYTE_EXTERN_MASK); + next_os &= ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK); } *len = next_os - os; @@ -1263,7 +1241,8 @@ rec_get_converted_size_comp_prefix_low( } else if (dfield_is_ext(dfield)) { ut_ad(DATA_BIG_COL(field->col)); extra_size += 2; - } else if (len < 128 || !DATA_BIG_COL(field->col)) { + } else if (UNIV_LIKELY(len < 128) + || !DATA_BIG_COL(field->col)) { extra_size++; } else { /* For variable-length columns, we look up the @@ -1614,14 +1593,7 @@ start: /* set the null flag if necessary */ if (dfield_is_null(field)) { -#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */ -#endif *nulls |= static_cast<byte>(null_mask); -#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 -# pragma GCC diagnostic pop -#endif null_mask <<= 1; continue; } @@ -1730,6 +1702,9 @@ rec_convert_dtuple_to_rec_new( REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); return buf; } +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 11 +# pragma GCC diagnostic pop /* ignored "-Wconversion" */ +#endif /*********************************************************//** Builds a physical record out of a data tuple and @@ -2092,14 +2067,12 @@ rec_copy_prefix_to_buf( stored in one byte for 0..127. The length will be encoded in two bytes when it is 128 or more, or when the column is stored externally. */ - if (DATA_BIG_COL(col)) { - if (len & 0x80) { - /* 1exxxxxx */ - len &= 0x3f; - len <<= 8; - len |= *lens--; - UNIV_PREFETCH_R(lens); - } + if (UNIV_UNLIKELY(len & 0x80) && DATA_BIG_COL(col)) { + /* 1exxxxxx */ + len &= 0x3f; + len <<= 8; + len |= *lens--; + UNIV_PREFETCH_R(lens); } prefix_len += len; } diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc index a20991cb819..24fb6eb39ce 100644 --- a/storage/innobase/row/row0ins.cc +++ b/storage/innobase/row/row0ins.cc @@ -2252,7 +2252,7 @@ row_ins_duplicate_online(ulint n_uniq, const dtuple_t *entry, ulint trx_id_len; - if (fields == n_uniq + if (fields == n_uniq + 2 && memcmp(rec_get_nth_field(rec, offsets, n_uniq, &trx_id_len), reset_trx_id, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) { ut_ad(trx_id_len == DATA_TRX_ID_LEN); diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc index 5f1b8be701e..010b347c003 100644 --- a/storage/innobase/row/row0log.cc +++ b/storage/innobase/row/row0log.cc @@ -3042,6 +3042,9 @@ row_log_apply_op_low( mtr_start(&mtr); index->set_modified(mtr); cursor.page_cur.index = index; + if (has_index_lock) { + mtr_x_lock_index(index, &mtr); + } /* We perform the pessimistic variant of the operations if we already hold index->lock exclusively. First, search the @@ -3049,7 +3052,8 @@ row_log_apply_op_low( depending on when the row in the clustered index was scanned. */ *error = cursor.search_leaf(entry, PAGE_CUR_LE, has_index_lock - ? BTR_MODIFY_TREE : BTR_MODIFY_LEAF, &mtr); + ? BTR_MODIFY_TREE_ALREADY_LATCHED + : BTR_MODIFY_LEAF, &mtr); if (UNIV_UNLIKELY(*error != DB_SUCCESS)) { goto func_exit; } diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc index f277ec4ec95..987d3d185d9 100644 --- a/storage/innobase/srv/srv0mon.cc +++ b/storage/innobase/srv/srv0mon.cc @@ -832,7 +832,7 @@ static monitor_info_t innodb_counter_info[] = MONITOR_DEFAULT_START, MONITOR_MODULE_INDEX}, {"index_page_splits", "index", "Number of index page splits", - MONITOR_NONE, + MONITOR_EXISTING, MONITOR_DEFAULT_START, MONITOR_INDEX_SPLIT}, {"index_page_merge_attempts", "index", @@ -1240,10 +1240,12 @@ srv_mon_process_existing_counter( /* Get the value from corresponding global variable */ switch (monitor_id) { - /* export_vars.innodb_buffer_pool_reads. Num Reads from - disk (page not in buffer) */ + case MONITOR_INDEX_SPLIT: + value = buf_pool.pages_split; + break; + case MONITOR_OVLD_BUF_POOL_READS: - value = srv_stats.buf_pool_reads; + value = buf_pool.stat.n_pages_read; break; /* innodb_buffer_pool_read_requests, the number of logical @@ -1304,7 +1306,7 @@ srv_mon_process_existing_counter( /* innodb_buffer_pool_bytes_dirty */ case MONITOR_OVLD_BUF_POOL_BYTES_DIRTY: - value = buf_pool.stat.flush_list_bytes; + value = buf_pool.flush_list_bytes; break; /* innodb_buffer_pool_pages_free */ diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index efa018e337a..d8babd40468 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -644,6 +644,7 @@ void srv_boot() if (transactional_lock_enabled()) sql_print_information("InnoDB: Using transactional memory"); #endif + buf_dblwr.init(); srv_thread_pool_init(); trx_pool_init(); srv_init(); @@ -896,56 +897,19 @@ srv_export_innodb_status(void) export_vars.innodb_data_writes = os_n_file_writes; - ulint dblwr = 0; - - if (buf_dblwr.is_initialised()) { - buf_dblwr.lock(); - dblwr = buf_dblwr.submitted(); - export_vars.innodb_dblwr_pages_written = buf_dblwr.written(); - export_vars.innodb_dblwr_writes = buf_dblwr.batches(); - buf_dblwr.unlock(); - } + buf_dblwr.lock(); + ulint dblwr = buf_dblwr.submitted(); + export_vars.innodb_dblwr_pages_written = buf_dblwr.written(); + export_vars.innodb_dblwr_writes = buf_dblwr.batches(); + buf_dblwr.unlock(); export_vars.innodb_data_written = srv_stats.data_written + dblwr; - export_vars.innodb_buffer_pool_read_requests - = buf_pool.stat.n_page_gets; - - export_vars.innodb_buffer_pool_reads = srv_stats.buf_pool_reads; - - export_vars.innodb_buffer_pool_read_ahead_rnd = - buf_pool.stat.n_ra_pages_read_rnd; - - export_vars.innodb_buffer_pool_read_ahead = - buf_pool.stat.n_ra_pages_read; - - export_vars.innodb_buffer_pool_read_ahead_evicted = - buf_pool.stat.n_ra_pages_evicted; - - export_vars.innodb_buffer_pool_pages_data = - UT_LIST_GET_LEN(buf_pool.LRU); - export_vars.innodb_buffer_pool_bytes_data = buf_pool.stat.LRU_bytes + (UT_LIST_GET_LEN(buf_pool.unzip_LRU) << srv_page_size_shift); - export_vars.innodb_buffer_pool_pages_dirty = - UT_LIST_GET_LEN(buf_pool.flush_list); - - export_vars.innodb_buffer_pool_pages_made_young - = buf_pool.stat.n_pages_made_young; - export_vars.innodb_buffer_pool_pages_made_not_young - = buf_pool.stat.n_pages_not_made_young; - - export_vars.innodb_buffer_pool_pages_old = buf_pool.LRU_old_len; - - export_vars.innodb_buffer_pool_bytes_dirty = - buf_pool.stat.flush_list_bytes; - - export_vars.innodb_buffer_pool_pages_free = - UT_LIST_GET_LEN(buf_pool.free); - #ifdef UNIV_DEBUG export_vars.innodb_buffer_pool_pages_latched = buf_get_latched_pages_number(); diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 05d17f9b883..5266450ce10 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -689,7 +689,8 @@ err_exit: fil_set_max_space_id_if_bigger(space_id); fil_space_t *space= fil_space_t::create(space_id, fsp_flags, - FIL_TYPE_TABLESPACE, NULL); + FIL_TYPE_TABLESPACE, nullptr, + FIL_ENCRYPTION_DEFAULT, true); ut_a(fil_validate()); ut_a(space); @@ -1034,9 +1035,7 @@ ATTRIBUTE_COLD static lsn_t srv_prepare_to_delete_redo_log_file() { DBUG_ENTER("srv_prepare_to_delete_redo_log_file"); - /* Disable checkpoints in the page cleaner. */ - ut_ad(!recv_sys.recovery_on); - recv_sys.recovery_on= true; + ut_ad(recv_sys.recovery_on); /* Clean the buffer pool. */ buf_flush_sync(); @@ -1666,8 +1665,6 @@ dberr_t srv_start(bool create_new_db) } } - recv_sys.debug_free(); - if (srv_operation != SRV_OPERATION_NORMAL) { ut_ad(srv_operation == SRV_OPERATION_RESTORE_EXPORT || srv_operation == SRV_OPERATION_RESTORE); @@ -1682,6 +1679,8 @@ dberr_t srv_start(bool create_new_db) if (err != DB_SUCCESS) { return(srv_init_abort(err)); } + + recv_sys.debug_free(); } ut_ad(err == DB_SUCCESS); @@ -1986,7 +1985,7 @@ void innodb_shutdown() ut_ad(dict_sys.is_initialised() || !srv_was_started); ut_ad(trx_sys.is_initialised() || !srv_was_started); - ut_ad(buf_dblwr.is_initialised() || !srv_was_started + ut_ad(buf_dblwr.is_created() || !srv_was_started || srv_read_only_mode || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO); ut_ad(lock_sys.is_initialised() || !srv_was_started); diff --git a/storage/innobase/trx/trx0roll.cc b/storage/innobase/trx/trx0roll.cc index 45dc78b4440..99cf1364192 100644 --- a/storage/innobase/trx/trx0roll.cc +++ b/storage/innobase/trx/trx0roll.cc @@ -557,9 +557,13 @@ trx_release_savepoint_for_mysql( if (savep != NULL) { trx_roll_savepoint_free(trx, savep); + return DB_SUCCESS; + } else if (trx->last_sql_stat_start.least_undo_no == 0) { + /* Bulk insert could have discarded savepoints */ + return DB_SUCCESS; } - return(savep != NULL ? DB_SUCCESS : DB_NO_SAVEPOINT); + return DB_NO_SAVEPOINT; } /*******************************************************************//** diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index ce7a252aa01..e88f7824ba6 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -404,6 +404,7 @@ void trx_t::free() sizeof skip_lock_inheritance_and_n_ref); /* do not poison mutex */ MEM_NOACCESS(&id, sizeof id); + MEM_NOACCESS(&max_inactive_id, sizeof id); MEM_NOACCESS(&state, sizeof state); MEM_NOACCESS(&is_recovered, sizeof is_recovered); #ifdef WITH_WSREP |