diff options
author | Marko Mäkelä <marko.makela@mariadb.com> | 2023-01-23 14:52:49 +0200 |
---|---|---|
committer | Marko Mäkelä <marko.makela@mariadb.com> | 2023-01-23 14:52:49 +0200 |
commit | e41fb3697c9f10b3db2e35b6a5704a03b1041c7e (patch) | |
tree | 4c36737d6a6edddee0ca59044e30d2a15ea6d014 /storage/innobase/btr | |
parent | 851c56771e11d50648430bb47644966996b9aa82 (diff) | |
download | mariadb-git-e41fb3697c9f10b3db2e35b6a5704a03b1041c7e.tar.gz |
Revert "MDEV-30400 Assertion height == btr_page_get_level(...) on INSERT"
This reverts commit f9cac8d2cbf82d4d616905fb3dfab34a9901179d
which was accidentally pushed prematurely.
Diffstat (limited to 'storage/innobase/btr')
-rw-r--r-- | storage/innobase/btr/btr0btr.cc | 498 | ||||
-rw-r--r-- | storage/innobase/btr/btr0cur.cc | 2254 | ||||
-rw-r--r-- | storage/innobase/btr/btr0defragment.cc | 68 | ||||
-rw-r--r-- | storage/innobase/btr/btr0pcur.cc | 109 | ||||
-rw-r--r-- | storage/innobase/btr/btr0sea.cc | 22 |
5 files changed, 1635 insertions, 1316 deletions
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index ef44ed5d9d6..0bb16dba374 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -2,7 +2,7 @@ Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2014, 2023, MariaDB Corporation. +Copyright (c) 2014, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -460,53 +460,6 @@ btr_page_create( } } -buf_block_t * -mtr_t::get_already_latched(const page_id_t id, mtr_memo_type_t type) const -{ - ut_ad(is_active()); - ut_ad(type == MTR_MEMO_PAGE_X_FIX || type == MTR_MEMO_PAGE_SX_FIX || - type == MTR_MEMO_PAGE_S_FIX); - for (ulint i= 0; i < m_memo.size(); i++) - { - const mtr_memo_slot_t &slot= m_memo[i]; - const auto slot_type= mtr_memo_type_t(slot.type & ~MTR_MEMO_MODIFY); - if (slot_type == MTR_MEMO_PAGE_X_FIX || slot_type == type) - { - buf_block_t *block= static_cast<buf_block_t*>(slot.object); - if (block->page.id() == id) - return block; - } - } - return nullptr; -} - -/** Fetch an index root page that was already latched in the -mini-transaction. */ -static buf_block_t *btr_get_latched_root(const dict_index_t &index, mtr_t *mtr) -{ - return mtr->get_already_latched(page_id_t{index.table->space_id, index.page}, - MTR_MEMO_PAGE_SX_FIX); -} - -/** Fet an index page that should have been already latched in the -mini-transaction. */ -static buf_block_t * -btr_block_reget(mtr_t *mtr, const dict_index_t &index, - const page_id_t id, rw_lock_type_t rw_latch, - dberr_t *err) -{ - if (buf_block_t *block= - mtr->get_already_latched(id, mtr_memo_type_t(rw_latch))) - { - *err= DB_SUCCESS; - return block; - } - - /* MDEV-29385 FIXME: Acquire the page latch upfront. */ - ut_ad(mtr->memo_contains_flagged(&index.lock, MTR_MEMO_X_LOCK)); - return btr_block_get(index, id.page_no(), rw_latch, true, mtr, err); -} - /**************************************************************//** Allocates a new file page to be used in an ibuf tree. Takes the page from the free list of the tree, which must contain pages! @@ -519,16 +472,18 @@ btr_page_alloc_for_ibuf( mtr_t* mtr, /*!< in: mtr */ dberr_t* err) /*!< out: error code */ { - buf_block_t *root= btr_get_latched_root(*index, mtr); + buf_block_t *root= btr_root_block_get(index, RW_SX_LATCH, mtr, err); if (UNIV_UNLIKELY(!root)) return root; + buf_block_t *new_block= - buf_page_get_gen(page_id_t(IBUF_SPACE_ID, + buf_page_get_gen(page_id_t(index->table->space_id, mach_read_from_4(PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST + FLST_FIRST + FIL_ADDR_PAGE + root->page.frame)), - 0, RW_X_LATCH, nullptr, BUF_GET, mtr, err); + index->table->space->zip_size(), RW_X_LATCH, nullptr, + BUF_GET, mtr, err); if (new_block) *err= flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, new_block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr); @@ -568,11 +523,11 @@ btr_page_alloc_low( #ifdef BTR_CUR_HASH_ADAPT ut_ad(!root->index || !root->index->freed()); #endif - mtr->rollback_to_savepoint(savepoint); + mtr->release_block_at_savepoint(savepoint, root); } else { - mtr->lock_register(savepoint, MTR_MEMO_PAGE_SX_FIX); + mtr->u_lock_register(savepoint); root->page.lock.u_lock(); #ifdef BTR_CUR_HASH_ADAPT btr_search_drop_page_hash_index(root, true); @@ -624,12 +579,15 @@ btr_page_free_for_ibuf( mtr_t* mtr) /*!< in: mtr */ { ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); - buf_block_t *root= btr_get_latched_root(*index, mtr); - dberr_t err= - flst_add_first(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + + dberr_t err; + if (buf_block_t *root= btr_root_block_get(index, RW_SX_LATCH, mtr, &err)) + { + err= flst_add_first(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr); - ut_d(if (err == DB_SUCCESS) - flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr)); + ut_d(if (err == DB_SUCCESS) + flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr)); + } return err; } @@ -679,11 +637,11 @@ dberr_t btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr, #ifdef BTR_CUR_HASH_ADAPT ut_ad(!root->index || !root->index->freed()); #endif - mtr->rollback_to_savepoint(savepoint); + mtr->release_block_at_savepoint(savepoint, root); } else { - mtr->lock_register(savepoint, MTR_MEMO_PAGE_SX_FIX); + mtr->u_lock_register(savepoint); root->page.lock.u_lock(); #ifdef BTR_CUR_HASH_ADAPT btr_search_drop_page_hash_index(root, true); @@ -754,27 +712,35 @@ btr_node_ptr_get_child( mtr, err); } -MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result)) +MY_ATTRIBUTE((nonnull(2,3,5), warn_unused_result)) /************************************************************//** Returns the upper level node pointer to a page. It is assumed that mtr holds an sx-latch on the tree. @return rec_get_offsets() of the node pointer record */ static rec_offs* -btr_page_get_father_node_ptr_for_validate( +btr_page_get_father_node_ptr_func( +/*==============================*/ rec_offs* offsets,/*!< in: work area for the return value */ mem_heap_t* heap, /*!< in: memory heap to use */ btr_cur_t* cursor, /*!< in: cursor pointing to user record, out: cursor on node pointer record, its page x-latched */ + btr_latch_mode latch_mode,/*!< in: BTR_CONT_MODIFY_TREE + or BTR_CONT_SEARCH_TREE */ mtr_t* mtr) /*!< in: mtr */ { + ut_ad(latch_mode == BTR_CONT_MODIFY_TREE + || latch_mode == BTR_CONT_SEARCH_TREE); + const uint32_t page_no = btr_cur_get_block(cursor)->page.id().page_no(); dict_index_t* index = btr_cur_get_index(cursor); ut_ad(!dict_index_is_spatial(index)); - ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK - | MTR_MEMO_SX_LOCK)); + ut_ad(srv_read_only_mode + || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(dict_index_get_page(index) != page_no); const auto level = btr_page_get_level(btr_cur_get_page(cursor)); @@ -786,16 +752,12 @@ btr_page_get_father_node_ptr_for_validate( dict_index_build_node_ptr(index, user_rec, 0, heap, level), - RW_S_LATCH, + PAGE_CUR_LE, latch_mode, cursor, mtr) != DB_SUCCESS) { return nullptr; } const rec_t* node_ptr = btr_cur_get_rec(cursor); -#if 0 /* MDEV-29835 FIXME */ - ut_ad(!btr_cur_get_block(cursor)->page.lock.not_recursive() - || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); -#endif offsets = rec_get_offsets(node_ptr, index, offsets, 0, ULINT_UNDEFINED, &heap); @@ -807,64 +769,13 @@ btr_page_get_father_node_ptr_for_validate( return(offsets); } -MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result)) -/************************************************************//** -Returns the upper level node pointer to a page. It is assumed that -it has already been latched. -@return rec_get_offsets() of the node pointer record */ -static -rec_offs* -btr_page_get_parent( - rec_offs* offsets,/*!< in: work area for the return value */ - mem_heap_t* heap, /*!< in: memory heap to use */ - btr_cur_t* cursor, /*!< in: cursor pointing to user record, - out: cursor on node pointer record, - its page x-latched */ - mtr_t* mtr) /*!< in: mtr */ -{ - const uint32_t page_no= cursor->block()->page.id().page_no(); - const dict_index_t *index= cursor->index(); - ut_ad(!index->is_spatial()); - ut_ad(index->page != page_no); - - uint32_t p= index->page; - const dtuple_t *tuple= - dict_index_build_node_ptr(index, btr_cur_get_rec(cursor), 0, heap, - btr_page_get_level(btr_cur_get_page(cursor))); - - ulint i; - for (i= 0; i < mtr->get_savepoint(); i++) - if (buf_block_t *block= mtr->block_at_savepoint(i)) - if (block->page.id().page_no() == p) - { - ut_ad(block->page.lock.have_u_or_x() || - (!block->page.lock.have_s() && index->lock.have_x())); - ulint up_match= 0, low_match= 0; - cursor->page_cur.block= block; - if (page_cur_search_with_match(tuple, PAGE_CUR_LE, &up_match, - &low_match, &cursor->page_cur, - nullptr)) - return nullptr; - offsets= rec_get_offsets(cursor->page_cur.rec, index, offsets, 0, - ULINT_UNDEFINED, &heap); - p= btr_node_ptr_get_child_page_no(cursor->page_cur.rec, offsets); - if (p != page_no) - { - i= 0; // MDEV-29835 FIXME: require all pages to be latched in order! - continue; - } - ut_ad(block->page.lock.have_u_or_x()); - if (block->page.lock.have_u_not_x()) - { - ut_ad(block->page.id().page_no() == index->page); - block->page.lock.u_x_upgrade(); - mtr->page_lock_upgrade(*block); - } - return offsets; - } +#define btr_page_get_father_node_ptr(of,heap,cur,mtr) \ + btr_page_get_father_node_ptr_func( \ + of,heap,cur,BTR_CONT_MODIFY_TREE,mtr) - return nullptr; -} +#define btr_page_get_father_node_ptr_for_validate(of,heap,cur,mtr) \ + btr_page_get_father_node_ptr_func( \ + of,heap,cur,BTR_CONT_SEARCH_TREE,mtr) /************************************************************//** Returns the upper level node pointer to a page. It is assumed that mtr holds @@ -885,7 +796,7 @@ btr_page_get_father_block( if (UNIV_UNLIKELY(!rec)) return nullptr; cursor->page_cur.rec= rec; - return btr_page_get_parent(offsets, heap, cursor, mtr); + return btr_page_get_father_node_ptr(offsets, heap, cursor, mtr); } /** Seek to the parent page of a B-tree page. @@ -900,7 +811,7 @@ bool btr_page_get_father(mtr_t* mtr, btr_cur_t* cursor) return false; cursor->page_cur.rec= rec; mem_heap_t *heap= mem_heap_create(100); - const bool got= btr_page_get_parent(nullptr, heap, cursor, mtr); + const bool got= btr_page_get_father_node_ptr(nullptr, heap, cursor, mtr); mem_heap_free(heap); return got; } @@ -1807,43 +1718,48 @@ void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr) /** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE. @param[in] index clustered index with instant ALTER TABLE @param[in] all whether to reset FIL_PAGE_TYPE as well -@param[in,out] mtr mini-transaction */ +@param[in,out] mtr mini-transaction +@return error code */ ATTRIBUTE_COLD -void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr) +dberr_t btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr) { ut_ad(!index.table->is_temporary()); ut_ad(index.is_primary()); - buf_block_t *root= btr_get_latched_root(index, mtr); - byte *page_type= root->page.frame + FIL_PAGE_TYPE; - if (all) - { - ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT || - mach_read_from_2(page_type) == FIL_PAGE_INDEX); - mtr->write<2,mtr_t::MAYBE_NOP>(*root, page_type, FIL_PAGE_INDEX); - byte *instant= PAGE_INSTANT + PAGE_HEADER + root->page.frame; - mtr->write<2,mtr_t::MAYBE_NOP>(*root, instant, - page_ptr_get_direction(instant + 1)); - } - else - ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT); - static const byte supremuminfimum[8 + 8] = "supremuminfimum"; - uint16_t infimum, supremum; - if (page_is_comp(root->page.frame)) - { - infimum= PAGE_NEW_INFIMUM; - supremum= PAGE_NEW_SUPREMUM; - } - else + dberr_t err; + if (buf_block_t *root= btr_root_block_get(&index, RW_SX_LATCH, mtr, &err)) { - infimum= PAGE_OLD_INFIMUM; - supremum= PAGE_OLD_SUPREMUM; + byte *page_type= root->page.frame + FIL_PAGE_TYPE; + if (all) + { + ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT || + mach_read_from_2(page_type) == FIL_PAGE_INDEX); + mtr->write<2,mtr_t::MAYBE_NOP>(*root, page_type, FIL_PAGE_INDEX); + byte *instant= PAGE_INSTANT + PAGE_HEADER + root->page.frame; + mtr->write<2,mtr_t::MAYBE_NOP>(*root, instant, + page_ptr_get_direction(instant + 1)); + } + else + ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT); + static const byte supremuminfimum[8 + 8] = "supremuminfimum"; + uint16_t infimum, supremum; + if (page_is_comp(root->page.frame)) + { + infimum= PAGE_NEW_INFIMUM; + supremum= PAGE_NEW_SUPREMUM; + } + else + { + infimum= PAGE_OLD_INFIMUM; + supremum= PAGE_OLD_SUPREMUM; + } + ut_ad(!memcmp(&root->page.frame[infimum], supremuminfimum + 8, 8) == + !memcmp(&root->page.frame[supremum], supremuminfimum, 8)); + mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->page.frame[infimum], + supremuminfimum + 8, 8); + mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->page.frame[supremum], + supremuminfimum, 8); } - ut_ad(!memcmp(&root->page.frame[infimum], supremuminfimum + 8, 8) == - !memcmp(&root->page.frame[supremum], supremuminfimum, 8)); - mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->page.frame[infimum], - supremuminfimum + 8, 8); - mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->page.frame[supremum], - supremuminfimum, 8); + return err; } /*************************************************************//** @@ -1940,6 +1856,11 @@ btr_root_raise_and_insert( } /* Copy the records from root to the new page one by one. */ + dberr_t e; + if (!err) { + err = &e; + } + if (0 #ifdef UNIV_ZIP_COPY || new_page_zip @@ -2083,15 +2004,21 @@ btr_root_raise_and_insert( page_cursor->block = new_block; page_cursor->index = index; - ut_ad(dtuple_check_typed(tuple)); - /* Reposition the cursor to the child node */ - ulint low_match = 0, up_match = 0; + if (tuple) { + ut_ad(dtuple_check_typed(tuple)); + /* Reposition the cursor to the child node */ + ulint low_match = 0, up_match = 0; - if (page_cur_search_with_match(tuple, PAGE_CUR_LE, - &up_match, &low_match, - page_cursor, nullptr)) { - *err = DB_CORRUPTION; - return nullptr; + if (page_cur_search_with_match(tuple, PAGE_CUR_LE, + &up_match, &low_match, + page_cursor, nullptr)) { + if (err) { + *err = DB_CORRUPTION; + } + return nullptr; + } + } else { + page_cursor->rec = page_get_infimum_rec(new_block->page.frame); } /* Split the child and insert tuple */ @@ -2310,7 +2237,6 @@ func_exit: return(rec); } -#ifdef UNIV_DEBUG /*************************************************************//** Returns TRUE if the insert fits on the appropriate half-page with the chosen split_rec. @@ -2408,7 +2334,6 @@ got_rec: return(false); } -#endif /*******************************************************//** Inserts a data tuple to a tree on a non-leaf level. It is assumed @@ -2431,34 +2356,25 @@ btr_insert_on_non_leaf_level( rtr_info_t rtr_info; ut_ad(level > 0); - - flags |= BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG - | BTR_NO_UNDO_LOG_FLAG; - cursor.page_cur.index = index; - - dberr_t err; + auto mode = PAGE_CUR_LE; if (index->is_spatial()) { + mode = PAGE_CUR_RTREE_INSERT; /* For spatial index, initialize structures to track its parents etc. */ rtr_init_rtr_info(&rtr_info, false, &cursor, index, false); rtr_info_update_btr(&cursor, &rtr_info); - err = rtr_search_to_nth_level(level, tuple, - PAGE_CUR_RTREE_INSERT, - BTR_CONT_MODIFY_TREE, - &cursor, mtr); - } else { - err = btr_cur_search_to_nth_level(level, tuple, RW_X_LATCH, - &cursor, mtr); } + flags |= BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG + | BTR_NO_UNDO_LOG_FLAG; + cursor.page_cur.index = index; + + dberr_t err = btr_cur_search_to_nth_level(level, tuple, mode, + BTR_CONT_MODIFY_TREE, + &cursor, mtr); ut_ad(cursor.flag == BTR_CUR_BINARY); -#if 0 /* MDEV-29835 FIXME */ - ut_ad(!btr_cur_get_block(&cursor)->page.lock.not_recursive() - || index->is_spatial() - || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); -#endif if (UNIV_LIKELY(err == DB_SUCCESS)) { err = btr_cur_optimistic_insert(flags, @@ -2554,7 +2470,6 @@ btr_attach_half_pages( /* Get the level of the split pages */ const ulint level = btr_page_get_level(block->page.frame); ut_ad(level == btr_page_get_level(new_block->page.frame)); - page_id_t id{block->page.id()}; /* Get the previous and next pages of page */ const uint32_t prev_page_no = btr_page_get_prev(block->page.frame); @@ -2562,32 +2477,12 @@ btr_attach_half_pages( /* for consistency, both blocks should be locked, before change */ if (prev_page_no != FIL_NULL && direction == FSP_DOWN) { - id.set_page_no(prev_page_no); - prev_block = mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); -#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ - if (!prev_block) { -# if 0 /* MDEV-29835 FIXME */ - ut_ad(mtr->memo_contains(index->lock, - MTR_MEMO_X_LOCK)); -# endif - prev_block = btr_block_get(*index, prev_page_no, - RW_X_LATCH, !level, mtr); - } -#endif + prev_block = btr_block_get(*index, prev_page_no, RW_X_LATCH, + !level, mtr); } if (next_page_no != FIL_NULL && direction != FSP_DOWN) { - id.set_page_no(next_page_no); - next_block = mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); -#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ - if (!next_block) { -# if 0 /* MDEV-29835 FIXME */ - ut_ad(mtr->memo_contains(index->lock, - MTR_MEMO_X_LOCK)); -# endif - next_block = btr_block_get(*index, next_page_no, - RW_X_LATCH, !level, mtr); - } -#endif + next_block = btr_block_get(*index, next_page_no, RW_X_LATCH, + !level, mtr); } /* Build the node pointer (= node key and page address) for the upper @@ -3123,7 +3018,6 @@ insert_empty: return nullptr; } -#ifdef UNIV_DEBUG /* If the split is made on the leaf level and the insert will fit on the appropriate half-page, we may release the tree x-latch. We can then move the records after releasing the tree latch, @@ -3131,21 +3025,21 @@ insert_empty: const bool insert_will_fit = !new_page_zip && btr_page_insert_fits(cursor, split_rec, offsets, tuple, n_ext, heap); -#endif if (!split_rec && !insert_left) { UT_DELETE_ARRAY(buf); buf = NULL; } -#if 0 // FIXME: this used to be a no-op, and may cause trouble if enabled - if (insert_will_fit + if (!srv_read_only_mode + && insert_will_fit && page_is_leaf(page) && !dict_index_is_online_ddl(cursor->index())) { +#if 0 // FIXME: this used to be a no-op, and may cause trouble if enabled mtr->release(cursor->index()->lock); +#endif /* NOTE: We cannot release root block latch here, because it has segment header and already modified in most of cases.*/ } -#endif /* 5. Move then the records to the new page */ if (direction == FSP_DOWN) { @@ -3377,58 +3271,52 @@ func_exit: dberr_t btr_level_list_remove(const buf_block_t& block, const dict_index_t& index, mtr_t* mtr) { - ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_X_FIX)); - ut_ad(block.zip_size() == index.table->space->zip_size()); - ut_ad(index.table->space->id == block.page.id().space()); - /* Get the previous and next page numbers of page */ - const uint32_t prev_page_no= btr_page_get_prev(block.page.frame); - const uint32_t next_page_no= btr_page_get_next(block.page.frame); - page_id_t id{block.page.id()}; - buf_block_t *prev= nullptr, *next; - dberr_t err; + ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(block.zip_size() == index.table->space->zip_size()); + ut_ad(index.table->space->id == block.page.id().space()); + /* Get the previous and next page numbers of page */ - /* Update page links of the level */ - if (prev_page_no != FIL_NULL) - { - id.set_page_no(prev_page_no); - prev= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); -#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ - if (!prev) - { -# if 0 /* MDEV-29835 FIXME */ - ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK)); -# endif - prev= btr_block_get(index, id.page_no(), RW_X_LATCH, - page_is_leaf(block.page.frame), mtr, &err); - if (UNIV_UNLIKELY(!prev)) - return err; - } -#endif - } + const page_t* page = block.page.frame; + const uint32_t prev_page_no = btr_page_get_prev(page); + const uint32_t next_page_no = btr_page_get_next(page); - if (next_page_no != FIL_NULL) - { - id.set_page_no(next_page_no); - next= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); -#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ - if (!next) - { -# if 0 /* MDEV-29835 FIXME */ - ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK)); -# endif - next= btr_block_get(index, id.page_no(), RW_X_LATCH, - page_is_leaf(block.page.frame), mtr, &err); - if (UNIV_UNLIKELY(!next)) - return err; - } -#endif - btr_page_set_prev(next, prev_page_no, mtr); - } + /* Update page links of the level */ + dberr_t err; - if (prev) - btr_page_set_next(prev, next_page_no, mtr); + if (prev_page_no != FIL_NULL) { + buf_block_t* prev_block = btr_block_get( + index, prev_page_no, RW_X_LATCH, page_is_leaf(page), + mtr, &err); + if (UNIV_UNLIKELY(!prev_block)) { + return err; + } + if (UNIV_UNLIKELY(memcmp_aligned<4>(prev_block->page.frame + + FIL_PAGE_NEXT, + page + FIL_PAGE_OFFSET, + 4))) { + return DB_CORRUPTION; + } + btr_page_set_next(prev_block, next_page_no, mtr); + } - return DB_SUCCESS; + if (next_page_no != FIL_NULL) { + buf_block_t* next_block = btr_block_get( + index, next_page_no, RW_X_LATCH, page_is_leaf(page), + mtr, &err); + + if (UNIV_UNLIKELY(!next_block)) { + return err; + } + if (UNIV_UNLIKELY(memcmp_aligned<4>(next_block->page.frame + + FIL_PAGE_PREV, + page + FIL_PAGE_OFFSET, + 4))) { + return DB_CORRUPTION; + } + btr_page_set_prev(next_block, prev_page_no, mtr); + } + + return DB_SUCCESS; } /*************************************************************//** @@ -4278,30 +4166,23 @@ btr_discard_page( const uint32_t left_page_no = btr_page_get_prev(block->page.frame); const uint32_t right_page_no = btr_page_get_next(block->page.frame); - page_id_t merge_page_id{block->page.id()}; ut_d(bool parent_is_different = false); - dberr_t err; if (left_page_no != FIL_NULL) { - merge_page_id.set_page_no(left_page_no); - merge_block = btr_block_reget(mtr, *index, merge_page_id, - RW_X_LATCH, &err); + dberr_t err; + merge_block = btr_block_get(*index, left_page_no, RW_X_LATCH, + true, mtr, &err); if (UNIV_UNLIKELY(!merge_block)) { return err; } -#if 0 /* MDEV-29385 FIXME: Acquire the page latch upfront. */ - ut_ad(!memcmp_aligned<4>(merge_block->page.frame - + FIL_PAGE_NEXT, - block->page.frame + FIL_PAGE_OFFSET, - 4)); -#else + if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_block->page.frame + FIL_PAGE_NEXT, block->page.frame + FIL_PAGE_OFFSET, 4))) { return DB_CORRUPTION; } -#endif + ut_d(parent_is_different = (page_rec_get_next( page_get_infimum_rec( @@ -4309,25 +4190,19 @@ btr_discard_page( &parent_cursor))) == btr_cur_get_rec(&parent_cursor))); } else if (right_page_no != FIL_NULL) { - merge_page_id.set_page_no(right_page_no); - merge_block = btr_block_reget(mtr, *index, merge_page_id, - RW_X_LATCH, &err); + dberr_t err; + merge_block = btr_block_get(*index, right_page_no, RW_X_LATCH, + true, mtr, &err); if (UNIV_UNLIKELY(!merge_block)) { return err; } -#if 0 /* MDEV-29385 FIXME: Acquire the page latch upfront. */ - ut_ad(!memcmp_aligned<4>(merge_block->page.frame - + FIL_PAGE_PREV, - block->page.frame + FIL_PAGE_OFFSET, - 4)); -#else if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_block->page.frame + FIL_PAGE_PREV, block->page.frame + FIL_PAGE_OFFSET, 4))) { return DB_CORRUPTION; } -#endif + ut_d(parent_is_different = page_rec_is_supremum( page_rec_get_next(btr_cur_get_rec(&parent_cursor)))); if (page_is_leaf(merge_block->page.frame)) { @@ -4369,10 +4244,13 @@ btr_discard_page( } #ifdef UNIV_ZIP_DEBUG - if (page_zip_des_t* merge_page_zip - = buf_block_get_page_zip(merge_block)); - ut_a(page_zip_validate(merge_page_zip, - merge_block->page.frame, index)); + { + page_zip_des_t* merge_page_zip + = buf_block_get_page_zip(merge_block); + ut_a(!merge_page_zip + || page_zip_validate(merge_page_zip, + merge_block->page.frame, index)); + } #endif /* UNIV_ZIP_DEBUG */ if (index->has_locking()) { @@ -4391,7 +4269,7 @@ btr_discard_page( } /* Free the file page */ - err = btr_page_free(index, block, mtr); + dberr_t err = btr_page_free(index, block, mtr); if (err == DB_SUCCESS) { /* btr_check_node_ptr() needs parent block latched. @@ -4584,8 +4462,6 @@ btr_check_node_ptr( offsets = btr_page_get_father_block(NULL, heap, mtr, &cursor); } - ut_ad(offsets); - if (page_is_leaf(page)) { goto func_exit; @@ -4917,16 +4793,19 @@ btr_validate_level( page_zip_des_t* page_zip; #endif /* UNIV_ZIP_DEBUG */ ulint savepoint = 0; + ulint savepoint2 = 0; uint32_t parent_page_no = FIL_NULL; uint32_t parent_right_page_no = FIL_NULL; bool rightmost_child = false; mtr.start(); - if (lockout) { - mtr_x_lock_index(index, &mtr); - } else { - mtr_sx_lock_index(index, &mtr); + if (!srv_read_only_mode) { + if (lockout) { + mtr_x_lock_index(index, &mtr); + } else { + mtr_sx_lock_index(index, &mtr); + } } dberr_t err; @@ -4974,6 +4853,7 @@ corrupted: offsets = rec_get_offsets(node_ptr, index, offsets, 0, ULINT_UNDEFINED, &heap); + savepoint2 = mtr_set_savepoint(&mtr); block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr, &err); if (!block) { @@ -4994,8 +4874,10 @@ corrupted: /* To obey latch order of tree blocks, we should release the right_block once to obtain lock of the uncle block. */ - mtr.release_last_page(); + mtr_release_block_at_savepoint( + &mtr, savepoint2, block); + savepoint2 = mtr_set_savepoint(&mtr); block = btr_block_get(*index, left_page_no, RW_SX_LATCH, false, &mtr, &err); @@ -5023,10 +4905,12 @@ func_exit: mem_heap_empty(heap); offsets = offsets2 = NULL; - if (lockout) { - mtr_x_lock_index(index, &mtr); - } else { - mtr_sx_lock_index(index, &mtr); + if (!srv_read_only_mode) { + if (lockout) { + mtr_x_lock_index(index, &mtr); + } else { + mtr_sx_lock_index(index, &mtr); + } } page = block->page.frame; @@ -5071,7 +4955,7 @@ func_exit: if (right_page_no != FIL_NULL) { const rec_t* right_rec; - savepoint = mtr.get_savepoint(); + savepoint = mtr_set_savepoint(&mtr); right_block = btr_block_get(*index, right_page_no, RW_SX_LATCH, !level, &mtr, &err); @@ -5266,10 +5150,8 @@ broken_links: /* To obey latch order of tree blocks, we should release the right_block once to obtain lock of the uncle block. */ - ut_ad(right_block - == mtr.at_savepoint(savepoint)); - mtr.rollback_to_savepoint(savepoint, - savepoint + 1); + mtr_release_block_at_savepoint( + &mtr, savepoint, right_block); if (parent_right_page_no != FIL_NULL) { btr_block_get(*index, diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index b3bfb74bb8b..ac06d9b1568 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -3,7 +3,7 @@ Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2015, 2023, MariaDB Corporation. +Copyright (c) 2015, 2022, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -103,14 +103,14 @@ throughput clearly from about 100000. */ #define BTR_CUR_FINE_HISTORY_LENGTH 100000 #ifdef BTR_CUR_HASH_ADAPT -/** Number of searches down the B-tree in btr_cur_t::search_leaf(). */ +/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */ ib_counter_t<ulint, ib_counter_element_t> btr_cur_n_non_sea; /** Old value of btr_cur_n_non_sea. Copied by srv_refresh_innodb_monitor_stats(). Referenced by srv_printf_innodb_monitor(). */ ulint btr_cur_n_non_sea_old; /** Number of successful adaptive hash index lookups in -btr_cur_t::search_leaf(). */ +btr_cur_search_to_nth_level(). */ ib_counter_t<ulint, ib_counter_element_t> btr_cur_n_sea; /** Old value of btr_cur_n_sea. Copied by srv_refresh_innodb_monitor_stats(). Referenced by @@ -188,106 +188,164 @@ btr_rec_free_externally_stored_fields( /*==================== B-TREE SEARCH =========================*/ /** Latches the leaf page or pages requested. -@param[in] block_savepoint leaf page where the search converged +@param[in] block leaf page where the search converged @param[in] latch_mode BTR_SEARCH_LEAF, ... @param[in] cursor cursor -@param[in] mtr mini-transaction */ +@param[in] mtr mini-transaction +@param[out] latch_leaves latched blocks and savepoints */ void btr_cur_latch_leaves( - ulint block_savepoint, + buf_block_t* block, btr_latch_mode latch_mode, btr_cur_t* cursor, - mtr_t* mtr) + mtr_t* mtr, + btr_latch_leaves_t* latch_leaves) { compile_time_assert(int(MTR_MEMO_PAGE_S_FIX) == int(RW_S_LATCH)); compile_time_assert(int(MTR_MEMO_PAGE_X_FIX) == int(RW_X_LATCH)); compile_time_assert(int(MTR_MEMO_PAGE_SX_FIX) == int(RW_SX_LATCH)); - - buf_block_t* block = mtr->at_savepoint(block_savepoint); - ut_ad(block->page.id().space() == cursor->index()->table->space->id); ut_ad(block->page.in_file()); - ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock, - MTR_MEMO_S_LOCK - | MTR_MEMO_X_LOCK - | MTR_MEMO_SX_LOCK)); + ut_ad(srv_read_only_mode + || mtr->memo_contains_flagged(&cursor->index()->lock, + MTR_MEMO_S_LOCK + | MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + auto rtr_info = cursor->rtr_info; + if (UNIV_LIKELY_NULL(rtr_info) && !cursor->index()->is_spatial()) { + rtr_info = nullptr; + } + const rw_lock_type_t mode = rw_lock_type_t( latch_mode & (RW_X_LATCH | RW_S_LATCH)); static_assert(ulint{RW_S_LATCH} == ulint{BTR_SEARCH_LEAF}, ""); static_assert(ulint{RW_X_LATCH} == ulint{BTR_MODIFY_LEAF}, ""); + static_assert(BTR_SEARCH_LEAF & BTR_SEARCH_TREE, ""); switch (latch_mode) { + default: + break; uint32_t left_page_no; uint32_t right_page_no; - default: - ut_ad(latch_mode == BTR_CONT_MODIFY_TREE); - ut_ad(cursor->index()->is_spatial()); - break; + ulint save; case BTR_SEARCH_LEAF: - s_latch_block: - block->page.lock.s_lock(); -#ifdef BTR_CUR_HASH_ADAPT - btr_search_drop_page_hash_index(block, true); -#endif - mtr->lock_register(block_savepoint, MTR_MEMO_PAGE_S_FIX); - break; + case BTR_MODIFY_LEAF: + case BTR_SEARCH_TREE: + if (UNIV_LIKELY_NULL(rtr_info)) { + rtr_info->tree_savepoints[RTR_MAX_LEVELS] + = mtr->get_savepoint(); + } +latch_block: + if (latch_leaves) { + latch_leaves->savepoints[1] = mtr->get_savepoint(); + latch_leaves->blocks[1] = block; + } + block->page.fix(); + mtr->page_lock(block, mode); + if (UNIV_LIKELY_NULL(rtr_info)) { + rtr_info->tree_blocks[RTR_MAX_LEVELS] = block; + } + return; case BTR_MODIFY_TREE: /* It is exclusive for other operations which calls btr_page_set_prev() */ ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock, MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); + save = mtr->get_savepoint(); /* x-latch also siblings from left to right */ left_page_no = btr_page_get_prev(block->page.frame); if (left_page_no != FIL_NULL) { - btr_block_get(*cursor->index(), left_page_no, RW_X_LATCH, - true, mtr); + buf_block_t *b = btr_block_get( + *cursor->index(), left_page_no, RW_X_LATCH, + true, mtr); + + if (latch_leaves) { + latch_leaves->savepoints[0] = save; + latch_leaves->blocks[0] = b; + } + + if (UNIV_LIKELY_NULL(rtr_info)) { + rtr_info->tree_savepoints[RTR_MAX_LEVELS] + = save; + rtr_info->tree_blocks[RTR_MAX_LEVELS] = b; + } + + save = mtr->get_savepoint(); } - mtr->x_latch_at_savepoint(block_savepoint, block); + if (latch_leaves) { + latch_leaves->savepoints[1] = mtr->get_savepoint(); + latch_leaves->blocks[1] = block; + } + + block->page.fix(); + block->page.lock.x_lock(); + + mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX); #ifdef BTR_CUR_HASH_ADAPT - btr_search_drop_page_hash_index(block, true); + ut_ad(!btr_search_check_marked_free_index(block)); #endif + if (UNIV_LIKELY_NULL(rtr_info)) { + rtr_info->tree_savepoints[RTR_MAX_LEVELS + 1] = save; + rtr_info->tree_blocks[RTR_MAX_LEVELS + 1] = block; + } + right_page_no = btr_page_get_next(block->page.frame); if (right_page_no != FIL_NULL) { - btr_block_get(*cursor->index(), right_page_no, - RW_X_LATCH, true, mtr); + save = mtr->get_savepoint(); + + buf_block_t* b = btr_block_get( + *cursor->index(), right_page_no, RW_X_LATCH, + true, mtr); + if (latch_leaves) { + latch_leaves->savepoints[2] = save; + latch_leaves->blocks[2] = b; + } + + if (UNIV_LIKELY_NULL(rtr_info)) { + rtr_info->tree_savepoints[RTR_MAX_LEVELS + 2] + = save; + rtr_info->tree_blocks[RTR_MAX_LEVELS + 2] = b; + } } - break; + + return; case BTR_SEARCH_PREV: case BTR_MODIFY_PREV: - static_assert(BTR_MODIFY_PREV & BTR_MODIFY_LEAF, ""); + ut_ad(!rtr_info); static_assert(BTR_SEARCH_PREV & BTR_SEARCH_LEAF, ""); - ut_ad(cursor->index()->is_ibuf() - || mtr->memo_contains_flagged(&cursor->index()->lock, - MTR_MEMO_X_LOCK - | MTR_MEMO_SX_LOCK)); + static_assert(BTR_MODIFY_PREV & BTR_MODIFY_LEAF, ""); + static_assert((BTR_SEARCH_PREV ^ BTR_MODIFY_PREV) + == (RW_S_LATCH ^ RW_X_LATCH), ""); + /* Because we are holding index->lock, no page splits or merges may run concurrently, and we may read FIL_PAGE_PREV from a buffer-fixed, unlatched page. */ left_page_no = btr_page_get_prev(block->page.frame); if (left_page_no != FIL_NULL) { + save = mtr->get_savepoint(); cursor->left_block = btr_block_get( *cursor->index(), left_page_no, mode, true, mtr); + if (latch_leaves) { + latch_leaves->savepoints[0] = save; + latch_leaves->blocks[0] = cursor->left_block; + } } - if (latch_mode == BTR_SEARCH_PREV) { - goto s_latch_block; - } - - /* fall through */ - case BTR_MODIFY_LEAF: - mtr->x_latch_at_savepoint(block_savepoint, block); -#ifdef BTR_CUR_HASH_ADAPT - btr_search_drop_page_hash_index(block, true); -#endif + goto latch_block; + case BTR_CONT_MODIFY_TREE: + ut_ad(cursor->index()->is_spatial()); + return; } + + MY_ASSERT_UNREACHABLE(); } /** Load the instant ALTER TABLE metadata from the clustered index @@ -671,6 +729,98 @@ bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page) return index->n_core_null_bytes > 128; } +/** Optimistically latches the leaf page or pages requested. +@param[in] block guessed buffer block +@param[in] modify_clock modify clock value +@param[in,out] latch_mode BTR_SEARCH_LEAF, ... +@param[in,out] cursor cursor +@param[in] mtr mini-transaction +@return true if success */ +TRANSACTIONAL_TARGET +bool +btr_cur_optimistic_latch_leaves( + buf_block_t* block, + ib_uint64_t modify_clock, + btr_latch_mode* latch_mode, + btr_cur_t* cursor, + mtr_t* mtr) +{ + ut_ad(block->page.buf_fix_count()); + ut_ad(block->page.in_file()); + ut_ad(block->page.frame); + + switch (*latch_mode) { + default: + MY_ASSERT_UNREACHABLE(); + return(false); + case BTR_SEARCH_LEAF: + case BTR_MODIFY_LEAF: + return(buf_page_optimistic_get(*latch_mode, block, + modify_clock, mtr)); + case BTR_SEARCH_PREV: /* btr_pcur_move_backward_from_page() */ + case BTR_MODIFY_PREV: /* Ditto, or ibuf_insert() */ + uint32_t curr_page_no, left_page_no; + { + transactional_shared_lock_guard<block_lock> g{ + block->page.lock}; + if (block->modify_clock != modify_clock) { + return false; + } + curr_page_no = block->page.id().page_no(); + left_page_no = btr_page_get_prev(block->page.frame); + } + + static_assert(BTR_SEARCH_PREV & BTR_SEARCH_LEAF, ""); + static_assert(BTR_MODIFY_PREV & BTR_MODIFY_LEAF, ""); + static_assert((BTR_SEARCH_PREV ^ BTR_MODIFY_PREV) + == (RW_S_LATCH ^ RW_X_LATCH), ""); + + const rw_lock_type_t mode = rw_lock_type_t( + *latch_mode & (RW_X_LATCH | RW_S_LATCH)); + + if (left_page_no != FIL_NULL) { + cursor->left_block = buf_page_get_gen( + page_id_t(cursor->index()->table->space_id, + left_page_no), + cursor->index()->table->space->zip_size(), + mode, nullptr, BUF_GET_POSSIBLY_FREED, mtr); + + if (cursor->left_block + && btr_page_get_next( + cursor->left_block->page.frame) + != curr_page_no) { +release_left_block: + mtr->release_last_page(); + return false; + } + } else { + cursor->left_block = nullptr; + } + + if (buf_page_optimistic_get(mode, block, modify_clock, mtr)) { + if (btr_page_get_prev(block->page.frame) + == left_page_no) { + /* block was already buffer-fixed while + entering the function and + buf_page_optimistic_get() buffer-fixes + it again. */ + ut_ad(2 <= block->page.buf_fix_count()); + *latch_mode = btr_latch_mode(mode); + return(true); + } + + mtr->release_last_page(); + } + + ut_ad(block->page.buf_fix_count()); + if (cursor->left_block) { + goto release_left_block; + } + } + + return false; +} + /** Gets intention in btr_intention_t from latch_mode, and cleares the intention at the latch_mode. @@ -698,6 +848,38 @@ btr_intention_t btr_cur_get_and_clear_intention(btr_latch_mode *latch_mode) return(intention); } +/** +Gets the desired latch type for the root leaf (root page is root leaf) +at the latch mode. +@param latch_mode in: BTR_SEARCH_LEAF, ... +@return latch type */ +static +rw_lock_type_t +btr_cur_latch_for_root_leaf( + ulint latch_mode) +{ + switch (latch_mode) { + case BTR_SEARCH_LEAF: + case BTR_SEARCH_TREE: + case BTR_SEARCH_PREV: + return(RW_S_LATCH); + case BTR_MODIFY_LEAF: + case BTR_MODIFY_TREE: + case BTR_MODIFY_PREV: + return(RW_X_LATCH); + case BTR_CONT_MODIFY_TREE: + case BTR_CONT_SEARCH_TREE: + /* A root page should be latched already, + and don't need to be latched here. + fall through (RW_NO_LATCH) */ + case BTR_NO_LATCHES: + return(RW_NO_LATCH); + } + + MY_ASSERT_UNREACHABLE(); + return(RW_NO_LATCH); /* avoid compiler warnings */ +} + /** @return whether the distance between two records is at most the specified value */ static bool @@ -1015,841 +1197,1223 @@ static ulint btr_node_ptr_max_size(const dict_index_t* index) return rec_max_size; } -/** @return a B-tree search mode suitable for non-leaf pages -@param mode leaf page search mode */ -static inline page_cur_mode_t btr_cur_nonleaf_mode(page_cur_mode_t mode) -{ - if (mode > PAGE_CUR_GE) - { - ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE); - return mode; - } - if (mode == PAGE_CUR_GE) - return PAGE_CUR_L; - ut_ad(mode == PAGE_CUR_G); - return PAGE_CUR_LE; -} +/********************************************************************//** +Searches an index tree and positions a tree cursor on a given level. +NOTE: n_fields_cmp in tuple must be set so that it cannot be compared +to node pointer page number fields on the upper levels of the tree! +Note that if mode is PAGE_CUR_LE, which is used in inserts, then +cursor->up_match and cursor->low_match both will have sensible values. +If mode is PAGE_CUR_GE, then up_match will a have a sensible value. -dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, - btr_latch_mode latch_mode, mtr_t *mtr) +If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the +search tuple should be performed in the B-tree. InnoDB does an insert +immediately after the cursor. Thus, the cursor may end up on a user record, +or on a page infimum record. +@param level the tree level of search +@param tuple data tuple; NOTE: n_fields_cmp in tuple must be set so that + it cannot get compared to the node ptr page number field! +@param mode PAGE_CUR_L, ...; NOTE that if the search is made using a + unique prefix of a record, mode should be PAGE_CUR_LE, not + PAGE_CUR_GE, as the latter may end up on the previous page of + the record! Inserts should always be made using PAGE_CUR_LE + to search the position! +@param latch_mode BTR_SEARCH_LEAF, ..., ORed with at most one of BTR_INSERT, + BTR_DELETE_MARK, or BTR_DELETE; + cursor->left_block is used to store a pointer to the left + neighbor page +@param cursor tree cursor; the cursor page is s- or x-latched, but see also + above! +@param mtr mini-transaction +@param autoinc PAGE_ROOT_AUTO_INC to be written (0 if none) +@return DB_SUCCESS on success or error code otherwise */ +TRANSACTIONAL_TARGET +dberr_t btr_cur_search_to_nth_level(ulint level, + const dtuple_t *tuple, + page_cur_mode_t mode, + btr_latch_mode latch_mode, + btr_cur_t *cursor, mtr_t *mtr, + ib_uint64_t autoinc) { - ut_ad(index()->is_btree() || index()->is_ibuf()); - ut_ad(!index()->is_ibuf() || ibuf_inside(mtr)); + page_t* page = NULL; /* remove warning */ + buf_block_t* block; + buf_block_t* guess; + ulint height; + ulint up_match; + ulint up_bytes; + ulint low_match; + ulint low_bytes; + ulint rw_latch; + page_cur_mode_t page_mode; + page_cur_mode_t search_mode = PAGE_CUR_UNSUPP; + ulint buf_mode; + ulint node_ptr_max_size = srv_page_size / 2; + page_cur_t* page_cursor; + btr_op_t btr_op; + ulint root_height = 0; /* remove warning */ + + btr_intention_t lock_intention; + buf_block_t* tree_blocks[BTR_MAX_LEVELS]; + ulint tree_savepoints[BTR_MAX_LEVELS]; + ulint n_blocks = 0; + ulint n_releases = 0; + bool detected_same_key_root = false; + + ulint leftmost_from_level = 0; + buf_block_t** prev_tree_blocks = NULL; + ulint* prev_tree_savepoints = NULL; + ulint prev_n_blocks = 0; + ulint prev_n_releases = 0; + bool need_path = true; + bool rtree_parent_modified = false; + bool mbr_adj = false; + bool found = false; + dict_index_t * const index = cursor->index(); + + DBUG_ENTER("btr_cur_search_to_nth_level"); - buf_block_t *guess; - btr_op_t btr_op; - btr_intention_t lock_intention; - bool detected_same_key_root= false; +#ifdef BTR_CUR_ADAPT + btr_search_t* info; +#endif /* BTR_CUR_ADAPT */ + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs offsets2_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets2 = offsets2_; + rec_offs_init(offsets_); + rec_offs_init(offsets2_); + /* Currently, PAGE_CUR_LE is the only search mode used for searches + ending to upper levels */ + + ut_ad(level == 0 || mode == PAGE_CUR_LE + || RTREE_SEARCH_MODE(mode)); + ut_ad(dict_index_check_search_tuple(index, tuple)); + ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr)); + ut_ad(dtuple_check_typed(tuple)); + ut_ad(!(index->type & DICT_FTS)); + ut_ad(index->page != FIL_NULL); - mem_heap_t* heap = NULL; - rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; - rec_offs* offsets = offsets_; - rec_offs offsets2_[REC_OFFS_NORMAL_SIZE]; - rec_offs* offsets2 = offsets2_; - rec_offs_init(offsets_); - rec_offs_init(offsets2_); - - ut_ad(dict_index_check_search_tuple(index(), tuple)); - ut_ad(dtuple_check_typed(tuple)); - ut_ad(index()->page != FIL_NULL); - - MEM_UNDEFINED(&up_match, sizeof up_match); - MEM_UNDEFINED(&up_bytes, sizeof up_bytes); - MEM_UNDEFINED(&low_match, sizeof low_match); - MEM_UNDEFINED(&low_bytes, sizeof low_bytes); - ut_d(up_match= ULINT_UNDEFINED); - ut_d(low_match= ULINT_UNDEFINED); - - ut_ad(!(latch_mode & BTR_ALREADY_S_LATCHED) || - mtr->memo_contains_flagged(&index()->lock, - MTR_MEMO_S_LOCK | MTR_MEMO_SX_LOCK | - MTR_MEMO_X_LOCK)); - - /* These flags are mutually exclusive, they are lumped together - with the latch mode for historical reasons. It's possible for - none of the flags to be set. */ - switch (UNIV_EXPECT(latch_mode & BTR_DELETE, 0)) { - default: - btr_op= BTR_NO_OP; - break; - case BTR_INSERT: - btr_op= (latch_mode & BTR_IGNORE_SEC_UNIQUE) - ? BTR_INSERT_IGNORE_UNIQUE_OP - : BTR_INSERT_OP; - break; - case BTR_DELETE: - btr_op= BTR_DELETE_OP; - ut_a(purge_node); - break; - case BTR_DELETE_MARK: - btr_op= BTR_DELMARK_OP; - break; - } + MEM_UNDEFINED(&cursor->up_match, sizeof cursor->up_match); + MEM_UNDEFINED(&cursor->up_bytes, sizeof cursor->up_bytes); + MEM_UNDEFINED(&cursor->low_match, sizeof cursor->low_match); + MEM_UNDEFINED(&cursor->low_bytes, sizeof cursor->low_bytes); +#ifdef UNIV_DEBUG + cursor->up_match = ULINT_UNDEFINED; + cursor->low_match = ULINT_UNDEFINED; +#endif /* UNIV_DEBUG */ - /* Operations on the insert buffer tree cannot be buffered. */ - ut_ad(btr_op == BTR_NO_OP || !index()->is_ibuf()); - /* Operations on the clustered index cannot be buffered. */ - ut_ad(btr_op == BTR_NO_OP || !index()->is_clust()); - /* Operations on the temporary table(indexes) cannot be buffered. */ - ut_ad(btr_op == BTR_NO_OP || !index()->table->is_temporary()); + const bool latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED; - const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED; - lock_intention= btr_cur_get_and_clear_intention(&latch_mode); - latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); + ut_ad(!latch_by_caller + || srv_read_only_mode + || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK + | MTR_MEMO_SX_LOCK)); - ut_ad(!latch_by_caller - || latch_mode == BTR_SEARCH_LEAF - || latch_mode == BTR_MODIFY_LEAF - || latch_mode == BTR_MODIFY_TREE - || latch_mode == BTR_MODIFY_ROOT_AND_LEAF); + /* These flags are mutually exclusive, they are lumped together + with the latch mode for historical reasons. It's possible for + none of the flags to be set. */ + switch (UNIV_EXPECT(latch_mode & BTR_DELETE, 0)) { + default: + btr_op = BTR_NO_OP; + break; + case BTR_INSERT: + btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE) + ? BTR_INSERT_IGNORE_UNIQUE_OP + : BTR_INSERT_OP; + break; + case BTR_DELETE: + btr_op = BTR_DELETE_OP; + ut_a(cursor->purge_node); + break; + case BTR_DELETE_MARK: + btr_op = BTR_DELMARK_OP; + break; + } + + /* Operations on the insert buffer tree cannot be buffered. */ + ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index)); + /* Operations on the clustered index cannot be buffered. */ + ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index)); + /* Operations on the temporary table(indexes) cannot be buffered. */ + ut_ad(btr_op == BTR_NO_OP || !index->table->is_temporary()); + /* Operation on the spatial index cannot be buffered. */ + ut_ad(btr_op == BTR_NO_OP || !dict_index_is_spatial(index)); + + lock_intention = btr_cur_get_and_clear_intention(&latch_mode); + + /* Turn the flags unrelated to the latch mode off. */ + latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); + + ut_ad(!latch_by_caller + || latch_mode == BTR_SEARCH_LEAF + || latch_mode == BTR_SEARCH_TREE + || latch_mode == BTR_MODIFY_LEAF); + + ut_ad(autoinc == 0 || dict_index_is_clust(index)); + ut_ad(autoinc == 0 + || latch_mode == BTR_MODIFY_TREE + || latch_mode == BTR_MODIFY_LEAF); + ut_ad(autoinc == 0 || level == 0); + + cursor->flag = BTR_CUR_BINARY; - flag= BTR_CUR_BINARY; #ifndef BTR_CUR_ADAPT - guess= nullptr; + guess = NULL; #else - btr_search_t *info= btr_search_get_info(index()); - guess= info->root_guess; - -# ifdef BTR_CUR_HASH_ADAPT -# ifdef UNIV_SEARCH_PERF_STAT - info->n_searches++; -# endif - /* We do a dirty read of btr_search_enabled below, - and btr_search_guess_on_hash() will have to check it again. */ - if (!btr_search_enabled); - else if (btr_search_guess_on_hash(index(), info, tuple, mode, - latch_mode, this, mtr)) - { - /* Search using the hash index succeeded */ - ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE); - ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); - ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); - ++btr_cur_n_sea; + info = btr_search_get_info(index); + guess = info->root_guess; - return DB_SUCCESS; - } - else - ++btr_cur_n_non_sea; +#ifdef BTR_CUR_HASH_ADAPT + +# ifdef UNIV_SEARCH_PERF_STAT + info->n_searches++; # endif -#endif + /* We do a dirty read of btr_search_enabled below, + and btr_search_guess_on_hash() will have to check it again. */ + if (!btr_search_enabled) { + } else if (autoinc == 0 + && latch_mode <= BTR_MODIFY_LEAF +# ifdef PAGE_CUR_LE_OR_EXTENDS + && mode != PAGE_CUR_LE_OR_EXTENDS +# endif /* PAGE_CUR_LE_OR_EXTENDS */ + && info->last_hash_succ + && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG) + && !index->is_spatial() && !index->table->is_temporary() + && btr_search_guess_on_hash(index, info, tuple, mode, + latch_mode, cursor, mtr)) { + + /* Search using the hash index succeeded */ + + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_GE); + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + ut_ad(cursor->low_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + ++btr_cur_n_sea; + + DBUG_RETURN(DB_SUCCESS); + } else { + ++btr_cur_n_non_sea; + } +# endif /* BTR_CUR_HASH_ADAPT */ +#endif /* BTR_CUR_ADAPT */ - /* If the hash search did not succeed, do binary search down the - tree */ + /* If the hash search did not succeed, do binary search down the + tree */ - /* Store the position of the tree latch we push to mtr so that we - know how to release it when we have latched leaf node(s) */ + /* Store the position of the tree latch we push to mtr so that we + know how to release it when we have latched leaf node(s) */ - const ulint savepoint = mtr->get_savepoint(); + ulint savepoint = mtr_set_savepoint(mtr); - ulint node_ptr_max_size= 0; - rw_lock_type_t rw_latch= RW_S_LATCH; + rw_lock_type_t upper_rw_latch; - switch (latch_mode) { - case BTR_MODIFY_TREE: - rw_latch= RW_X_LATCH; - node_ptr_max_size= btr_node_ptr_max_size(index()); - if (latch_by_caller) - { - ut_ad(mtr->memo_contains_flagged(&index()->lock, MTR_MEMO_X_LOCK)); - break; - } - if (lock_intention == BTR_INTENTION_DELETE && buf_pool.n_pend_reads && - trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH) - /* Most delete-intended operations are due to the purge of history. - Prioritize them when the history list is growing huge. */ - mtr_x_lock_index(index(), mtr); - else - mtr_sx_lock_index(index(), mtr); - break; -#ifdef UNIV_DEBUG - case BTR_CONT_MODIFY_TREE: - ut_ad("invalid mode" == 0); - break; -#endif - case BTR_MODIFY_ROOT_AND_LEAF: - rw_latch= RW_SX_LATCH; - /* fall through */ - default: - if (!latch_by_caller) - mtr_s_lock_index(index(), mtr); - } + switch (latch_mode) { + case BTR_MODIFY_TREE: + /* Most of delete-intended operations are purging. + Free blocks and read IO bandwidth should be prior + for them, when the history list is glowing huge. */ + if (lock_intention == BTR_INTENTION_DELETE + && buf_pool.n_pend_reads + && trx_sys.history_size_approx() + > BTR_CUR_FINE_HISTORY_LENGTH) { +x_latch_index: + mtr_x_lock_index(index, mtr); + } else if (index->is_spatial() + && lock_intention <= BTR_INTENTION_BOTH) { + /* X lock the if there is possibility of + pessimistic delete on spatial index. As we could + lock upward for the tree */ + goto x_latch_index; + } else { + mtr_sx_lock_index(index, mtr); + } + upper_rw_latch = RW_X_LATCH; + break; + case BTR_CONT_MODIFY_TREE: + ut_ad(srv_read_only_mode + || mtr->memo_contains_flagged(&index->lock, + MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + if (index->is_spatial()) { + /* If we are about to locate parent page for split + and/or merge operation for R-Tree index, X latch + the parent */ + upper_rw_latch = RW_X_LATCH; + break; + } + /* fall through */ + case BTR_CONT_SEARCH_TREE: + /* Do nothing */ + ut_ad(srv_read_only_mode + || mtr->memo_contains_flagged(&index->lock, + MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + upper_rw_latch = RW_NO_LATCH; + break; + default: + if (!srv_read_only_mode) { + if (!latch_by_caller) { + ut_ad(latch_mode != BTR_SEARCH_TREE); + mtr_s_lock_index(index, mtr); + } + upper_rw_latch = RW_S_LATCH; + } else { + upper_rw_latch = RW_NO_LATCH; + } + } + const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf( + latch_mode); - const ulint zip_size= index()->table->space->zip_size(); + page_cursor = btr_cur_get_page_cur(cursor); + page_cursor->index = index; - /* Start with the root page. */ - page_id_t page_id(index()->table->space_id, index()->page); + const ulint zip_size = index->table->space->zip_size(); - const page_cur_mode_t page_mode= btr_cur_nonleaf_mode(mode); - ulint height= ULINT_UNDEFINED; - up_match= 0; - up_bytes= 0; - low_match= 0; - low_bytes= 0; - ulint buf_mode= BUF_GET; - search_loop: - dberr_t err; - auto block_savepoint= mtr->get_savepoint(); - buf_block_t *block= - buf_page_get_gen(page_id, zip_size, rw_latch, guess, buf_mode, mtr, - &err, height == 0 && !index()->is_clust()); - if (!block) - { - switch (err) { - case DB_DECRYPTION_FAILED: - btr_decryption_failed(*index()); - /* fall through */ - default: - func_exit: - if (UNIV_LIKELY_NULL(heap)) - mem_heap_free(heap); - return err; - case DB_SUCCESS: - /* This must be a search to perform an insert, delete mark, or delete; - try using the change buffer */ - ut_ad(height == 0); - ut_ad(thr); - break; - } + /* Start with the root page. */ + page_id_t page_id(index->table->space_id, index->page); - switch (btr_op) { - default: - MY_ASSERT_UNREACHABLE(); - break; - case BTR_INSERT_OP: - case BTR_INSERT_IGNORE_UNIQUE_OP: - ut_ad(buf_mode == BUF_GET_IF_IN_POOL); + if (root_leaf_rw_latch == RW_X_LATCH) { + node_ptr_max_size = btr_node_ptr_max_size(index); + } - if (ibuf_insert(IBUF_OP_INSERT, tuple, index(), page_id, zip_size, thr)) - { - flag= BTR_CUR_INSERT_TO_IBUF; - goto func_exit; - } - break; + up_match = 0; + up_bytes = 0; + low_match = 0; + low_bytes = 0; - case BTR_DELMARK_OP: - ut_ad(buf_mode == BUF_GET_IF_IN_POOL); + height = ULINT_UNDEFINED; - if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple, - index(), page_id, zip_size, thr)) - { - flag = BTR_CUR_DEL_MARK_IBUF; - goto func_exit; - } + /* We use these modified search modes on non-leaf levels of the + B-tree. These let us end up in the right B-tree leaf. In that leaf + we use the original search mode. */ - break; + switch (mode) { + case PAGE_CUR_GE: + page_mode = PAGE_CUR_L; + break; + case PAGE_CUR_G: + page_mode = PAGE_CUR_LE; + break; + default: +#ifdef PAGE_CUR_LE_OR_EXTENDS + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE + || RTREE_SEARCH_MODE(mode) + || mode == PAGE_CUR_LE_OR_EXTENDS); +#else /* PAGE_CUR_LE_OR_EXTENDS */ + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE + || RTREE_SEARCH_MODE(mode)); +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + page_mode = mode; + break; + } - case BTR_DELETE_OP: - ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH); - auto& chain = buf_pool.page_hash.cell_get(page_id.fold()); - - if (!row_purge_poss_sec(purge_node, index(), tuple)) - /* The record cannot be purged yet. */ - flag= BTR_CUR_DELETE_REF; - else if (ibuf_insert(IBUF_OP_DELETE, tuple, index(), - page_id, zip_size, thr)) - /* The purge was buffered. */ - flag= BTR_CUR_DELETE_IBUF; - else - { - /* The purge could not be buffered. */ - buf_pool.watch_unset(page_id, chain); - break; - } + /* Loop and search until we arrive at the desired level */ + btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}}; - buf_pool.watch_unset(page_id, chain); - goto func_exit; - } +search_loop: + buf_mode = BUF_GET; + rw_latch = RW_NO_LATCH; + rtree_parent_modified = false; + + if (height != 0) { + /* We are about to fetch the root or a non-leaf page. */ + if ((latch_mode != BTR_MODIFY_TREE || height == level) + && !prev_tree_blocks) { + /* If doesn't have SX or X latch of index, + each pages should be latched before reading. */ + if (height == ULINT_UNDEFINED + && upper_rw_latch == RW_S_LATCH + && autoinc) { + /* needs sx-latch of root page + for writing PAGE_ROOT_AUTO_INC */ + rw_latch = RW_SX_LATCH; + } else { + rw_latch = upper_rw_latch; + } + } + } else if (latch_mode <= BTR_MODIFY_LEAF) { + rw_latch = latch_mode; - /* Change buffering did not succeed, we must read the page. */ - buf_mode= BUF_GET; - goto search_loop; - } + if (btr_op != BTR_NO_OP + && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) { - if (!!page_is_comp(block->page.frame) != index()->table->not_redundant() || - btr_page_get_index_id(block->page.frame) != index()->id || - fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE || - !fil_page_index_page_check(block->page.frame)) - { - corrupted: - ut_ad("corrupted" == 0); // FIXME: remove this - err= DB_CORRUPTION; - goto func_exit; - } + /* Try to buffer the operation if the leaf + page is not in the buffer pool. */ + + buf_mode = btr_op == BTR_DELETE_OP + ? BUF_GET_IF_IN_POOL_OR_WATCH + : BUF_GET_IF_IN_POOL; + } + } + +retry_page_get: + ut_ad(n_blocks < BTR_MAX_LEVELS); + tree_savepoints[n_blocks] = mtr_set_savepoint(mtr); + dberr_t err; + block = buf_page_get_gen(page_id, zip_size, rw_latch, guess, + buf_mode, mtr, &err, + height == 0 && !index->is_clust()); + if (!block) { + switch (err) { + case DB_SUCCESS: + /* change buffering */ + break; + case DB_DECRYPTION_FAILED: + btr_decryption_failed(*index); + /* fall through */ + default: + goto func_exit; + } + + /* This must be a search to perform an insert/delete + mark/ delete; try using the insert/delete buffer */ + + ut_ad(height == 0); + ut_ad(cursor->thr); + + switch (btr_op) { + default: + MY_ASSERT_UNREACHABLE(); + break; + case BTR_INSERT_OP: + case BTR_INSERT_IGNORE_UNIQUE_OP: + ut_ad(buf_mode == BUF_GET_IF_IN_POOL); + ut_ad(!dict_index_is_spatial(index)); + + if (ibuf_insert(IBUF_OP_INSERT, tuple, index, + page_id, zip_size, cursor->thr)) { + + cursor->flag = BTR_CUR_INSERT_TO_IBUF; + + goto func_exit; + } + break; + + case BTR_DELMARK_OP: + ut_ad(buf_mode == BUF_GET_IF_IN_POOL); + ut_ad(!dict_index_is_spatial(index)); + + if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple, + index, page_id, zip_size, + cursor->thr)) { + + cursor->flag = BTR_CUR_DEL_MARK_IBUF; + + goto func_exit; + } + + break; + + case BTR_DELETE_OP: + ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH); + ut_ad(!dict_index_is_spatial(index)); + auto& chain = buf_pool.page_hash.cell_get( + page_id.fold()); + + if (!row_purge_poss_sec(cursor->purge_node, + index, tuple)) { + + /* The record cannot be purged yet. */ + cursor->flag = BTR_CUR_DELETE_REF; + } else if (ibuf_insert(IBUF_OP_DELETE, tuple, + index, page_id, zip_size, + cursor->thr)) { + + /* The purge was buffered. */ + cursor->flag = BTR_CUR_DELETE_IBUF; + } else { + /* The purge could not be buffered. */ + buf_pool.watch_unset(page_id, chain); + break; + } + + buf_pool.watch_unset(page_id, chain); + goto func_exit; + } + + /* Insert to the insert/delete buffer did not succeed, we + must read the page from disk. */ + + buf_mode = BUF_GET; + + goto retry_page_get; + } + + tree_blocks[n_blocks] = block; + + if (height && prev_tree_blocks) { + /* also latch left sibling */ + ut_ad(rw_latch == RW_NO_LATCH); + + rw_latch = upper_rw_latch; + + /* Because we are holding index->lock, no page splits + or merges may run concurrently, and we may read + FIL_PAGE_PREV from a buffer-fixed, unlatched page. */ + uint32_t left_page_no = btr_page_get_prev(block->page.frame); + + if (left_page_no != FIL_NULL) { + ut_ad(prev_n_blocks < leftmost_from_level); + + prev_tree_savepoints[prev_n_blocks] + = mtr_set_savepoint(mtr); + buf_block_t* get_block = buf_page_get_gen( + page_id_t(page_id.space(), left_page_no), + zip_size, rw_latch, NULL, buf_mode, + mtr, &err); + if (!get_block) { + if (err == DB_DECRYPTION_FAILED) { + btr_decryption_failed(*index); + } + goto func_exit; + } + + prev_tree_blocks[prev_n_blocks++] = get_block; + /* BTR_MODIFY_TREE doesn't update prev/next_page_no, + without their parent page's lock. So, not needed to + retry here, because we have the parent page's lock. */ + } + + mtr->s_lock_register(tree_savepoints[n_blocks]); + block->page.lock.s_lock(); + } + + page = buf_block_get_frame(block); + + if (height == ULINT_UNDEFINED + && page_is_leaf(page) + && rw_latch != RW_NO_LATCH + && rw_latch != root_leaf_rw_latch) { + /* The root page is also a leaf page (root_leaf). + We should reacquire the page, because the root page + is latched differently from leaf pages. */ + ut_ad(root_leaf_rw_latch != RW_NO_LATCH); + ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_SX_LATCH); + ut_ad(rw_latch == RW_S_LATCH || autoinc); + ut_ad(!autoinc || root_leaf_rw_latch == RW_X_LATCH); + + ut_ad(n_blocks == 0); + mtr_release_block_at_savepoint( + mtr, tree_savepoints[n_blocks], + tree_blocks[n_blocks]); + + upper_rw_latch = root_leaf_rw_latch; + goto search_loop; + } - page_cur.block= block; - ut_ad(block == mtr->at_savepoint(block_savepoint)); - const page_t *page= buf_block_get_frame(block); #ifdef UNIV_ZIP_DEBUG - if (rw_latch != RW_NO_LATCH) - { - const page_zip_des_t *page_zip= buf_block_get_page_zip(block); - ut_a(!page_zip || page_zip_validate(page_zip, page, index())); - } + if (rw_latch != RW_NO_LATCH) { + const page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); + } #endif /* UNIV_ZIP_DEBUG */ - const uint32_t page_level= btr_page_get_level(page); - if (height == ULINT_UNDEFINED) - { - /* We are in the B-tree index root page. */ + ut_ad(fil_page_index_page_check(page)); + ut_ad(index->id == btr_page_get_index_id(page)); + + if (height == ULINT_UNDEFINED) { + /* We are in the root node */ + + height = btr_page_get_level(page); + root_height = height; + cursor->tree_height = root_height + 1; + + if (dict_index_is_spatial(index)) { + ut_ad(cursor->rtr_info); + + /* If SSN in memory is not initialized, fetch + it from root page */ + if (!rtr_get_current_ssn_id(index)) { + /* FIXME: do this in dict_load_table_one() */ + index->set_ssn(page_get_ssn_id(page) + 1); + } + + /* Save the MBR */ + cursor->rtr_info->thr = cursor->thr; + rtr_get_mbr_from_tuple(tuple, &cursor->rtr_info->mbr); + } + #ifdef BTR_CUR_ADAPT - info->root_guess= block; + info->root_guess = block; #endif - height= page_level; - tree_height= height + 1; + } - if (!height) - { - /* The root page is also a leaf page. - We may have to reacquire the page latch in a different mode. */ - switch (rw_latch) { - case RW_S_LATCH: - if ((latch_mode & ~12) != RW_S_LATCH) - { - rw_latch= rw_lock_type_t(latch_mode & ~12); - ut_ad(rw_latch == RW_X_LATCH || rw_latch == RW_SX_LATCH); - goto relatch; - } - if (latch_mode != BTR_MODIFY_PREV) - { - if (!latch_by_caller) - /* Release the tree s-latch */ - mtr->rollback_to_savepoint(savepoint, savepoint + 1); - goto reached_latched_leaf; + if (height == 0) { + if (rw_latch == RW_NO_LATCH) { + btr_cur_latch_leaves(block, latch_mode, cursor, mtr, + &latch_leaves); + } + + switch (latch_mode) { + case BTR_MODIFY_TREE: + case BTR_CONT_MODIFY_TREE: + case BTR_CONT_SEARCH_TREE: + break; + default: + if (!latch_by_caller + && !srv_read_only_mode) { + /* Release the tree s-latch */ + mtr_release_s_latch_at_savepoint( + mtr, savepoint, + &index->lock); + } + + /* release upper blocks */ + if (prev_tree_blocks) { + ut_ad(!autoinc); + for (; + prev_n_releases < prev_n_blocks; + prev_n_releases++) { + mtr_release_block_at_savepoint( + mtr, + prev_tree_savepoints[ + prev_n_releases], + prev_tree_blocks[ + prev_n_releases]); + } + } + + for (; n_releases < n_blocks; n_releases++) { + if (n_releases == 0 + && (autoinc)) { + /* keep the root page latch */ + ut_ad(mtr->memo_contains_flagged( + tree_blocks[n_releases], + MTR_MEMO_PAGE_SX_FIX + | MTR_MEMO_PAGE_X_FIX)); + continue; + } + + mtr_release_block_at_savepoint( + mtr, tree_savepoints[n_releases], + tree_blocks[n_releases]); + } + } + + page_mode = mode; + } + + if (dict_index_is_spatial(index)) { + /* Remember the page search mode */ + search_mode = page_mode; + + /* Some adjustment on search mode, when the + page search mode is PAGE_CUR_RTREE_LOCATE + or PAGE_CUR_RTREE_INSERT, as we are searching + with MBRs. When it is not the target level, we + should search all sub-trees that "CONTAIN" the + search range/MBR. When it is at the target + level, the search becomes PAGE_CUR_LE */ + if (page_mode == PAGE_CUR_RTREE_LOCATE + && level == height) { + if (level == 0) { + page_mode = PAGE_CUR_LE; + } else { + page_mode = PAGE_CUR_RTREE_GET_FATHER; + } + } + + if (page_mode == PAGE_CUR_RTREE_INSERT) { + page_mode = (level == height) + ? PAGE_CUR_LE + : PAGE_CUR_RTREE_INSERT; + + ut_ad(!page_is_leaf(page) || page_mode == PAGE_CUR_LE); + } + + /* "need_path" indicates if we need to tracking the parent + pages, if it is not spatial comparison, then no need to + track it */ + if (page_mode < PAGE_CUR_CONTAIN) { + need_path = false; + } + + up_match = 0; + low_match = 0; + + if (latch_mode == BTR_MODIFY_TREE + || latch_mode == BTR_CONT_MODIFY_TREE + || latch_mode == BTR_CONT_SEARCH_TREE) { + /* Tree are locked, no need for Page Lock to protect + the "path" */ + cursor->rtr_info->need_page_lock = false; + } } - /* fall through */ - case RW_SX_LATCH: - ut_ad(rw_latch == RW_S_LATCH || - latch_mode == BTR_MODIFY_ROOT_AND_LEAF); - rw_latch= RW_X_LATCH; - relatch: - mtr->rollback_to_savepoint(block_savepoint); - height= ULINT_UNDEFINED; - goto search_loop; - case RW_X_LATCH: - if (latch_mode == BTR_MODIFY_TREE) - goto reached_index_root_and_leaf; - goto reached_root_and_leaf; - case RW_NO_LATCH: - ut_ad(mtr->memo_contains_flagged(&index()->lock, MTR_MEMO_X_LOCK)); - } - goto reached_leaf; - } - } - else if (UNIV_UNLIKELY(height != page_level)) - goto corrupted; - else - switch (latch_mode) { - case BTR_MODIFY_TREE: - break; - case BTR_MODIFY_ROOT_AND_LEAF: - ut_ad((mtr->at_savepoint(block_savepoint - 1)->page.id().page_no() == - index()->page) == (tree_height <= height + 2)); - if (tree_height <= height + 2) - /* Retain the root page latch. */ - break; - /* fall through */ - default: - /* Release the parent page latch. */ - ut_ad(block_savepoint > savepoint); - mtr->rollback_to_savepoint(block_savepoint - 1, block_savepoint); - block_savepoint--; - } - if (!height) - { - reached_leaf: - /* We reached the leaf level. */ - ut_ad(block == mtr->at_savepoint(block_savepoint)); + page_cursor->block = block; - if (latch_mode == BTR_MODIFY_ROOT_AND_LEAF) - { - reached_root_and_leaf: - if (!latch_by_caller) - mtr->rollback_to_savepoint(savepoint, savepoint + 1); - reached_index_root_and_leaf: - ut_ad(rw_latch == RW_X_LATCH); -#ifdef BTR_CUR_HASH_ADAPT - btr_search_drop_page_hash_index(block, true); -#endif - if (page_cur_search_with_match(tuple, mode, &up_match, &low_match, - &page_cur, nullptr)) - goto corrupted; - ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE); - ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); - ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); - goto func_exit; - } + if (dict_index_is_spatial(index) && page_mode >= PAGE_CUR_CONTAIN) { + ut_ad(need_path); + found = rtr_cur_search_with_match( + block, index, tuple, page_mode, page_cursor, + cursor->rtr_info); - if (rw_latch == RW_NO_LATCH) - btr_cur_latch_leaves(block_savepoint, latch_mode, this, mtr); + /* Need to use BTR_MODIFY_TREE to do the MBR adjustment */ + if (search_mode == PAGE_CUR_RTREE_INSERT + && cursor->rtr_info->mbr_adj) { + static_assert(BTR_MODIFY_TREE + == (8 | BTR_MODIFY_LEAF), ""); - if (latch_mode != BTR_MODIFY_TREE) - { - if (!latch_by_caller) - { - /* Release the tree s-latch */ - block_savepoint--; - mtr->rollback_to_savepoint(savepoint, savepoint + 1); - } - /* release upper blocks */ - if (savepoint < block_savepoint) - mtr->rollback_to_savepoint(savepoint, block_savepoint); - } - else - ut_ad(rw_latch == RW_NO_LATCH); + if (!(latch_mode & 8)) { + /* Parent MBR needs updated, should retry + with BTR_MODIFY_TREE */ + goto func_exit; + } + + rtree_parent_modified = true; + cursor->rtr_info->mbr_adj = false; + mbr_adj = true; + } - reached_latched_leaf: + if (found && page_mode == PAGE_CUR_RTREE_GET_FATHER) { + cursor->low_match = + DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1; + } #ifdef BTR_CUR_HASH_ADAPT - if (btr_search_enabled && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG)) - { - if (page_cur_search_with_match_bytes(tuple, mode, - &up_match, &up_bytes, - &low_match, &low_bytes, &page_cur)) - goto corrupted; - } - else + } else if (height == 0 && btr_search_enabled + && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG) + && index->is_btree()) { + /* The adaptive hash index is only used when searching + for leaf pages (height==0), but not in r-trees. + We only need the byte prefix comparison for the purpose + of updating the adaptive hash index. */ + if (page_cur_search_with_match_bytes( + tuple, page_mode, &up_match, &up_bytes, + &low_match, &low_bytes, page_cursor)) { + err = DB_CORRUPTION; + goto func_exit; + } #endif /* BTR_CUR_HASH_ADAPT */ - if (page_cur_search_with_match(tuple, mode, &up_match, &low_match, - &page_cur, nullptr)) - goto corrupted; + } else { + /* Search for complete index fields. */ + up_bytes = low_bytes = 0; + if (page_cur_search_with_match( + tuple, page_mode, &up_match, + &low_match, page_cursor, + need_path ? cursor->rtr_info : nullptr)) { + err = DB_CORRUPTION; + goto func_exit; + } + } - ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE); - ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); - ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + /* If this is the desired level, leave the loop */ -#ifdef BTR_CUR_HASH_ADAPT - /* We do a dirty read of btr_search_enabled here. We will - properly check btr_search_enabled again in - btr_search_build_page_hash_index() before building a page hash - index, while holding search latch. */ - if (!btr_search_enabled); - else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG) - /* This may be a search tuple for btr_pcur_t::restore_position(). */ - ut_ad(tuple->is_metadata() || - (tuple->is_metadata(tuple->info_bits ^ REC_STATUS_INSTANT))); - else if (index()->table->is_temporary()); - else if (!rec_is_metadata(page_cur.rec, *index())) - btr_search_info_update(index(), this); -#endif /* BTR_CUR_HASH_ADAPT */ + ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor))); - goto func_exit; - } + /* Add Predicate lock if it is serializable isolation + and only if it is in the search case */ + if (dict_index_is_spatial(index) + && cursor->rtr_info->need_prdt_lock + && mode != PAGE_CUR_RTREE_INSERT + && mode != PAGE_CUR_RTREE_LOCATE + && mode >= PAGE_CUR_CONTAIN) { + lock_prdt_t prdt; - guess= nullptr; - if (page_cur_search_with_match(tuple, page_mode, &up_match, &low_match, - &page_cur, nullptr)) - goto corrupted; - offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0, ULINT_UNDEFINED, - &heap); + { + trx_t* trx = thr_get_trx(cursor->thr); + TMLockTrxGuard g{TMLockTrxArgs(*trx)}; + lock_init_prdt_from_mbr( + &prdt, &cursor->rtr_info->mbr, mode, + trx->lock.lock_heap); + } - ut_ad(block == mtr->at_savepoint(block_savepoint)); + if (rw_latch == RW_NO_LATCH && height != 0) { + block->page.lock.s_lock(); + } - switch (latch_mode) { - default: - break; - case BTR_MODIFY_TREE: - if (btr_cur_need_opposite_intention(page, lock_intention, page_cur.rec)) - { - /* If the rec is the first or last in the page for pessimistic - delete intention, it might cause node_ptr insert for the upper - level. We should change the intention and retry. */ - need_opposite_intention: - return pessimistic_search_leaf(tuple, mode, mtr); - } + lock_prdt_lock(block, &prdt, index, LOCK_S, + LOCK_PREDICATE, cursor->thr); - if (detected_same_key_root || lock_intention != BTR_INTENTION_BOTH || - index()->is_unique() || - (up_match <= rec_offs_n_fields(offsets) && - low_match <= rec_offs_n_fields(offsets))) - break; + if (rw_latch == RW_NO_LATCH && height != 0) { + block->page.lock.s_unlock(); + } + } - /* If the first or the last record of the page or the same key - value to the first record or last record, then another page might - be chosen when BTR_CONT_MODIFY_TREE. So, the parent page should - not released to avoiding deadlock with blocking the another search - with the same key value. */ - const rec_t *first= page_rec_get_next_const(page_get_infimum_rec(page)); - ulint matched_fields; + if (level != height) { - if (UNIV_UNLIKELY(!first)) - goto corrupted; - if (page_cur.rec == first || page_rec_is_last(page_cur.rec, page)) - { - same_key_root: - detected_same_key_root= true; - break; - } + const rec_t* node_ptr; + ut_ad(height > 0); - matched_fields= 0; - offsets2= rec_get_offsets(first, index(), offsets2, 0, ULINT_UNDEFINED, - &heap); - cmp_rec_rec(page_cur.rec, first, offsets, offsets2, index(), false, - &matched_fields); - if (matched_fields >= rec_offs_n_fields(offsets) - 1) - goto same_key_root; - if (const rec_t* last= - page_rec_get_prev_const(page_get_supremum_rec(page))) - { - matched_fields= 0; - offsets2= rec_get_offsets(last, index(), offsets2, 0, ULINT_UNDEFINED, - &heap); - cmp_rec_rec(page_cur.rec, last, offsets, offsets2, index(), false, - &matched_fields); - if (matched_fields >= rec_offs_n_fields(offsets) - 1) - goto same_key_root; - } - else - goto corrupted; + height--; + guess = NULL; - /* Release the non-root parent page unless it may need to be modified. */ - if (tree_height > height + 1 && - !btr_cur_will_modify_tree(index(), page, lock_intention, - page_cur.rec, node_ptr_max_size, - zip_size, mtr)) - { - mtr->rollback_to_savepoint(block_savepoint - 1, block_savepoint); - block_savepoint--; - } - } + node_ptr = page_cur_get_rec(page_cursor); - /* Go to the child node */ - page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec, offsets)); + offsets = rec_get_offsets(node_ptr, index, offsets, 0, + ULINT_UNDEFINED, &heap); - if (!--height) - { - /* We are about to access the leaf level. */ - rw_latch= RW_NO_LATCH; + /* If the rec is the first or last in the page for + pessimistic delete intention, it might cause node_ptr insert + for the upper level. We should change the intention and retry. + */ + if (latch_mode == BTR_MODIFY_TREE + && btr_cur_need_opposite_intention( + page, lock_intention, node_ptr)) { - switch (latch_mode) { - case BTR_MODIFY_ROOT_AND_LEAF: - rw_latch= RW_X_LATCH; - break; - default: - break; - case BTR_MODIFY_PREV: - /* This is almost exclusively for ibuf_insert(), but also for - btr_pcur_move_to_prev(); the latter is not exercised by mtr */ - case BTR_SEARCH_PREV: - if (page_has_prev(page) && page_rec_is_first(page_cur.rec, page)) - { - ut_ad(block_savepoint + 1 == mtr->get_savepoint()); - /* Latch the previous page if the node pointer is the leftmost - of the current page. */ - buf_block_t *left= btr_block_get(*index(), btr_page_get_prev(page), - RW_NO_LATCH, false, mtr, &err); - if (!left) - goto func_exit; - static_assert(mtr_memo_type_t(BTR_MODIFY_PREV & ~4) == - MTR_MEMO_PAGE_X_FIX, ""); - static_assert(mtr_memo_type_t(BTR_SEARCH_PREV & ~4) == - MTR_MEMO_PAGE_S_FIX, ""); - mtr->lock_register(block_savepoint + 1, - mtr_memo_type_t(latch_mode & ~4)); - /* Because we are violating the latching order here, we will - have to temporarily release the right page latch if the left - page latch cannot be acquired without waiting. Concurrent page - splits or merges are impossible because we are holding a latch - on the parent of these sibling pages. */ - if (latch_mode == BTR_MODIFY_PREV) - { - if (!left->page.lock.x_lock_try()) - { - block->page.lock.x_unlock(); - left->page.lock.x_lock(); - } - } - else if (!left->page.lock.s_lock_try()) - { - block->page.lock.s_unlock(); - left->page.lock.s_lock(); - } -#ifdef BTR_CUR_HASH_ADAPT - btr_search_drop_page_hash_index(left, true); +need_opposite_intention: + ut_ad(upper_rw_latch == RW_X_LATCH); + + if (n_releases > 0) { + /* release root block */ + mtr_release_block_at_savepoint( + mtr, tree_savepoints[0], + tree_blocks[0]); + } + + /* release all blocks */ + for (; n_releases <= n_blocks; n_releases++) { + mtr_release_block_at_savepoint( + mtr, tree_savepoints[n_releases], + tree_blocks[n_releases]); + } + + lock_intention = BTR_INTENTION_BOTH; + + page_id.set_page_no(index->page); + up_match = 0; + low_match = 0; + height = ULINT_UNDEFINED; + + n_blocks = 0; + n_releases = 0; + + goto search_loop; + } + + if (dict_index_is_spatial(index)) { + if (page_rec_is_supremum(node_ptr)) { + cursor->low_match = 0; + cursor->up_match = 0; + goto func_exit; + } + + /* If we are doing insertion or record locating, + remember the tree nodes we visited */ + if (page_mode == PAGE_CUR_RTREE_INSERT + || (search_mode == PAGE_CUR_RTREE_LOCATE + && (latch_mode != BTR_MODIFY_LEAF))) { + bool add_latch = false; + + if (latch_mode == BTR_MODIFY_TREE + && rw_latch == RW_NO_LATCH) { + ut_ad(mtr->memo_contains_flagged( + &index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + block->page.lock.s_lock(); + add_latch = true; + } + + /* Store the parent cursor location */ +#ifdef UNIV_DEBUG + ulint num_stored = rtr_store_parent_path( + block, cursor, latch_mode, + height + 1, mtr); +#else + rtr_store_parent_path( + block, cursor, latch_mode, + height + 1, mtr); #endif - } - break; - case BTR_MODIFY_LEAF: - case BTR_SEARCH_LEAF: - if (index()->is_ibuf()) - break; - rw_latch= rw_lock_type_t(latch_mode); - if (btr_op != BTR_NO_OP && - ibuf_should_try(index(), btr_op != BTR_INSERT_OP)) - /* Try to buffer the operation if the leaf page - is not in the buffer pool. */ - buf_mode= btr_op == BTR_DELETE_OP - ? BUF_GET_IF_IN_POOL_OR_WATCH - : BUF_GET_IF_IN_POOL; - break; - case BTR_MODIFY_TREE: - if (index()->is_ibuf()) - break; - if (lock_intention == BTR_INTENTION_INSERT && page_has_next(page) && - page_rec_is_last(page_cur.rec, page)) - { - /* btr_insert_into_right_sibling() might cause deleting node_ptr - at upper level */ - mtr->rollback_to_savepoint(block_savepoint); - goto need_opposite_intention; - } - } - } - goto search_loop; -} + if (page_mode == PAGE_CUR_RTREE_INSERT) { + btr_pcur_t* r_cursor = + rtr_get_parent_cursor( + cursor, height + 1, + true); + /* If it is insertion, there should + be only one parent for each level + traverse */ +#ifdef UNIV_DEBUG + ut_ad(num_stored == 1); +#endif -ATTRIBUTE_COLD -dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple, - page_cur_mode_t mode, mtr_t *mtr) -{ - ut_ad(index()->is_btree() || index()->is_ibuf()); - ut_ad(!index()->is_ibuf() || ibuf_inside(mtr)); + node_ptr = btr_pcur_get_rec(r_cursor); - rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; - rec_offs* offsets = offsets_; - rec_offs_init(offsets_); + } - ut_ad(flag == BTR_CUR_BINARY); - ut_ad(dict_index_check_search_tuple(index(), tuple)); - ut_ad(dtuple_check_typed(tuple)); - buf_block_t *block= mtr->at_savepoint(1); - ut_ad(block->page.id().page_no() == index()->page); - block->page.fix(); - mtr->rollback_to_savepoint(1); - ut_ad(mtr->memo_contains_flagged(&index()->lock, - MTR_MEMO_SX_LOCK | MTR_MEMO_X_LOCK)); - - const page_cur_mode_t page_mode{btr_cur_nonleaf_mode(mode)}; - - mtr->page_lock(block, RW_X_LATCH); - - up_match= 0; - up_bytes= 0; - low_match= 0; - low_bytes= 0; - ulint height= btr_page_get_level(block->page.frame); - tree_height= height + 1; - mem_heap_t *heap= nullptr; + if (add_latch) { + block->page.lock.s_unlock(); + } - search_loop: - dberr_t err; - page_cur.block= block; + ut_ad(!page_rec_is_supremum(node_ptr)); + } - if (UNIV_UNLIKELY(!height)) - { - if (page_cur_search_with_match(tuple, mode, &up_match, &low_match, - &page_cur, nullptr)) - corrupted: - err= DB_CORRUPTION; - else - { - ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE); - ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); - ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + ut_ad(page_mode == search_mode + || (page_mode == PAGE_CUR_WITHIN + && search_mode == PAGE_CUR_RTREE_LOCATE)); -#ifdef BTR_CUR_HASH_ADAPT - /* We do a dirty read of btr_search_enabled here. We will - properly check btr_search_enabled again in - btr_search_build_page_hash_index() before building a page hash - index, while holding search latch. */ - if (!btr_search_enabled); - else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG) - /* This may be a search tuple for btr_pcur_t::restore_position(). */ - ut_ad(tuple->is_metadata() || - (tuple->is_metadata(tuple->info_bits ^ REC_STATUS_INSTANT))); - else if (index()->table->is_temporary()); - else if (!rec_is_metadata(page_cur.rec, *index())) - btr_search_info_update(index(), this); -#endif /* BTR_CUR_HASH_ADAPT */ - err= DB_SUCCESS; - } + page_mode = search_mode; + } - func_exit: - if (UNIV_LIKELY_NULL(heap)) - mem_heap_free(heap); - return err; - } + /* If the first or the last record of the page + or the same key value to the first record or last record, + the another page might be chosen when BTR_CONT_MODIFY_TREE. + So, the parent page should not released to avoiding deadlock + with blocking the another search with the same key value. */ + if (!detected_same_key_root + && lock_intention == BTR_INTENTION_BOTH + && !dict_index_is_unique(index) + && latch_mode == BTR_MODIFY_TREE + && (up_match >= rec_offs_n_fields(offsets) - 1 + || low_match >= rec_offs_n_fields(offsets) - 1)) { + const rec_t* first_rec = page_rec_get_next_const( + page_get_infimum_rec(page)); + ulint matched_fields; - if (page_cur_search_with_match(tuple, page_mode, &up_match, &low_match, - &page_cur, nullptr)) - goto corrupted; + ut_ad(upper_rw_latch == RW_X_LATCH); - page_id_t page_id{block->page.id()}; + if (UNIV_UNLIKELY(!first_rec)) { + corrupted: + err = DB_CORRUPTION; + goto func_exit; + } + if (node_ptr == first_rec + || page_rec_is_last(node_ptr, page)) { + detected_same_key_root = true; + } else { + matched_fields = 0; + + offsets2 = rec_get_offsets( + first_rec, index, offsets2, + 0, ULINT_UNDEFINED, &heap); + cmp_rec_rec(node_ptr, first_rec, + offsets, offsets2, index, false, + &matched_fields); + + if (matched_fields + >= rec_offs_n_fields(offsets) - 1) { + detected_same_key_root = true; + } else if (const rec_t* last_rec + = page_rec_get_prev_const( + page_get_supremum_rec( + page))) { + matched_fields = 0; + + offsets2 = rec_get_offsets( + last_rec, index, offsets2, + 0, ULINT_UNDEFINED, &heap); + cmp_rec_rec( + node_ptr, last_rec, + offsets, offsets2, index, + false, &matched_fields); + if (matched_fields + >= rec_offs_n_fields(offsets) - 1) { + detected_same_key_root = true; + } + } else { + goto corrupted; + } + } + } - offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0, ULINT_UNDEFINED, - &heap); - /* Go to the child node */ - page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec, offsets)); + /* If the page might cause modify_tree, + we should not release the parent page's lock. */ + if (!detected_same_key_root + && latch_mode == BTR_MODIFY_TREE + && !btr_cur_will_modify_tree( + index, page, lock_intention, node_ptr, + node_ptr_max_size, zip_size, mtr) + && !rtree_parent_modified) { + ut_ad(upper_rw_latch == RW_X_LATCH); + ut_ad(n_releases <= n_blocks); + + /* we can release upper blocks */ + for (; n_releases < n_blocks; n_releases++) { + if (n_releases == 0) { + /* we should not release root page + to pin to same block. */ + continue; + } - const auto block_savepoint= mtr->get_savepoint(); - block= - buf_page_get_gen(page_id, block->zip_size(), RW_NO_LATCH, nullptr, BUF_GET, - mtr, &err, !--height && !index()->is_clust()); + /* release unused blocks to unpin */ + mtr_release_block_at_savepoint( + mtr, tree_savepoints[n_releases], + tree_blocks[n_releases]); + } + } - if (!block) - { - if (err == DB_DECRYPTION_FAILED) - btr_decryption_failed(*index()); - goto func_exit; - } + if (height == level + && latch_mode == BTR_MODIFY_TREE) { + ut_ad(upper_rw_latch == RW_X_LATCH); + /* we should sx-latch root page, if released already. + It contains seg_header. */ + if (n_releases > 0) { + mtr->sx_latch_at_savepoint( + tree_savepoints[0], + tree_blocks[0]); + } - if (!!page_is_comp(block->page.frame) != index()->table->not_redundant() || - btr_page_get_index_id(block->page.frame) != index()->id || - fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE || - !fil_page_index_page_check(block->page.frame)) - goto corrupted; + /* x-latch the branch blocks not released yet. */ + for (ulint i = n_releases; i <= n_blocks; i++) { + mtr->x_latch_at_savepoint( + tree_savepoints[i], + tree_blocks[i]); + } + } - if (height != btr_page_get_level(block->page.frame)) - goto corrupted; + /* We should consider prev_page of parent page, if the node_ptr + is the leftmost of the page. because BTR_SEARCH_PREV and + BTR_MODIFY_PREV latches prev_page of the leaf page. */ + if ((latch_mode == BTR_SEARCH_PREV + || latch_mode == BTR_MODIFY_PREV) + && !prev_tree_blocks) { + /* block should be latched for consistent + btr_page_get_prev() */ + ut_ad(mtr->memo_contains_flagged( + block, MTR_MEMO_PAGE_S_FIX + | MTR_MEMO_PAGE_X_FIX)); - if (page_has_prev(block->page.frame) && - !btr_block_get(*index(), btr_page_get_prev(block->page.frame), - RW_X_LATCH, false, mtr, &err)) - goto func_exit; - mtr->x_latch_at_savepoint(block_savepoint, block); -#ifdef BTR_CUR_HASH_ADAPT - btr_search_drop_page_hash_index(block, true); + if (page_has_prev(page) + && page_rec_is_first(node_ptr, page)) { + + if (leftmost_from_level == 0) { + leftmost_from_level = height + 1; + } + } else { + leftmost_from_level = 0; + } + + if (height == 0 && leftmost_from_level > 0) { + /* should retry to get also prev_page + from level==leftmost_from_level. */ + prev_tree_blocks = static_cast<buf_block_t**>( + ut_malloc_nokey(sizeof(buf_block_t*) + * leftmost_from_level)); + + prev_tree_savepoints = static_cast<ulint*>( + ut_malloc_nokey(sizeof(ulint) + * leftmost_from_level)); + + /* back to the level (leftmost_from_level+1) */ + ulint idx = n_blocks + - (leftmost_from_level - 1); + + page_id.set_page_no( + tree_blocks[idx]->page.id().page_no()); + + for (ulint i = n_blocks + - (leftmost_from_level - 1); + i <= n_blocks; i++) { + mtr_release_block_at_savepoint( + mtr, tree_savepoints[i], + tree_blocks[i]); + } + + n_blocks -= (leftmost_from_level - 1); + height = leftmost_from_level; + ut_ad(n_releases == 0); + + /* replay up_match, low_match */ + up_match = 0; + low_match = 0; + rtr_info_t* rtr_info = need_path + ? cursor->rtr_info : NULL; + + for (ulint i = 0; i < n_blocks; i++) { + page_cursor->block = tree_blocks[i]; + if (page_cur_search_with_match( + tuple, + page_mode, &up_match, + &low_match, page_cursor, + rtr_info)) { + err = DB_CORRUPTION; + goto func_exit; + } + } + + goto search_loop; + } + } + + /* Go to the child node */ + page_id.set_page_no( + btr_node_ptr_get_child_page_no(node_ptr, offsets)); + + n_blocks++; + + if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) { + /* We're doing a search on an ibuf tree and we're one + level above the leaf page. */ + + ut_ad(level == 0); + + buf_mode = BUF_GET; + rw_latch = RW_NO_LATCH; + goto retry_page_get; + } + + if (dict_index_is_spatial(index) + && page_mode >= PAGE_CUR_CONTAIN + && page_mode != PAGE_CUR_RTREE_INSERT) { + ut_ad(need_path); + rtr_node_path_t* path = + cursor->rtr_info->path; + + if (!path->empty() && found) { + ut_ad(path->back().page_no + == page_id.page_no()); + path->pop_back(); +#ifdef UNIV_DEBUG + if (page_mode == PAGE_CUR_RTREE_LOCATE + && (latch_mode != BTR_MODIFY_LEAF)) { + btr_pcur_t* cur + = cursor->rtr_info->parent_path->back( + ).cursor; + rec_t* my_node_ptr + = btr_pcur_get_rec(cur); + + offsets = rec_get_offsets( + my_node_ptr, index, offsets, + 0, ULINT_UNDEFINED, &heap); + + ulint my_page_no + = btr_node_ptr_get_child_page_no( + my_node_ptr, offsets); + + ut_ad(page_id.page_no() == my_page_no); + } #endif -#ifdef UNIV_ZIP_DEBUG - const page_zip_des_t *page_zip= buf_block_get_page_zip(block); - ut_a(!page_zip || page_zip_validate(page_zip, page, index())); -#endif /* UNIV_ZIP_DEBUG */ - if (page_has_next(block->page.frame) && - !btr_block_get(*index(), btr_page_get_next(block->page.frame), - RW_X_LATCH, false, mtr, &err)) - goto func_exit; - goto search_loop; -} + } + } -/********************************************************************//** -Searches an index tree and positions a tree cursor on a given non-leaf level. -NOTE: n_fields_cmp in tuple must be set so that it cannot be compared -to node pointer page number fields on the upper levels of the tree! -cursor->up_match and cursor->low_match both will have sensible values. -Cursor is left at the place where an insert of the -search tuple should be performed in the B-tree. InnoDB does an insert -immediately after the cursor. Thus, the cursor may end up on a user record, -or on a page infimum record. -@param level the tree level of search -@param tuple data tuple; NOTE: n_fields_cmp in tuple must be set so that - it cannot get compared to the node ptr page number field! -@param latch RW_S_LATCH or RW_X_LATCH -@param cursor tree cursor; the cursor page is s- or x-latched, but see also - above! -@param mtr mini-transaction -@return DB_SUCCESS on success or error code otherwise */ -TRANSACTIONAL_TARGET -dberr_t btr_cur_search_to_nth_level(ulint level, - const dtuple_t *tuple, - rw_lock_type_t rw_latch, - btr_cur_t *cursor, mtr_t *mtr) -{ - dict_index_t *const index= cursor->index(); + goto search_loop; + } else if (!dict_index_is_spatial(index) + && latch_mode == BTR_MODIFY_TREE + && lock_intention == BTR_INTENTION_INSERT + && page_has_next(page) + && page_rec_is_last(page_cur_get_rec(page_cursor), page)) { + + /* btr_insert_into_right_sibling() might cause + deleting node_ptr at upper level */ + + guess = NULL; + + if (height == 0) { + /* release the leaf pages if latched */ + for (uint i = 0; i < 3; i++) { + if (latch_leaves.blocks[i] != NULL) { + mtr_release_block_at_savepoint( + mtr, latch_leaves.savepoints[i], + latch_leaves.blocks[i]); + latch_leaves.blocks[i] = NULL; + } + } + } - ut_ad(index->is_btree() || index->is_ibuf()); - mem_heap_t *heap= nullptr; - rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; - rec_offs *offsets= offsets_; - rec_offs_init(offsets_); - ut_ad(level); - ut_ad(dict_index_check_search_tuple(index, tuple)); - ut_ad(index->is_ibuf() ? ibuf_inside(mtr) : index->is_btree()); - ut_ad(dtuple_check_typed(tuple)); - ut_ad(index->page != FIL_NULL); - - MEM_UNDEFINED(&cursor->up_bytes, sizeof cursor->up_bytes); - MEM_UNDEFINED(&cursor->low_bytes, sizeof cursor->low_bytes); - cursor->up_match= 0; - cursor->low_match= 0; - cursor->flag= BTR_CUR_BINARY; + goto need_opposite_intention; + } -#ifndef BTR_CUR_ADAPT - buf_block_t *block= nullptr; -#else - btr_search_t *info= btr_search_get_info(index); - buf_block_t *block= info->root_guess; -#endif /* BTR_CUR_ADAPT */ + if (level != 0) { + ut_ad(!autoinc); - ut_ad(mtr->memo_contains_flagged(&index->lock, - MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); + if (upper_rw_latch == RW_NO_LATCH) { + ut_ad(latch_mode == BTR_CONT_MODIFY_TREE + || latch_mode == BTR_CONT_SEARCH_TREE); + btr_block_get( + *index, page_id.page_no(), + latch_mode == BTR_CONT_MODIFY_TREE + ? RW_X_LATCH : RW_SX_LATCH, false, mtr, &err); + } else { + ut_ad(mtr->memo_contains_flagged(block, + upper_rw_latch)); + + if (latch_by_caller) { + ut_ad(latch_mode == BTR_SEARCH_TREE); + /* to exclude modifying tree operations + should sx-latch the index. */ + ut_ad(mtr->memo_contains(index->lock, + MTR_MEMO_SX_LOCK)); + /* because has sx-latch of index, + can release upper blocks. */ + for (; n_releases < n_blocks; n_releases++) { + mtr_release_block_at_savepoint( + mtr, + tree_savepoints[n_releases], + tree_blocks[n_releases]); + } + } + } - const ulint zip_size= index->table->space->zip_size(); + if (page_mode <= PAGE_CUR_LE) { + cursor->low_match = low_match; + cursor->up_match = up_match; + } + } else { + cursor->low_match = low_match; + cursor->low_bytes = low_bytes; + cursor->up_match = up_match; + cursor->up_bytes = up_bytes; - /* Start with the root page. */ - page_id_t page_id(index->table->space_id, index->page); - ulint height= ULINT_UNDEFINED; + if (autoinc) { + page_set_autoinc(tree_blocks[0], autoinc, mtr, false); + } -search_loop: - dberr_t err= DB_SUCCESS; - if (buf_block_t *b= - mtr->get_already_latched(page_id, mtr_memo_type_t(rw_latch))) - block= b; - else if (!(block= buf_page_get_gen(page_id, zip_size, rw_latch, - block, BUF_GET, mtr, &err))) - { - if (err == DB_DECRYPTION_FAILED) - btr_decryption_failed(*index); - goto func_exit; - } +#ifdef BTR_CUR_HASH_ADAPT + /* We do a dirty read of btr_search_enabled here. We + will properly check btr_search_enabled again in + btr_search_build_page_hash_index() before building a + page hash index, while holding search latch. */ + if (!btr_search_enabled) { + } else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG) { + /* This may be a search tuple for + btr_pcur_t::restore_position(). */ + ut_ad(tuple->is_metadata() + || (tuple->is_metadata(tuple->info_bits + ^ REC_STATUS_INSTANT))); + } else if (index->is_spatial()) { + } else if (index->table->is_temporary()) { + } else if (rec_is_metadata(btr_cur_get_rec(cursor), *index)) { + /* Only user records belong in the adaptive + hash index. */ + } else { + btr_search_info_update(index, cursor); + } +#endif /* BTR_CUR_HASH_ADAPT */ + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_GE); + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + ut_ad(cursor->low_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + } + + /* For spatial index, remember what blocks are still latched */ + if (dict_index_is_spatial(index) + && (latch_mode == BTR_MODIFY_TREE + || latch_mode == BTR_MODIFY_LEAF)) { + for (ulint i = 0; i < n_releases; i++) { + cursor->rtr_info->tree_blocks[i] = NULL; + cursor->rtr_info->tree_savepoints[i] = 0; + } -#ifdef UNIV_ZIP_DEBUG - if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block)) - ut_a(page_zip_validate(page_zip, block->page.frame, index)); -#endif /* UNIV_ZIP_DEBUG */ + for (ulint i = n_releases; i <= n_blocks; i++) { + cursor->rtr_info->tree_blocks[i] = tree_blocks[i]; + cursor->rtr_info->tree_savepoints[i] = tree_savepoints[i]; + } + } - if (!!page_is_comp(block->page.frame) != index->table->not_redundant() || - btr_page_get_index_id(block->page.frame) != index->id || - fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE || - !fil_page_index_page_check(block->page.frame)) - { - corrupted: - err= DB_CORRUPTION; - func_exit: - if (UNIV_LIKELY_NULL(heap)) - mem_heap_free(heap); - return err; - } +func_exit: - const uint32_t page_level= btr_page_get_level(block->page.frame); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } - if (height == ULINT_UNDEFINED) - { - /* We are in the root node */ - height= page_level; - if (!height) - goto corrupted; - cursor->tree_height= height + 1; - } - else if (height != ulint{page_level}) - goto corrupted; - - cursor->page_cur.block= block; - - /* Search for complete index fields. */ - if (page_cur_search_with_match(tuple, PAGE_CUR_LE, &cursor->up_match, - &cursor->low_match, &cursor->page_cur, - nullptr)) - goto corrupted; - - /* If this is the desired level, leave the loop */ - if (level == height) - goto func_exit; - - ut_ad(height > level); - height--; - - offsets = rec_get_offsets(cursor->page_cur.rec, index, offsets, 0, - ULINT_UNDEFINED, &heap); - /* Go to the child node */ - page_id.set_page_no(btr_node_ptr_get_child_page_no(cursor->page_cur.rec, - offsets)); - block= nullptr; - goto search_loop; + ut_free(prev_tree_blocks); + ut_free(prev_tree_savepoints); + + if (mbr_adj) { + /* remember that we will need to adjust parent MBR */ + cursor->rtr_info->mbr_adj = true; + } + + DBUG_RETURN(err); } dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, btr_latch_mode latch_mode, mtr_t *mtr) { + ulint node_ptr_max_size= srv_page_size / 2; btr_intention_t lock_intention; ulint n_blocks= 0; mem_heap_t *heap= nullptr; @@ -1860,21 +2424,29 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, rec_offs_init(offsets_); const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED; - latch_mode= btr_latch_mode(latch_mode & ~BTR_ALREADY_S_LATCHED); + latch_mode = btr_latch_mode(latch_mode & ~BTR_ALREADY_S_LATCHED); lock_intention= btr_cur_get_and_clear_intention(&latch_mode); + /* This function doesn't need to lock left page of the leaf page */ + if (latch_mode == BTR_SEARCH_PREV) + latch_mode= BTR_SEARCH_LEAF; + else if (latch_mode == BTR_MODIFY_PREV) + latch_mode= BTR_MODIFY_LEAF; + /* Store the position of the tree latch we push to mtr so that we know how to release it when we have latched the leaf node */ auto savepoint= mtr->get_savepoint(); rw_lock_type_t upper_rw_latch= RW_X_LATCH; - ulint node_ptr_max_size= 0; - if (latch_mode == BTR_MODIFY_TREE) - { - node_ptr_max_size= btr_node_ptr_max_size(index); + switch (latch_mode) { + case BTR_CONT_MODIFY_TREE: + case BTR_CONT_SEARCH_TREE: + abort(); + break; + case BTR_MODIFY_TREE: /* Most of delete-intended operations are purging. Free blocks and read IO bandwidth should be prioritized for them, when the history list is growing huge. */ @@ -1885,35 +2457,32 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, mtr_x_lock_index(index, mtr); else mtr_sx_lock_index(index, mtr); - } - else - { - static_assert(int{BTR_CONT_MODIFY_TREE} == (12 | BTR_MODIFY_LEAF), ""); - ut_ad(!(latch_mode & 8)); - /* This function doesn't need to lock left page of the leaf page */ - static_assert(int{BTR_SEARCH_PREV} == (4 | BTR_SEARCH_LEAF), ""); - static_assert(int{BTR_MODIFY_PREV} == (4 | BTR_MODIFY_LEAF), ""); - latch_mode= btr_latch_mode(latch_mode & ~4); + break; + default: ut_ad(!latch_by_caller || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_SX_LOCK | MTR_MEMO_S_LOCK)); upper_rw_latch= RW_S_LATCH; - if (!latch_by_caller) - { - savepoint++; - mtr_s_lock_index(index, mtr); - } + if (latch_by_caller) + break; + ut_ad(latch_mode != BTR_SEARCH_TREE); + savepoint++; + mtr_s_lock_index(index, mtr); } ut_ad(savepoint == mtr->get_savepoint()); - const rw_lock_type_t root_leaf_rw_latch= rw_lock_type_t(latch_mode & ~12); + const rw_lock_type_t root_leaf_rw_latch= + btr_cur_latch_for_root_leaf(latch_mode); page_cur.index = index; uint32_t page= index->page; const auto zip_size= index->table->space->zip_size(); + if (root_leaf_rw_latch == RW_X_LATCH) + node_ptr_max_size= btr_node_ptr_max_size(index); + for (ulint height= ULINT_UNDEFINED;;) { ut_ad(n_blocks < BTR_MAX_LEVELS); @@ -1962,15 +2531,20 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, reached_leaf: const auto leaf_savepoint= mtr->get_savepoint(); ut_ad(leaf_savepoint); - ut_ad(block == mtr->at_savepoint(leaf_savepoint - 1)); if (rw_latch == RW_NO_LATCH) - btr_cur_latch_leaves(leaf_savepoint - 1, latch_mode, this, mtr); + btr_cur_latch_leaves(block, latch_mode, this, mtr); - if (latch_mode != BTR_MODIFY_TREE) + switch (latch_mode) { + case BTR_MODIFY_TREE: + case BTR_CONT_MODIFY_TREE: + case BTR_CONT_SEARCH_TREE: + break; + default: /* Release index->lock if needed, and the non-leaf pages. */ mtr->rollback_to_savepoint(savepoint - !latch_by_caller, leaf_savepoint - 1); + } break; } } @@ -4095,15 +4669,16 @@ btr_cur_pessimistic_update( } } -#if 0 // FIXME: this used to be a no-op, and will cause trouble if enabled - if (!big_rec_vec + if (!srv_read_only_mode + && !big_rec_vec && page_is_leaf(block->page.frame) && !dict_index_is_online_ddl(index)) { +#if 0 // FIXME: this used to be a no-op, and will cause trouble if enabled mtr->release(index->lock); +#endif /* NOTE: We cannot release root block latch here, because it has segment header and already modified in most of cases.*/ } -#endif err = DB_SUCCESS; goto return_after_reservations; @@ -4845,14 +5420,15 @@ return_after_reservations: err_exit: mem_heap_free(heap); -#if 0 // FIXME: this used to be a no-op, and will cause trouble if enabled - if (page_is_leaf(page) + if (!srv_read_only_mode + && page_is_leaf(page) && !dict_index_is_online_ddl(index)) { +#if 0 // FIXME: this used to be a no-op, and will cause trouble if enabled mtr->release(index->lock); +#endif /* NOTE: We cannot release root block latch here, because it has segment header and already modified in most of cases.*/ } -#endif index->table->space->release_free_extents(n_reserved); return(ret); @@ -4969,18 +5545,16 @@ public: buf_block_t *parent_block= m_block; ulint parent_savepoint= m_savepoint; + m_savepoint= mtr_set_savepoint(&mtr); m_block= btr_block_get(*index(), m_page_id.page_no(), RW_S_LATCH, !level, &mtr, nullptr); - if (!m_block) - return false; if (parent_block && parent_block != right_parent) - mtr.rollback_to_savepoint(parent_savepoint, parent_savepoint + 1); - - m_savepoint= mtr.get_savepoint() - 1; + mtr_release_block_at_savepoint(&mtr, parent_savepoint, parent_block); - return level == ULINT_UNDEFINED || - btr_page_get_level(m_block->page.frame) == level; + return m_block && + (level == ULINT_UNDEFINED || + btr_page_get_level(buf_block_get_frame(m_block)) == level); } /** Sets page mode for leaves */ @@ -5187,18 +5761,14 @@ static ha_rows btr_estimate_n_rows_in_range_on_level( buf_block_t *prev_block= block; ulint prev_savepoint= savepoint; - savepoint= mtr.get_savepoint(); + savepoint= mtr_set_savepoint(&mtr); /* Fetch the page. */ block= btr_block_get(*index, page_id.page_no(), RW_S_LATCH, !level, &mtr, nullptr); if (prev_block) - { - mtr.rollback_to_savepoint(prev_savepoint, prev_savepoint + 1); - if (block) - savepoint--; - } + mtr_release_block_at_savepoint(&mtr, prev_savepoint, prev_block); if (!block || btr_page_get_level(buf_block_get_frame(block)) != level) goto inexact; @@ -5227,20 +5797,14 @@ static ha_rows btr_estimate_n_rows_in_range_on_level( } while (page_id.page_no() != right_page_no); if (block) - { - ut_ad(block == mtr.at_savepoint(savepoint)); - mtr.rollback_to_savepoint(savepoint, savepoint + 1); - } + mtr_release_block_at_savepoint(&mtr, savepoint, block); return (n_rows); inexact: if (block) - { - ut_ad(block == mtr.at_savepoint(savepoint)); - mtr.rollback_to_savepoint(savepoint, savepoint + 1); - } + mtr_release_block_at_savepoint(&mtr, savepoint, block); is_n_rows_exact= false; @@ -5299,7 +5863,9 @@ ha_rows btr_estimate_n_rows_in_range(dict_index_t *index, mtr.start(); - ut_ad(mtr.get_savepoint() == 0); + /* Store the position of the tree latch we push to mtr so that we + know how to release it when we have latched leaf node(s) */ + ulint savepoint= mtr_set_savepoint(&mtr); mtr_s_lock_index(index, &mtr); ha_rows table_n_rows= dict_table_get_n_rows(index->table); @@ -5354,10 +5920,10 @@ search_loop: } if (height == 0) - /* There is no need to release non-leaf pages here as they must already be + /* There is no need to unlach non-leaf pages here as they must already be unlatched in btr_est_cur_t::fetch_child(). Try to search on pages after - releasing the index latch, to decrease contention. */ - mtr.rollback_to_savepoint(0, 1); + index->lock unlatching to decrease contention. */ + mtr_release_s_latch_at_savepoint(&mtr, savepoint, &index->lock); /* There is no need to search on left page if divergence_height != ULINT_UNDEFINED, as it was already searched before diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc index 4e0a7d1f86a..76b173359da 100644 --- a/storage/innobase/btr/btr0defragment.cc +++ b/storage/innobase/btr/btr0defragment.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (C) 2012, 2014 Facebook, Inc. All Rights Reserved. -Copyright (C) 2014, 2023, MariaDB Corporation. +Copyright (C) 2014, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -280,70 +280,6 @@ btr_defragment_calc_n_recs_for_size( return n_recs; } -MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result)) -/************************************************************//** -Returns the upper level node pointer to a page. It is assumed that mtr holds -an sx-latch on the tree. -@return rec_get_offsets() of the node pointer record */ -static -rec_offs* -btr_page_search_father_node_ptr( - rec_offs* offsets,/*!< in: work area for the return value */ - mem_heap_t* heap, /*!< in: memory heap to use */ - btr_cur_t* cursor, /*!< in: cursor pointing to user record, - out: cursor on node pointer record, - its page x-latched */ - mtr_t* mtr) /*!< in: mtr */ -{ - const uint32_t page_no = btr_cur_get_block(cursor)->page.id().page_no(); - dict_index_t* index = btr_cur_get_index(cursor); - ut_ad(!index->is_spatial()); - - ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK - | MTR_MEMO_SX_LOCK)); - ut_ad(dict_index_get_page(index) != page_no); - - const auto level = btr_page_get_level(btr_cur_get_page(cursor)); - - const rec_t* user_rec = btr_cur_get_rec(cursor); - ut_a(page_rec_is_user_rec(user_rec)); - - if (btr_cur_search_to_nth_level(level + 1, - dict_index_build_node_ptr(index, - user_rec, 0, - heap, level), - RW_X_LATCH, - cursor, mtr) != DB_SUCCESS) { - return nullptr; - } - - const rec_t* node_ptr = btr_cur_get_rec(cursor); - ut_ad(!btr_cur_get_block(cursor)->page.lock.not_recursive() - || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); - - offsets = rec_get_offsets(node_ptr, index, offsets, 0, - ULINT_UNDEFINED, &heap); - - if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) { - offsets = nullptr; - } - - return(offsets); -} - -static bool btr_page_search_father(mtr_t *mtr, btr_cur_t *cursor) -{ - rec_t *rec= - page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame)); - if (UNIV_UNLIKELY(!rec)) - return false; - cursor->page_cur.rec= rec; - mem_heap_t *heap= mem_heap_create(100); - const bool got= btr_page_search_father_node_ptr(nullptr, heap, cursor, mtr); - mem_heap_free(heap); - return got; -} - /*************************************************************//** Merge as many records from the from_block to the to_block. Delete the from_block if all records are successfully merged to to_block. @@ -472,7 +408,7 @@ btr_defragment_merge_pages( parent.page_cur.index = index; parent.page_cur.block = from_block; - if (!btr_page_search_father(mtr, &parent)) { + if (!btr_page_get_father(mtr, &parent)) { to_block = nullptr; } else if (n_recs_to_move == n_recs) { /* The whole page is merged with the previous page, diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc index 68699ede469..d731bcbb893 100644 --- a/storage/innobase/btr/btr0pcur.cc +++ b/storage/innobase/btr/btr0pcur.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2016, 2023, MariaDB Corporation. +Copyright (c) 2016, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -212,100 +212,24 @@ btr_pcur_copy_stored_position( pcur_receive->old_n_fields = pcur_donate->old_n_fields; } -/** Optimistically latches the leaf page or pages requested. -@param[in] block guessed buffer block -@param[in,out] latch_mode BTR_SEARCH_LEAF, ... -@param[in,out] pcur cursor -@param[in,out] mtr mini-transaction -@return true if success */ -TRANSACTIONAL_TARGET -static bool btr_pcur_optimistic_latch_leaves(buf_block_t *block, - btr_pcur_t *pcur, - btr_latch_mode *latch_mode, - mtr_t *mtr) -{ - ut_ad(block->page.buf_fix_count()); - ut_ad(block->page.in_file()); - ut_ad(block->page.frame); - - static_assert(BTR_SEARCH_PREV & BTR_SEARCH_LEAF, ""); - static_assert(BTR_MODIFY_PREV & BTR_MODIFY_LEAF, ""); - static_assert((BTR_SEARCH_PREV ^ BTR_MODIFY_PREV) == - (RW_S_LATCH ^ RW_X_LATCH), ""); - - const rw_lock_type_t mode= - rw_lock_type_t(*latch_mode & (RW_X_LATCH | RW_S_LATCH)); - - switch (*latch_mode) { - default: - ut_ad(*latch_mode == BTR_SEARCH_LEAF || *latch_mode == BTR_MODIFY_LEAF); - return buf_page_optimistic_get(mode, block, pcur->modify_clock, mtr); - case BTR_SEARCH_PREV: /* btr_pcur_move_backward_from_page() */ - case BTR_MODIFY_PREV: /* Ditto, or ibuf_insert() */ - page_id_t id{0}; - uint32_t left_page_no; - ulint zip_size; - { - transactional_shared_lock_guard<block_lock> g{block->page.lock}; - if (block->modify_clock != pcur->modify_clock) - return false; - id= block->page.id(); - zip_size= block->zip_size(); - left_page_no= btr_page_get_prev(block->page.frame); - } - - if (left_page_no != FIL_NULL) - { - pcur->btr_cur.left_block= - buf_page_get_gen(page_id_t(id.space(), left_page_no), zip_size, - mode, nullptr, BUF_GET_POSSIBLY_FREED, mtr); - - if (pcur->btr_cur.left_block && - btr_page_get_next(pcur->btr_cur.left_block->page.frame) != - id.page_no()) - { -release_left_block: - mtr->release_last_page(); - return false; - } - } - else - pcur->btr_cur.left_block= nullptr; - - if (buf_page_optimistic_get(mode, block, pcur->modify_clock, mtr)) - { - if (btr_page_get_prev(block->page.frame) == left_page_no) - { - /* block was already buffer-fixed while entering the function and - buf_page_optimistic_get() buffer-fixes it again. */ - ut_ad(2 <= block->page.buf_fix_count()); - *latch_mode= btr_latch_mode(mode); - return true; - } - - mtr->release_last_page(); - } - - ut_ad(block->page.buf_fix_count()); - if (pcur->btr_cur.left_block) - goto release_left_block; - return false; - } -} - /** Structure acts as functor to do the latching of leaf pages. It returns true if latching of leaf pages succeeded and false otherwise. */ struct optimistic_latch_leaves { btr_pcur_t *const cursor; - btr_latch_mode *const latch_mode; + btr_latch_mode *latch_mode; mtr_t *const mtr; + optimistic_latch_leaves(btr_pcur_t *cursor, btr_latch_mode *latch_mode, + mtr_t *mtr) + : cursor(cursor), latch_mode(latch_mode), mtr(mtr) {} + bool operator() (buf_block_t *hint) const { - return hint && - btr_pcur_optimistic_latch_leaves(hint, cursor, latch_mode, mtr); + return hint && btr_cur_optimistic_latch_leaves( + hint, cursor->modify_clock, latch_mode, + btr_pcur_get_btr_cur(cursor), mtr); } }; @@ -379,8 +303,8 @@ btr_pcur_t::restore_position(btr_latch_mode restore_latch_mode, mtr_t *mtr) /* Try optimistic restoration. */ if (block_when_stored.run_with_hint( - optimistic_latch_leaves{this, &restore_latch_mode, - mtr})) { + optimistic_latch_leaves(this, &restore_latch_mode, + mtr))) { pos_state = BTR_PCUR_IS_POSITIONED; latch_mode = restore_latch_mode; @@ -541,9 +465,18 @@ btr_pcur_move_to_next_page( return DB_CORRUPTION; } + ulint mode = cursor->latch_mode; + switch (mode) { + case BTR_SEARCH_TREE: + mode = BTR_SEARCH_LEAF; + break; + case BTR_MODIFY_TREE: + mode = BTR_MODIFY_LEAF; + } + dberr_t err; buf_block_t* next_block = btr_block_get( - *cursor->index(), next_page_no, cursor->latch_mode & ~12, + *cursor->index(), next_page_no, mode, page_is_leaf(page), mtr, &err); if (UNIV_UNLIKELY(!next_block)) { diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc index a1609248512..fc890f9233b 100644 --- a/storage/innobase/btr/btr0sea.cc +++ b/storage/innobase/btr/btr0sea.cc @@ -1055,24 +1055,26 @@ btr_search_guess_on_hash( index_id_t index_id; ut_ad(mtr->is_active()); - ut_ad(index->is_btree() || index->is_ibuf()); - /* Note that, for efficiency, the struct info may not be protected by - any latch here! */ - - if (latch_mode > BTR_MODIFY_LEAF - || !info->last_hash_succ || !info->n_hash_potential - || (tuple->info_bits & REC_INFO_MIN_REC_FLAG)) { + if (!btr_search_enabled) { return false; } - ut_ad(index->is_btree()); - ut_ad(!index->table->is_temporary()); - + ut_ad(!index->is_ibuf()); ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF); compile_time_assert(ulint{BTR_SEARCH_LEAF} == ulint{RW_S_LATCH}); compile_time_assert(ulint{BTR_MODIFY_LEAF} == ulint{RW_X_LATCH}); + /* Not supported for spatial index */ + ut_ad(!dict_index_is_spatial(index)); + + /* Note that, for efficiency, the struct info may not be protected by + any latch here! */ + + if (info->n_hash_potential == 0) { + return false; + } + cursor->n_fields = info->n_fields; cursor->n_bytes = info->n_bytes; |