diff options
Diffstat (limited to 'storage/xtradb/btr')
-rw-r--r-- | storage/xtradb/btr/btr0btr.c | 3789 | ||||
-rw-r--r-- | storage/xtradb/btr/btr0cur.c | 5256 | ||||
-rw-r--r-- | storage/xtradb/btr/btr0pcur.c | 606 | ||||
-rw-r--r-- | storage/xtradb/btr/btr0sea.c | 2032 |
4 files changed, 11683 insertions, 0 deletions
diff --git a/storage/xtradb/btr/btr0btr.c b/storage/xtradb/btr/btr0btr.c new file mode 100644 index 00000000000..ff047095aa4 --- /dev/null +++ b/storage/xtradb/btr/btr0btr.c @@ -0,0 +1,3789 @@ +/***************************************************************************** + +Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/**************************************************//** +@file btr/btr0btr.c +The B-tree + +Created 6/2/1994 Heikki Tuuri +*******************************************************/ + +#include "btr0btr.h" + +#ifdef UNIV_NONINL +#include "btr0btr.ic" +#endif + +#include "fsp0fsp.h" +#include "page0page.h" +#include "page0zip.h" + +#ifndef UNIV_HOTBACKUP +#include "btr0cur.h" +#include "btr0sea.h" +#include "btr0pcur.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "ibuf0ibuf.h" +#include "trx0trx.h" + +/* +Latching strategy of the InnoDB B-tree +-------------------------------------- +A tree latch protects all non-leaf nodes of the tree. Each node of a tree +also has a latch of its own. + +A B-tree operation normally first acquires an S-latch on the tree. It +searches down the tree and releases the tree latch when it has the +leaf node latch. To save CPU time we do not acquire any latch on +non-leaf nodes of the tree during a search, those pages are only bufferfixed. + +If an operation needs to restructure the tree, it acquires an X-latch on +the tree before searching to a leaf node. If it needs, for example, to +split a leaf, +(1) InnoDB decides the split point in the leaf, +(2) allocates a new page, +(3) inserts the appropriate node pointer to the first non-leaf level, +(4) releases the tree X-latch, +(5) and then moves records from the leaf to the new allocated page. + +Node pointers +------------- +Leaf pages of a B-tree contain the index records stored in the +tree. On levels n > 0 we store 'node pointers' to pages on level +n - 1. For each page there is exactly one node pointer stored: +thus the our tree is an ordinary B-tree, not a B-link tree. + +A node pointer contains a prefix P of an index record. The prefix +is long enough so that it determines an index record uniquely. +The file page number of the child page is added as the last +field. To the child page we can store node pointers or index records +which are >= P in the alphabetical order, but < P1 if there is +a next node pointer on the level, and P1 is its prefix. + +If a node pointer with a prefix P points to a non-leaf child, +then the leftmost record in the child must have the same +prefix P. If it points to a leaf node, the child is not required +to contain any record with a prefix equal to P. The leaf case +is decided this way to allow arbitrary deletions in a leaf node +without touching upper levels of the tree. + +We have predefined a special minimum record which we +define as the smallest record in any alphabetical order. +A minimum record is denoted by setting a bit in the record +header. A minimum record acts as the prefix of a node pointer +which points to a leftmost node on any level of the tree. + +File page allocation +-------------------- +In the root node of a B-tree there are two file segment headers. +The leaf pages of a tree are allocated from one file segment, to +make them consecutive on disk if possible. From the other file segment +we allocate pages for the non-leaf levels of the tree. +*/ + +#ifdef UNIV_BTR_DEBUG +/**************************************************************//** +Checks a file segment header within a B-tree root page. +@return TRUE if valid */ +static +ibool +btr_root_fseg_validate( +/*===================*/ + const fseg_header_t* seg_header, /*!< in: segment header */ + ulint space) /*!< in: tablespace identifier */ +{ + ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET); + + ut_a(mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space); + ut_a(offset >= FIL_PAGE_DATA); + ut_a(offset <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END); + return(TRUE); +} +#endif /* UNIV_BTR_DEBUG */ + +/**************************************************************//** +Gets the root node of a tree and x-latches it. +@return root page, x-latched */ +static +buf_block_t* +btr_root_block_get( +/*===============*/ + dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint space; + ulint zip_size; + ulint root_page_no; + buf_block_t* block; + + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + root_page_no = dict_index_get_page(index); + + block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr); + + if (srv_pass_corrupt_table && !block) { + return(0); + } + ut_a(block); + + ut_a((ibool)!!page_is_comp(buf_block_get_frame(block)) + == dict_table_is_comp(index->table)); +#ifdef UNIV_BTR_DEBUG + if (!dict_index_is_ibuf(index)) { + const page_t* root = buf_block_get_frame(block); + + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + root, space)); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, space)); + } +#endif /* UNIV_BTR_DEBUG */ + + return(block); +} + +/**************************************************************//** +Gets the root node of a tree and x-latches it. +@return root page, x-latched */ +UNIV_INTERN +page_t* +btr_root_get( +/*=========*/ + dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr) /*!< in: mtr */ +{ + return(buf_block_get_frame(btr_root_block_get(index, mtr))); +} + +/*************************************************************//** +Gets pointer to the previous user record in the tree. It is assumed that +the caller has appropriate latches on the page and its neighbor. +@return previous user record, NULL if there is none */ +UNIV_INTERN +rec_t* +btr_get_prev_user_rec( +/*==================*/ + rec_t* rec, /*!< in: record on leaf level */ + mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if + needed, also to the previous page */ +{ + page_t* page; + page_t* prev_page; + ulint prev_page_no; + + if (!page_rec_is_infimum(rec)) { + + rec_t* prev_rec = page_rec_get_prev(rec); + + if (!page_rec_is_infimum(prev_rec)) { + + return(prev_rec); + } + } + + page = page_align(rec); + prev_page_no = btr_page_get_prev(page, mtr); + + if (prev_page_no != FIL_NULL) { + + ulint space; + ulint zip_size; + buf_block_t* prev_block; + + space = page_get_space_id(page); + zip_size = fil_space_get_zip_size(space); + + prev_block = buf_page_get_with_no_latch(space, zip_size, + prev_page_no, mtr); + prev_page = buf_block_get_frame(prev_block); + /* The caller must already have a latch to the brother */ + ut_ad(mtr_memo_contains(mtr, prev_block, + MTR_MEMO_PAGE_S_FIX) + || mtr_memo_contains(mtr, prev_block, + MTR_MEMO_PAGE_X_FIX)); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(prev_page) == page_is_comp(page)); + ut_a(btr_page_get_next(prev_page, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + return(page_rec_get_prev(page_get_supremum_rec(prev_page))); + } + + return(NULL); +} + +/*************************************************************//** +Gets pointer to the next user record in the tree. It is assumed that the +caller has appropriate latches on the page and its neighbor. +@return next user record, NULL if there is none */ +UNIV_INTERN +rec_t* +btr_get_next_user_rec( +/*==================*/ + rec_t* rec, /*!< in: record on leaf level */ + mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if + needed, also to the next page */ +{ + page_t* page; + page_t* next_page; + ulint next_page_no; + + if (!page_rec_is_supremum(rec)) { + + rec_t* next_rec = page_rec_get_next(rec); + + if (!page_rec_is_supremum(next_rec)) { + + return(next_rec); + } + } + + page = page_align(rec); + next_page_no = btr_page_get_next(page, mtr); + + if (next_page_no != FIL_NULL) { + ulint space; + ulint zip_size; + buf_block_t* next_block; + + space = page_get_space_id(page); + zip_size = fil_space_get_zip_size(space); + + next_block = buf_page_get_with_no_latch(space, zip_size, + next_page_no, mtr); + next_page = buf_block_get_frame(next_block); + /* The caller must already have a latch to the brother */ + ut_ad(mtr_memo_contains(mtr, next_block, MTR_MEMO_PAGE_S_FIX) + || mtr_memo_contains(mtr, next_block, + MTR_MEMO_PAGE_X_FIX)); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(next_page) == page_is_comp(page)); + ut_a(btr_page_get_prev(next_page, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + return(page_rec_get_next(page_get_infimum_rec(next_page))); + } + + return(NULL); +} + +/**************************************************************//** +Creates a new index page (not the root, and also not +used in page reorganization). @see btr_page_empty(). */ +static +void +btr_page_create( +/*============*/ + buf_block_t* block, /*!< in/out: page to be created */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: the B-tree level of the page */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* page = buf_block_get_frame(block); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + + if (UNIV_LIKELY_NULL(page_zip)) { + page_create_zip(block, index, level, mtr); + } else { + page_create(block, mtr, dict_table_is_comp(index->table)); + /* Set the level of the new index page */ + btr_page_set_level(page, NULL, level, mtr); + } + + block->check_index_page_at_flush = TRUE; + + btr_page_set_index_id(page, page_zip, index->id, mtr); +} + +/**************************************************************//** +Allocates a new file page to be used in an ibuf tree. Takes the page from +the free list of the tree, which must contain pages! +@return new allocated block, x-latched */ +static +buf_block_t* +btr_page_alloc_for_ibuf( +/*====================*/ + dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr) /*!< in: mtr */ +{ + fil_addr_t node_addr; + page_t* root; + page_t* new_page; + buf_block_t* new_block; + + root = btr_root_get(index, mtr); + + node_addr = flst_get_first(root + PAGE_HEADER + + PAGE_BTR_IBUF_FREE_LIST, mtr); + ut_a(node_addr.page != FIL_NULL); + + new_block = buf_page_get(dict_index_get_space(index), + dict_table_zip_size(index->table), + node_addr.page, RW_X_LATCH, mtr); + new_page = buf_block_get_frame(new_block); + buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW); + + flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, + mtr); + ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + mtr)); + + return(new_block); +} + +/**************************************************************//** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! +@return new allocated block, x-latched; NULL if out of space */ +UNIV_INTERN +buf_block_t* +btr_page_alloc( +/*===========*/ + dict_index_t* index, /*!< in: index */ + ulint hint_page_no, /*!< in: hint of a good page */ + byte file_direction, /*!< in: direction where a possible + page split is made */ + ulint level, /*!< in: level where the page is placed + in the tree */ + mtr_t* mtr) /*!< in: mtr */ +{ + fseg_header_t* seg_header; + page_t* root; + buf_block_t* new_block; + ulint new_page_no; + + if (dict_index_is_ibuf(index)) { + + return(btr_page_alloc_for_ibuf(index, mtr)); + } + + root = btr_root_get(index, mtr); + + if (level == 0) { + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + } else { + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; + } + + /* Parameter TRUE below states that the caller has made the + reservation for free extents, and thus we know that a page can + be allocated: */ + + new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no, + file_direction, TRUE, mtr); + if (new_page_no == FIL_NULL) { + + return(NULL); + } + + new_block = buf_page_get(dict_index_get_space(index), + dict_table_zip_size(index->table), + new_page_no, RW_X_LATCH, mtr); + buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW); + + return(new_block); +} + +/**************************************************************//** +Gets the number of pages in a B-tree. +@return number of pages */ +UNIV_INTERN +ulint +btr_get_size( +/*=========*/ + dict_index_t* index, /*!< in: index */ + ulint flag) /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ +{ + fseg_header_t* seg_header; + page_t* root; + ulint n; + ulint dummy; + mtr_t mtr; + + mtr_start(&mtr); + + mtr_s_lock(dict_index_get_lock(index), &mtr); + + root = btr_root_get(index, &mtr); + + if (srv_pass_corrupt_table && !root) { + mtr_commit(&mtr); + return(0); + } + ut_a(root); + + if (flag == BTR_N_LEAF_PAGES) { + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + + fseg_n_reserved_pages(seg_header, &n, &mtr); + + } else if (flag == BTR_TOTAL_SIZE) { + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; + + n = fseg_n_reserved_pages(seg_header, &dummy, &mtr); + + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + + n += fseg_n_reserved_pages(seg_header, &dummy, &mtr); + } else { + ut_error; + } + + mtr_commit(&mtr); + + return(n); +} + +/**************************************************************//** +Frees a page used in an ibuf tree. Puts the page to the free list of the +ibuf tree. */ +static +void +btr_page_free_for_ibuf( +/*===================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: block to be freed, x-latched */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* root; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + root = btr_root_get(index, mtr); + + flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + buf_block_get_frame(block) + + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr); + + ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + mtr)); +} + +/**************************************************************//** +Frees a file page used in an index tree. Can be used also to (BLOB) +external storage pages, because the page level 0 can be given as an +argument. */ +UNIV_INTERN +void +btr_page_free_low( +/*==============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: block to be freed, x-latched */ + ulint level, /*!< in: page level */ + mtr_t* mtr) /*!< in: mtr */ +{ + fseg_header_t* seg_header; + page_t* root; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + /* The page gets invalid for optimistic searches: increment the frame + modify clock */ + + buf_block_modify_clock_inc(block); + + if (dict_index_is_ibuf(index)) { + + btr_page_free_for_ibuf(index, block, mtr); + + return; + } + + root = btr_root_get(index, mtr); + + if (level == 0) { + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + } else { + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; + } + + fseg_free_page(seg_header, + buf_block_get_space(block), + buf_block_get_page_no(block), mtr); +} + +/**************************************************************//** +Frees a file page used in an index tree. NOTE: cannot free field external +storage pages because the page must contain info on its level. */ +UNIV_INTERN +void +btr_page_free( +/*==========*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: block to be freed, x-latched */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint level; + + level = btr_page_get_level(buf_block_get_frame(block), mtr); + + btr_page_free_low(index, block, level, mtr); +} + +/**************************************************************//** +Sets the child node file address in a node pointer. */ +UNIV_INLINE +void +btr_node_ptr_set_child_page_no( +/*===========================*/ + rec_t* rec, /*!< in: node pointer record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed + part will be updated, or NULL */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint page_no,/*!< in: child node address */ + mtr_t* mtr) /*!< in: mtr */ +{ + byte* field; + ulint len; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!page_is_leaf(page_align(rec))); + ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); + + /* The child address is in the last field */ + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, &len); + + ut_ad(len == REC_NODE_PTR_SIZE); + + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write_node_ptr(page_zip, rec, + rec_offs_data_size(offsets), + page_no, mtr); + } else { + mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr); + } +} + +/************************************************************//** +Returns the child page of a node pointer and x-latches it. +@return child page, x-latched */ +static +buf_block_t* +btr_node_ptr_get_child( +/*===================*/ + const rec_t* node_ptr,/*!< in: node pointer */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint page_no; + ulint space; + + ut_ad(rec_offs_validate(node_ptr, index, offsets)); + space = page_get_space_id(page_align(node_ptr)); + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + + return(btr_block_get(space, dict_table_zip_size(index->table), + page_no, RW_X_LATCH, mtr)); +} + +/************************************************************//** +Returns the upper level node pointer to a page. It is assumed that mtr holds +an x-latch on the tree. +@return rec_get_offsets() of the node pointer record */ +static +ulint* +btr_page_get_father_node_ptr_func( +/*==============================*/ + ulint* offsets,/*!< in: work area for the return value */ + mem_heap_t* heap, /*!< in: memory heap to use */ + btr_cur_t* cursor, /*!< in: cursor pointing to user record, + out: cursor on node pointer record, + its page x-latched */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + dtuple_t* tuple; + rec_t* user_rec; + rec_t* node_ptr; + ulint level; + ulint page_no; + dict_index_t* index; + + page_no = buf_block_get_page_no(btr_cur_get_block(cursor)); + index = btr_cur_get_index(cursor); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + + ut_ad(dict_index_get_page(index) != page_no); + + level = btr_page_get_level(btr_cur_get_page(cursor), mtr); + user_rec = btr_cur_get_rec(cursor); + ut_a(page_rec_is_user_rec(user_rec)); + tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level); + + btr_cur_search_to_nth_level(index, level + 1, tuple, PAGE_CUR_LE, + BTR_CONT_MODIFY_TREE, cursor, 0, + file, line, mtr); + + node_ptr = btr_cur_get_rec(cursor); + ut_ad(!page_rec_is_comp(node_ptr) + || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR); + offsets = rec_get_offsets(node_ptr, index, offsets, + ULINT_UNDEFINED, &heap); + + if (UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr, offsets) + != page_no)) { + rec_t* print_rec; + fputs("InnoDB: Dump of the child page:\n", stderr); + buf_page_print(page_align(user_rec), 0); + fputs("InnoDB: Dump of the parent page:\n", stderr); + buf_page_print(page_align(node_ptr), 0); + + fputs("InnoDB: Corruption of an index tree: table ", stderr); + ut_print_name(stderr, NULL, TRUE, index->table_name); + fputs(", index ", stderr); + ut_print_name(stderr, NULL, FALSE, index->name); + fprintf(stderr, ",\n" + "InnoDB: father ptr page no %lu, child page no %lu\n", + (ulong) + btr_node_ptr_get_child_page_no(node_ptr, offsets), + (ulong) page_no); + print_rec = page_rec_get_next( + page_get_infimum_rec(page_align(user_rec))); + offsets = rec_get_offsets(print_rec, index, + offsets, ULINT_UNDEFINED, &heap); + page_rec_print(print_rec, offsets); + offsets = rec_get_offsets(node_ptr, index, offsets, + ULINT_UNDEFINED, &heap); + page_rec_print(node_ptr, offsets); + + fputs("InnoDB: You should dump + drop + reimport the table" + " to fix the\n" + "InnoDB: corruption. If the crash happens at " + "the database startup, see\n" + "InnoDB: " REFMAN "forcing-recovery.html about\n" + "InnoDB: forcing recovery. " + "Then dump + drop + reimport.\n", stderr); + + ut_error; + } + + return(offsets); +} + +#define btr_page_get_father_node_ptr(of,heap,cur,mtr) \ + btr_page_get_father_node_ptr_func(of,heap,cur,__FILE__,__LINE__,mtr) + +/************************************************************//** +Returns the upper level node pointer to a page. It is assumed that mtr holds +an x-latch on the tree. +@return rec_get_offsets() of the node pointer record */ +static +ulint* +btr_page_get_father_block( +/*======================*/ + ulint* offsets,/*!< in: work area for the return value */ + mem_heap_t* heap, /*!< in: memory heap to use */ + dict_index_t* index, /*!< in: b-tree index */ + buf_block_t* block, /*!< in: child page in the index */ + mtr_t* mtr, /*!< in: mtr */ + btr_cur_t* cursor) /*!< out: cursor on node pointer record, + its page x-latched */ +{ + rec_t* rec + = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame( + block))); + btr_cur_position(index, rec, block, cursor); + return(btr_page_get_father_node_ptr(offsets, heap, cursor, mtr)); +} + +/************************************************************//** +Seeks to the upper level node pointer to a page. +It is assumed that mtr holds an x-latch on the tree. */ +static +void +btr_page_get_father( +/*================*/ + dict_index_t* index, /*!< in: b-tree index */ + buf_block_t* block, /*!< in: child page in the index */ + mtr_t* mtr, /*!< in: mtr */ + btr_cur_t* cursor) /*!< out: cursor on node pointer record, + its page x-latched */ +{ + mem_heap_t* heap; + rec_t* rec + = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame( + block))); + btr_cur_position(index, rec, block, cursor); + + heap = mem_heap_create(100); + btr_page_get_father_node_ptr(NULL, heap, cursor, mtr); + mem_heap_free(heap); +} + +/************************************************************//** +Creates the root node for a new index tree. +@return page number of the created root, FIL_NULL if did not succeed */ +UNIV_INTERN +ulint +btr_create( +/*=======*/ + ulint type, /*!< in: type of the index */ + ulint space, /*!< in: space where created */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + dulint index_id,/*!< in: index id */ + dict_index_t* index, /*!< in: index */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ulint page_no; + buf_block_t* block; + buf_frame_t* frame; + page_t* page; + page_zip_des_t* page_zip; + + /* Create the two new segments (one, in the case of an ibuf tree) for + the index tree; the segment headers are put on the allocated root page + (for an ibuf tree, not in the root, but on a separate ibuf header + page) */ + + if (type & DICT_IBUF) { + /* Allocate first the ibuf header page */ + buf_block_t* ibuf_hdr_block = fseg_create( + space, 0, + IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr); + + buf_block_dbg_add_level(ibuf_hdr_block, SYNC_TREE_NODE_NEW); + + ut_ad(buf_block_get_page_no(ibuf_hdr_block) + == IBUF_HEADER_PAGE_NO); + /* Allocate then the next page to the segment: it will be the + tree root page */ + + page_no = fseg_alloc_free_page(buf_block_get_frame( + ibuf_hdr_block) + + IBUF_HEADER + + IBUF_TREE_SEG_HEADER, + IBUF_TREE_ROOT_PAGE_NO, + FSP_UP, mtr); + ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO); + + block = buf_page_get(space, zip_size, page_no, + RW_X_LATCH, mtr); + } else { + block = fseg_create(space, 0, + PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr); + } + + if (block == NULL) { + + return(FIL_NULL); + } + + page_no = buf_block_get_page_no(block); + frame = buf_block_get_frame(block); + + buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW); + + if (type & DICT_IBUF) { + /* It is an insert buffer tree: initialize the free list */ + + ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO); + + flst_init(frame + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr); + } else { + /* It is a non-ibuf tree: create a file segment for leaf + pages */ + if (!fseg_create(space, page_no, + PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr)) { + /* Not enough space for new segment, free root + segment before return. */ + btr_free_root(space, zip_size, page_no, mtr); + + return(FIL_NULL); + } + + /* The fseg create acquires a second latch on the page, + therefore we must declare it: */ + buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW); + } + + /* Create a new index page on the allocated segment page */ + page_zip = buf_block_get_page_zip(block); + + if (UNIV_LIKELY_NULL(page_zip)) { + page = page_create_zip(block, index, 0, mtr); + } else { + page = page_create(block, mtr, + dict_table_is_comp(index->table)); + /* Set the level of the new index page */ + btr_page_set_level(page, NULL, 0, mtr); + } + + block->check_index_page_at_flush = TRUE; + + /* Set the index id of the page */ + btr_page_set_index_id(page, page_zip, index_id, mtr); + + /* Set the next node and previous node fields */ + btr_page_set_next(page, page_zip, FIL_NULL, mtr); + btr_page_set_prev(page, page_zip, FIL_NULL, mtr); + + /* We reset the free bits for the page to allow creation of several + trees in the same mtr, otherwise the latch on a bitmap page would + prevent it because of the latching order */ + + if (!(type & DICT_CLUSTERED)) { + ibuf_reset_free_bits(block); + } + + /* In the following assertion we test that two records of maximum + allowed size fit on the root page: this fact is needed to ensure + correctness of split algorithms */ + + ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE); + + return(page_no); +} + +/************************************************************//** +Frees a B-tree except the root page, which MUST be freed after this +by calling btr_free_root. */ +UNIV_INTERN +void +btr_free_but_not_root( +/*==================*/ + ulint space, /*!< in: space where created */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint root_page_no) /*!< in: root page number */ +{ + ibool finished; + page_t* root; + mtr_t mtr; + +leaf_loop: + mtr_start(&mtr); + + root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, &mtr); + + if (srv_pass_corrupt_table && !root) { + mtr_commit(&mtr); + return; + } + ut_a(root); + +#ifdef UNIV_BTR_DEBUG + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + root, space)); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, space)); +#endif /* UNIV_BTR_DEBUG */ + + /* NOTE: page hash indexes are dropped when a page is freed inside + fsp0fsp. */ + + finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF, + &mtr); + mtr_commit(&mtr); + + if (!finished) { + + goto leaf_loop; + } +top_loop: + mtr_start(&mtr); + + root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, &mtr); + + if (srv_pass_corrupt_table && !root) { + mtr_commit(&mtr); + return; + } + ut_a(root); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, space)); +#endif /* UNIV_BTR_DEBUG */ + + finished = fseg_free_step_not_header( + root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr); + mtr_commit(&mtr); + + if (!finished) { + + goto top_loop; + } +} + +/************************************************************//** +Frees the B-tree root page. Other tree MUST already have been freed. */ +UNIV_INTERN +void +btr_free_root( +/*==========*/ + ulint space, /*!< in: space where created */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint root_page_no, /*!< in: root page number */ + mtr_t* mtr) /*!< in: a mini-transaction which has already + been started */ +{ + buf_block_t* block; + fseg_header_t* header; + + block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr); + + if (srv_pass_corrupt_table && !block) { + return; + } + ut_a(block); + + btr_search_drop_page_hash_index(block); + + header = buf_block_get_frame(block) + PAGE_HEADER + PAGE_BTR_SEG_TOP; +#ifdef UNIV_BTR_DEBUG + ut_a(btr_root_fseg_validate(header, space)); +#endif /* UNIV_BTR_DEBUG */ + + while (!fseg_free_step(header, mtr)); +} +#endif /* !UNIV_HOTBACKUP */ + +/*************************************************************//** +Reorganizes an index page. */ +static +ibool +btr_page_reorganize_low( +/*====================*/ + ibool recovery,/*!< in: TRUE if called in recovery: + locks should not be updated, i.e., + there cannot exist locks on the + page, and a hash index should not be + dropped: it cannot exist */ + buf_block_t* block, /*!< in: page to be reorganized */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* page = buf_block_get_frame(block); + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + buf_block_t* temp_block; + page_t* temp_page; + ulint log_mode; + ulint data_size1; + ulint data_size2; + ulint max_ins_size1; + ulint max_ins_size2; + ibool success = FALSE; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + data_size1 = page_get_data_size(page); + max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1); + +#ifndef UNIV_HOTBACKUP + /* Write the log record */ + mlog_open_and_write_index(mtr, page, index, page_is_comp(page) + ? MLOG_COMP_PAGE_REORGANIZE + : MLOG_PAGE_REORGANIZE, 0); +#endif /* !UNIV_HOTBACKUP */ + + /* Turn logging off */ + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + +#ifndef UNIV_HOTBACKUP + temp_block = buf_block_alloc(0); +#else /* !UNIV_HOTBACKUP */ + ut_ad(block == back_block1); + temp_block = back_block2; +#endif /* !UNIV_HOTBACKUP */ + temp_page = temp_block->frame; + + /* Copy the old page to temporary space */ + buf_frame_copy(temp_page, page); + +#ifndef UNIV_HOTBACKUP + if (UNIV_LIKELY(!recovery)) { + btr_search_drop_page_hash_index(block); + } + + block->check_index_page_at_flush = TRUE; +#endif /* !UNIV_HOTBACKUP */ + + /* Recreate the page: note that global data on page (possible + segment headers, next page-field, etc.) is preserved intact */ + + page_create(block, mtr, dict_table_is_comp(index->table)); + + /* Copy the records from the temporary space to the recreated page; + do not copy the lock bits yet */ + + page_copy_rec_list_end_no_locks(block, temp_block, + page_get_infimum_rec(temp_page), + index, mtr); + + if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) { + /* Copy max trx id to recreated page */ + trx_id_t max_trx_id = page_get_max_trx_id(temp_page); + page_set_max_trx_id(block, NULL, max_trx_id, mtr); + /* In crash recovery, dict_index_is_sec_or_ibuf() always + returns TRUE, even for clustered indexes. max_trx_id is + unused in clustered index pages. */ + ut_ad(!ut_dulint_is_zero(max_trx_id) || recovery); + } + + if (UNIV_LIKELY_NULL(page_zip) + && UNIV_UNLIKELY + (!page_zip_compress(page_zip, page, index, NULL))) { + + /* Restore the old page and exit. */ + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + /* Check that the bytes that we skip are identical. */ + ut_a(!memcmp(page, temp_page, PAGE_HEADER)); + ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page, + PAGE_HEADER + PAGE_N_RECS + temp_page, + PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS))); + ut_a(!memcmp(UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page, + UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + temp_page, + FIL_PAGE_DATA_END)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + + memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page, + PAGE_N_RECS - PAGE_N_DIR_SLOTS); + memcpy(PAGE_DATA + page, PAGE_DATA + temp_page, + UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END); + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(page, temp_page, UNIV_PAGE_SIZE)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + + goto func_exit; + } + +#ifndef UNIV_HOTBACKUP + if (UNIV_LIKELY(!recovery)) { + /* Update the record lock bitmaps */ + lock_move_reorganize_page(block, temp_block); + } +#endif /* !UNIV_HOTBACKUP */ + + data_size2 = page_get_data_size(page); + max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1); + + if (UNIV_UNLIKELY(data_size1 != data_size2) + || UNIV_UNLIKELY(max_ins_size1 != max_ins_size2)) { + buf_page_print(page, 0); + buf_page_print(temp_page, 0); + fprintf(stderr, + "InnoDB: Error: page old data size %lu" + " new data size %lu\n" + "InnoDB: Error: page old max ins size %lu" + " new max ins size %lu\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", + (unsigned long) data_size1, (unsigned long) data_size2, + (unsigned long) max_ins_size1, + (unsigned long) max_ins_size2); + } else { + success = TRUE; + } + +func_exit: +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ +#ifndef UNIV_HOTBACKUP + buf_block_free(temp_block); +#endif /* !UNIV_HOTBACKUP */ + + /* Restore logging mode */ + mtr_set_log_mode(mtr, log_mode); + + return(success); +} + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Reorganizes an index page. +IMPORTANT: if btr_page_reorganize() is invoked on a compressed leaf +page of a non-clustered index, the caller must update the insert +buffer free bits in the same mini-transaction in such a way that the +modification will be redo-logged. +@return TRUE on success, FALSE on failure */ +UNIV_INTERN +ibool +btr_page_reorganize( +/*================*/ + buf_block_t* block, /*!< in: page to be reorganized */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + return(btr_page_reorganize_low(FALSE, block, index, mtr)); +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses a redo log record of reorganizing a page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_parse_page_reorganize( +/*======================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr __attribute__((unused)), + /*!< in: buffer end */ + dict_index_t* index, /*!< in: record descriptor */ + buf_block_t* block, /*!< in: page to be reorganized, or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + ut_ad(ptr && end_ptr); + + /* The record is empty, except for the record initial part */ + + if (UNIV_LIKELY(block != NULL)) { + btr_page_reorganize_low(TRUE, block, index, mtr); + } + + return(ptr); +} + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Empties an index page. @see btr_page_create(). */ +static +void +btr_page_empty( +/*===========*/ + buf_block_t* block, /*!< in: page to be emptied */ + page_zip_des_t* page_zip,/*!< out: compressed page, or NULL */ + dict_index_t* index, /*!< in: index of the page */ + ulint level, /*!< in: the B-tree level of the page */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* page = buf_block_get_frame(block); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_zip == buf_block_get_page_zip(block)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + btr_search_drop_page_hash_index(block); + + /* Recreate the page: note that global data on page (possible + segment headers, next page-field, etc.) is preserved intact */ + + if (UNIV_LIKELY_NULL(page_zip)) { + page_create_zip(block, index, level, mtr); + } else { + page_create(block, mtr, dict_table_is_comp(index->table)); + btr_page_set_level(page, NULL, level, mtr); + } + + block->check_index_page_at_flush = TRUE; +} + +/*************************************************************//** +Makes tree one level higher by splitting the root, and inserts +the tuple. It is assumed that mtr contains an x-latch on the tree. +NOTE that the operation of this function must always succeed, +we cannot reverse it: therefore enough free disk space must be +guaranteed to be available before this function is called. +@return inserted record */ +UNIV_INTERN +rec_t* +btr_root_raise_and_insert( +/*======================*/ + btr_cur_t* cursor, /*!< in: cursor at which to insert: must be + on the root page; when the function returns, + the cursor is positioned on the predecessor + of the inserted record */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index; + page_t* root; + page_t* new_page; + ulint new_page_no; + rec_t* rec; + mem_heap_t* heap; + dtuple_t* node_ptr; + ulint level; + rec_t* node_ptr_rec; + page_cur_t* page_cursor; + page_zip_des_t* root_page_zip; + page_zip_des_t* new_page_zip; + buf_block_t* root_block; + buf_block_t* new_block; + + root = btr_cur_get_page(cursor); + root_block = btr_cur_get_block(cursor); + root_page_zip = buf_block_get_page_zip(root_block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!root_page_zip || page_zip_validate(root_page_zip, root)); +#endif /* UNIV_ZIP_DEBUG */ + index = btr_cur_get_index(cursor); +#ifdef UNIV_BTR_DEBUG + if (!dict_index_is_ibuf(index)) { + ulint space = dict_index_get_space(index); + + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + root, space)); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, space)); + } + + ut_a(dict_index_get_page(index) == page_get_page_no(root)); +#endif /* UNIV_BTR_DEBUG */ + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, root_block, MTR_MEMO_PAGE_X_FIX)); + + /* Allocate a new page to the tree. Root splitting is done by first + moving the root records to the new page, emptying the root, putting + a node pointer to the new page, and then splitting the new page. */ + + level = btr_page_get_level(root, mtr); + + new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr); + new_page = buf_block_get_frame(new_block); + new_page_zip = buf_block_get_page_zip(new_block); + ut_a(!new_page_zip == !root_page_zip); + ut_a(!new_page_zip + || page_zip_get_size(new_page_zip) + == page_zip_get_size(root_page_zip)); + + btr_page_create(new_block, new_page_zip, index, level, mtr); + + /* Set the next node and previous node fields of new page */ + btr_page_set_next(new_page, new_page_zip, FIL_NULL, mtr); + btr_page_set_prev(new_page, new_page_zip, FIL_NULL, mtr); + + /* Copy the records from root to the new page one by one. */ + + if (0 +#ifdef UNIV_ZIP_COPY + || new_page_zip +#endif /* UNIV_ZIP_COPY */ + || UNIV_UNLIKELY + (!page_copy_rec_list_end(new_block, root_block, + page_get_infimum_rec(root), + index, mtr))) { + ut_a(new_page_zip); + + /* Copy the page byte for byte. */ + page_zip_copy_recs(new_page_zip, new_page, + root_page_zip, root, index, mtr); + + /* Update the lock table and possible hash index. */ + + lock_move_rec_list_end(new_block, root_block, + page_get_infimum_rec(root)); + + btr_search_move_or_delete_hash_entries(new_block, root_block, + index); + } + + /* If this is a pessimistic insert which is actually done to + perform a pessimistic update then we have stored the lock + information of the record to be inserted on the infimum of the + root page: we cannot discard the lock structs on the root page */ + + lock_update_root_raise(new_block, root_block); + + /* Create a memory heap where the node pointer is stored */ + heap = mem_heap_create(100); + + rec = page_rec_get_next(page_get_infimum_rec(new_page)); + new_page_no = buf_block_get_page_no(new_block); + + /* Build the node pointer (= node key and page address) for the + child */ + + node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, heap, + level); + /* The node pointer must be marked as the predefined minimum record, + as there is no lower alphabetical limit to records in the leftmost + node of a level: */ + dtuple_set_info_bits(node_ptr, + dtuple_get_info_bits(node_ptr) + | REC_INFO_MIN_REC_FLAG); + + /* Rebuild the root page to get free space */ + btr_page_empty(root_block, root_page_zip, index, level + 1, mtr); + + /* Set the next node and previous node fields, although + they should already have been set. The previous node field + must be FIL_NULL if root_page_zip != NULL, because the + REC_INFO_MIN_REC_FLAG (of the first user record) will be + set if and only if btr_page_get_prev() == FIL_NULL. */ + btr_page_set_next(root, root_page_zip, FIL_NULL, mtr); + btr_page_set_prev(root, root_page_zip, FIL_NULL, mtr); + + page_cursor = btr_cur_get_page_cur(cursor); + + /* Insert node pointer to the root */ + + page_cur_set_before_first(root_block, page_cursor); + + node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr, + index, 0, mtr); + + /* The root page should only contain the node pointer + to new_page at this point. Thus, the data should fit. */ + ut_a(node_ptr_rec); + + /* Free the memory heap */ + mem_heap_free(heap); + + /* We play safe and reset the free bits for the new page */ + +#if 0 + fprintf(stderr, "Root raise new page no %lu\n", new_page_no); +#endif + + if (!dict_index_is_clust(index)) { + ibuf_reset_free_bits(new_block); + } + + /* Reposition the cursor to the child node */ + page_cur_search(new_block, index, tuple, + PAGE_CUR_LE, page_cursor); + + /* Split the child and insert tuple */ + return(btr_page_split_and_insert(cursor, tuple, n_ext, mtr)); +} + +/*************************************************************//** +Decides if the page should be split at the convergence point of inserts +converging to the left. +@return TRUE if split recommended */ +UNIV_INTERN +ibool +btr_page_get_split_rec_to_left( +/*===========================*/ + btr_cur_t* cursor, /*!< in: cursor at which to insert */ + rec_t** split_rec) /*!< out: if split recommended, + the first record on upper half page, + or NULL if tuple to be inserted should + be first */ +{ + page_t* page; + rec_t* insert_point; + rec_t* infimum; + + page = btr_cur_get_page(cursor); + insert_point = btr_cur_get_rec(cursor); + + if (page_header_get_ptr(page, PAGE_LAST_INSERT) + == page_rec_get_next(insert_point)) { + + infimum = page_get_infimum_rec(page); + + /* If the convergence is in the middle of a page, include also + the record immediately before the new insert to the upper + page. Otherwise, we could repeatedly move from page to page + lots of records smaller than the convergence point. */ + + if (infimum != insert_point + && page_rec_get_next(infimum) != insert_point) { + + *split_rec = insert_point; + } else { + *split_rec = page_rec_get_next(insert_point); + } + + return(TRUE); + } + + return(FALSE); +} + +/*************************************************************//** +Decides if the page should be split at the convergence point of inserts +converging to the right. +@return TRUE if split recommended */ +UNIV_INTERN +ibool +btr_page_get_split_rec_to_right( +/*============================*/ + btr_cur_t* cursor, /*!< in: cursor at which to insert */ + rec_t** split_rec) /*!< out: if split recommended, + the first record on upper half page, + or NULL if tuple to be inserted should + be first */ +{ + page_t* page; + rec_t* insert_point; + + page = btr_cur_get_page(cursor); + insert_point = btr_cur_get_rec(cursor); + + /* We use eager heuristics: if the new insert would be right after + the previous insert on the same page, we assume that there is a + pattern of sequential inserts here. */ + + if (UNIV_LIKELY(page_header_get_ptr(page, PAGE_LAST_INSERT) + == insert_point)) { + + rec_t* next_rec; + + next_rec = page_rec_get_next(insert_point); + + if (page_rec_is_supremum(next_rec)) { +split_at_new: + /* Split at the new record to insert */ + *split_rec = NULL; + } else { + rec_t* next_next_rec = page_rec_get_next(next_rec); + if (page_rec_is_supremum(next_next_rec)) { + + goto split_at_new; + } + + /* If there are >= 2 user records up from the insert + point, split all but 1 off. We want to keep one because + then sequential inserts can use the adaptive hash + index, as they can do the necessary checks of the right + search position just by looking at the records on this + page. */ + + *split_rec = next_next_rec; + } + + return(TRUE); + } + + return(FALSE); +} + +/*************************************************************//** +Calculates a split record such that the tuple will certainly fit on +its half-page when the split is performed. We assume in this function +only that the cursor page has at least one user record. +@return split record, or NULL if tuple will be the first record on +the lower or upper half-page (determined by btr_page_tuple_smaller()) */ +static +rec_t* +btr_page_get_split_rec( +/*===================*/ + btr_cur_t* cursor, /*!< in: cursor at which insert should be made */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext) /*!< in: number of externally stored columns */ +{ + page_t* page; + page_zip_des_t* page_zip; + ulint insert_size; + ulint free_space; + ulint total_data; + ulint total_n_recs; + ulint total_space; + ulint incl_data; + rec_t* ins_rec; + rec_t* rec; + rec_t* next_rec; + ulint n; + mem_heap_t* heap; + ulint* offsets; + + page = btr_cur_get_page(cursor); + + insert_size = rec_get_converted_size(cursor->index, tuple, n_ext); + free_space = page_get_free_space_of_empty(page_is_comp(page)); + + page_zip = btr_cur_get_page_zip(cursor); + if (UNIV_LIKELY_NULL(page_zip)) { + /* Estimate the free space of an empty compressed page. */ + ulint free_space_zip = page_zip_empty_size( + cursor->index->n_fields, + page_zip_get_size(page_zip)); + + if (UNIV_LIKELY(free_space > (ulint) free_space_zip)) { + free_space = (ulint) free_space_zip; + } + } + + /* free_space is now the free space of a created new page */ + + total_data = page_get_data_size(page) + insert_size; + total_n_recs = page_get_n_recs(page) + 1; + ut_ad(total_n_recs >= 2); + total_space = total_data + page_dir_calc_reserved_space(total_n_recs); + + n = 0; + incl_data = 0; + ins_rec = btr_cur_get_rec(cursor); + rec = page_get_infimum_rec(page); + + heap = NULL; + offsets = NULL; + + /* We start to include records to the left half, and when the + space reserved by them exceeds half of total_space, then if + the included records fit on the left page, they will be put there + if something was left over also for the right page, + otherwise the last included record will be the first on the right + half page */ + + do { + /* Decide the next record to include */ + if (rec == ins_rec) { + rec = NULL; /* NULL denotes that tuple is + now included */ + } else if (rec == NULL) { + rec = page_rec_get_next(ins_rec); + } else { + rec = page_rec_get_next(rec); + } + + if (rec == NULL) { + /* Include tuple */ + incl_data += insert_size; + } else { + offsets = rec_get_offsets(rec, cursor->index, + offsets, ULINT_UNDEFINED, + &heap); + incl_data += rec_offs_size(offsets); + } + + n++; + } while (incl_data + page_dir_calc_reserved_space(n) + < total_space / 2); + + if (incl_data + page_dir_calc_reserved_space(n) <= free_space) { + /* The next record will be the first on + the right half page if it is not the + supremum record of page */ + + if (rec == ins_rec) { + rec = NULL; + + goto func_exit; + } else if (rec == NULL) { + next_rec = page_rec_get_next(ins_rec); + } else { + next_rec = page_rec_get_next(rec); + } + ut_ad(next_rec); + if (!page_rec_is_supremum(next_rec)) { + rec = next_rec; + } + } + +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(rec); +} + +/*************************************************************//** +Returns TRUE if the insert fits on the appropriate half-page with the +chosen split_rec. +@return TRUE if fits */ +static +ibool +btr_page_insert_fits( +/*=================*/ + btr_cur_t* cursor, /*!< in: cursor at which insert + should be made */ + const rec_t* split_rec,/*!< in: suggestion for first record + on upper half-page, or NULL if + tuple to be inserted should be first */ + const ulint* offsets,/*!< in: rec_get_offsets( + split_rec, cursor->index) */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mem_heap_t* heap) /*!< in: temporary memory heap */ +{ + page_t* page; + ulint insert_size; + ulint free_space; + ulint total_data; + ulint total_n_recs; + const rec_t* rec; + const rec_t* end_rec; + ulint* offs; + + page = btr_cur_get_page(cursor); + + ut_ad(!split_rec == !offsets); + ut_ad(!offsets + || !page_is_comp(page) == !rec_offs_comp(offsets)); + ut_ad(!offsets + || rec_offs_validate(split_rec, cursor->index, offsets)); + + insert_size = rec_get_converted_size(cursor->index, tuple, n_ext); + free_space = page_get_free_space_of_empty(page_is_comp(page)); + + /* free_space is now the free space of a created new page */ + + total_data = page_get_data_size(page) + insert_size; + total_n_recs = page_get_n_recs(page) + 1; + + /* We determine which records (from rec to end_rec, not including + end_rec) will end up on the other half page from tuple when it is + inserted. */ + + if (split_rec == NULL) { + rec = page_rec_get_next(page_get_infimum_rec(page)); + end_rec = page_rec_get_next(btr_cur_get_rec(cursor)); + + } else if (cmp_dtuple_rec(tuple, split_rec, offsets) >= 0) { + + rec = page_rec_get_next(page_get_infimum_rec(page)); + end_rec = split_rec; + } else { + rec = split_rec; + end_rec = page_get_supremum_rec(page); + } + + if (total_data + page_dir_calc_reserved_space(total_n_recs) + <= free_space) { + + /* Ok, there will be enough available space on the + half page where the tuple is inserted */ + + return(TRUE); + } + + offs = NULL; + + while (rec != end_rec) { + /* In this loop we calculate the amount of reserved + space after rec is removed from page. */ + + offs = rec_get_offsets(rec, cursor->index, offs, + ULINT_UNDEFINED, &heap); + + total_data -= rec_offs_size(offs); + total_n_recs--; + + if (total_data + page_dir_calc_reserved_space(total_n_recs) + <= free_space) { + + /* Ok, there will be enough available space on the + half page where the tuple is inserted */ + + return(TRUE); + } + + rec = page_rec_get_next_const(rec); + } + + return(FALSE); +} + +/*******************************************************//** +Inserts a data tuple to a tree on a non-leaf level. It is assumed +that mtr holds an x-latch on the tree. */ +UNIV_INTERN +void +btr_insert_on_non_leaf_level_func( +/*==============================*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level, must be > 0 */ + dtuple_t* tuple, /*!< in: the record to be inserted */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + big_rec_t* dummy_big_rec; + btr_cur_t cursor; + ulint err; + rec_t* rec; + + ut_ad(level > 0); + + btr_cur_search_to_nth_level(index, level, tuple, PAGE_CUR_LE, + BTR_CONT_MODIFY_TREE, + &cursor, 0, file, line, mtr); + + err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG + | BTR_NO_UNDO_LOG_FLAG, + &cursor, tuple, &rec, + &dummy_big_rec, 0, NULL, mtr); + ut_a(err == DB_SUCCESS); +} + +/**************************************************************//** +Attaches the halves of an index page on the appropriate level in an +index tree. */ +static +void +btr_attach_half_pages( +/*==================*/ + dict_index_t* index, /*!< in: the index tree */ + buf_block_t* block, /*!< in/out: page to be split */ + rec_t* split_rec, /*!< in: first record on upper + half page */ + buf_block_t* new_block, /*!< in/out: the new half page */ + ulint direction, /*!< in: FSP_UP or FSP_DOWN */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint space; + ulint zip_size; + ulint prev_page_no; + ulint next_page_no; + ulint level; + page_t* page = buf_block_get_frame(block); + page_t* lower_page; + page_t* upper_page; + ulint lower_page_no; + ulint upper_page_no; + page_zip_des_t* lower_page_zip; + page_zip_des_t* upper_page_zip; + dtuple_t* node_ptr_upper; + mem_heap_t* heap; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains(mtr, new_block, MTR_MEMO_PAGE_X_FIX)); + + /* Create a memory heap where the data tuple is stored */ + heap = mem_heap_create(1024); + + /* Based on split direction, decide upper and lower pages */ + if (direction == FSP_DOWN) { + + btr_cur_t cursor; + ulint* offsets; + + lower_page = buf_block_get_frame(new_block); + lower_page_no = buf_block_get_page_no(new_block); + lower_page_zip = buf_block_get_page_zip(new_block); + upper_page = buf_block_get_frame(block); + upper_page_no = buf_block_get_page_no(block); + upper_page_zip = buf_block_get_page_zip(block); + + /* Look up the index for the node pointer to page */ + offsets = btr_page_get_father_block(NULL, heap, index, + block, mtr, &cursor); + + /* Replace the address of the old child node (= page) with the + address of the new lower half */ + + btr_node_ptr_set_child_page_no( + btr_cur_get_rec(&cursor), + btr_cur_get_page_zip(&cursor), + offsets, lower_page_no, mtr); + mem_heap_empty(heap); + } else { + lower_page = buf_block_get_frame(block); + lower_page_no = buf_block_get_page_no(block); + lower_page_zip = buf_block_get_page_zip(block); + upper_page = buf_block_get_frame(new_block); + upper_page_no = buf_block_get_page_no(new_block); + upper_page_zip = buf_block_get_page_zip(new_block); + } + + /* Get the level of the split pages */ + level = btr_page_get_level(buf_block_get_frame(block), mtr); + ut_ad(level + == btr_page_get_level(buf_block_get_frame(new_block), mtr)); + + /* Build the node pointer (= node key and page address) for the upper + half */ + + node_ptr_upper = dict_index_build_node_ptr(index, split_rec, + upper_page_no, heap, level); + + /* Insert it next to the pointer to the lower half. Note that this + may generate recursion leading to a split on the higher level. */ + + btr_insert_on_non_leaf_level(index, level + 1, node_ptr_upper, mtr); + + /* Free the memory heap */ + mem_heap_free(heap); + + /* Get the previous and next pages of page */ + + prev_page_no = btr_page_get_prev(page, mtr); + next_page_no = btr_page_get_next(page, mtr); + space = buf_block_get_space(block); + zip_size = buf_block_get_zip_size(block); + + /* Update page links of the level */ + + if (prev_page_no != FIL_NULL) { + buf_block_t* prev_block = btr_block_get(space, zip_size, + prev_page_no, + RW_X_LATCH, mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(prev_block->frame) == page_is_comp(page)); + ut_a(btr_page_get_next(prev_block->frame, mtr) + == buf_block_get_page_no(block)); +#endif /* UNIV_BTR_DEBUG */ + + btr_page_set_next(buf_block_get_frame(prev_block), + buf_block_get_page_zip(prev_block), + lower_page_no, mtr); + } + + if (next_page_no != FIL_NULL) { + buf_block_t* next_block = btr_block_get(space, zip_size, + next_page_no, + RW_X_LATCH, mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(next_block->frame) == page_is_comp(page)); + ut_a(btr_page_get_prev(next_block->frame, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + btr_page_set_prev(buf_block_get_frame(next_block), + buf_block_get_page_zip(next_block), + upper_page_no, mtr); + } + + btr_page_set_prev(lower_page, lower_page_zip, prev_page_no, mtr); + btr_page_set_next(lower_page, lower_page_zip, upper_page_no, mtr); + + btr_page_set_prev(upper_page, upper_page_zip, lower_page_no, mtr); + btr_page_set_next(upper_page, upper_page_zip, next_page_no, mtr); +} + +/*************************************************************//** +Determine if a tuple is smaller than any record on the page. +@return TRUE if smaller */ +static +ibool +btr_page_tuple_smaller( +/*===================*/ + btr_cur_t* cursor, /*!< in: b-tree cursor */ + const dtuple_t* tuple, /*!< in: tuple to consider */ + ulint* offsets,/*!< in/out: temporary storage */ + ulint n_uniq, /*!< in: number of unique fields + in the index page records */ + mem_heap_t** heap) /*!< in/out: heap for offsets */ +{ + buf_block_t* block; + const rec_t* first_rec; + page_cur_t pcur; + + /* Read the first user record in the page. */ + block = btr_cur_get_block(cursor); + page_cur_set_before_first(block, &pcur); + page_cur_move_to_next(&pcur); + first_rec = page_cur_get_rec(&pcur); + + offsets = rec_get_offsets( + first_rec, cursor->index, offsets, + n_uniq, heap); + + return(cmp_dtuple_rec(tuple, first_rec, offsets) < 0); +} + +/*************************************************************//** +Splits an index page to halves and inserts the tuple. It is assumed +that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is +released within this function! NOTE that the operation of this +function must always succeed, we cannot reverse it: therefore enough +free disk space (2 pages) must be guaranteed to be available before +this function is called. + +@return inserted record */ +UNIV_INTERN +rec_t* +btr_page_split_and_insert( +/*======================*/ + btr_cur_t* cursor, /*!< in: cursor at which to insert; when the + function returns, the cursor is positioned + on the predecessor of the inserted record */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + ulint page_no; + byte direction; + ulint hint_page_no; + buf_block_t* new_block; + page_t* new_page; + page_zip_des_t* new_page_zip; + rec_t* split_rec; + buf_block_t* left_block; + buf_block_t* right_block; + buf_block_t* insert_block; + page_t* insert_page; + page_cur_t* page_cursor; + rec_t* first_rec; + byte* buf = 0; /* remove warning */ + rec_t* move_limit; + ibool insert_will_fit; + ibool insert_left; + ulint n_iterations = 0; + rec_t* rec; + mem_heap_t* heap; + ulint n_uniq; + ulint* offsets; + + heap = mem_heap_create(1024); + n_uniq = dict_index_get_n_unique_in_tree(cursor->index); +func_start: + mem_heap_empty(heap); + offsets = NULL; + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(cursor->index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_get_n_recs(page) >= 1); + + page_no = buf_block_get_page_no(block); + + /* 1. Decide the split record; split_rec == NULL means that the + tuple to be inserted should be the first record on the upper + half-page */ + insert_left = FALSE; + + if (n_iterations > 0) { + direction = FSP_UP; + hint_page_no = page_no + 1; + split_rec = btr_page_get_split_rec(cursor, tuple, n_ext); + + if (UNIV_UNLIKELY(split_rec == NULL)) { + insert_left = btr_page_tuple_smaller( + cursor, tuple, offsets, n_uniq, &heap); + } + } else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) { + direction = FSP_UP; + hint_page_no = page_no + 1; + + } else if (btr_page_get_split_rec_to_left(cursor, &split_rec)) { + direction = FSP_DOWN; + hint_page_no = page_no - 1; + ut_ad(split_rec); + } else { + direction = FSP_UP; + hint_page_no = page_no + 1; + + /* If there is only one record in the index page, we + can't split the node in the middle by default. We need + to determine whether the new record will be inserted + to the left or right. */ + + if (page_get_n_recs(page) > 1) { + split_rec = page_get_middle_rec(page); + } else if (btr_page_tuple_smaller(cursor, tuple, + offsets, n_uniq, &heap)) { + split_rec = page_rec_get_next( + page_get_infimum_rec(page)); + } else { + split_rec = NULL; + } + } + + /* 2. Allocate a new page to the index */ + new_block = btr_page_alloc(cursor->index, hint_page_no, direction, + btr_page_get_level(page, mtr), mtr); + new_page = buf_block_get_frame(new_block); + new_page_zip = buf_block_get_page_zip(new_block); + btr_page_create(new_block, new_page_zip, cursor->index, + btr_page_get_level(page, mtr), mtr); + + /* 3. Calculate the first record on the upper half-page, and the + first record (move_limit) on original page which ends up on the + upper half */ + + if (split_rec) { + first_rec = move_limit = split_rec; + + offsets = rec_get_offsets(split_rec, cursor->index, offsets, + n_uniq, &heap); + + insert_left = cmp_dtuple_rec(tuple, split_rec, offsets) < 0; + + if (UNIV_UNLIKELY(!insert_left && new_page_zip + && n_iterations > 0)) { + /* If a compressed page has already been split, + avoid further splits by inserting the record + to an empty page. */ + split_rec = NULL; + goto insert_empty; + } + } else if (UNIV_UNLIKELY(insert_left)) { + ut_a(n_iterations > 0); + first_rec = page_rec_get_next(page_get_infimum_rec(page)); + move_limit = page_rec_get_next(btr_cur_get_rec(cursor)); + } else { +insert_empty: + ut_ad(!split_rec); + ut_ad(!insert_left); + buf = mem_alloc(rec_get_converted_size(cursor->index, + tuple, n_ext)); + + first_rec = rec_convert_dtuple_to_rec(buf, cursor->index, + tuple, n_ext); + move_limit = page_rec_get_next(btr_cur_get_rec(cursor)); + } + + /* 4. Do first the modifications in the tree structure */ + + btr_attach_half_pages(cursor->index, block, + first_rec, new_block, direction, mtr); + + /* If the split is made on the leaf level and the insert will fit + on the appropriate half-page, we may release the tree x-latch. + We can then move the records after releasing the tree latch, + thus reducing the tree latch contention. */ + + if (split_rec) { + insert_will_fit = !new_page_zip + && btr_page_insert_fits(cursor, split_rec, + offsets, tuple, n_ext, heap); + } else { + if (!insert_left) { + mem_free(buf); + buf = NULL; + } + + insert_will_fit = !new_page_zip + && btr_page_insert_fits(cursor, NULL, + NULL, tuple, n_ext, heap); + } + + if (insert_will_fit && page_is_leaf(page)) { + + mtr_memo_release(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK); + } + + /* 5. Move then the records to the new page */ + if (direction == FSP_DOWN) { + /* fputs("Split left\n", stderr); */ + + if (0 +#ifdef UNIV_ZIP_COPY + || page_zip +#endif /* UNIV_ZIP_COPY */ + || UNIV_UNLIKELY + (!page_move_rec_list_start(new_block, block, move_limit, + cursor->index, mtr))) { + /* For some reason, compressing new_page failed, + even though it should contain fewer records than + the original page. Copy the page byte for byte + and then delete the records from both pages + as appropriate. Deleting will always succeed. */ + ut_a(new_page_zip); + + page_zip_copy_recs(new_page_zip, new_page, + page_zip, page, cursor->index, mtr); + page_delete_rec_list_end(move_limit - page + new_page, + new_block, cursor->index, + ULINT_UNDEFINED, + ULINT_UNDEFINED, mtr); + + /* Update the lock table and possible hash index. */ + + lock_move_rec_list_start( + new_block, block, move_limit, + new_page + PAGE_NEW_INFIMUM); + + btr_search_move_or_delete_hash_entries( + new_block, block, cursor->index); + + /* Delete the records from the source page. */ + + page_delete_rec_list_start(move_limit, block, + cursor->index, mtr); + } + + left_block = new_block; + right_block = block; + + lock_update_split_left(right_block, left_block); + } else { + /* fputs("Split right\n", stderr); */ + + if (0 +#ifdef UNIV_ZIP_COPY + || page_zip +#endif /* UNIV_ZIP_COPY */ + || UNIV_UNLIKELY + (!page_move_rec_list_end(new_block, block, move_limit, + cursor->index, mtr))) { + /* For some reason, compressing new_page failed, + even though it should contain fewer records than + the original page. Copy the page byte for byte + and then delete the records from both pages + as appropriate. Deleting will always succeed. */ + ut_a(new_page_zip); + + page_zip_copy_recs(new_page_zip, new_page, + page_zip, page, cursor->index, mtr); + page_delete_rec_list_start(move_limit - page + + new_page, new_block, + cursor->index, mtr); + + /* Update the lock table and possible hash index. */ + + lock_move_rec_list_end(new_block, block, move_limit); + + btr_search_move_or_delete_hash_entries( + new_block, block, cursor->index); + + /* Delete the records from the source page. */ + + page_delete_rec_list_end(move_limit, block, + cursor->index, + ULINT_UNDEFINED, + ULINT_UNDEFINED, mtr); + } + + left_block = block; + right_block = new_block; + + lock_update_split_right(right_block, left_block); + } + +#ifdef UNIV_ZIP_DEBUG + if (UNIV_LIKELY_NULL(page_zip)) { + ut_a(page_zip_validate(page_zip, page)); + ut_a(page_zip_validate(new_page_zip, new_page)); + } +#endif /* UNIV_ZIP_DEBUG */ + + /* At this point, split_rec, move_limit and first_rec may point + to garbage on the old page. */ + + /* 6. The split and the tree modification is now completed. Decide the + page where the tuple should be inserted */ + + if (insert_left) { + insert_block = left_block; + } else { + insert_block = right_block; + } + + insert_page = buf_block_get_frame(insert_block); + + /* 7. Reposition the cursor for insert and try insertion */ + page_cursor = btr_cur_get_page_cur(cursor); + + page_cur_search(insert_block, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); + + rec = page_cur_tuple_insert(page_cursor, tuple, + cursor->index, n_ext, mtr); + +#ifdef UNIV_ZIP_DEBUG + { + page_zip_des_t* insert_page_zip + = buf_block_get_page_zip(insert_block); + ut_a(!insert_page_zip + || page_zip_validate(insert_page_zip, insert_page)); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (UNIV_LIKELY(rec != NULL)) { + + goto func_exit; + } + + /* 8. If insert did not fit, try page reorganization */ + + if (UNIV_UNLIKELY + (!btr_page_reorganize(insert_block, cursor->index, mtr))) { + + goto insert_failed; + } + + page_cur_search(insert_block, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, + n_ext, mtr); + + if (UNIV_UNLIKELY(rec == NULL)) { + /* The insert did not fit on the page: loop back to the + start of the function for a new split */ +insert_failed: + /* We play safe and reset the free bits for new_page */ + if (!dict_index_is_clust(cursor->index)) { + ibuf_reset_free_bits(new_block); + } + + /* fprintf(stderr, "Split second round %lu\n", + page_get_page_no(page)); */ + n_iterations++; + ut_ad(n_iterations < 2 + || buf_block_get_page_zip(insert_block)); + ut_ad(!insert_will_fit); + + goto func_start; + } + +func_exit: + /* Insert fit on the page: update the free bits for the + left and right pages in the same mtr */ + + if (!dict_index_is_clust(cursor->index) && page_is_leaf(page)) { + ibuf_update_free_bits_for_two_pages_low( + buf_block_get_zip_size(left_block), + left_block, right_block, mtr); + } + +#if 0 + fprintf(stderr, "Split and insert done %lu %lu\n", + buf_block_get_page_no(left_block), + buf_block_get_page_no(right_block)); +#endif + + ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index)); + ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index)); + + mem_heap_free(heap); + return(rec); +} + +/*************************************************************//** +Removes a page from the level list of pages. */ +static +void +btr_level_list_remove( +/*==================*/ + ulint space, /*!< in: space where removed */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + page_t* page, /*!< in: page to remove */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint prev_page_no; + ulint next_page_no; + + ut_ad(page && mtr); + ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); + ut_ad(space == page_get_space_id(page)); + /* Get the previous and next page numbers of page */ + + prev_page_no = btr_page_get_prev(page, mtr); + next_page_no = btr_page_get_next(page, mtr); + + /* Update page links of the level */ + + if (prev_page_no != FIL_NULL) { + buf_block_t* prev_block + = btr_block_get(space, zip_size, prev_page_no, + RW_X_LATCH, mtr); + page_t* prev_page + = buf_block_get_frame(prev_block); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(prev_page) == page_is_comp(page)); + ut_a(btr_page_get_next(prev_page, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + btr_page_set_next(prev_page, + buf_block_get_page_zip(prev_block), + next_page_no, mtr); + } + + if (next_page_no != FIL_NULL) { + buf_block_t* next_block + = btr_block_get(space, zip_size, next_page_no, + RW_X_LATCH, mtr); + page_t* next_page + = buf_block_get_frame(next_block); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(next_page) == page_is_comp(page)); + ut_a(btr_page_get_prev(next_page, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + btr_page_set_prev(next_page, + buf_block_get_page_zip(next_block), + prev_page_no, mtr); + } +} + +/****************************************************************//** +Writes the redo log record for setting an index record as the predefined +minimum record. */ +UNIV_INLINE +void +btr_set_min_rec_mark_log( +/*=====================*/ + rec_t* rec, /*!< in: record */ + byte type, /*!< in: MLOG_COMP_REC_MIN_MARK or MLOG_REC_MIN_MARK */ + mtr_t* mtr) /*!< in: mtr */ +{ + mlog_write_initial_log_record(rec, type, mtr); + + /* Write rec offset as a 2-byte ulint */ + mlog_catenate_ulint(mtr, page_offset(rec), MLOG_2BYTES); +} +#else /* !UNIV_HOTBACKUP */ +# define btr_set_min_rec_mark_log(rec,comp,mtr) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +/****************************************************************//** +Parses the redo log record for setting an index record as the predefined +minimum record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_parse_set_min_rec_mark( +/*=======================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + ulint comp, /*!< in: nonzero=compact page format */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + rec_t* rec; + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + if (page) { + ut_a(!page_is_comp(page) == !comp); + + rec = page + mach_read_from_2(ptr); + + btr_set_min_rec_mark(rec, mtr); + } + + return(ptr + 2); +} + +/****************************************************************//** +Sets a record as the predefined minimum record. */ +UNIV_INTERN +void +btr_set_min_rec_mark( +/*=================*/ + rec_t* rec, /*!< in: record */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint info_bits; + + if (UNIV_LIKELY(page_rec_is_comp(rec))) { + info_bits = rec_get_info_bits(rec, TRUE); + + rec_set_info_bits_new(rec, info_bits | REC_INFO_MIN_REC_FLAG); + + btr_set_min_rec_mark_log(rec, MLOG_COMP_REC_MIN_MARK, mtr); + } else { + info_bits = rec_get_info_bits(rec, FALSE); + + rec_set_info_bits_old(rec, info_bits | REC_INFO_MIN_REC_FLAG); + + btr_set_min_rec_mark_log(rec, MLOG_REC_MIN_MARK, mtr); + } +} + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Deletes on the upper level the node pointer to a page. */ +UNIV_INTERN +void +btr_node_ptr_delete( +/*================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page whose node pointer is deleted */ + mtr_t* mtr) /*!< in: mtr */ +{ + btr_cur_t cursor; + ibool compressed; + ulint err; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + + /* Delete node pointer on father page */ + btr_page_get_father(index, block, mtr, &cursor); + + compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor, RB_NONE, + mtr); + ut_a(err == DB_SUCCESS); + + if (!compressed) { + btr_cur_compress_if_useful(&cursor, mtr); + } +} + +/*************************************************************//** +If page is the only on its level, this function moves its records to the +father page, thus reducing the tree height. */ +static +void +btr_lift_page_up( +/*=============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page which is the only on its level; + must not be empty: use + btr_discard_only_page_on_level if the last + record from the page should be removed */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* father_block; + page_t* father_page; + ulint page_level; + page_zip_des_t* father_page_zip; + page_t* page = buf_block_get_frame(block); + ulint root_page_no; + buf_block_t* blocks[BTR_MAX_LEVELS]; + ulint n_blocks; /*!< last used index in blocks[] */ + ulint i; + + ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL); + ut_ad(btr_page_get_next(page, mtr) == FIL_NULL); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + + page_level = btr_page_get_level(page, mtr); + root_page_no = dict_index_get_page(index); + + { + btr_cur_t cursor; + mem_heap_t* heap = mem_heap_create(100); + ulint* offsets; + buf_block_t* b; + + offsets = btr_page_get_father_block(NULL, heap, index, + block, mtr, &cursor); + father_block = btr_cur_get_block(&cursor); + father_page_zip = buf_block_get_page_zip(father_block); + father_page = buf_block_get_frame(father_block); + + n_blocks = 0; + + /* Store all ancestor pages so we can reset their + levels later on. We have to do all the searches on + the tree now because later on, after we've replaced + the first level, the tree is in an inconsistent state + and can not be searched. */ + for (b = father_block; + buf_block_get_page_no(b) != root_page_no; ) { + ut_a(n_blocks < BTR_MAX_LEVELS); + + offsets = btr_page_get_father_block(offsets, heap, + index, b, + mtr, &cursor); + + blocks[n_blocks++] = b = btr_cur_get_block(&cursor); + } + + mem_heap_free(heap); + } + + btr_search_drop_page_hash_index(block); + + /* Make the father empty */ + btr_page_empty(father_block, father_page_zip, index, page_level, mtr); + + /* Copy the records to the father page one by one. */ + if (0 +#ifdef UNIV_ZIP_COPY + || father_page_zip +#endif /* UNIV_ZIP_COPY */ + || UNIV_UNLIKELY + (!page_copy_rec_list_end(father_block, block, + page_get_infimum_rec(page), + index, mtr))) { + const page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(father_page_zip); + ut_a(page_zip); + + /* Copy the page byte for byte. */ + page_zip_copy_recs(father_page_zip, father_page, + page_zip, page, index, mtr); + + /* Update the lock table and possible hash index. */ + + lock_move_rec_list_end(father_block, block, + page_get_infimum_rec(page)); + + btr_search_move_or_delete_hash_entries(father_block, block, + index); + } + + lock_update_copy_and_discard(father_block, block); + + /* Go upward to root page, decrementing levels by one. */ + for (i = 0; i < n_blocks; i++, page_level++) { + page_t* page = buf_block_get_frame(blocks[i]); + page_zip_des_t* page_zip= buf_block_get_page_zip(blocks[i]); + + ut_ad(btr_page_get_level(page, mtr) == page_level + 1); + + btr_page_set_level(page, page_zip, page_level, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + } + + /* Free the file page */ + btr_page_free(index, block, mtr); + + /* We play it safe and reset the free bits for the father */ + if (!dict_index_is_clust(index)) { + ibuf_reset_free_bits(father_block); + } + ut_ad(page_validate(father_page, index)); + ut_ad(btr_check_node_ptr(index, father_block, mtr)); +} + +/*************************************************************//** +Tries to merge the page first to the left immediate brother if such a +brother exists, and the node pointers to the current page and to the brother +reside on the same page. If the left brother does not satisfy these +conditions, looks at the right brother. If the page is the only one on that +level lifts the records of the page to the father page, thus reducing the +tree height. It is assumed that mtr holds an x-latch on the tree and on the +page. If cursor is on the leaf level, mtr must also hold x-latches to the +brothers, if they exist. +@return TRUE on success */ +UNIV_INTERN +ibool +btr_compress( +/*=========*/ + btr_cur_t* cursor, /*!< in: cursor on the page to merge or lift; + the page must not be empty: in record delete + use btr_discard_page if the page would become + empty */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index; + ulint space; + ulint zip_size; + ulint left_page_no; + ulint right_page_no; + buf_block_t* merge_block; + page_t* merge_page; + page_zip_des_t* merge_page_zip; + ibool is_left; + buf_block_t* block; + page_t* page; + btr_cur_t father_cursor; + mem_heap_t* heap; + ulint* offsets; + ulint data_size; + ulint n_recs; + ulint max_ins_size; + ulint max_ins_size_reorg; + ulint level; + + block = btr_cur_get_block(cursor); + page = btr_cur_get_page(cursor); + index = btr_cur_get_index(cursor); + ut_a((ibool) !!page_is_comp(page) == dict_table_is_comp(index->table)); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + level = btr_page_get_level(page, mtr); + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + + left_page_no = btr_page_get_prev(page, mtr); + right_page_no = btr_page_get_next(page, mtr); + +#if 0 + fprintf(stderr, "Merge left page %lu right %lu \n", + left_page_no, right_page_no); +#endif + + heap = mem_heap_create(100); + offsets = btr_page_get_father_block(NULL, heap, index, block, mtr, + &father_cursor); + + /* Decide the page to which we try to merge and which will inherit + the locks */ + + is_left = left_page_no != FIL_NULL; + + if (is_left) { + + merge_block = btr_block_get(space, zip_size, left_page_no, + RW_X_LATCH, mtr); + merge_page = buf_block_get_frame(merge_block); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_next(merge_page, mtr) + == buf_block_get_page_no(block)); +#endif /* UNIV_BTR_DEBUG */ + } else if (right_page_no != FIL_NULL) { + + merge_block = btr_block_get(space, zip_size, right_page_no, + RW_X_LATCH, mtr); + merge_page = buf_block_get_frame(merge_block); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_prev(merge_page, mtr) + == buf_block_get_page_no(block)); +#endif /* UNIV_BTR_DEBUG */ + } else { + /* The page is the only one on the level, lift the records + to the father */ + btr_lift_page_up(index, block, mtr); + mem_heap_free(heap); + return(TRUE); + } + + n_recs = page_get_n_recs(page); + data_size = page_get_data_size(page); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(merge_page) == page_is_comp(page)); +#endif /* UNIV_BTR_DEBUG */ + + max_ins_size_reorg = page_get_max_insert_size_after_reorganize( + merge_page, n_recs); + if (data_size > max_ins_size_reorg) { + + /* No space for merge */ +err_exit: + /* We play it safe and reset the free bits. */ + if (zip_size + && page_is_leaf(merge_page) + && !dict_index_is_clust(index)) { + ibuf_reset_free_bits(merge_block); + } + + mem_heap_free(heap); + return(FALSE); + } + + ut_ad(page_validate(merge_page, index)); + + max_ins_size = page_get_max_insert_size(merge_page, n_recs); + + if (UNIV_UNLIKELY(data_size > max_ins_size)) { + + /* We have to reorganize merge_page */ + + if (UNIV_UNLIKELY(!btr_page_reorganize(merge_block, + index, mtr))) { + + goto err_exit; + } + + max_ins_size = page_get_max_insert_size(merge_page, n_recs); + + ut_ad(page_validate(merge_page, index)); + ut_ad(max_ins_size == max_ins_size_reorg); + + if (UNIV_UNLIKELY(data_size > max_ins_size)) { + + /* Add fault tolerance, though this should + never happen */ + + goto err_exit; + } + } + + merge_page_zip = buf_block_get_page_zip(merge_block); +#ifdef UNIV_ZIP_DEBUG + if (UNIV_LIKELY_NULL(merge_page_zip)) { + const page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(page_zip); + ut_a(page_zip_validate(merge_page_zip, merge_page)); + ut_a(page_zip_validate(page_zip, page)); + } +#endif /* UNIV_ZIP_DEBUG */ + + /* Move records to the merge page */ + if (is_left) { + rec_t* orig_pred = page_copy_rec_list_start( + merge_block, block, page_get_supremum_rec(page), + index, mtr); + + if (UNIV_UNLIKELY(!orig_pred)) { + goto err_exit; + } + + btr_search_drop_page_hash_index(block); + + /* Remove the page from the level list */ + btr_level_list_remove(space, zip_size, page, mtr); + + btr_node_ptr_delete(index, block, mtr); + lock_update_merge_left(merge_block, orig_pred, block); + } else { + rec_t* orig_succ; +#ifdef UNIV_BTR_DEBUG + byte fil_page_prev[4]; +#endif /* UNIV_BTR_DEBUG */ + + if (UNIV_LIKELY_NULL(merge_page_zip)) { + /* The function page_zip_compress(), which will be + invoked by page_copy_rec_list_end() below, + requires that FIL_PAGE_PREV be FIL_NULL. + Clear the field, but prepare to restore it. */ +#ifdef UNIV_BTR_DEBUG + memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4); +#endif /* UNIV_BTR_DEBUG */ +#if FIL_NULL != 0xffffffff +# error "FIL_NULL != 0xffffffff" +#endif + memset(merge_page + FIL_PAGE_PREV, 0xff, 4); + } + + orig_succ = page_copy_rec_list_end(merge_block, block, + page_get_infimum_rec(page), + cursor->index, mtr); + + if (UNIV_UNLIKELY(!orig_succ)) { + ut_a(merge_page_zip); +#ifdef UNIV_BTR_DEBUG + /* FIL_PAGE_PREV was restored from merge_page_zip. */ + ut_a(!memcmp(fil_page_prev, + merge_page + FIL_PAGE_PREV, 4)); +#endif /* UNIV_BTR_DEBUG */ + goto err_exit; + } + + btr_search_drop_page_hash_index(block); + +#ifdef UNIV_BTR_DEBUG + if (UNIV_LIKELY_NULL(merge_page_zip)) { + /* Restore FIL_PAGE_PREV in order to avoid an assertion + failure in btr_level_list_remove(), which will set + the field again to FIL_NULL. Even though this makes + merge_page and merge_page_zip inconsistent for a + split second, it is harmless, because the pages + are X-latched. */ + memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4); + } +#endif /* UNIV_BTR_DEBUG */ + + /* Remove the page from the level list */ + btr_level_list_remove(space, zip_size, page, mtr); + + /* Replace the address of the old child node (= page) with the + address of the merge page to the right */ + + btr_node_ptr_set_child_page_no( + btr_cur_get_rec(&father_cursor), + btr_cur_get_page_zip(&father_cursor), + offsets, right_page_no, mtr); + btr_node_ptr_delete(index, merge_block, mtr); + + lock_update_merge_right(merge_block, orig_succ, block); + } + + mem_heap_free(heap); + + if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) { + /* Update the free bits of the B-tree page in the + insert buffer bitmap. This has to be done in a + separate mini-transaction that is committed before the + main mini-transaction. We cannot update the insert + buffer bitmap in this mini-transaction, because + btr_compress() can be invoked recursively without + committing the mini-transaction in between. Since + insert buffer bitmap pages have a lower rank than + B-tree pages, we must not access other pages in the + same mini-transaction after accessing an insert buffer + bitmap page. */ + + /* The free bits in the insert buffer bitmap must + never exceed the free space on a page. It is safe to + decrement or reset the bits in the bitmap in a + mini-transaction that is committed before the + mini-transaction that affects the free space. */ + + /* It is unsafe to increment the bits in a separately + committed mini-transaction, because in crash recovery, + the free bits could momentarily be set too high. */ + + if (zip_size) { + /* Because the free bits may be incremented + and we cannot update the insert buffer bitmap + in the same mini-transaction, the only safe + thing we can do here is the pessimistic + approach: reset the free bits. */ + ibuf_reset_free_bits(merge_block); + } else { + /* On uncompressed pages, the free bits will + never increase here. Thus, it is safe to + write the bits accurately in a separate + mini-transaction. */ + ibuf_update_free_bits_if_full(merge_block, + UNIV_PAGE_SIZE, + ULINT_UNDEFINED); + } + } + + ut_ad(page_validate(merge_page, index)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page)); +#endif /* UNIV_ZIP_DEBUG */ + + /* Free the file page */ + btr_page_free(index, block, mtr); + + ut_ad(btr_check_node_ptr(index, merge_block, mtr)); + return(TRUE); +} + +/*************************************************************//** +Discards a page that is the only page on its level. This will empty +the whole B-tree, leaving just an empty root page. This function +should never be reached, because btr_compress(), which is invoked in +delete operations, calls btr_lift_page_up() to flatten the B-tree. */ +static +void +btr_discard_only_page_on_level( +/*===========================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page which is the only on its level */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint page_level = 0; + trx_id_t max_trx_id; + + /* Save the PAGE_MAX_TRX_ID from the leaf page. */ + max_trx_id = page_get_max_trx_id(buf_block_get_frame(block)); + + while (buf_block_get_page_no(block) != dict_index_get_page(index)) { + btr_cur_t cursor; + buf_block_t* father; + const page_t* page = buf_block_get_frame(block); + + ut_a(page_get_n_recs(page) == 1); + ut_a(page_level == btr_page_get_level(page, mtr)); + ut_a(btr_page_get_prev(page, mtr) == FIL_NULL); + ut_a(btr_page_get_next(page, mtr) == FIL_NULL); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + btr_search_drop_page_hash_index(block); + + btr_page_get_father(index, block, mtr, &cursor); + father = btr_cur_get_block(&cursor); + + lock_update_discard(father, PAGE_HEAP_NO_SUPREMUM, block); + + /* Free the file page */ + btr_page_free(index, block, mtr); + + block = father; + page_level++; + } + + /* block is the root page, which must be empty, except + for the node pointer to the (now discarded) block(s). */ + +#ifdef UNIV_BTR_DEBUG + if (!dict_index_is_ibuf(index)) { + const page_t* root = buf_block_get_frame(block); + const ulint space = dict_index_get_space(index); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + root, space)); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, space)); + } +#endif /* UNIV_BTR_DEBUG */ + + btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr); + + if (!dict_index_is_clust(index)) { + /* We play it safe and reset the free bits for the root */ + ibuf_reset_free_bits(block); + + if (page_is_leaf(buf_block_get_frame(block))) { + ut_a(!ut_dulint_is_zero(max_trx_id)); + page_set_max_trx_id(block, + buf_block_get_page_zip(block), + max_trx_id, mtr); + } + } +} + +/*************************************************************//** +Discards a page from a B-tree. This is used to remove the last record from +a B-tree page: the whole page must be removed at the same time. This cannot +be used for the root page, which is allowed to be empty. */ +UNIV_INTERN +void +btr_discard_page( +/*=============*/ + btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on + the root page */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index; + ulint space; + ulint zip_size; + ulint left_page_no; + ulint right_page_no; + buf_block_t* merge_block; + page_t* merge_page; + buf_block_t* block; + page_t* page; + rec_t* node_ptr; + + block = btr_cur_get_block(cursor); + index = btr_cur_get_index(cursor); + + ut_ad(dict_index_get_page(index) != buf_block_get_page_no(block)); + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + + /* Decide the page which will inherit the locks */ + + left_page_no = btr_page_get_prev(buf_block_get_frame(block), mtr); + right_page_no = btr_page_get_next(buf_block_get_frame(block), mtr); + + if (left_page_no != FIL_NULL) { + merge_block = btr_block_get(space, zip_size, left_page_no, + RW_X_LATCH, mtr); + merge_page = buf_block_get_frame(merge_block); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_next(merge_page, mtr) + == buf_block_get_page_no(block)); +#endif /* UNIV_BTR_DEBUG */ + } else if (right_page_no != FIL_NULL) { + merge_block = btr_block_get(space, zip_size, right_page_no, + RW_X_LATCH, mtr); + merge_page = buf_block_get_frame(merge_block); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_prev(merge_page, mtr) + == buf_block_get_page_no(block)); +#endif /* UNIV_BTR_DEBUG */ + } else { + btr_discard_only_page_on_level(index, block, mtr); + + return; + } + + page = buf_block_get_frame(block); + ut_a(page_is_comp(merge_page) == page_is_comp(page)); + btr_search_drop_page_hash_index(block); + + if (left_page_no == FIL_NULL && !page_is_leaf(page)) { + + /* We have to mark the leftmost node pointer on the right + side page as the predefined minimum record */ + node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page)); + + ut_ad(page_rec_is_user_rec(node_ptr)); + + /* This will make page_zip_validate() fail on merge_page + until btr_level_list_remove() completes. This is harmless, + because everything will take place within a single + mini-transaction and because writing to the redo log + is an atomic operation (performed by mtr_commit()). */ + btr_set_min_rec_mark(node_ptr, mtr); + } + + btr_node_ptr_delete(index, block, mtr); + + /* Remove the page from the level list */ + btr_level_list_remove(space, zip_size, page, mtr); +#ifdef UNIV_ZIP_DEBUG + { + page_zip_des_t* merge_page_zip + = buf_block_get_page_zip(merge_block); + ut_a(!merge_page_zip + || page_zip_validate(merge_page_zip, merge_page)); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (left_page_no != FIL_NULL) { + lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM, + block); + } else { + lock_update_discard(merge_block, + lock_get_min_heap_no(merge_block), + block); + } + + /* Free the file page */ + btr_page_free(index, block, mtr); + + ut_ad(btr_check_node_ptr(index, merge_block, mtr)); +} + +#ifdef UNIV_BTR_PRINT +/*************************************************************//** +Prints size info of a B-tree. */ +UNIV_INTERN +void +btr_print_size( +/*===========*/ + dict_index_t* index) /*!< in: index tree */ +{ + page_t* root; + fseg_header_t* seg; + mtr_t mtr; + + if (dict_index_is_ibuf(index)) { + fputs("Sorry, cannot print info of an ibuf tree:" + " use ibuf functions\n", stderr); + + return; + } + + mtr_start(&mtr); + + root = btr_root_get(index, &mtr); + + seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; + + fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr); + fseg_print(seg, &mtr); + + if (!(index->type & DICT_UNIVERSAL)) { + + seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + + fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr); + fseg_print(seg, &mtr); + } + + mtr_commit(&mtr); +} + +/************************************************************//** +Prints recursively index tree pages. */ +static +void +btr_print_recursive( +/*================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: index page */ + ulint width, /*!< in: print this many entries from start + and end */ + mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */ + ulint** offsets,/*!< in/out: buffer for rec_get_offsets() */ + mtr_t* mtr) /*!< in: mtr */ +{ + const page_t* page = buf_block_get_frame(block); + page_cur_t cursor; + ulint n_recs; + ulint i = 0; + mtr_t mtr2; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n", + (ulong) btr_page_get_level(page, mtr), + (ulong) buf_block_get_page_no(block)); + + page_print(block, index, width, width); + + n_recs = page_get_n_recs(page); + + page_cur_set_before_first(block, &cursor); + page_cur_move_to_next(&cursor); + + while (!page_cur_is_after_last(&cursor)) { + + if (page_is_leaf(page)) { + + /* If this is the leaf level, do nothing */ + + } else if ((i <= width) || (i >= n_recs - width)) { + + const rec_t* node_ptr; + + mtr_start(&mtr2); + + node_ptr = page_cur_get_rec(&cursor); + + *offsets = rec_get_offsets(node_ptr, index, *offsets, + ULINT_UNDEFINED, heap); + btr_print_recursive(index, + btr_node_ptr_get_child(node_ptr, + index, + *offsets, + &mtr2), + width, heap, offsets, &mtr2); + mtr_commit(&mtr2); + } + + page_cur_move_to_next(&cursor); + i++; + } +} + +/**************************************************************//** +Prints directories and other info of all nodes in the tree. */ +UNIV_INTERN +void +btr_print_index( +/*============*/ + dict_index_t* index, /*!< in: index */ + ulint width) /*!< in: print this many entries from start + and end */ +{ + mtr_t mtr; + buf_block_t* root; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + fputs("--------------------------\n" + "INDEX TREE PRINT\n", stderr); + + mtr_start(&mtr); + + root = btr_root_block_get(index, &mtr); + + btr_print_recursive(index, root, width, &heap, &offsets, &mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + mtr_commit(&mtr); + + btr_validate_index(index, NULL); +} +#endif /* UNIV_BTR_PRINT */ + +#ifdef UNIV_DEBUG +/************************************************************//** +Checks that the node pointer to a page is appropriate. +@return TRUE */ +UNIV_INTERN +ibool +btr_check_node_ptr( +/*===============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: index page */ + mtr_t* mtr) /*!< in: mtr */ +{ + mem_heap_t* heap; + dtuple_t* tuple; + ulint* offsets; + btr_cur_t cursor; + page_t* page = buf_block_get_frame(block); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + if (dict_index_get_page(index) == buf_block_get_page_no(block)) { + + return(TRUE); + } + + heap = mem_heap_create(256); + offsets = btr_page_get_father_block(NULL, heap, index, block, mtr, + &cursor); + + if (page_is_leaf(page)) { + + goto func_exit; + } + + tuple = dict_index_build_node_ptr( + index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap, + btr_page_get_level(page, mtr)); + + ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), offsets)); +func_exit: + mem_heap_free(heap); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/************************************************************//** +Display identification information for a record. */ +static +void +btr_index_rec_validate_report( +/*==========================*/ + const page_t* page, /*!< in: index page */ + const rec_t* rec, /*!< in: index record */ + const dict_index_t* index) /*!< in: index */ +{ + fputs("InnoDB: Record in ", stderr); + dict_index_name_print(stderr, NULL, index); + fprintf(stderr, ", page %lu, at offset %lu\n", + page_get_page_no(page), (ulint) page_offset(rec)); +} + +/************************************************************//** +Checks the size and number of fields in a record based on the definition of +the index. +@return TRUE if ok */ +UNIV_INTERN +ibool +btr_index_rec_validate( +/*===================*/ + const rec_t* rec, /*!< in: index record */ + const dict_index_t* index, /*!< in: index */ + ibool dump_on_error) /*!< in: TRUE if the function + should print hex dump of record + and page on error */ +{ + ulint len; + ulint n; + ulint i; + const page_t* page; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + page = page_align(rec); + + if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) { + /* The insert buffer index tree can contain records from any + other index: we cannot check the number of fields or + their length */ + + return(TRUE); + } + + if (UNIV_UNLIKELY((ibool)!!page_is_comp(page) + != dict_table_is_comp(index->table))) { + btr_index_rec_validate_report(page, rec, index); + fprintf(stderr, "InnoDB: compact flag=%lu, should be %lu\n", + (ulong) !!page_is_comp(page), + (ulong) dict_table_is_comp(index->table)); + + return(FALSE); + } + + n = dict_index_get_n_fields(index); + + if (!page_is_comp(page) + && UNIV_UNLIKELY(rec_get_n_fields_old(rec) != n)) { + btr_index_rec_validate_report(page, rec, index); + fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n", + (ulong) rec_get_n_fields_old(rec), (ulong) n); + + if (dump_on_error) { + buf_page_print(page, 0); + + fputs("InnoDB: corrupt record ", stderr); + rec_print_old(stderr, rec); + putc('\n', stderr); + } + return(FALSE); + } + + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + for (i = 0; i < n; i++) { + ulint fixed_size = dict_col_get_fixed_size( + dict_index_get_nth_col(index, i), page_is_comp(page)); + + rec_get_nth_field_offs(offsets, i, &len); + + /* Note that if fixed_size != 0, it equals the + length of a fixed-size column in the clustered index. + A prefix index of the column is of fixed, but different + length. When fixed_size == 0, prefix_len is the maximum + length of the prefix index column. */ + + if ((dict_index_get_nth_field(index, i)->prefix_len == 0 + && len != UNIV_SQL_NULL && fixed_size + && len != fixed_size) + || (dict_index_get_nth_field(index, i)->prefix_len > 0 + && len != UNIV_SQL_NULL + && len + > dict_index_get_nth_field(index, i)->prefix_len)) { + + btr_index_rec_validate_report(page, rec, index); + fprintf(stderr, + "InnoDB: field %lu len is %lu," + " should be %lu\n", + (ulong) i, (ulong) len, (ulong) fixed_size); + + if (dump_on_error) { + buf_page_print(page, 0); + + fputs("InnoDB: corrupt record ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + } + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(FALSE); + } + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(TRUE); +} + +/************************************************************//** +Checks the size and number of fields in records based on the definition of +the index. +@return TRUE if ok */ +static +ibool +btr_index_page_validate( +/*====================*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index) /*!< in: index */ +{ + page_cur_t cur; + ibool ret = TRUE; + + page_cur_set_before_first(block, &cur); + page_cur_move_to_next(&cur); + + for (;;) { + if (page_cur_is_after_last(&cur)) { + + break; + } + + if (!btr_index_rec_validate(cur.rec, index, TRUE)) { + + return(FALSE); + } + + page_cur_move_to_next(&cur); + } + + return(ret); +} + +/************************************************************//** +Report an error on one page of an index tree. */ +static +void +btr_validate_report1( +/*=================*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: B-tree level */ + const buf_block_t* block) /*!< in: index page */ +{ + fprintf(stderr, "InnoDB: Error in page %lu of ", + buf_block_get_page_no(block)); + dict_index_name_print(stderr, NULL, index); + if (level) { + fprintf(stderr, ", index tree level %lu", level); + } + putc('\n', stderr); +} + +/************************************************************//** +Report an error on two pages of an index tree. */ +static +void +btr_validate_report2( +/*=================*/ + const dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: B-tree level */ + const buf_block_t* block1, /*!< in: first index page */ + const buf_block_t* block2) /*!< in: second index page */ +{ + fprintf(stderr, "InnoDB: Error in pages %lu and %lu of ", + buf_block_get_page_no(block1), + buf_block_get_page_no(block2)); + dict_index_name_print(stderr, NULL, index); + if (level) { + fprintf(stderr, ", index tree level %lu", level); + } + putc('\n', stderr); +} + +/************************************************************//** +Validates index tree level. +@return TRUE if ok */ +static +ibool +btr_validate_level( +/*===============*/ + dict_index_t* index, /*!< in: index tree */ + trx_t* trx, /*!< in: transaction or NULL */ + ulint level) /*!< in: level number */ +{ + ulint space; + ulint zip_size; + buf_block_t* block; + page_t* page; + buf_block_t* right_block = 0; /* remove warning */ + page_t* right_page = 0; /* remove warning */ + page_t* father_page; + btr_cur_t node_cur; + btr_cur_t right_node_cur; + rec_t* rec; + ulint right_page_no; + ulint left_page_no; + page_cur_t cursor; + dtuple_t* node_ptr_tuple; + ibool ret = TRUE; + mtr_t mtr; + mem_heap_t* heap = mem_heap_create(256); + ulint* offsets = NULL; + ulint* offsets2= NULL; +#ifdef UNIV_ZIP_DEBUG + page_zip_des_t* page_zip; +#endif /* UNIV_ZIP_DEBUG */ + + mtr_start(&mtr); + + mtr_x_lock(dict_index_get_lock(index), &mtr); + + block = btr_root_block_get(index, &mtr); + page = buf_block_get_frame(block); + + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + + while (level != btr_page_get_level(page, &mtr)) { + const rec_t* node_ptr; + + ut_a(space == buf_block_get_space(block)); + ut_a(space == page_get_space_id(page)); +#ifdef UNIV_ZIP_DEBUG + page_zip = buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + ut_a(!page_is_leaf(page)); + + page_cur_set_before_first(block, &cursor); + page_cur_move_to_next(&cursor); + + node_ptr = page_cur_get_rec(&cursor); + offsets = rec_get_offsets(node_ptr, index, offsets, + ULINT_UNDEFINED, &heap); + block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr); + page = buf_block_get_frame(block); + } + + /* Now we are on the desired level. Loop through the pages on that + level. */ +loop: + if (trx_is_interrupted(trx)) { + mtr_commit(&mtr); + mem_heap_free(heap); + return(ret); + } + mem_heap_empty(heap); + offsets = offsets2 = NULL; + mtr_x_lock(dict_index_get_lock(index), &mtr); + +#ifdef UNIV_ZIP_DEBUG + page_zip = buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + /* Check ordering etc. of records */ + + if (!page_validate(page, index)) { + btr_validate_report1(index, level, block); + + ret = FALSE; + } else if (level == 0) { + /* We are on level 0. Check that the records have the right + number of fields, and field lengths are right. */ + + if (!btr_index_page_validate(block, index)) { + + ret = FALSE; + } + } + + ut_a(btr_page_get_level(page, &mtr) == level); + + right_page_no = btr_page_get_next(page, &mtr); + left_page_no = btr_page_get_prev(page, &mtr); + + ut_a(page_get_n_recs(page) > 0 || (level == 0 + && page_get_page_no(page) + == dict_index_get_page(index))); + + if (right_page_no != FIL_NULL) { + const rec_t* right_rec; + right_block = btr_block_get(space, zip_size, right_page_no, + RW_X_LATCH, &mtr); + right_page = buf_block_get_frame(right_block); + if (UNIV_UNLIKELY(btr_page_get_prev(right_page, &mtr) + != page_get_page_no(page))) { + btr_validate_report2(index, level, block, right_block); + fputs("InnoDB: broken FIL_PAGE_NEXT" + " or FIL_PAGE_PREV links\n", stderr); + buf_page_print(page, 0); + buf_page_print(right_page, 0); + + ret = FALSE; + } + + if (UNIV_UNLIKELY(page_is_comp(right_page) + != page_is_comp(page))) { + btr_validate_report2(index, level, block, right_block); + fputs("InnoDB: 'compact' flag mismatch\n", stderr); + buf_page_print(page, 0); + buf_page_print(right_page, 0); + + ret = FALSE; + + goto node_ptr_fails; + } + + rec = page_rec_get_prev(page_get_supremum_rec(page)); + right_rec = page_rec_get_next(page_get_infimum_rec( + right_page)); + offsets = rec_get_offsets(rec, index, + offsets, ULINT_UNDEFINED, &heap); + offsets2 = rec_get_offsets(right_rec, index, + offsets2, ULINT_UNDEFINED, &heap); + if (UNIV_UNLIKELY(cmp_rec_rec(rec, right_rec, + offsets, offsets2, + index) >= 0)) { + + btr_validate_report2(index, level, block, right_block); + + fputs("InnoDB: records in wrong order" + " on adjacent pages\n", stderr); + + buf_page_print(page, 0); + buf_page_print(right_page, 0); + + fputs("InnoDB: record ", stderr); + rec = page_rec_get_prev(page_get_supremum_rec(page)); + rec_print(stderr, rec, index); + putc('\n', stderr); + fputs("InnoDB: record ", stderr); + rec = page_rec_get_next( + page_get_infimum_rec(right_page)); + rec_print(stderr, rec, index); + putc('\n', stderr); + + ret = FALSE; + } + } + + if (level > 0 && left_page_no == FIL_NULL) { + ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( + page_rec_get_next(page_get_infimum_rec(page)), + page_is_comp(page))); + } + + if (buf_block_get_page_no(block) != dict_index_get_page(index)) { + + /* Check father node pointers */ + + rec_t* node_ptr; + + offsets = btr_page_get_father_block(offsets, heap, index, + block, &mtr, &node_cur); + father_page = btr_cur_get_page(&node_cur); + node_ptr = btr_cur_get_rec(&node_cur); + + btr_cur_position( + index, page_rec_get_prev(page_get_supremum_rec(page)), + block, &node_cur); + offsets = btr_page_get_father_node_ptr(offsets, heap, + &node_cur, &mtr); + + if (UNIV_UNLIKELY(node_ptr != btr_cur_get_rec(&node_cur)) + || UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr, + offsets) + != buf_block_get_page_no(block))) { + + btr_validate_report1(index, level, block); + + fputs("InnoDB: node pointer to the page is wrong\n", + stderr); + + buf_page_print(father_page, 0); + buf_page_print(page, 0); + + fputs("InnoDB: node ptr ", stderr); + rec_print(stderr, node_ptr, index); + + rec = btr_cur_get_rec(&node_cur); + fprintf(stderr, "\n" + "InnoDB: node ptr child page n:o %lu\n", + (ulong) btr_node_ptr_get_child_page_no( + rec, offsets)); + + fputs("InnoDB: record on page ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + ret = FALSE; + + goto node_ptr_fails; + } + + if (!page_is_leaf(page)) { + node_ptr_tuple = dict_index_build_node_ptr( + index, + page_rec_get_next(page_get_infimum_rec(page)), + 0, heap, btr_page_get_level(page, &mtr)); + + if (cmp_dtuple_rec(node_ptr_tuple, node_ptr, + offsets)) { + const rec_t* first_rec = page_rec_get_next( + page_get_infimum_rec(page)); + + btr_validate_report1(index, level, block); + + buf_page_print(father_page, 0); + buf_page_print(page, 0); + + fputs("InnoDB: Error: node ptrs differ" + " on levels > 0\n" + "InnoDB: node ptr ", stderr); + rec_print_new(stderr, node_ptr, offsets); + fputs("InnoDB: first rec ", stderr); + rec_print(stderr, first_rec, index); + putc('\n', stderr); + ret = FALSE; + + goto node_ptr_fails; + } + } + + if (left_page_no == FIL_NULL) { + ut_a(node_ptr == page_rec_get_next( + page_get_infimum_rec(father_page))); + ut_a(btr_page_get_prev(father_page, &mtr) == FIL_NULL); + } + + if (right_page_no == FIL_NULL) { + ut_a(node_ptr == page_rec_get_prev( + page_get_supremum_rec(father_page))); + ut_a(btr_page_get_next(father_page, &mtr) == FIL_NULL); + } else { + const rec_t* right_node_ptr + = page_rec_get_next(node_ptr); + + offsets = btr_page_get_father_block( + offsets, heap, index, right_block, + &mtr, &right_node_cur); + if (right_node_ptr + != page_get_supremum_rec(father_page)) { + + if (btr_cur_get_rec(&right_node_cur) + != right_node_ptr) { + ret = FALSE; + fputs("InnoDB: node pointer to" + " the right page is wrong\n", + stderr); + + btr_validate_report1(index, level, + block); + + buf_page_print(father_page, 0); + buf_page_print(page, 0); + buf_page_print(right_page, 0); + } + } else { + page_t* right_father_page + = btr_cur_get_page(&right_node_cur); + + if (btr_cur_get_rec(&right_node_cur) + != page_rec_get_next( + page_get_infimum_rec( + right_father_page))) { + ret = FALSE; + fputs("InnoDB: node pointer 2 to" + " the right page is wrong\n", + stderr); + + btr_validate_report1(index, level, + block); + + buf_page_print(father_page, 0); + buf_page_print(right_father_page, 0); + buf_page_print(page, 0); + buf_page_print(right_page, 0); + } + + if (page_get_page_no(right_father_page) + != btr_page_get_next(father_page, &mtr)) { + + ret = FALSE; + fputs("InnoDB: node pointer 3 to" + " the right page is wrong\n", + stderr); + + btr_validate_report1(index, level, + block); + + buf_page_print(father_page, 0); + buf_page_print(right_father_page, 0); + buf_page_print(page, 0); + buf_page_print(right_page, 0); + } + } + } + } + +node_ptr_fails: + /* Commit the mini-transaction to release the latch on 'page'. + Re-acquire the latch on right_page, which will become 'page' + on the next loop. The page has already been checked. */ + mtr_commit(&mtr); + + if (right_page_no != FIL_NULL) { + mtr_start(&mtr); + + block = btr_block_get(space, zip_size, right_page_no, + RW_X_LATCH, &mtr); + page = buf_block_get_frame(block); + + goto loop; + } + + mem_heap_free(heap); + return(ret); +} + +/**************************************************************//** +Checks the consistency of an index tree. +@return TRUE if ok */ +UNIV_INTERN +ibool +btr_validate_index( +/*===============*/ + dict_index_t* index, /*!< in: index */ + trx_t* trx) /*!< in: transaction or NULL */ +{ + mtr_t mtr; + page_t* root; + ulint i; + ulint n; + + mtr_start(&mtr); + mtr_x_lock(dict_index_get_lock(index), &mtr); + + root = btr_root_get(index, &mtr); + n = btr_page_get_level(root, &mtr); + + for (i = 0; i <= n && !trx_is_interrupted(trx); i++) { + if (!btr_validate_level(index, trx, n - i)) { + + mtr_commit(&mtr); + + return(FALSE); + } + } + + mtr_commit(&mtr); + + return(TRUE); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/btr/btr0cur.c b/storage/xtradb/btr/btr0cur.c new file mode 100644 index 00000000000..3fc2b48162a --- /dev/null +++ b/storage/xtradb/btr/btr0cur.c @@ -0,0 +1,5256 @@ +/***************************************************************************** + +Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/**************************************************//** +@file btr/btr0cur.c +The index tree cursor + +All changes that row operations make to a B-tree or the records +there must go through this module! Undo log records are written here +of every modify or insert of a clustered index record. + + NOTE!!! +To make sure we do not run out of disk space during a pessimistic +insert or update, we have to reserve 2 x the height of the index tree +many pages in the tablespace before we start the operation, because +if leaf splitting has been started, it is difficult to undo, except +by crashing the database and doing a roll-forward. + +Created 10/16/1994 Heikki Tuuri +*******************************************************/ + +#include "btr0cur.h" + +#ifdef UNIV_NONINL +#include "btr0cur.ic" +#endif + +#include "row0upd.h" +#ifndef UNIV_HOTBACKUP +#include "mtr0log.h" +#include "page0page.h" +#include "page0zip.h" +#include "rem0rec.h" +#include "rem0cmp.h" +#include "buf0lru.h" +#include "btr0btr.h" +#include "btr0sea.h" +#include "trx0rec.h" +#include "trx0roll.h" /* trx_is_recv() */ +#include "que0que.h" +#include "row0row.h" +#include "srv0srv.h" +#include "ibuf0ibuf.h" +#include "lock0lock.h" +#include "zlib.h" + +#ifdef UNIV_DEBUG +/** If the following is set to TRUE, this module prints a lot of +trace information of individual record operations */ +UNIV_INTERN ibool btr_cur_print_record_ops = FALSE; +#endif /* UNIV_DEBUG */ + +/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */ +UNIV_INTERN ulint btr_cur_n_non_sea = 0; +/** Number of successful adaptive hash index lookups in +btr_cur_search_to_nth_level(). */ +UNIV_INTERN ulint btr_cur_n_sea = 0; +/** Old value of btr_cur_n_non_sea. Copied by +srv_refresh_innodb_monitor_stats(). Referenced by +srv_printf_innodb_monitor(). */ +UNIV_INTERN ulint btr_cur_n_non_sea_old = 0; +/** Old value of btr_cur_n_sea. Copied by +srv_refresh_innodb_monitor_stats(). Referenced by +srv_printf_innodb_monitor(). */ +UNIV_INTERN ulint btr_cur_n_sea_old = 0; + +/** In the optimistic insert, if the insert does not fit, but this much space +can be released by page reorganize, then it is reorganized */ +#define BTR_CUR_PAGE_REORGANIZE_LIMIT (UNIV_PAGE_SIZE / 32) + +/** The structure of a BLOB part header */ +/* @{ */ +/*--------------------------------------*/ +#define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this + page */ +#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no, + FIL_NULL if none */ +/*--------------------------------------*/ +#define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB + part header, in bytes */ +/* @} */ +#endif /* !UNIV_HOTBACKUP */ + +/** A BLOB field reference full of zero, for use in assertions and tests. +Initially, BLOB field references are set to zero, in +dtuple_convert_big_rec(). */ +UNIV_INTERN const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE]; + +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Marks all extern fields in a record as owned by the record. This function +should be called if the delete mark of a record is removed: a not delete +marked record always owns all its extern fields. */ +static +void +btr_cur_unmark_extern_fields( +/*=========================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed + part will be updated, or NULL */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + mtr_t* mtr); /*!< in: mtr, or NULL if not logged */ +/*******************************************************************//** +Adds path information to the cursor for the current page, for which +the binary search has been performed. */ +static +void +btr_cur_add_path_info( +/*==================*/ + btr_cur_t* cursor, /*!< in: cursor positioned on a page */ + ulint height, /*!< in: height of the page in tree; + 0 means leaf node */ + ulint root_height); /*!< in: root node height in tree */ +/***********************************************************//** +Frees the externally stored fields for a record, if the field is mentioned +in the update vector. */ +static +void +btr_rec_free_updated_extern_fields( +/*===============================*/ + dict_index_t* index, /*!< in: index of rec; the index tree MUST be + X-latched */ + rec_t* rec, /*!< in: record */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + const upd_t* update, /*!< in: update vector */ + enum trx_rb_ctx rb_ctx, /*!< in: rollback context */ + mtr_t* mtr); /*!< in: mini-transaction handle which contains + an X-latch to record page and to the tree */ +/***********************************************************//** +Frees the externally stored fields for a record. */ +static +void +btr_rec_free_externally_stored_fields( +/*==================================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched */ + rec_t* rec, /*!< in: record */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + enum trx_rb_ctx rb_ctx, /*!< in: rollback context */ + mtr_t* mtr); /*!< in: mini-transaction handle which contains + an X-latch to record page and to the index + tree */ +/***********************************************************//** +Gets the externally stored size of a record, in units of a database page. +@return externally stored part, in units of a database page */ +static +ulint +btr_rec_get_externally_stored_len( +/*==============================*/ + rec_t* rec, /*!< in: record */ + const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ +#endif /* !UNIV_HOTBACKUP */ + +/******************************************************//** +The following function is used to set the deleted bit of a record. */ +UNIV_INLINE +void +btr_rec_set_deleted_flag( +/*=====================*/ + rec_t* rec, /*!< in/out: physical record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page (or NULL) */ + ulint flag) /*!< in: nonzero if delete marked */ +{ + if (page_rec_is_comp(rec)) { + rec_set_deleted_flag_new(rec, page_zip, flag); + } else { + ut_ad(!page_zip); + rec_set_deleted_flag_old(rec, flag); + } +} + +#ifndef UNIV_HOTBACKUP +/*==================== B-TREE SEARCH =========================*/ + +/********************************************************************//** +Latches the leaf page or pages requested. */ +static +void +btr_cur_latch_leaves( +/*=================*/ + page_t* page, /*!< in: leaf page where the search + converged */ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number of the leaf */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_cur_t* cursor, /*!< in: cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint mode; + ulint left_page_no; + ulint right_page_no; + buf_block_t* get_block; + + ut_ad(page && mtr); + + switch (latch_mode) { + case BTR_SEARCH_LEAF: + case BTR_MODIFY_LEAF: + mode = latch_mode == BTR_SEARCH_LEAF ? RW_S_LATCH : RW_X_LATCH; + get_block = btr_block_get(space, zip_size, page_no, mode, mtr); + + if (srv_pass_corrupt_table && !get_block) { + return; + } + ut_a(get_block); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) == page_is_comp(page)); +#endif /* UNIV_BTR_DEBUG */ + get_block->check_index_page_at_flush = TRUE; + return; + case BTR_MODIFY_TREE: + /* x-latch also brothers from left to right */ + left_page_no = btr_page_get_prev(page, mtr); + + if (left_page_no != FIL_NULL) { + get_block = btr_block_get(space, zip_size, + left_page_no, + RW_X_LATCH, mtr); + + if (srv_pass_corrupt_table && !get_block) { + return; + } + ut_a(get_block); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) + == page_is_comp(page)); + ut_a(btr_page_get_next(get_block->frame, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + get_block->check_index_page_at_flush = TRUE; + } + + get_block = btr_block_get(space, zip_size, page_no, + RW_X_LATCH, mtr); + + if (srv_pass_corrupt_table && !get_block) { + return; + } + ut_a(get_block); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) == page_is_comp(page)); +#endif /* UNIV_BTR_DEBUG */ + get_block->check_index_page_at_flush = TRUE; + + right_page_no = btr_page_get_next(page, mtr); + + if (right_page_no != FIL_NULL) { + get_block = btr_block_get(space, zip_size, + right_page_no, + RW_X_LATCH, mtr); + + if (srv_pass_corrupt_table && !get_block) { + return; + } + ut_a(get_block); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) + == page_is_comp(page)); + ut_a(btr_page_get_prev(get_block->frame, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + get_block->check_index_page_at_flush = TRUE; + } + + return; + + case BTR_SEARCH_PREV: + case BTR_MODIFY_PREV: + mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH; + /* latch also left brother */ + left_page_no = btr_page_get_prev(page, mtr); + + if (left_page_no != FIL_NULL) { + get_block = btr_block_get(space, zip_size, + left_page_no, mode, mtr); + cursor->left_block = get_block; + + if (srv_pass_corrupt_table && !get_block) { + return; + } + ut_a(get_block); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) + == page_is_comp(page)); + ut_a(btr_page_get_next(get_block->frame, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + get_block->check_index_page_at_flush = TRUE; + } + + get_block = btr_block_get(space, zip_size, page_no, mode, mtr); + + if (srv_pass_corrupt_table && !get_block) { + return; + } + ut_a(get_block); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) == page_is_comp(page)); +#endif /* UNIV_BTR_DEBUG */ + get_block->check_index_page_at_flush = TRUE; + return; + } + + ut_error; +} + +/********************************************************************//** +Searches an index tree and positions a tree cursor on a given level. +NOTE: n_fields_cmp in tuple must be set so that it cannot be compared +to node pointer page number fields on the upper levels of the tree! +Note that if mode is PAGE_CUR_LE, which is used in inserts, then +cursor->up_match and cursor->low_match both will have sensible values. +If mode is PAGE_CUR_GE, then up_match will a have a sensible value. + +If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the +search tuple should be performed in the B-tree. InnoDB does an insert +immediately after the cursor. Thus, the cursor may end up on a user record, +or on a page infimum record. */ +UNIV_INTERN +void +btr_cur_search_to_nth_level( +/*========================*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: the tree level of search */ + const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in + tuple must be set so that it cannot get + compared to the node ptr page number field! */ + ulint mode, /*!< in: PAGE_CUR_L, ...; + Inserts should always be made using + PAGE_CUR_LE to search the position! */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with + BTR_INSERT and BTR_ESTIMATE; + cursor->left_block is used to store a pointer + to the left neighbor page, in the cases + BTR_SEARCH_PREV and BTR_MODIFY_PREV; + NOTE that if has_search_latch + is != 0, we maybe do not have a latch set + on the cursor page, we assume + the caller uses his search latch + to protect the record! */ + btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is + s- or x-latched, but see also above! */ + ulint has_search_latch,/*!< in: info on the latch mode the + caller currently has on btr_search_latch: + RW_S_LATCH, or 0 */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t* page_cursor; + page_t* page; + buf_block_t* guess; + rec_t* node_ptr; + ulint page_no; + ulint space; + ulint up_match; + ulint up_bytes; + ulint low_match; + ulint low_bytes; + ulint height; + ulint savepoint; + ulint page_mode; + ulint insert_planned; + ulint estimate; + ulint ignore_sec_unique; + ulint root_height = 0; /* remove warning */ +#ifdef BTR_CUR_ADAPT + btr_search_t* info; +#endif + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + /* Currently, PAGE_CUR_LE is the only search mode used for searches + ending to upper levels */ + + ut_ad(level == 0 || mode == PAGE_CUR_LE); + ut_ad(dict_index_check_search_tuple(index, tuple)); + ut_ad(!dict_index_is_ibuf(index) || ibuf_inside()); + ut_ad(dtuple_check_typed(tuple)); + +#ifdef UNIV_DEBUG + cursor->up_match = ULINT_UNDEFINED; + cursor->low_match = ULINT_UNDEFINED; +#endif + insert_planned = latch_mode & BTR_INSERT; + estimate = latch_mode & BTR_ESTIMATE; + ignore_sec_unique = latch_mode & BTR_IGNORE_SEC_UNIQUE; + latch_mode = latch_mode & ~(BTR_INSERT | BTR_ESTIMATE + | BTR_IGNORE_SEC_UNIQUE); + + ut_ad(!insert_planned || (mode == PAGE_CUR_LE)); + + cursor->flag = BTR_CUR_BINARY; + cursor->index = index; + +#ifndef BTR_CUR_ADAPT + guess = NULL; +#else + info = btr_search_get_info(index); + + guess = info->root_guess; + +#ifdef BTR_CUR_HASH_ADAPT + +#ifdef UNIV_SEARCH_PERF_STAT + info->n_searches++; +#endif + if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_NOT_LOCKED + && latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ + && !estimate +#ifdef PAGE_CUR_LE_OR_EXTENDS + && mode != PAGE_CUR_LE_OR_EXTENDS +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + /* If !has_search_latch, we do a dirty read of + btr_search_enabled below, and btr_search_guess_on_hash() + will have to check it again. */ + && UNIV_LIKELY(btr_search_enabled) + && btr_search_guess_on_hash(index, info, tuple, mode, + latch_mode, cursor, + has_search_latch, mtr)) { + + /* Search using the hash index succeeded */ + + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_GE); + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + ut_ad(cursor->low_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + btr_cur_n_sea++; + + return; + } +#endif /* BTR_CUR_HASH_ADAPT */ +#endif /* BTR_CUR_ADAPT */ + btr_cur_n_non_sea++; + + /* If the hash search did not succeed, do binary search down the + tree */ + + if (has_search_latch) { + /* Release possible search latch to obey latching order */ + rw_lock_s_unlock(&btr_search_latch); + } + + /* Store the position of the tree latch we push to mtr so that we + know how to release it when we have latched leaf node(s) */ + + savepoint = mtr_set_savepoint(mtr); + + if (latch_mode == BTR_MODIFY_TREE) { + mtr_x_lock(dict_index_get_lock(index), mtr); + + } else if (latch_mode == BTR_CONT_MODIFY_TREE) { + /* Do nothing */ + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + } else { + mtr_s_lock(dict_index_get_lock(index), mtr); + } + + page_cursor = btr_cur_get_page_cur(cursor); + + space = dict_index_get_space(index); + page_no = dict_index_get_page(index); + + up_match = 0; + up_bytes = 0; + low_match = 0; + low_bytes = 0; + + height = ULINT_UNDEFINED; + + /* We use these modified search modes on non-leaf levels of the + B-tree. These let us end up in the right B-tree leaf. In that leaf + we use the original search mode. */ + + switch (mode) { + case PAGE_CUR_GE: + page_mode = PAGE_CUR_L; + break; + case PAGE_CUR_G: + page_mode = PAGE_CUR_LE; + break; + default: +#ifdef PAGE_CUR_LE_OR_EXTENDS + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE + || mode == PAGE_CUR_LE_OR_EXTENDS); +#else /* PAGE_CUR_LE_OR_EXTENDS */ + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE); +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + page_mode = mode; + break; + } + + /* Loop and search until we arrive at the desired level */ + + for (;;) { + ulint zip_size; + buf_block_t* block; + ulint rw_latch; + ulint buf_mode; + + zip_size = dict_table_zip_size(index->table); + rw_latch = RW_NO_LATCH; + buf_mode = BUF_GET; + + if (height == 0 && latch_mode <= BTR_MODIFY_LEAF) { + + rw_latch = latch_mode; + + if (insert_planned + && ibuf_should_try(index, ignore_sec_unique)) { + + /* Try insert to the insert buffer if the + page is not in the buffer pool */ + + buf_mode = BUF_GET_IF_IN_POOL; + } + } + +retry_page_get: + block = buf_page_get_gen(space, zip_size, page_no, + rw_latch, guess, buf_mode, + file, line, mtr); + if (block == NULL) { + if (srv_pass_corrupt_table && buf_mode != BUF_GET_IF_IN_POOL) { + page_cursor->block = 0; + page_cursor->rec = 0; + if (estimate) { + cursor->path_arr->nth_rec = ULINT_UNDEFINED; + } + break; + } + ut_a(buf_mode == BUF_GET_IF_IN_POOL); + + /* This must be a search to perform an insert; + try insert to the insert buffer */ + + ut_ad(buf_mode == BUF_GET_IF_IN_POOL); + ut_ad(insert_planned); + ut_ad(cursor->thr); + + if (ibuf_insert(tuple, index, space, zip_size, + page_no, cursor->thr)) { + /* Insertion to the insert buffer succeeded */ + cursor->flag = BTR_CUR_INSERT_TO_IBUF; + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + goto func_exit; + } + + /* Insert to the insert buffer did not succeed: + retry page get */ + + buf_mode = BUF_GET; + + goto retry_page_get; + } + + page = buf_block_get_frame(block); + + if (srv_pass_corrupt_table && !page) { + page_cursor->block = 0; + page_cursor->rec = 0; + if (estimate) { + cursor->path_arr->nth_rec = ULINT_UNDEFINED; + } + break; + } + ut_a(page); + + block->check_index_page_at_flush = TRUE; + + if (rw_latch != RW_NO_LATCH) { +#ifdef UNIV_ZIP_DEBUG + const page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + buf_block_dbg_add_level(block, SYNC_TREE_NODE); + } + + ut_ad(0 == ut_dulint_cmp(index->id, + btr_page_get_index_id(page))); + + if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) { + /* We are in the root node */ + + height = btr_page_get_level(page, mtr); + root_height = height; + cursor->tree_height = root_height + 1; +#ifdef BTR_CUR_ADAPT + if (block != guess) { + info->root_guess = block; + } +#endif + } + + if (height == 0) { + if (rw_latch == RW_NO_LATCH) { + + btr_cur_latch_leaves(page, space, zip_size, + page_no, latch_mode, + cursor, mtr); + } + + if ((latch_mode != BTR_MODIFY_TREE) + && (latch_mode != BTR_CONT_MODIFY_TREE)) { + + /* Release the tree s-latch */ + + mtr_release_s_latch_at_savepoint( + mtr, savepoint, + dict_index_get_lock(index)); + } + + page_mode = mode; + } + + page_cur_search_with_match(block, index, tuple, page_mode, + &up_match, &up_bytes, + &low_match, &low_bytes, + page_cursor); + + if (estimate) { + btr_cur_add_path_info(cursor, height, root_height); + } + + /* If this is the desired level, leave the loop */ + + ut_ad(height == btr_page_get_level( + page_cur_get_page(page_cursor), mtr)); + + if (level == height) { + + if (level > 0) { + /* x-latch the page */ + page = btr_page_get(space, zip_size, + page_no, RW_X_LATCH, mtr); + ut_a((ibool)!!page_is_comp(page) + == dict_table_is_comp(index->table)); + } + + break; + } + + ut_ad(height > 0); + + height--; + + guess = NULL; + + node_ptr = page_cur_get_rec(page_cursor); + offsets = rec_get_offsets(node_ptr, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + /* Go to the child node */ + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + if (level == 0) { + cursor->low_match = low_match; + cursor->low_bytes = low_bytes; + cursor->up_match = up_match; + cursor->up_bytes = up_bytes; + +#ifdef BTR_CUR_ADAPT + /* We do a dirty read of btr_search_enabled here. We + will properly check btr_search_enabled again in + btr_search_build_page_hash_index() before building a + page hash index, while holding btr_search_latch. */ + if (UNIV_LIKELY(btr_search_enabled)) { + + btr_search_info_update(index, cursor); + } +#endif + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_GE); + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + ut_ad(cursor->low_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + } + +func_exit: + if (has_search_latch) { + + rw_lock_s_lock(&btr_search_latch); + } +} + +/*****************************************************************//** +Opens a cursor at either end of an index. */ +UNIV_INTERN +void +btr_cur_open_at_index_side_func( +/*============================*/ + ibool from_left, /*!< in: TRUE if open to the low end, + FALSE if to the high end */ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: latch mode */ + btr_cur_t* cursor, /*!< in: cursor */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t* page_cursor; + ulint page_no; + ulint space; + ulint zip_size; + ulint height; + ulint root_height = 0; /* remove warning */ + rec_t* node_ptr; + ulint estimate; + ulint savepoint; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + estimate = latch_mode & BTR_ESTIMATE; + latch_mode = latch_mode & ~BTR_ESTIMATE; + + /* Store the position of the tree latch we push to mtr so that we + know how to release it when we have latched the leaf node */ + + savepoint = mtr_set_savepoint(mtr); + + if (latch_mode == BTR_MODIFY_TREE) { + mtr_x_lock(dict_index_get_lock(index), mtr); + } else { + mtr_s_lock(dict_index_get_lock(index), mtr); + } + + page_cursor = btr_cur_get_page_cur(cursor); + cursor->index = index; + + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + page_no = dict_index_get_page(index); + + height = ULINT_UNDEFINED; + + for (;;) { + buf_block_t* block; + page_t* page; + block = buf_page_get_gen(space, zip_size, page_no, + RW_NO_LATCH, NULL, BUF_GET, + file, line, mtr); + page = buf_block_get_frame(block); + + if (srv_pass_corrupt_table && !page) { + page_cursor->block = 0; + page_cursor->rec = 0; + if (estimate) { + cursor->path_arr->nth_rec = ULINT_UNDEFINED; + } + break; + } + ut_a(page); + + ut_ad(0 == ut_dulint_cmp(index->id, + btr_page_get_index_id(page))); + + block->check_index_page_at_flush = TRUE; + + if (height == ULINT_UNDEFINED) { + /* We are in the root node */ + + height = btr_page_get_level(page, mtr); + root_height = height; + } + + if (height == 0) { + btr_cur_latch_leaves(page, space, zip_size, page_no, + latch_mode, cursor, mtr); + + /* In versions <= 3.23.52 we had forgotten to + release the tree latch here. If in an index scan + we had to scan far to find a record visible to the + current transaction, that could starve others + waiting for the tree latch. */ + + if ((latch_mode != BTR_MODIFY_TREE) + && (latch_mode != BTR_CONT_MODIFY_TREE)) { + + /* Release the tree s-latch */ + + mtr_release_s_latch_at_savepoint( + mtr, savepoint, + dict_index_get_lock(index)); + } + } + + if (from_left) { + page_cur_set_before_first(block, page_cursor); + } else { + page_cur_set_after_last(block, page_cursor); + } + + if (height == 0) { + if (estimate) { + btr_cur_add_path_info(cursor, height, + root_height); + } + + break; + } + + ut_ad(height > 0); + + if (from_left) { + page_cur_move_to_next(page_cursor); + } else { + page_cur_move_to_prev(page_cursor); + } + + if (estimate) { + btr_cur_add_path_info(cursor, height, root_height); + } + + height--; + + node_ptr = page_cur_get_rec(page_cursor); + offsets = rec_get_offsets(node_ptr, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + /* Go to the child node */ + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/**********************************************************************//** +Positions a cursor at a randomly chosen position within a B-tree. */ +UNIV_INTERN +void +btr_cur_open_at_rnd_pos_func( +/*=========================*/ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_cur_t* cursor, /*!< in/out: B-tree cursor */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t* page_cursor; + ulint page_no; + ulint space; + ulint zip_size; + ulint height; + rec_t* node_ptr; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + if (latch_mode == BTR_MODIFY_TREE) { + mtr_x_lock(dict_index_get_lock(index), mtr); + } else { + mtr_s_lock(dict_index_get_lock(index), mtr); + } + + page_cursor = btr_cur_get_page_cur(cursor); + cursor->index = index; + + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + page_no = dict_index_get_page(index); + + height = ULINT_UNDEFINED; + + for (;;) { + buf_block_t* block; + page_t* page; + + block = buf_page_get_gen(space, zip_size, page_no, + RW_NO_LATCH, NULL, BUF_GET, + file, line, mtr); + page = buf_block_get_frame(block); + + if (srv_pass_corrupt_table && !page) { + page_cursor->block = 0; + page_cursor->rec = 0; + break; + } + ut_a(page); + + ut_ad(0 == ut_dulint_cmp(index->id, + btr_page_get_index_id(page))); + + if (height == ULINT_UNDEFINED) { + /* We are in the root node */ + + height = btr_page_get_level(page, mtr); + } + + if (height == 0) { + btr_cur_latch_leaves(page, space, zip_size, page_no, + latch_mode, cursor, mtr); + } + + page_cur_open_on_rnd_user_rec(block, page_cursor); + + if (height == 0) { + + break; + } + + ut_ad(height > 0); + + height--; + + node_ptr = page_cur_get_rec(page_cursor); + offsets = rec_get_offsets(node_ptr, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + /* Go to the child node */ + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/**********************************************************************//** +Positions a cursor at a randomly chosen position within a B-tree +after the given path +@return TRUE if the position is at the first page, and cursor must point + the first record for used by the caller.*/ +UNIV_INTERN +ibool +btr_cur_open_at_rnd_pos_after_path( +/*====================*/ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_path_t* first_rec_path, + btr_cur_t* cursor, /*!< in/out: B-tree cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t* page_cursor; + btr_path_t* slot; + ibool is_first_rec = TRUE; + ulint page_no; + ulint space; + ulint zip_size; + ulint height; + rec_t* node_ptr; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + if (latch_mode == BTR_MODIFY_TREE) { + mtr_x_lock(dict_index_get_lock(index), mtr); + } else { + mtr_s_lock(dict_index_get_lock(index), mtr); + } + + page_cursor = btr_cur_get_page_cur(cursor); + cursor->index = index; + + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + page_no = dict_index_get_page(index); + + height = ULINT_UNDEFINED; + slot = first_rec_path; + + for (;;) { + buf_block_t* block; + page_t* page; + + block = buf_page_get_gen(space, zip_size, page_no, + RW_NO_LATCH, NULL, BUF_GET, + __FILE__, __LINE__, mtr); + page = buf_block_get_frame(block); + ut_ad(0 == ut_dulint_cmp(index->id, + btr_page_get_index_id(page))); + + if (height == ULINT_UNDEFINED) { + /* We are in the root node */ + + height = btr_page_get_level(page, mtr); + } + + if (height == 0) { + btr_cur_latch_leaves(page, space, zip_size, page_no, + latch_mode, cursor, mtr); + } + + if (is_first_rec && slot->nth_rec != ULINT_UNDEFINED) { + if (height == 0) { + /* must open the first rec */ + page_cur_open_on_nth_user_rec(block, page_cursor, slot->nth_rec); + } else { + is_first_rec = page_cur_open_on_rnd_user_rec_after_nth(block, + page_cursor, slot->nth_rec); + } + } else { + is_first_rec = FALSE; + page_cur_open_on_rnd_user_rec(block, page_cursor); + } + + if (height == 0) { + break; + } + + ut_ad(height > 0); + + height--; + slot++; + + node_ptr = page_cur_get_rec(page_cursor); + offsets = rec_get_offsets(node_ptr, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + /* Go to the child node */ + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return (is_first_rec); +} + +/*==================== B-TREE INSERT =========================*/ + +/*************************************************************//** +Inserts a record if there is enough space, or if enough space can +be freed by reorganizing. Differs from btr_cur_optimistic_insert because +no heuristics is applied to whether it pays to use CPU time for +reorganizing the page or not. +@return pointer to inserted record if succeed, else NULL */ +static +rec_t* +btr_cur_insert_if_possible( +/*=======================*/ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert; + cursor stays valid */ + const dtuple_t* tuple, /*!< in: tuple to insert; the size info need not + have been stored to tuple */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t* page_cursor; + buf_block_t* block; + rec_t* rec; + + ut_ad(dtuple_check_typed(tuple)); + + block = btr_cur_get_block(cursor); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + page_cursor = btr_cur_get_page_cur(cursor); + + /* Now, try the insert */ + rec = page_cur_tuple_insert(page_cursor, tuple, + cursor->index, n_ext, mtr); + + if (UNIV_UNLIKELY(!rec)) { + /* If record did not fit, reorganize */ + + if (btr_page_reorganize(block, cursor->index, mtr)) { + + page_cur_search(block, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); + + rec = page_cur_tuple_insert(page_cursor, tuple, + cursor->index, n_ext, mtr); + } + } + + return(rec); +} + +/*************************************************************//** +For an insert, checks the locks and does the undo logging if desired. +@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */ +UNIV_INLINE +ulint +btr_cur_ins_lock_and_undo( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags: if + not zero, the parameters index and thr + should be specified */ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert */ + const dtuple_t* entry, /*!< in: entry to insert */ + que_thr_t* thr, /*!< in: query thread or NULL */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + ibool* inherit)/*!< out: TRUE if the inserted new record maybe + should inherit LOCK_GAP type locks from the + successor record */ +{ + dict_index_t* index; + ulint err; + rec_t* rec; + roll_ptr_t roll_ptr; + + /* Check if we have to wait for a lock: enqueue an explicit lock + request if yes */ + + rec = btr_cur_get_rec(cursor); + index = cursor->index; + + err = lock_rec_insert_check_and_lock(flags, rec, + btr_cur_get_block(cursor), + index, thr, mtr, inherit); + + if (err != DB_SUCCESS) { + + return(err); + } + + if (dict_index_is_clust(index) && !dict_index_is_ibuf(index)) { + + err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP, + thr, index, entry, + NULL, 0, NULL, + &roll_ptr); + if (err != DB_SUCCESS) { + + return(err); + } + + /* Now we can fill in the roll ptr field in entry */ + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + + row_upd_index_entry_sys_field(entry, index, + DATA_ROLL_PTR, roll_ptr); + } + } + + return(DB_SUCCESS); +} + +#ifdef UNIV_DEBUG +/*************************************************************//** +Report information about a transaction. */ +static +void +btr_cur_trx_report( +/*===============*/ + trx_t* trx, /*!< in: transaction */ + const dict_index_t* index, /*!< in: index */ + const char* op) /*!< in: operation */ +{ + fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ", + TRX_ID_PREP_PRINTF(trx->id)); + fputs(op, stderr); + dict_index_name_print(stderr, trx, index); + putc('\n', stderr); +} +#endif /* UNIV_DEBUG */ + +/*************************************************************//** +Tries to perform an insert to a page in an index tree, next to cursor. +It is assumed that mtr holds an x-latch on the page. The operation does +not succeed if there is too little space on the page. If there is just +one record on the page, the insert will always succeed; this is to +prevent trying to split a page with just one record. +@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */ +UNIV_INTERN +ulint +btr_cur_optimistic_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags: if not + zero, the parameters index and thr should be + specified */ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert; + cursor stays valid */ + dtuple_t* entry, /*!< in/out: entry to insert */ + rec_t** rec, /*!< out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller, or + NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr, /*!< in: query thread or NULL */ + mtr_t* mtr) /*!< in: mtr; if this function returns + DB_SUCCESS on a leaf page of a secondary + index in a compressed tablespace, the + mtr must be committed before latching + any further pages */ +{ + big_rec_t* big_rec_vec = NULL; + dict_index_t* index; + page_cur_t* page_cursor; + buf_block_t* block; + page_t* page; + ulint max_size; + rec_t* dummy_rec; + ibool leaf; + ibool reorg; + ibool inherit; + ulint zip_size; + ulint rec_size; + ulint err; + + *big_rec = NULL; + + block = btr_cur_get_block(cursor); + + if (srv_pass_corrupt_table && !block) { + return(DB_CORRUPTION); + } + ut_a(block); + + page = buf_block_get_frame(block); + index = cursor->index; + zip_size = buf_block_get_zip_size(block); +#ifdef UNIV_DEBUG_VALGRIND + if (zip_size) { + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size); + } +#endif /* UNIV_DEBUG_VALGRIND */ + + if (!dtuple_check_typed_no_assert(entry)) { + fputs("InnoDB: Error in a tuple to insert into ", stderr); + dict_index_name_print(stderr, thr_get_trx(thr), index); + } +#ifdef UNIV_DEBUG + if (btr_cur_print_record_ops && thr) { + btr_cur_trx_report(thr_get_trx(thr), index, "insert into "); + dtuple_print(stderr, entry); + } +#endif /* UNIV_DEBUG */ + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + max_size = page_get_max_insert_size_after_reorganize(page, 1); + leaf = page_is_leaf(page); + + /* Calculate the record size when entry is converted to a record */ + rec_size = rec_get_converted_size(index, entry, n_ext); + + if (page_zip_rec_needs_ext(rec_size, page_is_comp(page), + dtuple_get_n_fields(entry), zip_size)) { + + /* The record is so big that we have to store some fields + externally on separate database pages */ + big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext); + + if (UNIV_UNLIKELY(big_rec_vec == NULL)) { + + return(DB_TOO_BIG_RECORD); + } + + rec_size = rec_get_converted_size(index, entry, n_ext); + } + + if (UNIV_UNLIKELY(zip_size)) { + /* Estimate the free space of an empty compressed page. + Subtract one byte for the encoded heap_no in the + modification log. */ + ulint free_space_zip = page_zip_empty_size( + cursor->index->n_fields, zip_size) - 1; + ulint n_uniq = dict_index_get_n_unique_in_tree(index); + + ut_ad(dict_table_is_comp(index->table)); + + /* There should be enough room for two node pointer + records on an empty non-leaf page. This prevents + infinite page splits. */ + + if (UNIV_LIKELY(entry->n_fields >= n_uniq) + && UNIV_UNLIKELY(REC_NODE_PTR_SIZE + + rec_get_converted_size_comp_prefix( + index, entry->fields, n_uniq, + NULL) + /* On a compressed page, there is + a two-byte entry in the dense + page directory for every record. + But there is no record header. */ + - (REC_N_NEW_EXTRA_BYTES - 2) + > free_space_zip / 2)) { + + if (big_rec_vec) { + dtuple_convert_back_big_rec( + index, entry, big_rec_vec); + } + + return(DB_TOO_BIG_RECORD); + } + } + + /* If there have been many consecutive inserts, and we are on the leaf + level, check if we have to split the page to reserve enough free space + for future updates of records. */ + + if (dict_index_is_clust(index) + && (page_get_n_recs(page) >= 2) + && UNIV_LIKELY(leaf) + && (dict_index_get_space_reserve() + rec_size > max_size) + && (btr_page_get_split_rec_to_right(cursor, &dummy_rec) + || btr_page_get_split_rec_to_left(cursor, &dummy_rec))) { +fail: + err = DB_FAIL; +fail_err: + + if (big_rec_vec) { + dtuple_convert_back_big_rec(index, entry, big_rec_vec); + } + + return(err); + } + + if (UNIV_UNLIKELY(max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + || max_size < rec_size) + && UNIV_LIKELY(page_get_n_recs(page) > 1) + && page_get_max_insert_size(page, 1) < rec_size) { + + goto fail; + } + + /* Check locks and write to the undo log, if specified */ + err = btr_cur_ins_lock_and_undo(flags, cursor, entry, + thr, mtr, &inherit); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + + goto fail_err; + } + + page_cursor = btr_cur_get_page_cur(cursor); + + /* Now, try the insert */ + + { + const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor); + *rec = page_cur_tuple_insert(page_cursor, entry, index, + n_ext, mtr); + reorg = page_cursor_rec != page_cur_get_rec(page_cursor); + + if (UNIV_UNLIKELY(reorg)) { + ut_a(zip_size); + ut_a(*rec); + } + } + + if (UNIV_UNLIKELY(!*rec) && UNIV_LIKELY(!reorg)) { + /* If the record did not fit, reorganize */ + if (UNIV_UNLIKELY(!btr_page_reorganize(block, index, mtr))) { + ut_a(zip_size); + + goto fail; + } + + ut_ad(zip_size + || page_get_max_insert_size(page, 1) == max_size); + + reorg = TRUE; + + page_cur_search(block, index, entry, PAGE_CUR_LE, page_cursor); + + *rec = page_cur_tuple_insert(page_cursor, entry, index, + n_ext, mtr); + + if (UNIV_UNLIKELY(!*rec)) { + if (UNIV_LIKELY(zip_size != 0)) { + + goto fail; + } + + fputs("InnoDB: Error: cannot insert tuple ", stderr); + dtuple_print(stderr, entry); + fputs(" into ", stderr); + dict_index_name_print(stderr, thr_get_trx(thr), index); + fprintf(stderr, "\nInnoDB: max insert size %lu\n", + (ulong) max_size); + ut_error; + } + } + +#ifdef BTR_CUR_HASH_ADAPT + if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) { + btr_search_update_hash_node_on_insert(cursor); + } else { + btr_search_update_hash_on_insert(cursor); + } +#endif + + if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) { + + lock_update_insert(block, *rec); + } + +#if 0 + fprintf(stderr, "Insert into page %lu, max ins size %lu," + " rec %lu ind type %lu\n", + buf_block_get_page_no(block), max_size, + rec_size + PAGE_DIR_SLOT_SIZE, index->type); +#endif + if (leaf && !dict_index_is_clust(index)) { + /* Update the free bits of the B-tree page in the + insert buffer bitmap. */ + + /* The free bits in the insert buffer bitmap must + never exceed the free space on a page. It is safe to + decrement or reset the bits in the bitmap in a + mini-transaction that is committed before the + mini-transaction that affects the free space. */ + + /* It is unsafe to increment the bits in a separately + committed mini-transaction, because in crash recovery, + the free bits could momentarily be set too high. */ + + if (zip_size) { + /* Update the bits in the same mini-transaction. */ + ibuf_update_free_bits_zip(block, mtr); + } else { + /* Decrement the bits in a separate + mini-transaction. */ + ibuf_update_free_bits_if_full( + block, max_size, + rec_size + PAGE_DIR_SLOT_SIZE); + } + } + + *big_rec = big_rec_vec; + + return(DB_SUCCESS); +} + +/*************************************************************//** +Performs an insert on a page of an index tree. It is assumed that mtr +holds an x-latch on the tree and on the cursor page. If the insert is +made on the leaf level, to avoid deadlocks, mtr must also own x-latches +to brothers of page, if those brothers exist. +@return DB_SUCCESS or error number */ +UNIV_INTERN +ulint +btr_cur_pessimistic_insert( +/*=======================*/ + ulint flags, /*!< in: undo logging and locking flags: if not + zero, the parameter thr should be + specified; if no undo logging is specified, + then the caller must have reserved enough + free extents in the file space so that the + insertion will certainly succeed */ + btr_cur_t* cursor, /*!< in: cursor after which to insert; + cursor stays valid */ + dtuple_t* entry, /*!< in/out: entry to insert */ + rec_t** rec, /*!< out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller, or + NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr, /*!< in: query thread or NULL */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index = cursor->index; + ulint zip_size = dict_table_zip_size(index->table); + big_rec_t* big_rec_vec = NULL; + mem_heap_t* heap = NULL; + ulint err; + ibool dummy_inh; + ibool success; + ulint n_extents = 0; + ulint n_reserved; + + ut_ad(dtuple_check_typed(entry)); + + *big_rec = NULL; + + ut_ad(mtr_memo_contains(mtr, + dict_index_get_lock(btr_cur_get_index(cursor)), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + + /* Try first an optimistic insert; reset the cursor flag: we do not + assume anything of how it was positioned */ + + cursor->flag = BTR_CUR_BINARY; + + err = btr_cur_optimistic_insert(flags, cursor, entry, rec, + big_rec, n_ext, thr, mtr); + if (err != DB_FAIL) { + + return(err); + } + + /* Retry with a pessimistic insert. Check locks and write to undo log, + if specified */ + + err = btr_cur_ins_lock_and_undo(flags, cursor, entry, + thr, mtr, &dummy_inh); + + if (err != DB_SUCCESS) { + + return(err); + } + + if (!(flags & BTR_NO_UNDO_LOG_FLAG)) { + /* First reserve enough free space for the file segments + of the index tree, so that the insert will not fail because + of lack of space */ + + n_extents = cursor->tree_height / 16 + 3; + + success = fsp_reserve_free_extents(&n_reserved, index->space, + n_extents, FSP_NORMAL, mtr); + if (!success) { + return(DB_OUT_OF_FILE_SPACE); + } + } + + if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext), + dict_table_is_comp(index->table), + dict_index_get_n_fields(index), + zip_size)) { + /* The record is so big that we have to store some fields + externally on separate database pages */ + + if (UNIV_LIKELY_NULL(big_rec_vec)) { + /* This should never happen, but we handle + the situation in a robust manner. */ + ut_ad(0); + dtuple_convert_back_big_rec(index, entry, big_rec_vec); + } + + big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext); + + if (big_rec_vec == NULL) { + + if (n_extents > 0) { + fil_space_release_free_extents(index->space, + n_reserved); + } + return(DB_TOO_BIG_RECORD); + } + } + + if (dict_index_get_page(index) + == buf_block_get_page_no(btr_cur_get_block(cursor))) { + + /* The page is the root page */ + *rec = btr_root_raise_and_insert(cursor, entry, n_ext, mtr); + } else { + *rec = btr_page_split_and_insert(cursor, entry, n_ext, mtr); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec); + +#ifdef BTR_CUR_ADAPT + btr_search_update_hash_on_insert(cursor); +#endif + if (!(flags & BTR_NO_LOCKING_FLAG)) { + + lock_update_insert(btr_cur_get_block(cursor), *rec); + } + + if (n_extents > 0) { + fil_space_release_free_extents(index->space, n_reserved); + } + + *big_rec = big_rec_vec; + + return(DB_SUCCESS); +} + +/*==================== B-TREE UPDATE =========================*/ + +/*************************************************************//** +For an update, checks the locks and does the undo logging. +@return DB_SUCCESS, DB_WAIT_LOCK, or error number */ +UNIV_INLINE +ulint +btr_cur_upd_lock_and_undo( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on record to update */ + const upd_t* update, /*!< in: update vector */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + roll_ptr_t* roll_ptr)/*!< out: roll pointer */ +{ + dict_index_t* index; + rec_t* rec; + ulint err; + + ut_ad(cursor && update && thr && roll_ptr); + + rec = btr_cur_get_rec(cursor); + index = cursor->index; + + if (!dict_index_is_clust(index)) { + /* We do undo logging only when we update a clustered index + record */ + return(lock_sec_rec_modify_check_and_lock( + flags, btr_cur_get_block(cursor), rec, + index, thr, mtr)); + } + + /* Check if we have to wait for a lock: enqueue an explicit lock + request if yes */ + + err = DB_SUCCESS; + + if (!(flags & BTR_NO_LOCKING_FLAG)) { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + err = lock_clust_rec_modify_check_and_lock( + flags, btr_cur_get_block(cursor), rec, index, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), thr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + if (err != DB_SUCCESS) { + + return(err); + } + } + + /* Append the info about the update in the undo log */ + + err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr, + index, NULL, update, + cmpl_info, rec, roll_ptr); + return(err); +} + +/***********************************************************//** +Writes a redo log record of updating a record in-place. */ +UNIV_INLINE +void +btr_cur_update_in_place_log( +/*========================*/ + ulint flags, /*!< in: flags */ + rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in: index where cursor positioned */ + const upd_t* update, /*!< in: update vector */ + trx_t* trx, /*!< in: transaction */ + roll_ptr_t roll_ptr, /*!< in: roll ptr */ + mtr_t* mtr) /*!< in: mtr */ +{ + byte* log_ptr; + page_t* page = page_align(rec); + ut_ad(flags < 256); + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + + log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page) + ? MLOG_COMP_REC_UPDATE_IN_PLACE + : MLOG_REC_UPDATE_IN_PLACE, + 1 + DATA_ROLL_PTR_LEN + 14 + 2 + + MLOG_BUF_MARGIN); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery */ + return; + } + + /* The code below assumes index is a clustered index: change index to + the clustered index if we are updating a secondary index record (or we + could as well skip writing the sys col values to the log in this case + because they are not needed for a secondary index record update) */ + + index = dict_table_get_first_index(index->table); + + mach_write_to_1(log_ptr, flags); + log_ptr++; + + log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr, + mtr); + mach_write_to_2(log_ptr, page_offset(rec)); + log_ptr += 2; + + row_upd_index_write_log(update, log_ptr, mtr); +} +#endif /* UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses a redo log record of updating a record in-place. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_cur_parse_update_in_place( +/*==========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in/out: page or NULL */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index) /*!< in: index corresponding to page */ +{ + ulint flags; + rec_t* rec; + upd_t* update; + ulint pos; + trx_id_t trx_id; + roll_ptr_t roll_ptr; + ulint rec_offset; + mem_heap_t* heap; + ulint* offsets; + + if (end_ptr < ptr + 1) { + + return(NULL); + } + + flags = mach_read_from_1(ptr); + ptr++; + + ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr); + + if (ptr == NULL) { + + return(NULL); + } + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + rec_offset = mach_read_from_2(ptr); + ptr += 2; + + ut_a(rec_offset <= UNIV_PAGE_SIZE); + + heap = mem_heap_create(256); + + ptr = row_upd_index_parse(ptr, end_ptr, heap, &update); + + if (!ptr || !page) { + + goto func_exit; + } + + ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table)); + rec = page + rec_offset; + + /* We do not need to reserve btr_search_latch, as the page is only + being recovered, and there cannot be a hash index to it. */ + + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets, + pos, trx_id, roll_ptr); + } + + row_upd_rec_in_place(rec, index, offsets, update, page_zip); + +func_exit: + mem_heap_free(heap); + + return(ptr); +} + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +See if there is enough place in the page modification log to log +an update-in-place. +@return TRUE if enough place */ +static +ibool +btr_cur_update_alloc_zip( +/*=====================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + buf_block_t* block, /*!< in/out: buffer page */ + dict_index_t* index, /*!< in: the index corresponding to the block */ + ulint length, /*!< in: size needed */ + ibool create, /*!< in: TRUE=delete-and-insert, + FALSE=update-in-place */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + ut_a(page_zip == buf_block_get_page_zip(block)); + ut_ad(page_zip); + ut_ad(!dict_index_is_ibuf(index)); + + if (page_zip_available(page_zip, dict_index_is_clust(index), + length, create)) { + return(TRUE); + } + + if (!page_zip->m_nonempty) { + /* The page has been freshly compressed, so + recompressing it will not help. */ + return(FALSE); + } + + if (!page_zip_compress(page_zip, buf_block_get_frame(block), + index, mtr)) { + /* Unable to compress the page */ + return(FALSE); + } + + /* After recompressing a page, we must make sure that the free + bits in the insert buffer bitmap will not exceed the free + space on the page. Because this function will not attempt + recompression unless page_zip_available() fails above, it is + safe to reset the free bits if page_zip_available() fails + again, below. The free bits can safely be reset in a separate + mini-transaction. If page_zip_available() succeeds below, we + can be sure that the page_zip_compress() above did not reduce + the free space available on the page. */ + + if (!page_zip_available(page_zip, dict_index_is_clust(index), + length, create)) { + /* Out of space: reset the free bits. */ + if (!dict_index_is_clust(index) + && page_is_leaf(buf_block_get_frame(block))) { + ibuf_reset_free_bits(block); + } + return(FALSE); + } + + return(TRUE); +} + +/*************************************************************//** +Updates a record when the update causes no size changes in its fields. +We assume here that the ordering fields of the record do not change. +@return DB_SUCCESS or error number */ +UNIV_INTERN +ulint +btr_cur_update_in_place( +/*====================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + const upd_t* update, /*!< in: update vector */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr; must be committed before + latching any further pages */ +{ + dict_index_t* index; + buf_block_t* block; + page_zip_des_t* page_zip; + ulint err; + rec_t* rec; + roll_ptr_t roll_ptr = ut_dulint_zero; + trx_t* trx; + ulint was_delete_marked; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + rec = btr_cur_get_rec(cursor); + index = cursor->index; + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + /* The insert buffer tree should never be updated in place. */ + ut_ad(!dict_index_is_ibuf(index)); + + trx = thr_get_trx(thr); + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); +#ifdef UNIV_DEBUG + if (btr_cur_print_record_ops && thr) { + btr_cur_trx_report(trx, index, "update "); + rec_print_new(stderr, rec, offsets); + } +#endif /* UNIV_DEBUG */ + + block = btr_cur_get_block(cursor); + page_zip = buf_block_get_page_zip(block); + + /* Check that enough space is available on the compressed page. */ + if (UNIV_LIKELY_NULL(page_zip) + && !btr_cur_update_alloc_zip(page_zip, block, index, + rec_offs_size(offsets), FALSE, mtr)) { + return(DB_ZIP_OVERFLOW); + } + + /* Do lock checking and undo logging */ + err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info, + thr, mtr, &roll_ptr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); + } + + if (block->is_hashed) { + /* The function row_upd_changes_ord_field_binary works only + if the update vector was built for a clustered index, we must + NOT call it if index is secondary */ + + if (!dict_index_is_clust(index) + || row_upd_changes_ord_field_binary(NULL, index, update)) { + + /* Remove possible hash index pointer to this record */ + btr_search_update_hash_on_delete(cursor); + } + + rw_lock_x_lock(&btr_search_latch); + } + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + row_upd_rec_sys_fields(rec, NULL, + index, offsets, trx, roll_ptr); + } + + was_delete_marked = rec_get_deleted_flag( + rec, page_is_comp(buf_block_get_frame(block))); + + row_upd_rec_in_place(rec, index, offsets, update, page_zip); + + if (block->is_hashed) { + rw_lock_x_unlock(&btr_search_latch); + } + + if (page_zip && !dict_index_is_clust(index) + && page_is_leaf(buf_block_get_frame(block))) { + /* Update the free bits in the insert buffer. */ + ibuf_update_free_bits_zip(block, mtr); + } + + btr_cur_update_in_place_log(flags, rec, index, update, + trx, roll_ptr, mtr); + + if (was_delete_marked + && !rec_get_deleted_flag(rec, page_is_comp( + buf_block_get_frame(block)))) { + /* The new updated record owns its possible externally + stored fields */ + + btr_cur_unmark_extern_fields(page_zip, + rec, index, offsets, mtr); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(DB_SUCCESS); +} + +/*************************************************************//** +Tries to update a record on a page in an index tree. It is assumed that mtr +holds an x-latch on the page. The operation does not succeed if there is too +little space on the page or if the update would result in too empty a page, +so that tree compression is recommended. We assume here that the ordering +fields of the record do not change. +@return DB_SUCCESS, or DB_OVERFLOW if the updated record does not fit, +DB_UNDERFLOW if the page would become too empty, or DB_ZIP_OVERFLOW if +there is not enough space left on the compressed page */ +UNIV_INTERN +ulint +btr_cur_optimistic_update( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + const upd_t* update, /*!< in: update vector; this must also + contain trx id and roll ptr fields */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr; must be committed before + latching any further pages */ +{ + dict_index_t* index; + page_cur_t* page_cursor; + ulint err; + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + rec_t* rec; + rec_t* orig_rec; + ulint max_size; + ulint new_rec_size; + ulint old_rec_size; + dtuple_t* new_entry; + roll_ptr_t roll_ptr; + trx_t* trx; + mem_heap_t* heap; + ulint i; + ulint n_ext; + ulint* offsets; + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + orig_rec = rec = btr_cur_get_rec(cursor); + index = cursor->index; + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + /* The insert buffer tree should never be updated in place. */ + ut_ad(!dict_index_is_ibuf(index)); + + heap = mem_heap_create(1024); + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); + +#ifdef UNIV_DEBUG + if (btr_cur_print_record_ops && thr) { + btr_cur_trx_report(thr_get_trx(thr), index, "update "); + rec_print_new(stderr, rec, offsets); + } +#endif /* UNIV_DEBUG */ + + if (!row_upd_changes_field_size_or_external(index, offsets, update)) { + + /* The simplest and the most common case: the update does not + change the size of any field and none of the updated fields is + externally stored in rec or update, and there is enough space + on the compressed page to log the update. */ + + mem_heap_free(heap); + return(btr_cur_update_in_place(flags, cursor, update, + cmpl_info, thr, mtr)); + } + + if (rec_offs_any_extern(offsets)) { +any_extern: + /* Externally stored fields are treated in pessimistic + update */ + + mem_heap_free(heap); + return(DB_OVERFLOW); + } + + for (i = 0; i < upd_get_n_fields(update); i++) { + if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) { + + goto any_extern; + } + } + + page_cursor = btr_cur_get_page_cur(cursor); + + new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets, + &n_ext, heap); + /* We checked above that there are no externally stored fields. */ + ut_a(!n_ext); + + /* The page containing the clustered index record + corresponding to new_entry is latched in mtr. + Thus the following call is safe. */ + row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, + FALSE, heap); + old_rec_size = rec_offs_size(offsets); + new_rec_size = rec_get_converted_size(index, new_entry, 0); + + page_zip = buf_block_get_page_zip(block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + if (UNIV_LIKELY_NULL(page_zip) + && !btr_cur_update_alloc_zip(page_zip, block, index, + new_rec_size, TRUE, mtr)) { + err = DB_ZIP_OVERFLOW; + goto err_exit; + } + + if (UNIV_UNLIKELY(new_rec_size + >= (page_get_free_space_of_empty(page_is_comp(page)) + / 2))) { + + err = DB_OVERFLOW; + goto err_exit; + } + + if (UNIV_UNLIKELY(page_get_data_size(page) + - old_rec_size + new_rec_size + < BTR_CUR_PAGE_COMPRESS_LIMIT)) { + + /* The page would become too empty */ + + err = DB_UNDERFLOW; + goto err_exit; + } + + max_size = old_rec_size + + page_get_max_insert_size_after_reorganize(page, 1); + + if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT) + && (max_size >= new_rec_size)) + || (page_get_n_recs(page) <= 1))) { + + /* There was not enough space, or it did not pay to + reorganize: for simplicity, we decide what to do assuming a + reorganization is needed, though it might not be necessary */ + + err = DB_OVERFLOW; + goto err_exit; + } + + /* Do lock checking and undo logging */ + err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info, + thr, mtr, &roll_ptr); + if (err != DB_SUCCESS) { + + goto err_exit; + } + + /* Ok, we may do the replacement. Store on the page infimum the + explicit locks on rec, before deleting rec (see the comment in + btr_cur_pessimistic_update). */ + + lock_rec_store_on_page_infimum(block, rec); + + btr_search_update_hash_on_delete(cursor); + + /* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above + invokes rec_offs_make_valid() to point to the copied record that + the fields of new_entry point to. We have to undo it here. */ + ut_ad(rec_offs_validate(NULL, index, offsets)); + rec_offs_make_valid(page_cur_get_rec(page_cursor), index, offsets); + + page_cur_delete_rec(page_cursor, index, offsets, mtr); + + page_cur_move_to_prev(page_cursor); + + trx = thr_get_trx(thr); + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR, + roll_ptr); + row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID, + trx->id); + } + + /* There are no externally stored columns in new_entry */ + rec = btr_cur_insert_if_possible(cursor, new_entry, 0/*n_ext*/, mtr); + ut_a(rec); /* <- We calculated above the insert would fit */ + + if (page_zip && !dict_index_is_clust(index) + && page_is_leaf(page)) { + /* Update the free bits in the insert buffer. */ + ibuf_update_free_bits_zip(block, mtr); + } + + /* Restore the old explicit lock state on the record */ + + lock_rec_restore_from_page_infimum(block, rec, block); + + page_cur_move_to_next(page_cursor); + + err = DB_SUCCESS; +err_exit: + mem_heap_free(heap); + return(err); +} + +/*************************************************************//** +If, in a split, a new supremum record was created as the predecessor of the +updated record, the supremum record must inherit exactly the locks on the +updated record. In the split it may have inherited locks from the successor +of the updated record, which is not correct. This function restores the +right locks for the new supremum. */ +static +void +btr_cur_pess_upd_restore_supremum( +/*==============================*/ + buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: updated record */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* page; + buf_block_t* prev_block; + ulint space; + ulint zip_size; + ulint prev_page_no; + + page = buf_block_get_frame(block); + + if (page_rec_get_next(page_get_infimum_rec(page)) != rec) { + /* Updated record is not the first user record on its page */ + + return; + } + + space = buf_block_get_space(block); + zip_size = buf_block_get_zip_size(block); + prev_page_no = btr_page_get_prev(page, mtr); + + ut_ad(prev_page_no != FIL_NULL); + prev_block = buf_page_get_with_no_latch(space, zip_size, + prev_page_no, mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_next(prev_block->frame, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + /* We must already have an x-latch on prev_block! */ + ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX)); + + lock_rec_reset_and_inherit_gap_locks(prev_block, block, + PAGE_HEAP_NO_SUPREMUM, + page_rec_get_heap_no(rec)); +} + +/*************************************************************//** +Performs an update of a record on a page of a tree. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. If the +update is made on the leaf level, to avoid deadlocks, mtr must also +own x-latches to brothers of page, if those brothers exist. We assume +here that the ordering fields of the record do not change. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ulint +btr_cur_pessimistic_update( +/*=======================*/ + ulint flags, /*!< in: undo logging, locking, and rollback + flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller, or NULL */ + const upd_t* update, /*!< in: update vector; this is allowed also + contain trx id and roll ptr fields, but + the values in update vector have no effect */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr; must be committed before + latching any further pages */ +{ + big_rec_t* big_rec_vec = NULL; + big_rec_t* dummy_big_rec; + dict_index_t* index; + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + rec_t* rec; + page_cur_t* page_cursor; + dtuple_t* new_entry; + ulint err; + ulint optim_err; + roll_ptr_t roll_ptr; + trx_t* trx; + ibool was_first; + ulint n_extents = 0; + ulint n_reserved; + ulint n_ext; + ulint* offsets = NULL; + + *big_rec = NULL; + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + rec = btr_cur_get_rec(cursor); + index = cursor->index; + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + /* The insert buffer tree should never be updated in place. */ + ut_ad(!dict_index_is_ibuf(index)); + + optim_err = btr_cur_optimistic_update(flags, cursor, update, + cmpl_info, thr, mtr); + + switch (optim_err) { + case DB_UNDERFLOW: + case DB_OVERFLOW: + case DB_ZIP_OVERFLOW: + break; + default: + return(optim_err); + } + + /* Do lock checking and undo logging */ + err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info, + thr, mtr, &roll_ptr); + if (err != DB_SUCCESS) { + + return(err); + } + + if (optim_err == DB_OVERFLOW) { + ulint reserve_flag; + + /* First reserve enough free space for the file segments + of the index tree, so that the update will not fail because + of lack of space */ + + n_extents = cursor->tree_height / 16 + 3; + + if (flags & BTR_NO_UNDO_LOG_FLAG) { + reserve_flag = FSP_CLEANING; + } else { + reserve_flag = FSP_NORMAL; + } + + if (!fsp_reserve_free_extents(&n_reserved, index->space, + n_extents, reserve_flag, mtr)) { + return(DB_OUT_OF_FILE_SPACE); + } + } + + if (!*heap) { + *heap = mem_heap_create(1024); + } + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, heap); + + trx = thr_get_trx(thr); + + new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets, + &n_ext, *heap); + /* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above + invokes rec_offs_make_valid() to point to the copied record that + the fields of new_entry point to. We have to undo it here. */ + ut_ad(rec_offs_validate(NULL, index, offsets)); + rec_offs_make_valid(rec, index, offsets); + + /* The page containing the clustered index record + corresponding to new_entry is latched in mtr. If the + clustered index record is delete-marked, then its externally + stored fields cannot have been purged yet, because then the + purge would also have removed the clustered index record + itself. Thus the following call is safe. */ + row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, + FALSE, *heap); + if (!(flags & BTR_KEEP_SYS_FLAG)) { + row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR, + roll_ptr); + row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID, + trx->id); + } + + if ((flags & BTR_NO_UNDO_LOG_FLAG) && rec_offs_any_extern(offsets)) { + /* We are in a transaction rollback undoing a row + update: we must free possible externally stored fields + which got new values in the update, if they are not + inherited values. They can be inherited if we have + updated the primary key to another value, and then + update it back again. */ + + ut_ad(big_rec_vec == NULL); + + btr_rec_free_updated_extern_fields( + index, rec, page_zip, offsets, update, + trx_is_recv(trx) ? RB_RECOVERY : RB_NORMAL, mtr); + } + + /* We have to set appropriate extern storage bits in the new + record to be inserted: we have to remember which fields were such */ + + ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec)); + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, heap); + n_ext += btr_push_update_extern_fields(new_entry, update, *heap); + + if (UNIV_LIKELY_NULL(page_zip)) { + ut_ad(page_is_comp(page)); + if (page_zip_rec_needs_ext( + rec_get_converted_size(index, new_entry, n_ext), + TRUE, + dict_index_get_n_fields(index), + page_zip_get_size(page_zip))) { + + goto make_external; + } + } else if (page_zip_rec_needs_ext( + rec_get_converted_size(index, new_entry, n_ext), + page_is_comp(page), 0, 0)) { +make_external: + big_rec_vec = dtuple_convert_big_rec(index, new_entry, &n_ext); + if (UNIV_UNLIKELY(big_rec_vec == NULL)) { + + err = DB_TOO_BIG_RECORD; + goto return_after_reservations; + } + } + + /* Store state of explicit locks on rec on the page infimum record, + before deleting rec. The page infimum acts as a dummy carrier of the + locks, taking care also of lock releases, before we can move the locks + back on the actual record. There is a special case: if we are + inserting on the root page and the insert causes a call of + btr_root_raise_and_insert. Therefore we cannot in the lock system + delete the lock structs set on the root page even if the root + page carries just node pointers. */ + + lock_rec_store_on_page_infimum(block, rec); + + btr_search_update_hash_on_delete(cursor); + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + page_cursor = btr_cur_get_page_cur(cursor); + + page_cur_delete_rec(page_cursor, index, offsets, mtr); + + page_cur_move_to_prev(page_cursor); + + rec = btr_cur_insert_if_possible(cursor, new_entry, n_ext, mtr); + + if (rec) { + lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor), + rec, block); + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, heap); + + if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { + /* The new inserted record owns its possible externally + stored fields */ + btr_cur_unmark_extern_fields(page_zip, + rec, index, offsets, mtr); + } + + btr_cur_compress_if_useful(cursor, mtr); + + if (page_zip && !dict_index_is_clust(index) + && page_is_leaf(page)) { + /* Update the free bits in the insert buffer. */ + ibuf_update_free_bits_zip(block, mtr); + } + + err = DB_SUCCESS; + goto return_after_reservations; + } else { + ut_a(optim_err != DB_UNDERFLOW); + + /* Out of space: reset the free bits. */ + if (!dict_index_is_clust(index) + && page_is_leaf(page)) { + ibuf_reset_free_bits(block); + } + } + + /* Was the record to be updated positioned as the first user + record on its page? */ + was_first = page_cur_is_before_first(page_cursor); + + /* The first parameter means that no lock checking and undo logging + is made in the insert */ + + err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG, + cursor, new_entry, &rec, + &dummy_big_rec, n_ext, NULL, mtr); + ut_a(rec); + ut_a(err == DB_SUCCESS); + ut_a(dummy_big_rec == NULL); + + if (dict_index_is_sec_or_ibuf(index)) { + /* Update PAGE_MAX_TRX_ID in the index page header. + It was not updated by btr_cur_pessimistic_insert() + because of BTR_NO_LOCKING_FLAG. */ + buf_block_t* rec_block; + + rec_block = btr_cur_get_block(cursor); + + page_update_max_trx_id(rec_block, + buf_block_get_page_zip(rec_block), + trx->id, mtr); + } + + if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { + /* The new inserted record owns its possible externally + stored fields */ + buf_block_t* rec_block = btr_cur_get_block(cursor); + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); + page = buf_block_get_frame(rec_block); +#endif /* UNIV_ZIP_DEBUG */ + page_zip = buf_block_get_page_zip(rec_block); + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, heap); + btr_cur_unmark_extern_fields(page_zip, + rec, index, offsets, mtr); + } + + lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor), + rec, block); + + /* If necessary, restore also the correct lock state for a new, + preceding supremum record created in a page split. While the old + record was nonexistent, the supremum might have inherited its locks + from a wrong record. */ + + if (!was_first) { + btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor), + rec, mtr); + } + +return_after_reservations: +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + if (n_extents > 0) { + fil_space_release_free_extents(index->space, n_reserved); + } + + *big_rec = big_rec_vec; + + return(err); +} + +/*==================== B-TREE DELETE MARK AND UNMARK ===============*/ + +/****************************************************************//** +Writes the redo log record for delete marking or unmarking of an index +record. */ +UNIV_INLINE +void +btr_cur_del_mark_set_clust_rec_log( +/*===============================*/ + ulint flags, /*!< in: flags */ + rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in: index of the record */ + ibool val, /*!< in: value to set */ + trx_t* trx, /*!< in: deleting transaction */ + roll_ptr_t roll_ptr,/*!< in: roll ptr to the undo log record */ + mtr_t* mtr) /*!< in: mtr */ +{ + byte* log_ptr; + ut_ad(flags < 256); + ut_ad(val <= 1); + + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + + log_ptr = mlog_open_and_write_index(mtr, rec, index, + page_rec_is_comp(rec) + ? MLOG_COMP_REC_CLUST_DELETE_MARK + : MLOG_REC_CLUST_DELETE_MARK, + 1 + 1 + DATA_ROLL_PTR_LEN + + 14 + 2); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery */ + return; + } + + mach_write_to_1(log_ptr, flags); + log_ptr++; + mach_write_to_1(log_ptr, val); + log_ptr++; + + log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr, + mtr); + mach_write_to_2(log_ptr, page_offset(rec)); + log_ptr += 2; + + mlog_close(mtr, log_ptr); +} +#endif /* !UNIV_HOTBACKUP */ + +/****************************************************************//** +Parses the redo log record for delete marking or unmarking of a clustered +index record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_cur_parse_del_mark_set_clust_rec( +/*=================================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in/out: page or NULL */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index) /*!< in: index corresponding to page */ +{ + ulint flags; + ulint val; + ulint pos; + trx_id_t trx_id; + roll_ptr_t roll_ptr; + ulint offset; + rec_t* rec; + + ut_ad(!page + || !!page_is_comp(page) == dict_table_is_comp(index->table)); + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + flags = mach_read_from_1(ptr); + ptr++; + val = mach_read_from_1(ptr); + ptr++; + + ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr); + + if (ptr == NULL) { + + return(NULL); + } + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + ptr += 2; + + ut_a(offset <= UNIV_PAGE_SIZE); + + if (page) { + rec = page + offset; + + /* We do not need to reserve btr_search_latch, as the page + is only being recovered, and there cannot be a hash index to + it. */ + + btr_rec_set_deleted_flag(rec, page_zip, val); + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + row_upd_rec_sys_fields_in_recovery( + rec, page_zip, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), + pos, trx_id, roll_ptr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + } + + return(ptr); +} + +#ifndef UNIV_HOTBACKUP +/***********************************************************//** +Marks a clustered index record deleted. Writes an undo log record to +undo log on this delete marking. Writes in the trx id field the id +of the deleting transaction, and in the roll ptr field pointer to the +undo log record created. +@return DB_SUCCESS, DB_LOCK_WAIT, or error number */ +UNIV_INTERN +ulint +btr_cur_del_mark_set_clust_rec( +/*===========================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor */ + ibool val, /*!< in: value to set */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index; + buf_block_t* block; + roll_ptr_t roll_ptr; + ulint err; + rec_t* rec; + page_zip_des_t* page_zip; + trx_t* trx; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + rec = btr_cur_get_rec(cursor); + index = cursor->index; + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + +#ifdef UNIV_DEBUG + if (btr_cur_print_record_ops && thr) { + btr_cur_trx_report(thr_get_trx(thr), index, "del mark "); + rec_print_new(stderr, rec, offsets); + } +#endif /* UNIV_DEBUG */ + + ut_ad(dict_index_is_clust(index)); + ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets))); + + err = lock_clust_rec_modify_check_and_lock(flags, + btr_cur_get_block(cursor), + rec, index, offsets, thr); + + if (err != DB_SUCCESS) { + + goto func_exit; + } + + err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr, + index, NULL, NULL, 0, rec, + &roll_ptr); + if (err != DB_SUCCESS) { + + goto func_exit; + } + + block = btr_cur_get_block(cursor); + + if (block->is_hashed) { + rw_lock_x_lock(&btr_search_latch); + } + + page_zip = buf_block_get_page_zip(block); + + btr_rec_set_deleted_flag(rec, page_zip, val); + + trx = thr_get_trx(thr); + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + row_upd_rec_sys_fields(rec, page_zip, + index, offsets, trx, roll_ptr); + } + + if (block->is_hashed) { + rw_lock_x_unlock(&btr_search_latch); + } + + btr_cur_del_mark_set_clust_rec_log(flags, rec, index, val, trx, + roll_ptr, mtr); + +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); +} + +/****************************************************************//** +Writes the redo log record for a delete mark setting of a secondary +index record. */ +UNIV_INLINE +void +btr_cur_del_mark_set_sec_rec_log( +/*=============================*/ + rec_t* rec, /*!< in: record */ + ibool val, /*!< in: value to set */ + mtr_t* mtr) /*!< in: mtr */ +{ + byte* log_ptr; + ut_ad(val <= 1); + + log_ptr = mlog_open(mtr, 11 + 1 + 2); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr); + mach_write_to_1(log_ptr, val); + log_ptr++; + + mach_write_to_2(log_ptr, page_offset(rec)); + log_ptr += 2; + + mlog_close(mtr, log_ptr); +} +#endif /* !UNIV_HOTBACKUP */ + +/****************************************************************//** +Parses the redo log record for delete marking or unmarking of a secondary +index record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_cur_parse_del_mark_set_sec_rec( +/*===============================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in/out: page or NULL */ + page_zip_des_t* page_zip)/*!< in/out: compressed page, or NULL */ +{ + ulint val; + ulint offset; + rec_t* rec; + + if (end_ptr < ptr + 3) { + + return(NULL); + } + + val = mach_read_from_1(ptr); + ptr++; + + offset = mach_read_from_2(ptr); + ptr += 2; + + ut_a(offset <= UNIV_PAGE_SIZE); + + if (page) { + rec = page + offset; + + /* We do not need to reserve btr_search_latch, as the page + is only being recovered, and there cannot be a hash index to + it. */ + + btr_rec_set_deleted_flag(rec, page_zip, val); + } + + return(ptr); +} + +#ifndef UNIV_HOTBACKUP +/***********************************************************//** +Sets a secondary index record delete mark to TRUE or FALSE. +@return DB_SUCCESS, DB_LOCK_WAIT, or error number */ +UNIV_INTERN +ulint +btr_cur_del_mark_set_sec_rec( +/*=========================*/ + ulint flags, /*!< in: locking flag */ + btr_cur_t* cursor, /*!< in: cursor */ + ibool val, /*!< in: value to set */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + rec_t* rec; + ulint err; + + block = btr_cur_get_block(cursor); + rec = btr_cur_get_rec(cursor); + +#ifdef UNIV_DEBUG + if (btr_cur_print_record_ops && thr) { + btr_cur_trx_report(thr_get_trx(thr), cursor->index, + "del mark "); + rec_print(stderr, rec, cursor->index); + } +#endif /* UNIV_DEBUG */ + + err = lock_sec_rec_modify_check_and_lock(flags, + btr_cur_get_block(cursor), + rec, cursor->index, thr, mtr); + if (err != DB_SUCCESS) { + + return(err); + } + + ut_ad(!!page_rec_is_comp(rec) + == dict_table_is_comp(cursor->index->table)); + + if (block->is_hashed) { + rw_lock_x_lock(&btr_search_latch); + } + + btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val); + + if (block->is_hashed) { + rw_lock_x_unlock(&btr_search_latch); + } + + btr_cur_del_mark_set_sec_rec_log(rec, val, mtr); + + return(DB_SUCCESS); +} + +/***********************************************************//** +Clear a secondary index record's delete mark. This function is only +used by the insert buffer insert merge mechanism. */ +UNIV_INTERN +void +btr_cur_del_unmark_for_ibuf( +/*========================*/ + rec_t* rec, /*!< in/out: record to delete unmark */ + page_zip_des_t* page_zip, /*!< in/out: compressed page + corresponding to rec, or NULL + when the tablespace is + uncompressed */ + mtr_t* mtr) /*!< in: mtr */ +{ + /* We do not need to reserve btr_search_latch, as the page has just + been read to the buffer pool and there cannot be a hash index to it. */ + + btr_rec_set_deleted_flag(rec, page_zip, FALSE); + + btr_cur_del_mark_set_sec_rec_log(rec, FALSE, mtr); +} + +/*==================== B-TREE RECORD REMOVE =========================*/ + +/*************************************************************//** +Tries to compress a page of the tree if it seems useful. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. To avoid +deadlocks, mtr must also own x-latches to brothers of page, if those +brothers exist. NOTE: it is assumed that the caller has reserved enough +free extents so that the compression will always succeed if done! +@return TRUE if compression occurred */ +UNIV_INTERN +ibool +btr_cur_compress_if_useful( +/*=======================*/ + btr_cur_t* cursor, /*!< in: cursor on the page to compress; + cursor does not stay valid if compression + occurs */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(mtr_memo_contains(mtr, + dict_index_get_lock(btr_cur_get_index(cursor)), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + + return(btr_cur_compress_recommendation(cursor, mtr) + && btr_compress(cursor, mtr)); +} + +/*******************************************************//** +Removes the record on which the tree cursor is positioned on a leaf page. +It is assumed that the mtr has an x-latch on the page where the cursor is +positioned, but no latch on the whole tree. +@return TRUE if success, i.e., the page did not become too empty */ +UNIV_INTERN +ibool +btr_cur_optimistic_delete( +/*======================*/ + btr_cur_t* cursor, /*!< in: cursor on leaf page, on the record to + delete; cursor stays valid: if deletion + succeeds, on function exit it points to the + successor of the deleted record */ + mtr_t* mtr) /*!< in: mtr; if this function returns + TRUE on a leaf page of a secondary + index, the mtr must be committed + before latching any further pages */ +{ + buf_block_t* block; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + ibool no_compress_needed; + rec_offs_init(offsets_); + + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + /* This is intended only for leaf page deletions */ + + block = btr_cur_get_block(cursor); + + if (srv_pass_corrupt_table && !block) { + return(DB_CORRUPTION); + } + ut_a(block); + + ut_ad(page_is_leaf(buf_block_get_frame(block))); + + rec = btr_cur_get_rec(cursor); + offsets = rec_get_offsets(rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + + no_compress_needed = !rec_offs_any_extern(offsets) + && btr_cur_can_delete_without_compress( + cursor, rec_offs_size(offsets), mtr); + + if (no_compress_needed) { + + page_t* page = buf_block_get_frame(block); + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + ulint max_ins = 0; + + lock_update_delete(block, rec); + + btr_search_update_hash_on_delete(cursor); + + if (!page_zip) { + max_ins = page_get_max_insert_size_after_reorganize( + page, 1); + } +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + cursor->index, offsets, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + if (dict_index_is_clust(cursor->index) + || dict_index_is_ibuf(cursor->index) + || !page_is_leaf(page)) { + /* The insert buffer does not handle + inserts to clustered indexes, to + non-leaf pages of secondary index B-trees, + or to the insert buffer. */ + } else if (page_zip) { + ibuf_update_free_bits_zip(block, mtr); + } else { + ibuf_update_free_bits_low(block, max_ins, mtr); + } + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(no_compress_needed); +} + +/*************************************************************//** +Removes the record on which the tree cursor is positioned. Tries +to compress the page if its fillfactor drops below a threshold +or if it is the only page on the level. It is assumed that mtr holds +an x-latch on the tree and on the cursor page. To avoid deadlocks, +mtr must also own x-latches to brothers of page, if those brothers +exist. +@return TRUE if compression occurred */ +UNIV_INTERN +ibool +btr_cur_pessimistic_delete( +/*=======================*/ + ulint* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE; + the latter may occur because we may have + to update node pointers on upper levels, + and in the case of variable length keys + these may actually grow in size */ + ibool has_reserved_extents, /*!< in: TRUE if the + caller has already reserved enough free + extents so that he knows that the operation + will succeed */ + btr_cur_t* cursor, /*!< in: cursor on the record to delete; + if compression does not occur, the cursor + stays valid: it points to successor of + deleted record on function exit */ + enum trx_rb_ctx rb_ctx, /*!< in: rollback context */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + dict_index_t* index; + rec_t* rec; + dtuple_t* node_ptr; + ulint n_extents = 0; + ulint n_reserved; + ibool success; + ibool ret = FALSE; + ulint level; + mem_heap_t* heap; + ulint* offsets; + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + index = btr_cur_get_index(cursor); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + if (!has_reserved_extents) { + /* First reserve enough free space for the file segments + of the index tree, so that the node pointer updates will + not fail because of lack of space */ + + n_extents = cursor->tree_height / 32 + 1; + + success = fsp_reserve_free_extents(&n_reserved, + index->space, + n_extents, + FSP_CLEANING, mtr); + if (!success) { + *err = DB_OUT_OF_FILE_SPACE; + + return(FALSE); + } + } + + heap = mem_heap_create(1024); + rec = btr_cur_get_rec(cursor); + page_zip = buf_block_get_page_zip(block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); + + if (rec_offs_any_extern(offsets)) { + btr_rec_free_externally_stored_fields(index, + rec, offsets, page_zip, + rb_ctx, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + } + + if (UNIV_UNLIKELY(page_get_n_recs(page) < 2) + && UNIV_UNLIKELY(dict_index_get_page(index) + != buf_block_get_page_no(block))) { + + /* If there is only one record, drop the whole page in + btr_discard_page, if this is not the root page */ + + btr_discard_page(cursor, mtr); + + *err = DB_SUCCESS; + ret = TRUE; + + goto return_after_reservations; + } + + lock_update_delete(block, rec); + level = btr_page_get_level(page, mtr); + + if (level > 0 + && UNIV_UNLIKELY(rec == page_rec_get_next( + page_get_infimum_rec(page)))) { + + rec_t* next_rec = page_rec_get_next(rec); + + if (btr_page_get_prev(page, mtr) == FIL_NULL) { + + /* If we delete the leftmost node pointer on a + non-leaf level, we must mark the new leftmost node + pointer as the predefined minimum record */ + + /* This will make page_zip_validate() fail until + page_cur_delete_rec() completes. This is harmless, + because everything will take place within a single + mini-transaction and because writing to the redo log + is an atomic operation (performed by mtr_commit()). */ + btr_set_min_rec_mark(next_rec, mtr); + } else { + /* Otherwise, if we delete the leftmost node pointer + on a page, we have to change the father node pointer + so that it is equal to the new leftmost node pointer + on the page */ + + btr_node_ptr_delete(index, block, mtr); + + node_ptr = dict_index_build_node_ptr( + index, next_rec, buf_block_get_page_no(block), + heap, level); + + btr_insert_on_non_leaf_level(index, + level + 1, node_ptr, mtr); + } + } + + btr_search_update_hash_on_delete(cursor); + + page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + ut_ad(btr_check_node_ptr(index, block, mtr)); + + *err = DB_SUCCESS; + +return_after_reservations: + mem_heap_free(heap); + + if (ret == FALSE) { + ret = btr_cur_compress_if_useful(cursor, mtr); + } + + if (n_extents > 0) { + fil_space_release_free_extents(index->space, n_reserved); + } + + return(ret); +} + +/*******************************************************************//** +Adds path information to the cursor for the current page, for which +the binary search has been performed. */ +static +void +btr_cur_add_path_info( +/*==================*/ + btr_cur_t* cursor, /*!< in: cursor positioned on a page */ + ulint height, /*!< in: height of the page in tree; + 0 means leaf node */ + ulint root_height) /*!< in: root node height in tree */ +{ + btr_path_t* slot; + rec_t* rec; + + ut_a(cursor->path_arr); + + if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) { + /* Do nothing; return empty path */ + + slot = cursor->path_arr; + slot->nth_rec = ULINT_UNDEFINED; + + return; + } + + if (height == 0) { + /* Mark end of slots for path */ + slot = cursor->path_arr + root_height + 1; + slot->nth_rec = ULINT_UNDEFINED; + } + + rec = btr_cur_get_rec(cursor); + + slot = cursor->path_arr + (root_height - height); + + slot->nth_rec = page_rec_get_n_recs_before(rec); + slot->n_recs = page_get_n_recs(page_align(rec)); +} + +/*******************************************************************//** +Estimates the number of rows in a given index range. +@return estimated number of rows */ +UNIV_INTERN +ib_int64_t +btr_estimate_n_rows_in_range( +/*=========================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* tuple1, /*!< in: range start, may also be empty tuple */ + ulint mode1, /*!< in: search mode for range start */ + const dtuple_t* tuple2, /*!< in: range end, may also be empty tuple */ + ulint mode2) /*!< in: search mode for range end */ +{ + btr_path_t path1[BTR_PATH_ARRAY_N_SLOTS]; + btr_path_t path2[BTR_PATH_ARRAY_N_SLOTS]; + btr_cur_t cursor; + btr_path_t* slot1; + btr_path_t* slot2; + ibool diverged; + ibool diverged_lot; + ulint divergence_level; + ib_int64_t n_rows; + ulint i; + mtr_t mtr; + + mtr_start(&mtr); + + cursor.path_arr = path1; + + if (dtuple_get_n_fields(tuple1) > 0) { + + btr_cur_search_to_nth_level(index, 0, tuple1, mode1, + BTR_SEARCH_LEAF | BTR_ESTIMATE, + &cursor, 0, + __FILE__, __LINE__, &mtr); + } else { + btr_cur_open_at_index_side(TRUE, index, + BTR_SEARCH_LEAF | BTR_ESTIMATE, + &cursor, &mtr); + } + + mtr_commit(&mtr); + + mtr_start(&mtr); + + cursor.path_arr = path2; + + if (dtuple_get_n_fields(tuple2) > 0) { + + btr_cur_search_to_nth_level(index, 0, tuple2, mode2, + BTR_SEARCH_LEAF | BTR_ESTIMATE, + &cursor, 0, + __FILE__, __LINE__, &mtr); + } else { + btr_cur_open_at_index_side(FALSE, index, + BTR_SEARCH_LEAF | BTR_ESTIMATE, + &cursor, &mtr); + } + + mtr_commit(&mtr); + + /* We have the path information for the range in path1 and path2 */ + + n_rows = 1; + diverged = FALSE; /* This becomes true when the path is not + the same any more */ + diverged_lot = FALSE; /* This becomes true when the paths are + not the same or adjacent any more */ + divergence_level = 1000000; /* This is the level where paths diverged + a lot */ + for (i = 0; ; i++) { + ut_ad(i < BTR_PATH_ARRAY_N_SLOTS); + + slot1 = path1 + i; + slot2 = path2 + i; + + if (slot1->nth_rec == ULINT_UNDEFINED + || slot2->nth_rec == ULINT_UNDEFINED) { + + if (i > divergence_level + 1) { + /* In trees whose height is > 1 our algorithm + tends to underestimate: multiply the estimate + by 2: */ + + n_rows = n_rows * 2; + } + + /* Do not estimate the number of rows in the range + to over 1 / 2 of the estimated rows in the whole + table */ + + if (n_rows > index->table->stat_n_rows / 2) { + n_rows = index->table->stat_n_rows / 2; + + /* If there are just 0 or 1 rows in the table, + then we estimate all rows are in the range */ + + if (n_rows == 0) { + n_rows = index->table->stat_n_rows; + } + } + + return(n_rows); + } + + if (!diverged && slot1->nth_rec != slot2->nth_rec) { + + diverged = TRUE; + + if (slot1->nth_rec < slot2->nth_rec) { + n_rows = slot2->nth_rec - slot1->nth_rec; + + if (n_rows > 1) { + diverged_lot = TRUE; + divergence_level = i; + } + } else { + /* Maybe the tree has changed between + searches */ + + return(10); + } + + } else if (diverged && !diverged_lot) { + + if (slot1->nth_rec < slot1->n_recs + || slot2->nth_rec > 1) { + + diverged_lot = TRUE; + divergence_level = i; + + n_rows = 0; + + if (slot1->nth_rec < slot1->n_recs) { + n_rows += slot1->n_recs + - slot1->nth_rec; + } + + if (slot2->nth_rec > 1) { + n_rows += slot2->nth_rec - 1; + } + } + } else if (diverged_lot) { + + n_rows = (n_rows * (slot1->n_recs + slot2->n_recs)) + / 2; + } + } +} + +/*******************************************************************//** +Estimates the number of pages which have not null value of the key of n_cols. +@return estimated number of pages */ +UNIV_INTERN +ulint +btr_estimate_n_pages_not_null( +/*=========================*/ + dict_index_t* index, /*!< in: index */ + ulint n_cols, /*!< in: The cols should be not null */ + btr_path_t* path1) /*!< in: path1[BTR_PATH_ARRAY_N_SLOTS] */ +{ + dtuple_t* tuple1; + btr_path_t path2[BTR_PATH_ARRAY_N_SLOTS]; + btr_cur_t cursor; + btr_path_t* slot1; + btr_path_t* slot2; + ibool diverged; + ibool diverged_lot; + ulint divergence_level; + ulint n_pages; + ulint i; + mtr_t mtr; + mem_heap_t* heap; + + heap = mem_heap_create(n_cols * sizeof(dfield_t) + + sizeof(dtuple_t)); + + /* make tuple1 (NULL,NULL,,,) from n_cols */ + tuple1 = dtuple_create(heap, n_cols); + dict_index_copy_types(tuple1, index, n_cols); + + for (i = 0; i < n_cols; i++) { + dfield_set_null(dtuple_get_nth_field(tuple1, i)); + } + + mtr_start(&mtr); + + cursor.path_arr = path1; + + btr_cur_search_to_nth_level(index, 0, tuple1, PAGE_CUR_G, + BTR_SEARCH_LEAF | BTR_ESTIMATE, + &cursor, 0, __FILE__, __LINE__, &mtr); + + mtr_commit(&mtr); + + + + mtr_start(&mtr); + + cursor.path_arr = path2; + + btr_cur_open_at_index_side(FALSE, index, + BTR_SEARCH_LEAF | BTR_ESTIMATE, + &cursor, &mtr); + + mtr_commit(&mtr); + + mem_heap_free(heap); + + /* We have the path information for the range in path1 and path2 */ + + n_pages = 1; + diverged = FALSE; /* This becomes true when the path is not + the same any more */ + diverged_lot = FALSE; /* This becomes true when the paths are + not the same or adjacent any more */ + divergence_level = 1000000; /* This is the level where paths diverged + a lot */ + for (i = 0; ; i++) { + ut_ad(i < BTR_PATH_ARRAY_N_SLOTS); + + slot1 = path1 + i; + slot2 = path2 + i; + + if ((slot1 + 1)->nth_rec == ULINT_UNDEFINED + || (slot2 + 1)->nth_rec == ULINT_UNDEFINED) { + + if (i > divergence_level + 1) { + /* In trees whose height is > 1 our algorithm + tends to underestimate: multiply the estimate + by 2: */ + + n_pages = n_pages * 2; + } + + /* Do not estimate the number of rows in the range + to over 1 / 2 of the estimated rows in the whole + table */ + + if (n_pages > index->stat_n_leaf_pages / 2) { + n_pages = index->stat_n_leaf_pages / 2; + + /* If there are just 0 or 1 rows in the table, + then we estimate all rows are in the range */ + + if (n_pages == 0) { + n_pages = index->stat_n_leaf_pages; + } + } + + return(n_pages); + } + + if (!diverged && slot1->nth_rec != slot2->nth_rec) { + + diverged = TRUE; + + if (slot1->nth_rec < slot2->nth_rec) { + n_pages = slot2->nth_rec - slot1->nth_rec; + + if (n_pages > 1) { + diverged_lot = TRUE; + divergence_level = i; + } + } else { + /* Maybe the tree has changed between + searches */ + + return(10); + } + + } else if (diverged && !diverged_lot) { + + if (slot1->nth_rec < slot1->n_recs + || slot2->nth_rec > 1) { + + diverged_lot = TRUE; + divergence_level = i; + + n_pages = 0; + + if (slot1->nth_rec < slot1->n_recs) { + n_pages += slot1->n_recs + - slot1->nth_rec; + } + + if (slot2->nth_rec > 1) { + n_pages += slot2->nth_rec - 1; + } + } + } else if (diverged_lot) { + + n_pages = (n_pages * (slot1->n_recs + slot2->n_recs)) + / 2; + } + } +} + +/*******************************************************************//** +Estimates the number of different key values in a given index, for +each n-column prefix of the index where n <= dict_index_get_n_unique(index). +The estimates are stored in the array index->stat_n_diff_key_vals. */ +UNIV_INTERN +void +btr_estimate_number_of_different_key_vals( +/*======================================*/ + dict_index_t* index) /*!< in: index */ +{ + btr_cur_t cursor; + page_t* page; + rec_t* rec; + ulint n_cols; + ulint matched_fields; + ulint matched_bytes; + ib_int64_t* n_diff; + ullint n_sample_pages; /* number of pages to sample */ + ulint not_empty_flag = 0; + ulint total_external_size = 0; + ulint i; + ulint j; + ullint add_on; + mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_rec_[REC_OFFS_NORMAL_SIZE]; + ulint offsets_next_rec_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets_rec = offsets_rec_; + ulint* offsets_next_rec= offsets_next_rec_; + ulint stats_method = srv_stats_method; + btr_path_t first_rec_path[BTR_PATH_ARRAY_N_SLOTS]; + ulint effective_pages; /* effective leaf pages */ + rec_offs_init(offsets_rec_); + rec_offs_init(offsets_next_rec_); + + n_cols = dict_index_get_n_unique(index); + + if (stats_method == SRV_STATS_METHOD_IGNORE_NULLS) { + /* estimate effective pages and path for the first effective record */ + /* TODO: make it work also for n_cols > 1. */ + effective_pages = btr_estimate_n_pages_not_null(index, 1 /*k*/, first_rec_path); + + if (!effective_pages) { + dict_index_stat_mutex_enter(index); + for (j = 0; j <= n_cols; j++) { + index->stat_n_diff_key_vals[j] = (ib_int64_t)index->stat_n_leaf_pages; + } + dict_index_stat_mutex_exit(index); + return; + } else if (effective_pages > index->stat_n_leaf_pages) { + effective_pages = index->stat_n_leaf_pages; + } + } else { + effective_pages = index->stat_n_leaf_pages; + } + + n_diff = mem_zalloc((n_cols + 1) * sizeof(ib_int64_t)); + + /* It makes no sense to test more pages than are contained + in the index, thus we lower the number if it is too high */ + if (srv_stats_sample_pages > effective_pages) { + if (effective_pages > 0) { + n_sample_pages = effective_pages; + } else { + n_sample_pages = 1; + } + } else { + n_sample_pages = srv_stats_sample_pages; + } + + /* We sample some pages in the index to get an estimate */ + + for (i = 0; i < n_sample_pages; i++) { + rec_t* supremum; + ibool is_first_page = TRUE; + mtr_start(&mtr); + + if (stats_method == SRV_STATS_METHOD_IGNORE_NULLS) { + is_first_page = btr_cur_open_at_rnd_pos_after_path(index, BTR_SEARCH_LEAF, + first_rec_path, &cursor, &mtr); + } else { + btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr); + } + + /* Count the number of different key values for each prefix of + the key on this index page. If the prefix does not determine + the index record uniquely in the B-tree, then we subtract one + because otherwise our algorithm would give a wrong estimate + for an index where there is just one key value. */ + + page = btr_cur_get_page(&cursor); + + if (srv_pass_corrupt_table && !page) { + break; + } + ut_a(page); + + supremum = page_get_supremum_rec(page); + if (stats_method == SRV_STATS_METHOD_IGNORE_NULLS && is_first_page) { + /* the cursor should be the first record of the page. */ + /* Counting should be started from here. */ + rec = btr_cur_get_rec(&cursor); + } else { + rec = page_rec_get_next(page_get_infimum_rec(page)); + } + + if (rec != supremum) { + not_empty_flag = 1; + offsets_rec = rec_get_offsets(rec, index, offsets_rec, + ULINT_UNDEFINED, &heap); + } + + while (rec != supremum) { + rec_t* next_rec; + next_rec = page_rec_get_next(rec); + if (next_rec == supremum) { + break; + } + + matched_fields = 0; + matched_bytes = 0; + offsets_next_rec = rec_get_offsets(next_rec, index, + offsets_next_rec, + n_cols, &heap); + + cmp_rec_rec_with_match(rec, next_rec, + offsets_rec, offsets_next_rec, + index, &matched_fields, + &matched_bytes, + (stats_method==SRV_STATS_METHOD_NULLS_NOT_EQUAL) ? + SRV_STATS_METHOD_NULLS_NOT_EQUAL : + SRV_STATS_METHOD_NULLS_EQUAL); + + for (j = matched_fields + 1; j <= n_cols; j++) { + /* We add one if this index record has + a different prefix from the previous */ + + n_diff[j]++; + } + + total_external_size + += btr_rec_get_externally_stored_len( + rec, offsets_rec); + + rec = next_rec; + /* Initialize offsets_rec for the next round + and assign the old offsets_rec buffer to + offsets_next_rec. */ + { + ulint* offsets_tmp = offsets_rec; + offsets_rec = offsets_next_rec; + offsets_next_rec = offsets_tmp; + } + } + + + if (n_cols == dict_index_get_n_unique_in_tree(index)) { + + /* If there is more than one leaf page in the tree, + we add one because we know that the first record + on the page certainly had a different prefix than the + last record on the previous index page in the + alphabetical order. Before this fix, if there was + just one big record on each clustered index page, the + algorithm grossly underestimated the number of rows + in the table. */ + + if (btr_page_get_prev(page, &mtr) != FIL_NULL + || btr_page_get_next(page, &mtr) != FIL_NULL) { + + n_diff[n_cols]++; + } + } + + offsets_rec = rec_get_offsets(rec, index, offsets_rec, + ULINT_UNDEFINED, &heap); + total_external_size += btr_rec_get_externally_stored_len( + rec, offsets_rec); + mtr_commit(&mtr); + } + + /* If we saw k borders between different key values on + n_sample_pages leaf pages, we can estimate how many + there will be in index->stat_n_leaf_pages */ + + /* We must take into account that our sample actually represents + also the pages used for external storage of fields (those pages are + included in index->stat_n_leaf_pages) */ + + dict_index_stat_mutex_enter(index); + + for (j = 0; j <= n_cols; j++) { + index->stat_n_diff_key_vals[j] + = ((n_diff[j] + * (ib_int64_t)effective_pages + + n_sample_pages - 1 + + total_external_size + + not_empty_flag) + / (n_sample_pages + + total_external_size)); + + /* If the tree is small, smaller than + 10 * n_sample_pages + total_external_size, then + the above estimate is ok. For bigger trees it is common that we + do not see any borders between key values in the few pages + we pick. But still there may be n_sample_pages + different key values, or even more. Let us try to approximate + that: */ + + add_on = effective_pages + / (10 * (n_sample_pages + + total_external_size)); + + if (add_on > n_sample_pages) { + add_on = n_sample_pages; + } + + index->stat_n_diff_key_vals[j] += add_on; + + if (stats_method == SRV_STATS_METHOD_IGNORE_NULLS) { + /* index->stat_n_diff_key_vals[k] is used for calc rec_per_key, + as "stats.records / index->stat_n_diff_key_vals[x]". + So it should be adjusted to the value which is based on whole of the index. */ + index->stat_n_diff_key_vals[j] = + index->stat_n_diff_key_vals[j] * (ib_int64_t)index->stat_n_leaf_pages + / (ib_int64_t)effective_pages; + } + } + + dict_index_stat_mutex_exit(index); + + mem_free(n_diff); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/ + +/***********************************************************//** +Gets the externally stored size of a record, in units of a database page. +@return externally stored part, in units of a database page */ +static +ulint +btr_rec_get_externally_stored_len( +/*==============================*/ + rec_t* rec, /*!< in: record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint n_fields; + byte* data; + ulint local_len; + ulint extern_len; + ulint total_extern_len = 0; + ulint i; + + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + n_fields = rec_offs_n_fields(offsets); + + for (i = 0; i < n_fields; i++) { + if (rec_offs_nth_extern(offsets, i)) { + + data = rec_get_nth_field(rec, offsets, i, &local_len); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + extern_len = mach_read_from_4(data + local_len + + BTR_EXTERN_LEN + 4); + + total_extern_len += ut_calc_align(extern_len, + UNIV_PAGE_SIZE); + } + } + + return(total_extern_len / UNIV_PAGE_SIZE); +} + +/*******************************************************************//** +Sets the ownership bit of an externally stored field in a record. */ +static +void +btr_cur_set_ownership_of_extern_field( +/*==================================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed + part will be updated, or NULL */ + rec_t* rec, /*!< in/out: clustered index record */ + dict_index_t* index, /*!< in: index of the page */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint i, /*!< in: field number */ + ibool val, /*!< in: value to set */ + mtr_t* mtr) /*!< in: mtr, or NULL if not logged */ +{ + byte* data; + ulint local_len; + ulint byte_val; + + data = rec_get_nth_field(rec, offsets, i, &local_len); + + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN); + + if (val) { + byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG); + } else { + byte_val = byte_val | BTR_EXTERN_OWNER_FLAG; + } + + if (UNIV_LIKELY_NULL(page_zip)) { + mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val); + page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr); + } else if (UNIV_LIKELY(mtr != NULL)) { + + mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val, + MLOG_1BYTE, mtr); + } else { + mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val); + } +} + +/*******************************************************************//** +Marks not updated extern fields as not-owned by this record. The ownership +is transferred to the updated record which is inserted elsewhere in the +index tree. In purge only the owner of externally stored field is allowed +to free the field. +@return TRUE if BLOB ownership was transferred */ +UNIV_INTERN +ibool +btr_cur_mark_extern_inherited_fields( +/*=================================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed + part will be updated, or NULL */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + const upd_t* update, /*!< in: update vector */ + mtr_t* mtr) /*!< in: mtr, or NULL if not logged */ +{ + ulint n; + ulint j; + ulint i; + ibool change_ownership = FALSE; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + + if (!rec_offs_any_extern(offsets)) { + + return(FALSE); + } + + n = rec_offs_n_fields(offsets); + + for (i = 0; i < n; i++) { + if (rec_offs_nth_extern(offsets, i)) { + + /* Check it is not in updated fields */ + + if (update) { + for (j = 0; j < upd_get_n_fields(update); + j++) { + if (upd_get_nth_field(update, j) + ->field_no == i) { + + goto updated; + } + } + } + + btr_cur_set_ownership_of_extern_field( + page_zip, rec, index, offsets, i, FALSE, mtr); + + change_ownership = TRUE; +updated: + ; + } + } + + return(change_ownership); +} + +/*******************************************************************//** +The complement of the previous function: in an update entry may inherit +some externally stored fields from a record. We must mark them as inherited +in entry, so that they are not freed in a rollback. */ +UNIV_INTERN +void +btr_cur_mark_dtuple_inherited_extern( +/*=================================*/ + dtuple_t* entry, /*!< in/out: updated entry to be + inserted to clustered index */ + const upd_t* update) /*!< in: update vector */ +{ + ulint i; + + for (i = 0; i < dtuple_get_n_fields(entry); i++) { + + dfield_t* dfield = dtuple_get_nth_field(entry, i); + byte* data; + ulint len; + ulint j; + + if (!dfield_is_ext(dfield)) { + continue; + } + + /* Check if it is in updated fields */ + + for (j = 0; j < upd_get_n_fields(update); j++) { + if (upd_get_nth_field(update, j)->field_no == i) { + + goto is_updated; + } + } + + data = dfield_get_data(dfield); + len = dfield_get_len(dfield); + data[len - BTR_EXTERN_FIELD_REF_SIZE + BTR_EXTERN_LEN] + |= BTR_EXTERN_INHERITED_FLAG; + +is_updated: + ; + } +} + +/*******************************************************************//** +Marks all extern fields in a record as owned by the record. This function +should be called if the delete mark of a record is removed: a not delete +marked record always owns all its extern fields. */ +static +void +btr_cur_unmark_extern_fields( +/*=========================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed + part will be updated, or NULL */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + mtr_t* mtr) /*!< in: mtr, or NULL if not logged */ +{ + ulint n; + ulint i; + + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + n = rec_offs_n_fields(offsets); + + if (!rec_offs_any_extern(offsets)) { + + return; + } + + for (i = 0; i < n; i++) { + if (rec_offs_nth_extern(offsets, i)) { + + btr_cur_set_ownership_of_extern_field( + page_zip, rec, index, offsets, i, TRUE, mtr); + } + } +} + +/*******************************************************************//** +Marks all extern fields in a dtuple as owned by the record. */ +UNIV_INTERN +void +btr_cur_unmark_dtuple_extern_fields( +/*================================*/ + dtuple_t* entry) /*!< in/out: clustered index entry */ +{ + ulint i; + + for (i = 0; i < dtuple_get_n_fields(entry); i++) { + dfield_t* dfield = dtuple_get_nth_field(entry, i); + + if (dfield_is_ext(dfield)) { + byte* data = dfield_get_data(dfield); + ulint len = dfield_get_len(dfield); + + data[len - BTR_EXTERN_FIELD_REF_SIZE + BTR_EXTERN_LEN] + &= ~BTR_EXTERN_OWNER_FLAG; + } + } +} + +/*******************************************************************//** +Flags the data tuple fields that are marked as extern storage in the +update vector. We use this function to remember which fields we must +mark as extern storage in a record inserted for an update. +@return number of flagged external columns */ +UNIV_INTERN +ulint +btr_push_update_extern_fields( +/*==========================*/ + dtuple_t* tuple, /*!< in/out: data tuple */ + const upd_t* update, /*!< in: update vector */ + mem_heap_t* heap) /*!< in: memory heap */ +{ + ulint n_pushed = 0; + ulint n; + const upd_field_t* uf; + + ut_ad(tuple); + ut_ad(update); + + uf = update->fields; + n = upd_get_n_fields(update); + + for (; n--; uf++) { + if (dfield_is_ext(&uf->new_val)) { + dfield_t* field + = dtuple_get_nth_field(tuple, uf->field_no); + + if (!dfield_is_ext(field)) { + dfield_set_ext(field); + n_pushed++; + } + + switch (uf->orig_len) { + byte* data; + ulint len; + byte* buf; + case 0: + break; + case BTR_EXTERN_FIELD_REF_SIZE: + /* Restore the original locally stored + part of the column. In the undo log, + InnoDB writes a longer prefix of externally + stored columns, so that column prefixes + in secondary indexes can be reconstructed. */ + dfield_set_data(field, (byte*) dfield_get_data(field) + + dfield_get_len(field) + - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + dfield_set_ext(field); + break; + default: + /* Reconstruct the original locally + stored part of the column. The data + will have to be copied. */ + ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE); + + data = dfield_get_data(field); + len = dfield_get_len(field); + + buf = mem_heap_alloc(heap, uf->orig_len); + /* Copy the locally stored prefix. */ + memcpy(buf, data, + uf->orig_len + - BTR_EXTERN_FIELD_REF_SIZE); + /* Copy the BLOB pointer. */ + memcpy(buf + uf->orig_len + - BTR_EXTERN_FIELD_REF_SIZE, + data + len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + + dfield_set_data(field, buf, uf->orig_len); + dfield_set_ext(field); + } + } + } + + return(n_pushed); +} + +/*******************************************************************//** +Returns the length of a BLOB part stored on the header page. +@return part length */ +static +ulint +btr_blob_get_part_len( +/*==================*/ + const byte* blob_header) /*!< in: blob header */ +{ + return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN)); +} + +/*******************************************************************//** +Returns the page number where the next BLOB part is stored. +@return page number or FIL_NULL if no more pages */ +static +ulint +btr_blob_get_next_page_no( +/*======================*/ + const byte* blob_header) /*!< in: blob header */ +{ + return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO)); +} + +/*******************************************************************//** +Deallocate a buffer block that was reserved for a BLOB part. */ +static +void +btr_blob_free( +/*==========*/ + buf_block_t* block, /*!< in: buffer block */ + ibool all, /*!< in: TRUE=remove also the compressed page + if there is one */ + mtr_t* mtr) /*!< in: mini-transaction to commit */ +{ + ulint space = buf_block_get_space(block); + ulint page_no = buf_block_get_page_no(block); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + + mtr_commit(mtr); + + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + mutex_enter(&block->mutex); + + /* Only free the block if it is still allocated to + the same file page. */ + + if (buf_block_get_state(block) + == BUF_BLOCK_FILE_PAGE + && buf_block_get_space(block) == space + && buf_block_get_page_no(block) == page_no) { + + if (buf_LRU_free_block(&block->page, all, NULL, TRUE) + != BUF_LRU_FREED + && all && block->page.zip.data + /* Now, buf_LRU_free_block() may release mutex temporarily */ + && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE + && buf_block_get_space(block) == space + && buf_block_get_page_no(block) == page_no) { + /* Attempt to deallocate the uncompressed page + if the whole block cannot be deallocted. */ + + buf_LRU_free_block(&block->page, FALSE, NULL, TRUE); + } + } + + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + mutex_exit(&block->mutex); +} + +/*******************************************************************//** +Stores the fields in big_rec_vec to the tablespace and puts pointers to +them in rec. The extern flags in rec will have to be set beforehand. +The fields are stored on pages allocated from leaf node +file segment of the index tree. +@return DB_SUCCESS or error */ +UNIV_INTERN +ulint +btr_store_big_rec_extern_fields( +/*============================*/ + dict_index_t* index, /*!< in: index of rec; the index tree + MUST be X-latched */ + buf_block_t* rec_block, /*!< in/out: block containing rec */ + rec_t* rec, /*!< in/out: record */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index); + the "external storage" flags in offsets + will not correspond to rec when + this function returns */ + big_rec_t* big_rec_vec, /*!< in: vector containing fields + to be stored externally */ + mtr_t* local_mtr __attribute__((unused))) /*!< in: mtr + containing the latch to rec and to the + tree */ +{ + ulint rec_page_no; + byte* field_ref; + ulint extern_len; + ulint store_len; + ulint page_no; + ulint space_id; + ulint zip_size; + ulint prev_page_no; + ulint hint_page_no; + ulint i; + mtr_t mtr; + mem_heap_t* heap = NULL; + page_zip_des_t* page_zip; + z_stream c_stream; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(local_mtr, rec_block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(buf_block_get_frame(rec_block) == page_align(rec)); + ut_a(dict_index_is_clust(index)); + + page_zip = buf_block_get_page_zip(rec_block); + ut_a(dict_table_zip_size(index->table) + == buf_block_get_zip_size(rec_block)); + + space_id = buf_block_get_space(rec_block); + zip_size = buf_block_get_zip_size(rec_block); + rec_page_no = buf_block_get_page_no(rec_block); + ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX); + + if (UNIV_LIKELY_NULL(page_zip)) { + int err; + + /* Zlib deflate needs 128 kilobytes for the default + window size, plus 512 << memLevel, plus a few + kilobytes for small objects. We use reduced memLevel + to limit the memory consumption, and preallocate the + heap, hoping to avoid memory fragmentation. */ + heap = mem_heap_create(250000); + page_zip_set_alloc(&c_stream, heap); + + err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION, + Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY); + ut_a(err == Z_OK); + } + + /* We have to create a file segment to the tablespace + for each field and put the pointer to the field in rec */ + + for (i = 0; i < big_rec_vec->n_fields; i++) { + ut_ad(rec_offs_nth_extern(offsets, + big_rec_vec->fields[i].field_no)); + { + ulint local_len; + field_ref = rec_get_nth_field( + rec, offsets, big_rec_vec->fields[i].field_no, + &local_len); + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + field_ref += local_len; + } + extern_len = big_rec_vec->fields[i].len; + UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data, + extern_len); + + ut_a(extern_len > 0); + + prev_page_no = FIL_NULL; + + if (UNIV_LIKELY_NULL(page_zip)) { + int err = deflateReset(&c_stream); + ut_a(err == Z_OK); + + c_stream.next_in = (void*) big_rec_vec->fields[i].data; + c_stream.avail_in = extern_len; + } + + for (;;) { + buf_block_t* block; + page_t* page; + + mtr_start(&mtr); + + if (prev_page_no == FIL_NULL) { + hint_page_no = 1 + rec_page_no; + } else { + hint_page_no = prev_page_no + 1; + } + + block = btr_page_alloc(index, hint_page_no, + FSP_NO_DIR, 0, &mtr); + if (UNIV_UNLIKELY(block == NULL)) { + + mtr_commit(&mtr); + + if (UNIV_LIKELY_NULL(page_zip)) { + deflateEnd(&c_stream); + mem_heap_free(heap); + } + + return(DB_OUT_OF_FILE_SPACE); + } + + page_no = buf_block_get_page_no(block); + page = buf_block_get_frame(block); + + if (prev_page_no != FIL_NULL) { + buf_block_t* prev_block; + page_t* prev_page; + + prev_block = buf_page_get(space_id, zip_size, + prev_page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(prev_block, + SYNC_EXTERN_STORAGE); + prev_page = buf_block_get_frame(prev_block); + + if (UNIV_LIKELY_NULL(page_zip)) { + mlog_write_ulint( + prev_page + FIL_PAGE_NEXT, + page_no, MLOG_4BYTES, &mtr); + memcpy(buf_block_get_page_zip( + prev_block) + ->data + FIL_PAGE_NEXT, + prev_page + FIL_PAGE_NEXT, 4); + } else { + mlog_write_ulint( + prev_page + FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO, + page_no, MLOG_4BYTES, &mtr); + } + + } + + if (UNIV_LIKELY_NULL(page_zip)) { + int err; + page_zip_des_t* blob_page_zip; + + /* Write FIL_PAGE_TYPE to the redo log + separately, before logging any other + changes to the page, so that the debug + assertions in + recv_parse_or_apply_log_rec_body() can + be made simpler. Before InnoDB Plugin + 1.0.4, the initialization of + FIL_PAGE_TYPE was logged as part of + the mlog_log_string() below. */ + + mlog_write_ulint(page + FIL_PAGE_TYPE, + prev_page_no == FIL_NULL + ? FIL_PAGE_TYPE_ZBLOB + : FIL_PAGE_TYPE_ZBLOB2, + MLOG_2BYTES, &mtr); + + c_stream.next_out = page + + FIL_PAGE_DATA; + c_stream.avail_out + = page_zip_get_size(page_zip) + - FIL_PAGE_DATA; + + err = deflate(&c_stream, Z_FINISH); + ut_a(err == Z_OK || err == Z_STREAM_END); + ut_a(err == Z_STREAM_END + || c_stream.avail_out == 0); + + /* Write the "next BLOB page" pointer */ + mlog_write_ulint(page + FIL_PAGE_NEXT, + FIL_NULL, MLOG_4BYTES, &mtr); + /* Initialize the unused "prev page" pointer */ + mlog_write_ulint(page + FIL_PAGE_PREV, + FIL_NULL, MLOG_4BYTES, &mtr); + /* Write a back pointer to the record + into the otherwise unused area. This + information could be useful in + debugging. Later, we might want to + implement the possibility to relocate + BLOB pages. Then, we would need to be + able to adjust the BLOB pointer in the + record. We do not store the heap + number of the record, because it can + change in page_zip_reorganize() or + btr_page_reorganize(). However, also + the page number of the record may + change when B-tree nodes are split or + merged. */ + mlog_write_ulint(page + + FIL_PAGE_FILE_FLUSH_LSN, + space_id, + MLOG_4BYTES, &mtr); + mlog_write_ulint(page + + FIL_PAGE_FILE_FLUSH_LSN + 4, + rec_page_no, + MLOG_4BYTES, &mtr); + + /* Zero out the unused part of the page. */ + memset(page + page_zip_get_size(page_zip) + - c_stream.avail_out, + 0, c_stream.avail_out); + mlog_log_string(page + FIL_PAGE_FILE_FLUSH_LSN, + page_zip_get_size(page_zip) + - FIL_PAGE_FILE_FLUSH_LSN, + &mtr); + /* Copy the page to compressed storage, + because it will be flushed to disk + from there. */ + blob_page_zip = buf_block_get_page_zip(block); + ut_ad(blob_page_zip); + ut_ad(page_zip_get_size(blob_page_zip) + == page_zip_get_size(page_zip)); + memcpy(blob_page_zip->data, page, + page_zip_get_size(page_zip)); + + if (err == Z_OK && prev_page_no != FIL_NULL) { + + goto next_zip_page; + } + + rec_block = buf_page_get(space_id, zip_size, + rec_page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(rec_block, + SYNC_NO_ORDER_CHECK); + + if (err == Z_STREAM_END) { + mach_write_to_4(field_ref + + BTR_EXTERN_LEN, 0); + mach_write_to_4(field_ref + + BTR_EXTERN_LEN + 4, + c_stream.total_in); + } else { + memset(field_ref + BTR_EXTERN_LEN, + 0, 8); + } + + if (prev_page_no == FIL_NULL) { + mach_write_to_4(field_ref + + BTR_EXTERN_SPACE_ID, + space_id); + + mach_write_to_4(field_ref + + BTR_EXTERN_PAGE_NO, + page_no); + + mach_write_to_4(field_ref + + BTR_EXTERN_OFFSET, + FIL_PAGE_NEXT); + } + + page_zip_write_blob_ptr( + page_zip, rec, index, offsets, + big_rec_vec->fields[i].field_no, &mtr); + +next_zip_page: + prev_page_no = page_no; + + /* Commit mtr and release the + uncompressed page frame to save memory. */ + btr_blob_free(block, FALSE, &mtr); + + if (err == Z_STREAM_END) { + break; + } + } else { + mlog_write_ulint(page + FIL_PAGE_TYPE, + FIL_PAGE_TYPE_BLOB, + MLOG_2BYTES, &mtr); + + if (extern_len > (UNIV_PAGE_SIZE + - FIL_PAGE_DATA + - BTR_BLOB_HDR_SIZE + - FIL_PAGE_DATA_END)) { + store_len = UNIV_PAGE_SIZE + - FIL_PAGE_DATA + - BTR_BLOB_HDR_SIZE + - FIL_PAGE_DATA_END; + } else { + store_len = extern_len; + } + + mlog_write_string(page + FIL_PAGE_DATA + + BTR_BLOB_HDR_SIZE, + (const byte*) + big_rec_vec->fields[i].data + + big_rec_vec->fields[i].len + - extern_len, + store_len, &mtr); + mlog_write_ulint(page + FIL_PAGE_DATA + + BTR_BLOB_HDR_PART_LEN, + store_len, MLOG_4BYTES, &mtr); + mlog_write_ulint(page + FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO, + FIL_NULL, MLOG_4BYTES, &mtr); + + extern_len -= store_len; + + rec_block = buf_page_get(space_id, zip_size, + rec_page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(rec_block, + SYNC_NO_ORDER_CHECK); + + mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0, + MLOG_4BYTES, &mtr); + mlog_write_ulint(field_ref + + BTR_EXTERN_LEN + 4, + big_rec_vec->fields[i].len + - extern_len, + MLOG_4BYTES, &mtr); + + if (prev_page_no == FIL_NULL) { + mlog_write_ulint(field_ref + + BTR_EXTERN_SPACE_ID, + space_id, + MLOG_4BYTES, &mtr); + + mlog_write_ulint(field_ref + + BTR_EXTERN_PAGE_NO, + page_no, + MLOG_4BYTES, &mtr); + + mlog_write_ulint(field_ref + + BTR_EXTERN_OFFSET, + FIL_PAGE_DATA, + MLOG_4BYTES, &mtr); + } + + prev_page_no = page_no; + + mtr_commit(&mtr); + + if (extern_len == 0) { + break; + } + } + } + } + + if (UNIV_LIKELY_NULL(page_zip)) { + deflateEnd(&c_stream); + mem_heap_free(heap); + } + + return(DB_SUCCESS); +} + +/*******************************************************************//** +Check the FIL_PAGE_TYPE on an uncompressed BLOB page. */ +static +void +btr_check_blob_fil_page_type( +/*=========================*/ + ulint space_id, /*!< in: space id */ + ulint page_no, /*!< in: page number */ + const page_t* page, /*!< in: page */ + ibool read) /*!< in: TRUE=read, FALSE=purge */ +{ + ulint type = fil_page_get_type(page); + + ut_a(space_id == page_get_space_id(page)); + ut_a(page_no == page_get_page_no(page)); + + if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) { + ulint flags = fil_space_get_flags(space_id); + + if (UNIV_LIKELY + ((flags & DICT_TF_FORMAT_MASK) == DICT_TF_FORMAT_51)) { + /* Old versions of InnoDB did not initialize + FIL_PAGE_TYPE on BLOB pages. Do not print + anything about the type mismatch when reading + a BLOB page that is in Antelope format.*/ + return; + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: FIL_PAGE_TYPE=%lu" + " on BLOB %s space %lu page %lu flags %lx\n", + (ulong) type, read ? "read" : "purge", + (ulong) space_id, (ulong) page_no, (ulong) flags); + ut_error; + } +} + +/*******************************************************************//** +Frees the space in an externally stored field to the file space +management if the field in data is owned by the externally stored field, +in a rollback we may have the additional condition that the field must +not be inherited. */ +UNIV_INTERN +void +btr_free_externally_stored_field( +/*=============================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched; if the tree + height is 1, then also the root page + must be X-latched! (this is relevant + in the case this function is called + from purge where 'data' is located on + an undo log page, not an index + page) */ + byte* field_ref, /*!< in/out: field reference */ + const rec_t* rec, /*!< in: record containing field_ref, for + page_zip_write_blob_ptr(), or NULL */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index), + or NULL */ + page_zip_des_t* page_zip, /*!< in: compressed page corresponding + to rec, or NULL if rec == NULL */ + ulint i, /*!< in: field number of field_ref; + ignored if rec == NULL */ + enum trx_rb_ctx rb_ctx, /*!< in: rollback context */ + mtr_t* local_mtr __attribute__((unused))) /*!< in: mtr + containing the latch to data an an + X-latch to the index tree */ +{ + page_t* page; + ulint space_id; + ulint rec_zip_size = dict_table_zip_size(index->table); + ulint ext_zip_size; + ulint page_no; + ulint next_page_no; + mtr_t mtr; +#ifdef UNIV_DEBUG + ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains_page(local_mtr, field_ref, + MTR_MEMO_PAGE_X_FIX)); + ut_ad(!rec || rec_offs_validate(rec, index, offsets)); + + if (rec) { + ulint local_len; + const byte* f = rec_get_nth_field(rec, offsets, + i, &local_len); + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + f += local_len; + ut_ad(f == field_ref); + } +#endif /* UNIV_DEBUG */ + + if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE))) { + /* In the rollback of uncommitted transactions, we may + encounter a clustered index record whose BLOBs have + not been written. There is nothing to free then. */ + ut_a(rb_ctx == RB_RECOVERY || rb_ctx == RB_RECOVERY_PURGE_REC); + return; + } + + space_id = mach_read_from_4(field_ref + BTR_EXTERN_SPACE_ID); + + if (UNIV_UNLIKELY(space_id != dict_index_get_space(index))) { + ext_zip_size = fil_space_get_zip_size(space_id); + /* This must be an undo log record in the system tablespace, + that is, in row_purge_upd_exist_or_extern(). + Currently, externally stored records are stored in the + same tablespace as the referring records. */ + ut_ad(!page_get_space_id(page_align(field_ref))); + ut_ad(!rec); + ut_ad(!page_zip); + } else { + ext_zip_size = rec_zip_size; + } + + if (!rec) { + /* This is a call from row_purge_upd_exist_or_extern(). */ + ut_ad(!page_zip); + rec_zip_size = 0; + } + + for (;;) { + buf_block_t* rec_block; + buf_block_t* ext_block; + + mtr_start(&mtr); + + rec_block = buf_page_get(page_get_space_id( + page_align(field_ref)), + rec_zip_size, + page_get_page_no( + page_align(field_ref)), + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK); + page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO); + + if (/* There is no external storage data */ + page_no == FIL_NULL + /* This field does not own the externally stored field */ + || (mach_read_from_1(field_ref + BTR_EXTERN_LEN) + & BTR_EXTERN_OWNER_FLAG) + /* Rollback and inherited field */ + || ((rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY) + && (mach_read_from_1(field_ref + BTR_EXTERN_LEN) + & BTR_EXTERN_INHERITED_FLAG))) { + + /* Do not free */ + mtr_commit(&mtr); + + return; + } + + ext_block = buf_page_get(space_id, ext_zip_size, page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE); + page = buf_block_get_frame(ext_block); + + if (ext_zip_size) { + /* Note that page_zip will be NULL + in row_purge_upd_exist_or_extern(). */ + switch (fil_page_get_type(page)) { + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + break; + default: + ut_error; + } + next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT); + + btr_page_free_low(index, ext_block, 0, &mtr); + + if (UNIV_LIKELY(page_zip != NULL)) { + mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO, + next_page_no); + mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4, + 0); + page_zip_write_blob_ptr(page_zip, rec, index, + offsets, i, &mtr); + } else { + mlog_write_ulint(field_ref + + BTR_EXTERN_PAGE_NO, + next_page_no, + MLOG_4BYTES, &mtr); + mlog_write_ulint(field_ref + + BTR_EXTERN_LEN + 4, 0, + MLOG_4BYTES, &mtr); + } + } else { + ut_a(!page_zip); + btr_check_blob_fil_page_type(space_id, page_no, page, + FALSE); + + next_page_no = mach_read_from_4( + page + FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO); + + /* We must supply the page level (= 0) as an argument + because we did not store it on the page (we save the + space overhead from an index page header. */ + + btr_page_free_low(index, ext_block, 0, &mtr); + + mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO, + next_page_no, + MLOG_4BYTES, &mtr); + /* Zero out the BLOB length. If the server + crashes during the execution of this function, + trx_rollback_or_clean_all_recovered() could + dereference the half-deleted BLOB, fetching a + wrong prefix for the BLOB. */ + mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4, + 0, + MLOG_4BYTES, &mtr); + } + + /* Commit mtr and release the BLOB block to save memory. */ + btr_blob_free(ext_block, TRUE, &mtr); + } +} + +/***********************************************************//** +Frees the externally stored fields for a record. */ +static +void +btr_rec_free_externally_stored_fields( +/*==================================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched */ + rec_t* rec, /*!< in/out: record */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + enum trx_rb_ctx rb_ctx, /*!< in: rollback context */ + mtr_t* mtr) /*!< in: mini-transaction handle which contains + an X-latch to record page and to the index + tree */ +{ + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)); + /* Free possible externally stored fields in the record */ + + ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets)); + n_fields = rec_offs_n_fields(offsets); + + for (i = 0; i < n_fields; i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + byte* data + = rec_get_nth_field(rec, offsets, i, &len); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + + btr_free_externally_stored_field( + index, data + len - BTR_EXTERN_FIELD_REF_SIZE, + rec, offsets, page_zip, i, rb_ctx, mtr); + } + } +} + +/***********************************************************//** +Frees the externally stored fields for a record, if the field is mentioned +in the update vector. */ +static +void +btr_rec_free_updated_extern_fields( +/*===============================*/ + dict_index_t* index, /*!< in: index of rec; the index tree MUST be + X-latched */ + rec_t* rec, /*!< in/out: record */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + const upd_t* update, /*!< in: update vector */ + enum trx_rb_ctx rb_ctx, /*!< in: rollback context */ + mtr_t* mtr) /*!< in: mini-transaction handle which contains + an X-latch to record page and to the tree */ +{ + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)); + + /* Free possible externally stored fields in the record */ + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + const upd_field_t* ufield = upd_get_nth_field(update, i); + + if (rec_offs_nth_extern(offsets, ufield->field_no)) { + ulint len; + byte* data = rec_get_nth_field( + rec, offsets, ufield->field_no, &len); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + + btr_free_externally_stored_field( + index, data + len - BTR_EXTERN_FIELD_REF_SIZE, + rec, offsets, page_zip, + ufield->field_no, rb_ctx, mtr); + } + } +} + +/*******************************************************************//** +Copies the prefix of an uncompressed BLOB. The clustered index record +that points to this BLOB must be protected by a lock or a page latch. +@return number of bytes written to buf */ +static +ulint +btr_copy_blob_prefix( +/*=================*/ + byte* buf, /*!< out: the externally stored part of + the field, or a prefix of it */ + ulint len, /*!< in: length of buf, in bytes */ + ulint space_id,/*!< in: space id of the BLOB pages */ + ulint page_no,/*!< in: page number of the first BLOB page */ + ulint offset) /*!< in: offset on the first BLOB page */ +{ + ulint copied_len = 0; + + for (;;) { + mtr_t mtr; + buf_block_t* block; + const page_t* page; + const byte* blob_header; + ulint part_len; + ulint copy_len; + + mtr_start(&mtr); + + block = buf_page_get(space_id, 0, page_no, RW_S_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE); + page = buf_block_get_frame(block); + + btr_check_blob_fil_page_type(space_id, page_no, page, TRUE); + + blob_header = page + offset; + part_len = btr_blob_get_part_len(blob_header); + copy_len = ut_min(part_len, len - copied_len); + + memcpy(buf + copied_len, + blob_header + BTR_BLOB_HDR_SIZE, copy_len); + copied_len += copy_len; + + page_no = btr_blob_get_next_page_no(blob_header); + + mtr_commit(&mtr); + + if (page_no == FIL_NULL || copy_len != part_len) { + UNIV_MEM_ASSERT_RW(buf, copied_len); + return(copied_len); + } + + /* On other BLOB pages except the first the BLOB header + always is at the page data start: */ + + offset = FIL_PAGE_DATA; + + ut_ad(copied_len <= len); + } +} + +/*******************************************************************//** +Copies the prefix of a compressed BLOB. The clustered index record +that points to this BLOB must be protected by a lock or a page latch. */ +static +void +btr_copy_zblob_prefix( +/*==================*/ + z_stream* d_stream,/*!< in/out: the decompressing stream */ + ulint zip_size,/*!< in: compressed BLOB page size */ + ulint space_id,/*!< in: space id of the BLOB pages */ + ulint page_no,/*!< in: page number of the first BLOB page */ + ulint offset) /*!< in: offset on the first BLOB page */ +{ + ulint page_type = FIL_PAGE_TYPE_ZBLOB; + + ut_ad(ut_is_2pow(zip_size)); + ut_ad(zip_size >= PAGE_ZIP_MIN_SIZE); + ut_ad(zip_size <= UNIV_PAGE_SIZE); + ut_ad(space_id); + + for (;;) { + buf_page_t* bpage; + int err; + ulint next_page_no; + + /* There is no latch on bpage directly. Instead, + bpage is protected by the B-tree page latch that + is being held on the clustered index record, or, + in row_merge_copy_blobs(), by an exclusive table lock. */ + bpage = buf_page_get_zip(space_id, zip_size, page_no); + + if (UNIV_UNLIKELY(!bpage)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Cannot load" + " compressed BLOB" + " page %lu space %lu\n", + (ulong) page_no, (ulong) space_id); + return; + } + + if (UNIV_UNLIKELY + (fil_page_get_type(bpage->zip.data) != page_type)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Unexpected type %lu of" + " compressed BLOB" + " page %lu space %lu\n", + (ulong) fil_page_get_type(bpage->zip.data), + (ulong) page_no, (ulong) space_id); + goto end_of_blob; + } + + next_page_no = mach_read_from_4(bpage->zip.data + offset); + + if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) { + /* When the BLOB begins at page header, + the compressed data payload does not + immediately follow the next page pointer. */ + offset = FIL_PAGE_DATA; + } else { + offset += 4; + } + + d_stream->next_in = bpage->zip.data + offset; + d_stream->avail_in = zip_size - offset; + + err = inflate(d_stream, Z_NO_FLUSH); + switch (err) { + case Z_OK: + if (!d_stream->avail_out) { + goto end_of_blob; + } + break; + case Z_STREAM_END: + if (next_page_no == FIL_NULL) { + goto end_of_blob; + } + /* fall through */ + default: +inflate_error: + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: inflate() of" + " compressed BLOB" + " page %lu space %lu returned %d (%s)\n", + (ulong) page_no, (ulong) space_id, + err, d_stream->msg); + case Z_BUF_ERROR: + goto end_of_blob; + } + + if (next_page_no == FIL_NULL) { + if (!d_stream->avail_in) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: unexpected end of" + " compressed BLOB" + " page %lu space %lu\n", + (ulong) page_no, + (ulong) space_id); + } else { + err = inflate(d_stream, Z_FINISH); + switch (err) { + case Z_STREAM_END: + case Z_BUF_ERROR: + break; + default: + goto inflate_error; + } + } + +end_of_blob: + buf_page_release_zip(bpage); + return; + } + + buf_page_release_zip(bpage); + + /* On other BLOB pages except the first + the BLOB header always is at the page header: */ + + page_no = next_page_no; + offset = FIL_PAGE_NEXT; + page_type = FIL_PAGE_TYPE_ZBLOB2; + } +} + +/*******************************************************************//** +Copies the prefix of an externally stored field of a record. The +clustered index record that points to this BLOB must be protected by a +lock or a page latch. +@return number of bytes written to buf */ +static +ulint +btr_copy_externally_stored_field_prefix_low( +/*========================================*/ + byte* buf, /*!< out: the externally stored part of + the field, or a prefix of it */ + ulint len, /*!< in: length of buf, in bytes */ + ulint zip_size,/*!< in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + ulint space_id,/*!< in: space id of the first BLOB page */ + ulint page_no,/*!< in: page number of the first BLOB page */ + ulint offset) /*!< in: offset on the first BLOB page */ +{ + if (UNIV_UNLIKELY(len == 0)) { + return(0); + } + + if (UNIV_UNLIKELY(zip_size)) { + int err; + z_stream d_stream; + mem_heap_t* heap; + + /* Zlib inflate needs 32 kilobytes for the default + window size, plus a few kilobytes for small objects. */ + heap = mem_heap_create(40000); + page_zip_set_alloc(&d_stream, heap); + + err = inflateInit(&d_stream); + ut_a(err == Z_OK); + + d_stream.next_out = buf; + d_stream.avail_out = len; + d_stream.avail_in = 0; + + btr_copy_zblob_prefix(&d_stream, zip_size, + space_id, page_no, offset); + inflateEnd(&d_stream); + mem_heap_free(heap); + UNIV_MEM_ASSERT_RW(buf, d_stream.total_out); + return(d_stream.total_out); + } else { + return(btr_copy_blob_prefix(buf, len, space_id, + page_no, offset)); + } +} + +/*******************************************************************//** +Copies the prefix of an externally stored field of a record. The +clustered index record must be protected by a lock or a page latch. +@return the length of the copied field, or 0 if the column was being +or has been deleted */ +UNIV_INTERN +ulint +btr_copy_externally_stored_field_prefix( +/*====================================*/ + byte* buf, /*!< out: the field, or a prefix of it */ + ulint len, /*!< in: length of buf, in bytes */ + ulint zip_size,/*!< in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + const byte* data, /*!< in: 'internally' stored part of the + field containing also the reference to + the external part; must be protected by + a lock or a page latch */ + ulint local_len)/*!< in: length of data, in bytes */ +{ + ulint space_id; + ulint page_no; + ulint offset; + + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY(local_len >= len)) { + memcpy(buf, data, len); + return(len); + } + + memcpy(buf, data, local_len); + data += local_len; + + ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)); + + if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) { + /* The externally stored part of the column has been + (partially) deleted. Signal the half-deleted BLOB + to the caller. */ + + return(0); + } + + space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID); + + page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO); + + offset = mach_read_from_4(data + BTR_EXTERN_OFFSET); + + return(local_len + + btr_copy_externally_stored_field_prefix_low(buf + local_len, + len - local_len, + zip_size, + space_id, page_no, + offset)); +} + +/*******************************************************************//** +Copies an externally stored field of a record to mem heap. The +clustered index record must be protected by a lock or a page latch. +@return the whole field copied to heap */ +static +byte* +btr_copy_externally_stored_field( +/*=============================*/ + ulint* len, /*!< out: length of the whole field */ + const byte* data, /*!< in: 'internally' stored part of the + field containing also the reference to + the external part; must be protected by + a lock or a page latch */ + ulint zip_size,/*!< in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + ulint local_len,/*!< in: length of data */ + mem_heap_t* heap) /*!< in: mem heap */ +{ + ulint space_id; + ulint page_no; + ulint offset; + ulint extern_len; + byte* buf; + + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID); + + page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO); + + offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET); + + /* Currently a BLOB cannot be bigger than 4 GB; we + leave the 4 upper bytes in the length field unused */ + + extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4); + + buf = mem_heap_alloc(heap, local_len + extern_len); + + memcpy(buf, data, local_len); + *len = local_len + + btr_copy_externally_stored_field_prefix_low(buf + local_len, + extern_len, + zip_size, + space_id, + page_no, offset); + + return(buf); +} + +/*******************************************************************//** +Copies an externally stored field of a record to mem heap. +@return the field copied to heap, or NULL if the field is incomplete */ +UNIV_INTERN +byte* +btr_rec_copy_externally_stored_field( +/*=================================*/ + const rec_t* rec, /*!< in: record in a clustered index; + must be protected by a lock or a page latch */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint zip_size,/*!< in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + ulint no, /*!< in: field number */ + ulint* len, /*!< out: length of the field */ + mem_heap_t* heap) /*!< in: mem heap */ +{ + ulint local_len; + const byte* data; + + ut_a(rec_offs_nth_extern(offsets, no)); + + /* An externally stored field can contain some initial + data from the field, and in the last 20 bytes it has the + space id, page number, and offset where the rest of the + field data is stored, and the data length in addition to + the data stored locally. We may need to store some data + locally to get the local record length above the 128 byte + limit so that field offsets are stored in two bytes, and + the extern bit is available in those two bytes. */ + + data = rec_get_nth_field(rec, offsets, no, &local_len); + + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + if (UNIV_UNLIKELY + (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) { + /* The externally stored field was not written yet. + This record should only be seen by + recv_recovery_rollback_active() or any + TRX_ISO_READ_UNCOMMITTED transactions. */ + return(NULL); + } + + return(btr_copy_externally_stored_field(len, data, + zip_size, local_len, heap)); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/btr/btr0pcur.c b/storage/xtradb/btr/btr0pcur.c new file mode 100644 index 00000000000..537c26f6bf2 --- /dev/null +++ b/storage/xtradb/btr/btr0pcur.c @@ -0,0 +1,606 @@ +/***************************************************************************** + +Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/**************************************************//** +@file btr/btr0pcur.c +The index tree persistent cursor + +Created 2/23/1996 Heikki Tuuri +*******************************************************/ + +#include "btr0pcur.h" + +#ifdef UNIV_NONINL +#include "btr0pcur.ic" +#endif + +#include "ut0byte.h" +#include "rem0cmp.h" +#include "trx0trx.h" +#include "srv0srv.h" +/**************************************************************//** +Allocates memory for a persistent cursor object and initializes the cursor. +@return own: persistent cursor */ +UNIV_INTERN +btr_pcur_t* +btr_pcur_create_for_mysql(void) +/*============================*/ +{ + btr_pcur_t* pcur; + + pcur = mem_alloc(sizeof(btr_pcur_t)); + + pcur->btr_cur.index = NULL; + btr_pcur_init(pcur); + + return(pcur); +} + +/**************************************************************//** +Frees the memory for a persistent cursor object. */ +UNIV_INTERN +void +btr_pcur_free_for_mysql( +/*====================*/ + btr_pcur_t* cursor) /*!< in, own: persistent cursor */ +{ + if (cursor->old_rec_buf != NULL) { + + mem_free(cursor->old_rec_buf); + + cursor->old_rec_buf = NULL; + } + + cursor->btr_cur.page_cur.rec = NULL; + cursor->old_rec = NULL; + cursor->old_n_fields = 0; + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + cursor->latch_mode = BTR_NO_LATCHES; + cursor->pos_state = BTR_PCUR_NOT_POSITIONED; + + mem_free(cursor); +} + +/**************************************************************//** +The position of the cursor is stored by taking an initial segment of the +record the cursor is positioned on, before, or after, and copying it to the +cursor data structure, or just setting a flag if the cursor id before the +first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the +page where the cursor is positioned must not be empty if the index tree is +not totally empty! */ +UNIV_INTERN +void +btr_pcur_store_position( +/*====================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t* page_cursor; + buf_block_t* block; + rec_t* rec; + dict_index_t* index; + page_t* page; + ulint offs; + + ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + block = btr_pcur_get_block(cursor); + + if (srv_pass_corrupt_table && !block) { + return; + } + ut_a(block); + + index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor)); + + page_cursor = btr_pcur_get_page_cur(cursor); + + rec = page_cur_get_rec(page_cursor); + page = page_align(rec); + offs = page_offset(rec); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_S_FIX) + || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_a(cursor->latch_mode != BTR_NO_LATCHES); + + if (UNIV_UNLIKELY(page_get_n_recs(page) == 0)) { + /* It must be an empty index tree; NOTE that in this case + we do not store the modify_clock, but always do a search + if we restore the cursor position */ + + ut_a(btr_page_get_next(page, mtr) == FIL_NULL); + ut_a(btr_page_get_prev(page, mtr) == FIL_NULL); + + cursor->old_stored = BTR_PCUR_OLD_STORED; + + if (page_rec_is_supremum_low(offs)) { + + cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE; + } else { + cursor->rel_pos = BTR_PCUR_BEFORE_FIRST_IN_TREE; + } + + return; + } + + if (page_rec_is_supremum_low(offs)) { + + rec = page_rec_get_prev(rec); + + cursor->rel_pos = BTR_PCUR_AFTER; + + } else if (page_rec_is_infimum_low(offs)) { + + rec = page_rec_get_next(rec); + + cursor->rel_pos = BTR_PCUR_BEFORE; + } else { + cursor->rel_pos = BTR_PCUR_ON; + } + + cursor->old_stored = BTR_PCUR_OLD_STORED; + cursor->old_rec = dict_index_copy_rec_order_prefix( + index, rec, &cursor->old_n_fields, + &cursor->old_rec_buf, &cursor->buf_size); + + cursor->block_when_stored = block; + cursor->modify_clock = buf_block_get_modify_clock(block); +} + +/**************************************************************//** +Copies the stored position of a pcur to another pcur. */ +UNIV_INTERN +void +btr_pcur_copy_stored_position( +/*==========================*/ + btr_pcur_t* pcur_receive, /*!< in: pcur which will receive the + position info */ + btr_pcur_t* pcur_donate) /*!< in: pcur from which the info is + copied */ +{ + if (pcur_receive->old_rec_buf) { + mem_free(pcur_receive->old_rec_buf); + } + + ut_memcpy(pcur_receive, pcur_donate, sizeof(btr_pcur_t)); + + if (pcur_donate->old_rec_buf) { + + pcur_receive->old_rec_buf = mem_alloc(pcur_donate->buf_size); + + ut_memcpy(pcur_receive->old_rec_buf, pcur_donate->old_rec_buf, + pcur_donate->buf_size); + pcur_receive->old_rec = pcur_receive->old_rec_buf + + (pcur_donate->old_rec - pcur_donate->old_rec_buf); + } + + pcur_receive->old_n_fields = pcur_donate->old_n_fields; +} + +/**************************************************************//** +Restores the stored position of a persistent cursor bufferfixing the page and +obtaining the specified latches. If the cursor position was saved when the +(1) cursor was positioned on a user record: this function restores the position +to the last record LESS OR EQUAL to the stored record; +(2) cursor was positioned on a page infimum record: restores the position to +the last record LESS than the user record which was the successor of the page +infimum; +(3) cursor was positioned on the page supremum: restores to the first record +GREATER than the user record which was the predecessor of the supremum. +(4) cursor was positioned before the first or after the last in an empty tree: +restores to before first or after the last in the tree. +@return TRUE if the cursor position was stored when it was on a user +record and it can be restored on a user record whose ordering fields +are identical to the ones of the original user record */ +UNIV_INTERN +ibool +btr_pcur_restore_position_func( +/*===========================*/ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in: detached persistent cursor */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index; + dtuple_t* tuple; + ulint mode; + ulint old_mode; + mem_heap_t* heap; + + ut_ad(mtr); + ut_ad(mtr->state == MTR_ACTIVE); + + index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor)); + + if (UNIV_UNLIKELY(cursor->old_stored != BTR_PCUR_OLD_STORED) + || UNIV_UNLIKELY(cursor->pos_state != BTR_PCUR_WAS_POSITIONED + && cursor->pos_state != BTR_PCUR_IS_POSITIONED)) { + ut_print_buf(stderr, cursor, sizeof(btr_pcur_t)); + putc('\n', stderr); + if (cursor->trx_if_known) { + trx_print(stderr, cursor->trx_if_known, 0); + } + + ut_error; + } + + if (UNIV_UNLIKELY + (cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE + || cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE)) { + + /* In these cases we do not try an optimistic restoration, + but always do a search */ + + btr_cur_open_at_index_side( + cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE, + index, latch_mode, btr_pcur_get_btr_cur(cursor), mtr); + + cursor->block_when_stored = btr_pcur_get_block(cursor); + + return(FALSE); + } + + ut_a(cursor->old_rec); + ut_a(cursor->old_n_fields); + + if (UNIV_LIKELY(latch_mode == BTR_SEARCH_LEAF) + || UNIV_LIKELY(latch_mode == BTR_MODIFY_LEAF)) { + /* Try optimistic restoration */ + + if (UNIV_LIKELY(buf_page_optimistic_get( + latch_mode, + cursor->block_when_stored, + cursor->modify_clock, + file, line, mtr))) { + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + + buf_block_dbg_add_level(btr_pcur_get_block(cursor), + SYNC_TREE_NODE); + + if (cursor->rel_pos == BTR_PCUR_ON) { +#ifdef UNIV_DEBUG + const rec_t* rec; + const ulint* offsets1; + const ulint* offsets2; +#endif /* UNIV_DEBUG */ + cursor->latch_mode = latch_mode; +#ifdef UNIV_DEBUG + rec = btr_pcur_get_rec(cursor); + + heap = mem_heap_create(256); + offsets1 = rec_get_offsets( + cursor->old_rec, index, NULL, + cursor->old_n_fields, &heap); + offsets2 = rec_get_offsets( + rec, index, NULL, + cursor->old_n_fields, &heap); + + ut_ad(!cmp_rec_rec(cursor->old_rec, + rec, offsets1, offsets2, + index)); + mem_heap_free(heap); +#endif /* UNIV_DEBUG */ + return(TRUE); + } + + return(FALSE); + } + } + + /* If optimistic restoration did not succeed, open the cursor anew */ + + heap = mem_heap_create(256); + + tuple = dict_index_build_data_tuple(index, cursor->old_rec, + cursor->old_n_fields, heap); + + /* Save the old search mode of the cursor */ + old_mode = cursor->search_mode; + + if (UNIV_LIKELY(cursor->rel_pos == BTR_PCUR_ON)) { + mode = PAGE_CUR_LE; + } else if (cursor->rel_pos == BTR_PCUR_AFTER) { + mode = PAGE_CUR_G; + } else { + ut_ad(cursor->rel_pos == BTR_PCUR_BEFORE); + mode = PAGE_CUR_L; + } + + btr_pcur_open_with_no_init_func(index, tuple, mode, latch_mode, + cursor, 0, file, line, mtr); + + /* Restore the old search mode */ + cursor->search_mode = old_mode; + + if (cursor->rel_pos == BTR_PCUR_ON + && btr_pcur_is_on_user_rec(cursor) + && 0 == cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor), + rec_get_offsets( + btr_pcur_get_rec(cursor), index, + NULL, ULINT_UNDEFINED, &heap))) { + + /* We have to store the NEW value for the modify clock, since + the cursor can now be on a different page! But we can retain + the value of old_rec */ + + cursor->block_when_stored = btr_pcur_get_block(cursor); + cursor->modify_clock = buf_block_get_modify_clock( + cursor->block_when_stored); + cursor->old_stored = BTR_PCUR_OLD_STORED; + + mem_heap_free(heap); + + return(TRUE); + } + + mem_heap_free(heap); + + /* We have to store new position information, modify_clock etc., + to the cursor because it can now be on a different page, the record + under it may have been removed, etc. */ + + btr_pcur_store_position(cursor, mtr); + + return(FALSE); +} + +/**************************************************************//** +If the latch mode of the cursor is BTR_LEAF_SEARCH or BTR_LEAF_MODIFY, +releases the page latch and bufferfix reserved by the cursor. +NOTE! In the case of BTR_LEAF_MODIFY, there should not exist changes +made by the current mini-transaction to the data protected by the +cursor latch, as then the latch must not be released until mtr_commit. */ +UNIV_INTERN +void +btr_pcur_release_leaf( +/*==================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + + ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + block = btr_pcur_get_block(cursor); + + btr_leaf_page_release(block, cursor->latch_mode, mtr); + + cursor->latch_mode = BTR_NO_LATCHES; + + cursor->pos_state = BTR_PCUR_WAS_POSITIONED; +} + +/*********************************************************//** +Moves the persistent cursor to the first record on the next page. Releases the +latch on the current page, and bufferunfixes it. Note that there must not be +modifications on the current page, as then the x-latch can be released only in +mtr_commit. */ +UNIV_INTERN +void +btr_pcur_move_to_next_page( +/*=======================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; must be on the + last record of the current page */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint next_page_no; + ulint space; + ulint zip_size; + page_t* page; + buf_block_t* next_block; + page_t* next_page; + + ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + ut_ad(btr_pcur_is_after_last_on_page(cursor)); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + page = btr_pcur_get_page(cursor); + next_page_no = btr_page_get_next(page, mtr); + space = buf_block_get_space(btr_pcur_get_block(cursor)); + zip_size = buf_block_get_zip_size(btr_pcur_get_block(cursor)); + + ut_ad(next_page_no != FIL_NULL); + + next_block = btr_block_get(space, zip_size, next_page_no, + cursor->latch_mode, mtr); + next_page = buf_block_get_frame(next_block); + + if (srv_pass_corrupt_table && !next_page) { + btr_leaf_page_release(btr_pcur_get_block(cursor), + cursor->latch_mode, mtr); + btr_pcur_get_page_cur(cursor)->block = 0; + btr_pcur_get_page_cur(cursor)->rec = 0; + return; + } + ut_a(next_page); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(next_page) == page_is_comp(page)); + ut_a(btr_page_get_prev(next_page, mtr) + == buf_block_get_page_no(btr_pcur_get_block(cursor))); +#endif /* UNIV_BTR_DEBUG */ + next_block->check_index_page_at_flush = TRUE; + + btr_leaf_page_release(btr_pcur_get_block(cursor), + cursor->latch_mode, mtr); + + page_cur_set_before_first(next_block, btr_pcur_get_page_cur(cursor)); + + page_check_dir(next_page); +} + +/*********************************************************//** +Moves the persistent cursor backward if it is on the first record of the page. +Commits mtr. Note that to prevent a possible deadlock, the operation +first stores the position of the cursor, commits mtr, acquires the necessary +latches and restores the cursor position again before returning. The +alphabetical position of the cursor is guaranteed to be sensible on +return, but it may happen that the cursor is not positioned on the last +record of any page, because the structure of the tree may have changed +during the time when the cursor had no latches. */ +UNIV_INTERN +void +btr_pcur_move_backward_from_page( +/*=============================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor, must be on the first + record of the current page */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint prev_page_no; + ulint space; + page_t* page; + buf_block_t* prev_block; + ulint latch_mode; + ulint latch_mode2; + + ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + ut_ad(btr_pcur_is_before_first_on_page(cursor)); + ut_ad(!btr_pcur_is_before_first_in_tree(cursor, mtr)); + + latch_mode = cursor->latch_mode; + + if (latch_mode == BTR_SEARCH_LEAF) { + + latch_mode2 = BTR_SEARCH_PREV; + + } else if (latch_mode == BTR_MODIFY_LEAF) { + + latch_mode2 = BTR_MODIFY_PREV; + } else { + latch_mode2 = 0; /* To eliminate compiler warning */ + ut_error; + } + + btr_pcur_store_position(cursor, mtr); + + mtr_commit(mtr); + + mtr_start(mtr); + + btr_pcur_restore_position(latch_mode2, cursor, mtr); + + page = btr_pcur_get_page(cursor); + + prev_page_no = btr_page_get_prev(page, mtr); + space = buf_block_get_space(btr_pcur_get_block(cursor)); + + if (prev_page_no == FIL_NULL) { + } else if (btr_pcur_is_before_first_on_page(cursor)) { + + prev_block = btr_pcur_get_btr_cur(cursor)->left_block; + + btr_leaf_page_release(btr_pcur_get_block(cursor), + latch_mode, mtr); + + page_cur_set_after_last(prev_block, + btr_pcur_get_page_cur(cursor)); + } else { + + /* The repositioned cursor did not end on an infimum record on + a page. Cursor repositioning acquired a latch also on the + previous page, but we do not need the latch: release it. */ + + prev_block = btr_pcur_get_btr_cur(cursor)->left_block; + + btr_leaf_page_release(prev_block, latch_mode, mtr); + } + + cursor->latch_mode = latch_mode; + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; +} + +/*********************************************************//** +Moves the persistent cursor to the previous record in the tree. If no records +are left, the cursor stays 'before first in tree'. +@return TRUE if the cursor was not before first in tree */ +UNIV_INTERN +ibool +btr_pcur_move_to_prev( +/*==================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + if (btr_pcur_is_before_first_on_page(cursor)) { + + if (btr_pcur_is_before_first_in_tree(cursor, mtr)) { + + return(FALSE); + } + + btr_pcur_move_backward_from_page(cursor, mtr); + + return(TRUE); + } + + btr_pcur_move_to_prev_on_page(cursor); + + return(TRUE); +} + +/**************************************************************//** +If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first +user record satisfying the search condition, in the case PAGE_CUR_L or +PAGE_CUR_LE, on the last user record. If no such user record exists, then +in the first case sets the cursor after last in tree, and in the latter case +before first in tree. The latching mode must be BTR_SEARCH_LEAF or +BTR_MODIFY_LEAF. */ +UNIV_INTERN +void +btr_pcur_open_on_user_rec_func( +/*===========================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + ulint mode, /*!< in: PAGE_CUR_L, ... */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or + BTR_MODIFY_LEAF */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent + cursor */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + btr_pcur_open_func(index, tuple, mode, latch_mode, cursor, + file, line, mtr); + + if ((mode == PAGE_CUR_GE) || (mode == PAGE_CUR_G)) { + + if (btr_pcur_is_after_last_on_page(cursor)) { + + btr_pcur_move_to_next_user_rec(cursor, mtr); + } + } else { + ut_ad((mode == PAGE_CUR_LE) || (mode == PAGE_CUR_L)); + + /* Not implemented yet */ + + ut_error; + } +} diff --git a/storage/xtradb/btr/btr0sea.c b/storage/xtradb/btr/btr0sea.c new file mode 100644 index 00000000000..6628333d32a --- /dev/null +++ b/storage/xtradb/btr/btr0sea.c @@ -0,0 +1,2032 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************//** +@file btr/btr0sea.c +The index tree adaptive search + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#include "btr0sea.h" +#ifdef UNIV_NONINL +#include "btr0sea.ic" +#endif + +#include "buf0buf.h" +#include "page0page.h" +#include "page0cur.h" +#include "btr0cur.h" +#include "btr0pcur.h" +#include "btr0btr.h" +#include "ha0ha.h" +#include "srv0srv.h" +/** Flag: has the search system been enabled? +Protected by btr_search_latch and btr_search_enabled_mutex. */ +UNIV_INTERN char btr_search_enabled = TRUE; +UNIV_INTERN ibool btr_search_fully_disabled = FALSE; + +/** Mutex protecting btr_search_enabled */ +static mutex_t btr_search_enabled_mutex; + +/** A dummy variable to fool the compiler */ +UNIV_INTERN ulint btr_search_this_is_zero = 0; + +#ifdef UNIV_SEARCH_PERF_STAT +/** Number of successful adaptive hash index lookups */ +UNIV_INTERN ulint btr_search_n_succ = 0; +/** Number of failed adaptive hash index lookups */ +UNIV_INTERN ulint btr_search_n_hash_fail = 0; +#endif /* UNIV_SEARCH_PERF_STAT */ + +/** padding to prevent other memory update +hotspots from residing on the same memory +cache line as btr_search_latch */ +UNIV_INTERN byte btr_sea_pad1[64]; + +/** The latch protecting the adaptive search system: this latch protects the +(1) positions of records on those pages where a hash index has been built. +NOTE: It does not protect values of non-ordering fields within a record from +being updated in-place! We can use fact (1) to perform unique searches to +indexes. */ + +/* We will allocate the latch from dynamic memory to get it to the +same DRAM page as other hotspot semaphores */ +UNIV_INTERN rw_lock_t* btr_search_latch_temp; + +/** padding to prevent other memory update hotspots from residing on +the same memory cache line */ +UNIV_INTERN byte btr_sea_pad2[64]; + +/** The adaptive hash index */ +UNIV_INTERN btr_search_sys_t* btr_search_sys; + +/** If the number of records on the page divided by this parameter +would have been successfully accessed using a hash index, the index +is then built on the page, assuming the global limit has been reached */ +#define BTR_SEARCH_PAGE_BUILD_LIMIT 16 + +/** The global limit for consecutive potentially successful hash searches, +before hash index building is started */ +#define BTR_SEARCH_BUILD_LIMIT 100 + +/********************************************************************//** +Builds a hash index on a page with the given parameters. If the page already +has a hash index with different parameters, the old hash index is removed. +If index is non-NULL, this function checks if n_fields and n_bytes are +sensible values, and does not build a hash index if not. */ +static +void +btr_search_build_page_hash_index( +/*=============================*/ + dict_index_t* index, /*!< in: index for which to build, or NULL if + not known */ + buf_block_t* block, /*!< in: index page, s- or x-latched */ + ulint n_fields,/*!< in: hash this many full fields */ + ulint n_bytes,/*!< in: hash this many bytes from the next + field */ + ibool left_side);/*!< in: hash for searches from left side? */ + +/*****************************************************************//** +This function should be called before reserving any btr search mutex, if +the intended operation might add nodes to the search system hash table. +Because of the latching order, once we have reserved the btr search system +latch, we cannot allocate a free frame from the buffer pool. Checks that +there is a free buffer frame allocated for hash table heap in the btr search +system. If not, allocates a free frames for the heap. This check makes it +probable that, when have reserved the btr search system latch and we need to +allocate a new node to the hash table, it will succeed. However, the check +will not guarantee success. */ +static +void +btr_search_check_free_space_in_heap(void) +/*=====================================*/ +{ + hash_table_t* table; + mem_heap_t* heap; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + table = btr_search_sys->hash_index; + + heap = table->heap; + + /* Note that we peek the value of heap->free_block without reserving + the latch: this is ok, because we will not guarantee that there will + be enough free space in the hash table. */ + + if (heap->free_block == NULL) { + buf_block_t* block = buf_block_alloc(0); + + rw_lock_x_lock(&btr_search_latch); + + if (heap->free_block == NULL) { + heap->free_block = block; + } else { + buf_block_free(block); + } + + rw_lock_x_unlock(&btr_search_latch); + } +} + +/*****************************************************************//** +Creates and initializes the adaptive search system at a database start. */ +UNIV_INTERN +void +btr_search_sys_create( +/*==================*/ + ulint hash_size) /*!< in: hash index hash table size */ +{ + /* We allocate the search latch from dynamic memory: + see above at the global variable definition */ + + btr_search_latch_temp = mem_alloc(sizeof(rw_lock_t)); + + rw_lock_create(&btr_search_latch, SYNC_SEARCH_SYS); + mutex_create(&btr_search_enabled_mutex, SYNC_SEARCH_SYS_CONF); + + btr_search_sys = mem_alloc(sizeof(btr_search_sys_t)); + + btr_search_sys->hash_index = ha_create(hash_size, 0, 0); +} + +/*****************************************************************//** +Frees the adaptive search system at a database shutdown. */ +UNIV_INTERN +void +btr_search_sys_free(void) +/*=====================*/ +{ + rw_lock_free(&btr_search_latch); + mem_free(btr_search_latch_temp); + btr_search_latch_temp = NULL; + mem_heap_free(btr_search_sys->hash_index->heap); + hash_table_free(btr_search_sys->hash_index); + mem_free(btr_search_sys); + btr_search_sys = NULL; +} + +/********************************************************************//** +Disable the adaptive hash search system and empty the index. */ +UNIV_INTERN +void +btr_search_disable(void) +/*====================*/ +{ + mutex_enter(&btr_search_enabled_mutex); + rw_lock_x_lock(&btr_search_latch); + + /* Disable access to hash index, also tell ha_insert_for_fold() + stop adding new nodes to hash index, but still allow updating + existing nodes */ + btr_search_enabled = FALSE; + + /* Clear all block->is_hashed flags and remove all entries + from btr_search_sys->hash_index. */ + buf_pool_drop_hash_index(); + + /* hash index has been cleaned up, disallow any operation to + the hash index */ + btr_search_fully_disabled = TRUE; + + /* btr_search_enabled_mutex should guarantee this. */ + ut_ad(!btr_search_enabled); + + rw_lock_x_unlock(&btr_search_latch); + mutex_exit(&btr_search_enabled_mutex); +} + +/********************************************************************//** +Enable the adaptive hash search system. */ +UNIV_INTERN +void +btr_search_enable(void) +/*====================*/ +{ + mutex_enter(&btr_search_enabled_mutex); + rw_lock_x_lock(&btr_search_latch); + + btr_search_enabled = TRUE; + btr_search_fully_disabled = FALSE; + + rw_lock_x_unlock(&btr_search_latch); + mutex_exit(&btr_search_enabled_mutex); +} + +/*****************************************************************//** +Creates and initializes a search info struct. +@return own: search info struct */ +UNIV_INTERN +btr_search_t* +btr_search_info_create( +/*===================*/ + mem_heap_t* heap) /*!< in: heap where created */ +{ + btr_search_t* info; + + info = mem_heap_alloc(heap, sizeof(btr_search_t)); + +#ifdef UNIV_DEBUG + info->magic_n = BTR_SEARCH_MAGIC_N; +#endif /* UNIV_DEBUG */ + + info->ref_count = 0; + info->root_guess = NULL; + + info->hash_analysis = 0; + info->n_hash_potential = 0; + + info->last_hash_succ = FALSE; + +#ifdef UNIV_SEARCH_PERF_STAT + info->n_hash_succ = 0; + info->n_hash_fail = 0; + info->n_patt_succ = 0; + info->n_searches = 0; +#endif /* UNIV_SEARCH_PERF_STAT */ + + /* Set some sensible values */ + info->n_fields = 1; + info->n_bytes = 0; + + info->left_side = TRUE; + + return(info); +} + +/*****************************************************************//** +Returns the value of ref_count. The value is protected by +btr_search_latch. +@return ref_count value. */ +UNIV_INTERN +ulint +btr_search_info_get_ref_count( +/*==========================*/ + btr_search_t* info) /*!< in: search info. */ +{ + ulint ret; + + ut_ad(info); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + rw_lock_s_lock(&btr_search_latch); + ret = info->ref_count; + rw_lock_s_unlock(&btr_search_latch); + + return(ret); +} + +/*********************************************************************//** +Updates the search info of an index about hash successes. NOTE that info +is NOT protected by any semaphore, to save CPU time! Do not assume its fields +are consistent. */ +static +void +btr_search_info_update_hash( +/*========================*/ + btr_search_t* info, /*!< in/out: search info */ + btr_cur_t* cursor) /*!< in: cursor which was just positioned */ +{ + dict_index_t* index; + ulint n_unique; + int cmp; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + index = cursor->index; + + if (dict_index_is_ibuf(index)) { + /* So many deletes are performed on an insert buffer tree + that we do not consider a hash index useful on it: */ + + return; + } + + n_unique = dict_index_get_n_unique_in_tree(index); + + if (info->n_hash_potential == 0) { + + goto set_new_recomm; + } + + /* Test if the search would have succeeded using the recommended + hash prefix */ + + if (info->n_fields >= n_unique && cursor->up_match >= n_unique) { +increment_potential: + info->n_hash_potential++; + + return; + } + + cmp = ut_pair_cmp(info->n_fields, info->n_bytes, + cursor->low_match, cursor->low_bytes); + + if (info->left_side ? cmp <= 0 : cmp > 0) { + + goto set_new_recomm; + } + + cmp = ut_pair_cmp(info->n_fields, info->n_bytes, + cursor->up_match, cursor->up_bytes); + + if (info->left_side ? cmp <= 0 : cmp > 0) { + + goto increment_potential; + } + +set_new_recomm: + /* We have to set a new recommendation; skip the hash analysis + for a while to avoid unnecessary CPU time usage when there is no + chance for success */ + + info->hash_analysis = 0; + + cmp = ut_pair_cmp(cursor->up_match, cursor->up_bytes, + cursor->low_match, cursor->low_bytes); + if (cmp == 0) { + info->n_hash_potential = 0; + + /* For extra safety, we set some sensible values here */ + + info->n_fields = 1; + info->n_bytes = 0; + + info->left_side = TRUE; + + } else if (cmp > 0) { + info->n_hash_potential = 1; + + if (cursor->up_match >= n_unique) { + + info->n_fields = n_unique; + info->n_bytes = 0; + + } else if (cursor->low_match < cursor->up_match) { + + info->n_fields = cursor->low_match + 1; + info->n_bytes = 0; + } else { + info->n_fields = cursor->low_match; + info->n_bytes = cursor->low_bytes + 1; + } + + info->left_side = TRUE; + } else { + info->n_hash_potential = 1; + + if (cursor->low_match >= n_unique) { + + info->n_fields = n_unique; + info->n_bytes = 0; + + } else if (cursor->low_match > cursor->up_match) { + + info->n_fields = cursor->up_match + 1; + info->n_bytes = 0; + } else { + info->n_fields = cursor->up_match; + info->n_bytes = cursor->up_bytes + 1; + } + + info->left_side = FALSE; + } +} + +/*********************************************************************//** +Updates the block search info on hash successes. NOTE that info and +block->n_hash_helps, n_fields, n_bytes, side are NOT protected by any +semaphore, to save CPU time! Do not assume the fields are consistent. +@return TRUE if building a (new) hash index on the block is recommended */ +static +ibool +btr_search_update_block_hash_info( +/*==============================*/ + btr_search_t* info, /*!< in: search info */ + buf_block_t* block, /*!< in: buffer block */ + btr_cur_t* cursor __attribute__((unused))) + /*!< in: cursor */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX)); + ut_ad(rw_lock_own(&block->lock, RW_LOCK_SHARED) + || rw_lock_own(&block->lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(cursor); + + info->last_hash_succ = FALSE; + + ut_a(buf_block_state_valid(block)); + ut_ad(info->magic_n == BTR_SEARCH_MAGIC_N); + + if ((block->n_hash_helps > 0) + && (info->n_hash_potential > 0) + && (block->n_fields == info->n_fields) + && (block->n_bytes == info->n_bytes) + && (block->left_side == info->left_side)) { + + if ((block->is_hashed) + && (block->curr_n_fields == info->n_fields) + && (block->curr_n_bytes == info->n_bytes) + && (block->curr_left_side == info->left_side)) { + + /* The search would presumably have succeeded using + the hash index */ + + info->last_hash_succ = TRUE; + } + + block->n_hash_helps++; + } else { + block->n_hash_helps = 1; + block->n_fields = info->n_fields; + block->n_bytes = info->n_bytes; + block->left_side = info->left_side; + } + +#ifdef UNIV_DEBUG + if (cursor->index->table->does_not_fit_in_memory) { + block->n_hash_helps = 0; + } +#endif /* UNIV_DEBUG */ + + if ((block->n_hash_helps > page_get_n_recs(block->frame) + / BTR_SEARCH_PAGE_BUILD_LIMIT) + && (info->n_hash_potential >= BTR_SEARCH_BUILD_LIMIT)) { + + if ((!block->is_hashed) + || (block->n_hash_helps + > 2 * page_get_n_recs(block->frame)) + || (block->n_fields != block->curr_n_fields) + || (block->n_bytes != block->curr_n_bytes) + || (block->left_side != block->curr_left_side)) { + + /* Build a new hash index on the page */ + + return(TRUE); + } + } + + return(FALSE); +} + +/*********************************************************************//** +Updates a hash node reference when it has been unsuccessfully used in a +search which could have succeeded with the used hash parameters. This can +happen because when building a hash index for a page, we do not check +what happens at page boundaries, and therefore there can be misleading +hash nodes. Also, collisions in the fold value can lead to misleading +references. This function lazily fixes these imperfections in the hash +index. */ +static +void +btr_search_update_hash_ref( +/*=======================*/ + btr_search_t* info, /*!< in: search info */ + buf_block_t* block, /*!< in: buffer block where cursor positioned */ + btr_cur_t* cursor) /*!< in: cursor */ +{ + ulint fold; + rec_t* rec; + dulint index_id; + + ut_ad(cursor->flag == BTR_CUR_HASH_FAIL); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX)); + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED) + || rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(page_align(btr_cur_get_rec(cursor)) + == buf_block_get_frame(block)); + + if (!block->is_hashed) { + + return; + } + + ut_a(block->index == cursor->index); + ut_a(!dict_index_is_ibuf(cursor->index)); + + if ((info->n_hash_potential > 0) + && (block->curr_n_fields == info->n_fields) + && (block->curr_n_bytes == info->n_bytes) + && (block->curr_left_side == info->left_side)) { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + rec = btr_cur_get_rec(cursor); + + if (!page_rec_is_user_rec(rec)) { + + return; + } + + index_id = cursor->index->id; + fold = rec_fold(rec, + rec_get_offsets(rec, cursor->index, offsets_, + ULINT_UNDEFINED, &heap), + block->curr_n_fields, + block->curr_n_bytes, index_id); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + ha_insert_for_fold(btr_search_sys->hash_index, fold, + block, rec); + } +} + +/*********************************************************************//** +Updates the search info. */ +UNIV_INTERN +void +btr_search_info_update_slow( +/*========================*/ + btr_search_t* info, /*!< in/out: search info */ + btr_cur_t* cursor) /*!< in: cursor which was just positioned */ +{ + buf_block_t* block; + ibool build_index; + ulint* params; + ulint* params2; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + block = btr_cur_get_block(cursor); + + if (srv_pass_corrupt_table && !block) { + return; + } + ut_a(block); + + /* NOTE that the following two function calls do NOT protect + info or block->n_fields etc. with any semaphore, to save CPU time! + We cannot assume the fields are consistent when we return from + those functions! */ + + btr_search_info_update_hash(info, cursor); + + build_index = btr_search_update_block_hash_info(info, block, cursor); + + if (build_index || (cursor->flag == BTR_CUR_HASH_FAIL)) { + + btr_search_check_free_space_in_heap(); + } + + if (cursor->flag == BTR_CUR_HASH_FAIL) { + /* Update the hash node reference, if appropriate */ + +#ifdef UNIV_SEARCH_PERF_STAT + btr_search_n_hash_fail++; +#endif /* UNIV_SEARCH_PERF_STAT */ + + rw_lock_x_lock(&btr_search_latch); + + btr_search_update_hash_ref(info, block, cursor); + + rw_lock_x_unlock(&btr_search_latch); + } + + if (build_index) { + /* Note that since we did not protect block->n_fields etc. + with any semaphore, the values can be inconsistent. We have + to check inside the function call that they make sense. We + also malloc an array and store the values there to make sure + the compiler does not let the function call parameters change + inside the called function. It might be that the compiler + would optimize the call just to pass pointers to block. */ + + params = mem_alloc(3 * sizeof(ulint)); + params[0] = block->n_fields; + params[1] = block->n_bytes; + params[2] = block->left_side; + + /* Make sure the compiler cannot deduce the values and do + optimizations */ + + params2 = params + btr_search_this_is_zero; + + btr_search_build_page_hash_index(cursor->index, + block, + params2[0], + params2[1], + params2[2]); + mem_free(params); + } +} + +/******************************************************************//** +Checks if a guessed position for a tree cursor is right. Note that if +mode is PAGE_CUR_LE, which is used in inserts, and the function returns +TRUE, then cursor->up_match and cursor->low_match both have sensible values. +@return TRUE if success */ +static +ibool +btr_search_check_guess( +/*===================*/ + btr_cur_t* cursor, /*!< in: guessed cursor position */ + ibool can_only_compare_to_cursor_rec, + /*!< in: if we do not have a latch on the page + of cursor, but only a latch on + btr_search_latch, then ONLY the columns + of the record UNDER the cursor are + protected, not the next or previous record + in the chain: we cannot look at the next or + previous record to check our guess! */ + const dtuple_t* tuple, /*!< in: data tuple */ + ulint mode, /*!< in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, + or PAGE_CUR_GE */ + mtr_t* mtr) /*!< in: mtr */ +{ + rec_t* rec; + ulint n_unique; + ulint match; + ulint bytes; + int cmp; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + ibool success = FALSE; + rec_offs_init(offsets_); + + n_unique = dict_index_get_n_unique_in_tree(cursor->index); + + rec = btr_cur_get_rec(cursor); + + ut_ad(page_rec_is_user_rec(rec)); + + match = 0; + bytes = 0; + + offsets = rec_get_offsets(rec, cursor->index, offsets, + n_unique, &heap); + cmp = page_cmp_dtuple_rec_with_match(tuple, rec, + offsets, &match, &bytes); + + if (mode == PAGE_CUR_GE) { + if (cmp == 1) { + goto exit_func; + } + + cursor->up_match = match; + + if (match >= n_unique) { + success = TRUE; + goto exit_func; + } + } else if (mode == PAGE_CUR_LE) { + if (cmp == -1) { + goto exit_func; + } + + cursor->low_match = match; + + } else if (mode == PAGE_CUR_G) { + if (cmp != -1) { + goto exit_func; + } + } else if (mode == PAGE_CUR_L) { + if (cmp != 1) { + goto exit_func; + } + } + + if (can_only_compare_to_cursor_rec) { + /* Since we could not determine if our guess is right just by + looking at the record under the cursor, return FALSE */ + goto exit_func; + } + + match = 0; + bytes = 0; + + if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)) { + rec_t* prev_rec; + + ut_ad(!page_rec_is_infimum(rec)); + + prev_rec = page_rec_get_prev(rec); + + if (page_rec_is_infimum(prev_rec)) { + success = btr_page_get_prev(page_align(prev_rec), mtr) + == FIL_NULL; + + goto exit_func; + } + + offsets = rec_get_offsets(prev_rec, cursor->index, offsets, + n_unique, &heap); + cmp = page_cmp_dtuple_rec_with_match(tuple, prev_rec, + offsets, &match, &bytes); + if (mode == PAGE_CUR_GE) { + success = cmp == 1; + } else { + success = cmp != -1; + } + + goto exit_func; + } else { + rec_t* next_rec; + + ut_ad(!page_rec_is_supremum(rec)); + + next_rec = page_rec_get_next(rec); + + if (page_rec_is_supremum(next_rec)) { + if (btr_page_get_next(page_align(next_rec), mtr) + == FIL_NULL) { + + cursor->up_match = 0; + success = TRUE; + } + + goto exit_func; + } + + offsets = rec_get_offsets(next_rec, cursor->index, offsets, + n_unique, &heap); + cmp = page_cmp_dtuple_rec_with_match(tuple, next_rec, + offsets, &match, &bytes); + if (mode == PAGE_CUR_LE) { + success = cmp == -1; + cursor->up_match = match; + } else { + success = cmp != 1; + } + } +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(success); +} + +/******************************************************************//** +Tries to guess the right search position based on the hash search info +of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts, +and the function returns TRUE, then cursor->up_match and cursor->low_match +both have sensible values. +@return TRUE if succeeded */ +UNIV_INTERN +ibool +btr_search_guess_on_hash( +/*=====================*/ + dict_index_t* index, /*!< in: index */ + btr_search_t* info, /*!< in: index search info */ + const dtuple_t* tuple, /*!< in: logical record */ + ulint mode, /*!< in: PAGE_CUR_L, ... */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ...; + NOTE that only if has_search_latch + is 0, we will have a latch set on + the cursor page, otherwise we assume + the caller uses his search latch + to protect the record! */ + btr_cur_t* cursor, /*!< out: tree cursor */ + ulint has_search_latch,/*!< in: latch mode the caller + currently has on btr_search_latch: + RW_S_LATCH, RW_X_LATCH, or 0 */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + rec_t* rec; + ulint fold; + dulint index_id; +#ifdef notdefined + btr_cur_t cursor2; + btr_pcur_t pcur; +#endif + ut_ad(index && info && tuple && cursor && mtr); + ut_ad((latch_mode == BTR_SEARCH_LEAF) + || (latch_mode == BTR_MODIFY_LEAF)); + + /* Note that, for efficiency, the struct info may not be protected by + any latch here! */ + + if (UNIV_UNLIKELY(info->n_hash_potential == 0)) { + + return(FALSE); + } + + cursor->n_fields = info->n_fields; + cursor->n_bytes = info->n_bytes; + + if (UNIV_UNLIKELY(dtuple_get_n_fields(tuple) + < cursor->n_fields + (cursor->n_bytes > 0))) { + + return(FALSE); + } + + index_id = index->id; + +#ifdef UNIV_SEARCH_PERF_STAT + info->n_hash_succ++; +#endif + fold = dtuple_fold(tuple, cursor->n_fields, cursor->n_bytes, index_id); + + cursor->fold = fold; + cursor->flag = BTR_CUR_HASH; + + if (UNIV_LIKELY(!has_search_latch)) { + rw_lock_s_lock(&btr_search_latch); + + if (UNIV_UNLIKELY(!btr_search_enabled)) { + goto failure_unlock; + } + } + + ut_ad(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_EX); + ut_ad(rw_lock_get_reader_count(&btr_search_latch) > 0); + + rec = ha_search_and_get_data(btr_search_sys->hash_index, fold); + + if (UNIV_UNLIKELY(!rec)) { + goto failure_unlock; + } + + block = buf_block_align(rec); + + if (UNIV_LIKELY(!has_search_latch)) { + + if (UNIV_UNLIKELY( + !buf_page_get_known_nowait(latch_mode, block, + BUF_MAKE_YOUNG, + __FILE__, __LINE__, + mtr))) { + goto failure_unlock; + } + + rw_lock_s_unlock(&btr_search_latch); + + buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH); + } + + if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) { + ut_ad(buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH); + + if (UNIV_LIKELY(!has_search_latch)) { + + btr_leaf_page_release(block, latch_mode, mtr); + } + + goto failure; + } + + ut_ad(page_rec_is_user_rec(rec)); + + btr_cur_position(index, rec, block, cursor); + + /* Check the validity of the guess within the page */ + + /* If we only have the latch on btr_search_latch, not on the + page, it only protects the columns of the record the cursor + is positioned on. We cannot look at the next of the previous + record to determine if our guess for the cursor position is + right. */ + if (UNIV_EXPECT + (ut_dulint_cmp(index_id, btr_page_get_index_id(block->frame)), 0) + || !btr_search_check_guess(cursor, + has_search_latch, + tuple, mode, mtr)) { + if (UNIV_LIKELY(!has_search_latch)) { + btr_leaf_page_release(block, latch_mode, mtr); + } + + goto failure; + } + + if (UNIV_LIKELY(info->n_hash_potential < BTR_SEARCH_BUILD_LIMIT + 5)) { + + info->n_hash_potential++; + } + +#ifdef notdefined + /* These lines of code can be used in a debug version to check + the correctness of the searched cursor position: */ + + info->last_hash_succ = FALSE; + + /* Currently, does not work if the following fails: */ + ut_ad(!has_search_latch); + + btr_leaf_page_release(block, latch_mode, mtr); + + btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode, + &cursor2, 0, mtr); + if (mode == PAGE_CUR_GE + && page_rec_is_supremum(btr_cur_get_rec(&cursor2))) { + + /* If mode is PAGE_CUR_GE, then the binary search + in the index tree may actually take us to the supremum + of the previous page */ + + info->last_hash_succ = FALSE; + + btr_pcur_open_on_user_rec(index, tuple, mode, latch_mode, + &pcur, mtr); + ut_ad(btr_pcur_get_rec(&pcur) == btr_cur_get_rec(cursor)); + } else { + ut_ad(btr_cur_get_rec(&cursor2) == btr_cur_get_rec(cursor)); + } + + /* NOTE that it is theoretically possible that the above assertions + fail if the page of the cursor gets removed from the buffer pool + meanwhile! Thus it might not be a bug. */ +#endif + info->last_hash_succ = TRUE; + +#ifdef UNIV_SEARCH_PERF_STAT + btr_search_n_succ++; +#endif + if (UNIV_LIKELY(!has_search_latch) + && buf_page_peek_if_too_old(&block->page)) { + + buf_page_make_young(&block->page); + } + + /* Increment the page get statistics though we did not really + fix the page: for user info only */ + + buf_pool->stat.n_page_gets++; + + return(TRUE); + + /*-------------------------------------------*/ +failure_unlock: + if (UNIV_LIKELY(!has_search_latch)) { + rw_lock_s_unlock(&btr_search_latch); + } +failure: + cursor->flag = BTR_CUR_HASH_FAIL; + +#ifdef UNIV_SEARCH_PERF_STAT + info->n_hash_fail++; + + if (info->n_hash_succ > 0) { + info->n_hash_succ--; + } +#endif + info->last_hash_succ = FALSE; + + return(FALSE); +} + +/********************************************************************//** +Drops a page hash index. */ +UNIV_INTERN +void +btr_search_drop_page_hash_index( +/*============================*/ + buf_block_t* block) /*!< in: block containing index page, + s- or x-latched, or an index page + for which we know that + block->buf_fix_count == 0 */ +{ + hash_table_t* table; + ulint n_fields; + ulint n_bytes; + const page_t* page; + const rec_t* rec; + ulint fold; + ulint prev_fold; + dulint index_id; + ulint n_cached; + ulint n_recs; + ulint* folds; + ulint i; + mem_heap_t* heap; + const dict_index_t* index; + ulint* offsets; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + +retry: + rw_lock_s_lock(&btr_search_latch); + page = block->frame; + + if (UNIV_LIKELY(!block->is_hashed)) { + + rw_lock_s_unlock(&btr_search_latch); + + return; + } + + table = btr_search_sys->hash_index; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED) + || rw_lock_own(&(block->lock), RW_LOCK_EX) + || (block->page.buf_fix_count == 0)); +#endif /* UNIV_SYNC_DEBUG */ + + n_fields = block->curr_n_fields; + n_bytes = block->curr_n_bytes; + index = block->index; + ut_a(!dict_index_is_ibuf(index)); + + /* NOTE: The fields of block must not be accessed after + releasing btr_search_latch, as the index page might only + be s-latched! */ + + rw_lock_s_unlock(&btr_search_latch); + + ut_a(n_fields + n_bytes > 0); + + n_recs = page_get_n_recs(page); + + /* Calculate and cache fold values into an array for fast deletion + from the hash index */ + + folds = mem_alloc(n_recs * sizeof(ulint)); + + n_cached = 0; + + rec = page_get_infimum_rec(page); + rec = page_rec_get_next_low(rec, page_is_comp(page)); + + index_id = btr_page_get_index_id(page); + + ut_a(0 == ut_dulint_cmp(index_id, index->id)); + + prev_fold = 0; + + heap = NULL; + offsets = NULL; + + while (!page_rec_is_supremum(rec)) { + offsets = rec_get_offsets(rec, index, offsets, + n_fields + (n_bytes > 0), &heap); + ut_a(rec_offs_n_fields(offsets) == n_fields + (n_bytes > 0)); + fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id); + + if (fold == prev_fold && prev_fold != 0) { + + goto next_rec; + } + + /* Remove all hash nodes pointing to this page from the + hash chain */ + + folds[n_cached] = fold; + n_cached++; +next_rec: + rec = page_rec_get_next_low(rec, page_rec_is_comp(rec)); + prev_fold = fold; + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + rw_lock_x_lock(&btr_search_latch); + + if (UNIV_UNLIKELY(!block->is_hashed)) { + /* Someone else has meanwhile dropped the hash index */ + + goto cleanup; + } + + ut_a(block->index == index); + + if (UNIV_UNLIKELY(block->curr_n_fields != n_fields) + || UNIV_UNLIKELY(block->curr_n_bytes != n_bytes)) { + + /* Someone else has meanwhile built a new hash index on the + page, with different parameters */ + + rw_lock_x_unlock(&btr_search_latch); + + mem_free(folds); + goto retry; + } + + for (i = 0; i < n_cached; i++) { + + ha_remove_all_nodes_to_page(table, folds[i], page); + } + + ut_a(index->search_info->ref_count > 0); + index->search_info->ref_count--; + + block->is_hashed = FALSE; + block->index = NULL; + +cleanup: +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + if (UNIV_UNLIKELY(block->n_pointers)) { + /* Corruption */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Corruption of adaptive hash index." + " After dropping\n" + "InnoDB: the hash index to a page of %s," + " still %lu hash nodes remain.\n", + index->name, (ulong) block->n_pointers); + rw_lock_x_unlock(&btr_search_latch); + + btr_search_validate(); + } else { + rw_lock_x_unlock(&btr_search_latch); + } +#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + rw_lock_x_unlock(&btr_search_latch); +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + + mem_free(folds); +} + +/************************************************************************ +Drops a page hash index based on index */ +UNIV_INTERN +void +btr_search_drop_page_hash_index_on_index( +/*=====================================*/ + dict_index_t* index) /* in: record descriptor */ +{ + buf_page_t* bpage; + hash_table_t* table; + buf_block_t* block; + ulint n_fields; + ulint n_bytes; + const page_t* page; + const rec_t* rec; + ulint fold; + ulint prev_fold; + dulint index_id; + ulint n_cached; + ulint n_recs; + ulint* folds; + ulint i; + mem_heap_t* heap = NULL; + ulint* offsets; + + rw_lock_x_lock(&btr_search_latch); + mutex_enter(&LRU_list_mutex); + + table = btr_search_sys->hash_index; + + bpage = UT_LIST_GET_LAST(buf_pool->LRU); + + while (bpage != NULL) { + block = (buf_block_t*) bpage; + if (block->index == index && block->is_hashed) { + page = block->frame; + + /* from btr_search_drop_page_hash_index() */ + n_fields = block->curr_n_fields; + n_bytes = block->curr_n_bytes; + + ut_a(n_fields + n_bytes > 0); + + n_recs = page_get_n_recs(page); + + /* Calculate and cache fold values into an array for fast deletion + from the hash index */ + + folds = mem_alloc(n_recs * sizeof(ulint)); + + n_cached = 0; + + rec = page_get_infimum_rec(page); + rec = page_rec_get_next_low(rec, page_is_comp(page)); + + index_id = btr_page_get_index_id(page); + + ut_a(0 == ut_dulint_cmp(index_id, index->id)); + + prev_fold = 0; + + offsets = NULL; + + while (!page_rec_is_supremum(rec)) { + offsets = rec_get_offsets(rec, index, offsets, + n_fields + (n_bytes > 0), &heap); + ut_a(rec_offs_n_fields(offsets) == n_fields + (n_bytes > 0)); + fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id); + + if (fold == prev_fold && prev_fold != 0) { + + goto next_rec; + } + + /* Remove all hash nodes pointing to this page from the + hash chain */ + + folds[n_cached] = fold; + n_cached++; +next_rec: + rec = page_rec_get_next_low(rec, page_rec_is_comp(rec)); + prev_fold = fold; + } + + for (i = 0; i < n_cached; i++) { + + ha_remove_all_nodes_to_page(table, folds[i], page); + } + + ut_a(index->search_info->ref_count > 0); + index->search_info->ref_count--; + + block->is_hashed = FALSE; + block->index = NULL; + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + if (UNIV_UNLIKELY(block->n_pointers)) { + /* Corruption */ + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Corruption of adaptive hash index. After dropping\n" +"InnoDB: the hash index to a page of %s, still %lu hash nodes remain.\n", + index->name, (ulong) block->n_pointers); + } +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + + mem_free(folds); + } + + bpage = UT_LIST_GET_PREV(LRU, bpage); + } + + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&btr_search_latch); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/********************************************************************//** +Drops a page hash index when a page is freed from a fseg to the file system. +Drops possible hash index if the page happens to be in the buffer pool. */ +UNIV_INTERN +void +btr_search_drop_page_hash_when_freed( +/*=================================*/ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no) /*!< in: page number */ +{ + buf_block_t* block; + mtr_t mtr; + + if (!buf_page_peek_if_search_hashed(space, page_no)) { + + return; + } + + mtr_start(&mtr); + + /* We assume that if the caller has a latch on the page, then the + caller has already dropped the hash index for the page, and we never + get here. Therefore we can acquire the s-latch to the page without + having to fear a deadlock. */ + + block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH, NULL, + BUF_GET_IF_IN_POOL, __FILE__, __LINE__, + &mtr); + /* Because the buffer pool mutex was released by + buf_page_peek_if_search_hashed(), it is possible that the + block was removed from the buffer pool by another thread + before buf_page_get_gen() got a chance to acquire the buffer + pool mutex again. Thus, we must check for a NULL return. */ + + if (UNIV_LIKELY(block != NULL)) { + + buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH); + + btr_search_drop_page_hash_index(block); + } + + mtr_commit(&mtr); +} + +/********************************************************************//** +Builds a hash index on a page with the given parameters. If the page already +has a hash index with different parameters, the old hash index is removed. +If index is non-NULL, this function checks if n_fields and n_bytes are +sensible values, and does not build a hash index if not. */ +static +void +btr_search_build_page_hash_index( +/*=============================*/ + dict_index_t* index, /*!< in: index for which to build */ + buf_block_t* block, /*!< in: index page, s- or x-latched */ + ulint n_fields,/*!< in: hash this many full fields */ + ulint n_bytes,/*!< in: hash this many bytes from the next + field */ + ibool left_side)/*!< in: hash for searches from left side? */ +{ + hash_table_t* table; + page_t* page; + rec_t* rec; + rec_t* next_rec; + ulint fold; + ulint next_fold; + dulint index_id; + ulint n_cached; + ulint n_recs; + ulint* folds; + rec_t** recs; + ulint i; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(index); + ut_a(!dict_index_is_ibuf(index)); + + table = btr_search_sys->hash_index; + page = buf_block_get_frame(block); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX)); + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED) + || rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + rw_lock_s_lock(&btr_search_latch); + + if (block->is_hashed && ((block->curr_n_fields != n_fields) + || (block->curr_n_bytes != n_bytes) + || (block->curr_left_side != left_side))) { + + rw_lock_s_unlock(&btr_search_latch); + + btr_search_drop_page_hash_index(block); + } else { + rw_lock_s_unlock(&btr_search_latch); + } + + n_recs = page_get_n_recs(page); + + if (n_recs == 0) { + + return; + } + + /* Check that the values for hash index build are sensible */ + + if (n_fields + n_bytes == 0) { + + return; + } + + if (dict_index_get_n_unique_in_tree(index) < n_fields + || (dict_index_get_n_unique_in_tree(index) == n_fields + && n_bytes > 0)) { + return; + } + + /* Calculate and cache fold values and corresponding records into + an array for fast insertion to the hash index */ + + folds = mem_alloc(n_recs * sizeof(ulint)); + recs = mem_alloc(n_recs * sizeof(rec_t*)); + + n_cached = 0; + + index_id = btr_page_get_index_id(page); + + rec = page_rec_get_next(page_get_infimum_rec(page)); + + offsets = rec_get_offsets(rec, index, offsets, + n_fields + (n_bytes > 0), &heap); + + if (!page_rec_is_supremum(rec)) { + ut_a(n_fields <= rec_offs_n_fields(offsets)); + + if (n_bytes > 0) { + ut_a(n_fields < rec_offs_n_fields(offsets)); + } + } + + fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id); + + if (left_side) { + + folds[n_cached] = fold; + recs[n_cached] = rec; + n_cached++; + } + + for (;;) { + next_rec = page_rec_get_next(rec); + + if (page_rec_is_supremum(next_rec)) { + + if (!left_side) { + + folds[n_cached] = fold; + recs[n_cached] = rec; + n_cached++; + } + + break; + } + + offsets = rec_get_offsets(next_rec, index, offsets, + n_fields + (n_bytes > 0), &heap); + next_fold = rec_fold(next_rec, offsets, n_fields, + n_bytes, index_id); + + if (fold != next_fold) { + /* Insert an entry into the hash index */ + + if (left_side) { + + folds[n_cached] = next_fold; + recs[n_cached] = next_rec; + n_cached++; + } else { + folds[n_cached] = fold; + recs[n_cached] = rec; + n_cached++; + } + } + + rec = next_rec; + fold = next_fold; + } + + btr_search_check_free_space_in_heap(); + + rw_lock_x_lock(&btr_search_latch); + + if (UNIV_UNLIKELY(btr_search_fully_disabled)) { + goto exit_func; + } + + if (block->is_hashed && ((block->curr_n_fields != n_fields) + || (block->curr_n_bytes != n_bytes) + || (block->curr_left_side != left_side))) { + goto exit_func; + } + + /* This counter is decremented every time we drop page + hash index entries and is incremented here. Since we can + rebuild hash index for a page that is already hashed, we + have to take care not to increment the counter in that + case. */ + if (!block->is_hashed) { + index->search_info->ref_count++; + } + + block->is_hashed = TRUE; + block->n_hash_helps = 0; + + block->curr_n_fields = n_fields; + block->curr_n_bytes = n_bytes; + block->curr_left_side = left_side; + block->index = index; + + for (i = 0; i < n_cached; i++) { + + ha_insert_for_fold(table, folds[i], block, recs[i]); + } + +exit_func: + rw_lock_x_unlock(&btr_search_latch); + + mem_free(folds); + mem_free(recs); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/********************************************************************//** +Moves or deletes hash entries for moved records. If new_page is already hashed, +then the hash index for page, if any, is dropped. If new_page is not hashed, +and page is hashed, then a new hash index is built to new_page with the same +parameters as page (this often happens when a page is split). */ +UNIV_INTERN +void +btr_search_move_or_delete_hash_entries( +/*===================================*/ + buf_block_t* new_block, /*!< in: records are copied + to this page */ + buf_block_t* block, /*!< in: index page from which + records were copied, and the + copied records will be deleted + from this page */ + dict_index_t* index) /*!< in: record descriptor */ +{ + ulint n_fields; + ulint n_bytes; + ibool left_side; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); + ut_ad(rw_lock_own(&(new_block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_a(!new_block->is_hashed || new_block->index == index); + ut_a(!block->is_hashed || block->index == index); + ut_a(!(new_block->is_hashed || block->is_hashed) + || !dict_index_is_ibuf(index)); + + rw_lock_s_lock(&btr_search_latch); + + if (new_block->is_hashed) { + + rw_lock_s_unlock(&btr_search_latch); + + btr_search_drop_page_hash_index(block); + + return; + } + + if (block->is_hashed) { + + n_fields = block->curr_n_fields; + n_bytes = block->curr_n_bytes; + left_side = block->curr_left_side; + + new_block->n_fields = block->curr_n_fields; + new_block->n_bytes = block->curr_n_bytes; + new_block->left_side = left_side; + + rw_lock_s_unlock(&btr_search_latch); + + ut_a(n_fields + n_bytes > 0); + + btr_search_build_page_hash_index(index, new_block, n_fields, + n_bytes, left_side); + ut_ad(n_fields == block->curr_n_fields); + ut_ad(n_bytes == block->curr_n_bytes); + ut_ad(left_side == block->curr_left_side); + return; + } + + rw_lock_s_unlock(&btr_search_latch); +} + +/********************************************************************//** +Updates the page hash index when a single record is deleted from a page. */ +UNIV_INTERN +void +btr_search_update_hash_on_delete( +/*=============================*/ + btr_cur_t* cursor) /*!< in: cursor which was positioned on the + record to delete using btr_cur_search_..., + the record is not yet deleted */ +{ + hash_table_t* table; + buf_block_t* block; + rec_t* rec; + ulint fold; + dulint index_id; + ibool found; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + mem_heap_t* heap = NULL; + rec_offs_init(offsets_); + + rec = btr_cur_get_rec(cursor); + + block = btr_cur_get_block(cursor); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (!block->is_hashed) { + + return; + } + + ut_a(block->index == cursor->index); + ut_a(block->curr_n_fields + block->curr_n_bytes > 0); + ut_a(!dict_index_is_ibuf(cursor->index)); + + table = btr_search_sys->hash_index; + + index_id = cursor->index->id; + fold = rec_fold(rec, rec_get_offsets(rec, cursor->index, offsets_, + ULINT_UNDEFINED, &heap), + block->curr_n_fields, block->curr_n_bytes, index_id); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + rw_lock_x_lock(&btr_search_latch); + + found = ha_search_and_delete_if_found(table, fold, rec); + + rw_lock_x_unlock(&btr_search_latch); +} + +/********************************************************************//** +Updates the page hash index when a single record is inserted on a page. */ +UNIV_INTERN +void +btr_search_update_hash_node_on_insert( +/*==================================*/ + btr_cur_t* cursor) /*!< in: cursor which was positioned to the + place to insert using btr_cur_search_..., + and the new record has been inserted next + to the cursor */ +{ + hash_table_t* table; + buf_block_t* block; + rec_t* rec; + + rec = btr_cur_get_rec(cursor); + + block = btr_cur_get_block(cursor); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (!block->is_hashed) { + + return; + } + + ut_a(block->index == cursor->index); + ut_a(!dict_index_is_ibuf(cursor->index)); + + rw_lock_x_lock(&btr_search_latch); + + if ((cursor->flag == BTR_CUR_HASH) + && (cursor->n_fields == block->curr_n_fields) + && (cursor->n_bytes == block->curr_n_bytes) + && !block->curr_left_side) { + + table = btr_search_sys->hash_index; + + ha_search_and_update_if_found(table, cursor->fold, rec, + block, page_rec_get_next(rec)); + + rw_lock_x_unlock(&btr_search_latch); + } else { + rw_lock_x_unlock(&btr_search_latch); + + btr_search_update_hash_on_insert(cursor); + } +} + +/********************************************************************//** +Updates the page hash index when a single record is inserted on a page. */ +UNIV_INTERN +void +btr_search_update_hash_on_insert( +/*=============================*/ + btr_cur_t* cursor) /*!< in: cursor which was positioned to the + place to insert using btr_cur_search_..., + and the new record has been inserted next + to the cursor */ +{ + hash_table_t* table; + buf_block_t* block; + rec_t* rec; + rec_t* ins_rec; + rec_t* next_rec; + dulint index_id; + ulint fold; + ulint ins_fold; + ulint next_fold = 0; /* remove warning (??? bug ???) */ + ulint n_fields; + ulint n_bytes; + ibool left_side; + ibool locked = FALSE; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + table = btr_search_sys->hash_index; + + btr_search_check_free_space_in_heap(); + + rec = btr_cur_get_rec(cursor); + + block = btr_cur_get_block(cursor); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (!block->is_hashed) { + + return; + } + + ut_a(block->index == cursor->index); + ut_a(!dict_index_is_ibuf(cursor->index)); + + index_id = cursor->index->id; + + n_fields = block->curr_n_fields; + n_bytes = block->curr_n_bytes; + left_side = block->curr_left_side; + + ins_rec = page_rec_get_next(rec); + next_rec = page_rec_get_next(ins_rec); + + offsets = rec_get_offsets(ins_rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + ins_fold = rec_fold(ins_rec, offsets, n_fields, n_bytes, index_id); + + if (!page_rec_is_supremum(next_rec)) { + offsets = rec_get_offsets(next_rec, cursor->index, offsets, + n_fields + (n_bytes > 0), &heap); + next_fold = rec_fold(next_rec, offsets, n_fields, + n_bytes, index_id); + } + + if (!page_rec_is_infimum(rec)) { + offsets = rec_get_offsets(rec, cursor->index, offsets, + n_fields + (n_bytes > 0), &heap); + fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id); + } else { + if (left_side) { + + rw_lock_x_lock(&btr_search_latch); + + locked = TRUE; + + ha_insert_for_fold(table, ins_fold, block, ins_rec); + } + + goto check_next_rec; + } + + if (fold != ins_fold) { + + if (!locked) { + + rw_lock_x_lock(&btr_search_latch); + + locked = TRUE; + } + + if (!left_side) { + ha_insert_for_fold(table, fold, block, rec); + } else { + ha_insert_for_fold(table, ins_fold, block, ins_rec); + } + } + +check_next_rec: + if (page_rec_is_supremum(next_rec)) { + + if (!left_side) { + + if (!locked) { + rw_lock_x_lock(&btr_search_latch); + + locked = TRUE; + } + + ha_insert_for_fold(table, ins_fold, block, ins_rec); + } + + goto function_exit; + } + + if (ins_fold != next_fold) { + + if (!locked) { + + rw_lock_x_lock(&btr_search_latch); + + locked = TRUE; + } + + if (!left_side) { + + ha_insert_for_fold(table, ins_fold, block, ins_rec); + /* + fputs("Hash insert for ", stderr); + dict_index_name_print(stderr, cursor->index); + fprintf(stderr, " fold %lu\n", ins_fold); + */ + } else { + ha_insert_for_fold(table, next_fold, block, next_rec); + } + } + +function_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + if (locked) { + rw_lock_x_unlock(&btr_search_latch); + } +} + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +/********************************************************************//** +Validates the search system. +@return TRUE if ok */ +UNIV_INTERN +ibool +btr_search_validate(void) +/*=====================*/ +{ + ha_node_t* node; + ulint n_page_dumps = 0; + ibool ok = TRUE; + ulint i; + ulint cell_count; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + + /* How many cells to check before temporarily releasing + btr_search_latch. */ + ulint chunk_size = 10000; + + rec_offs_init(offsets_); + + rw_lock_x_lock(&btr_search_latch); + //buf_pool_mutex_enter(); + rw_lock_x_lock(&page_hash_latch); + + cell_count = hash_get_n_cells(btr_search_sys->hash_index); + + for (i = 0; i < cell_count; i++) { + /* We release btr_search_latch every once in a while to + give other queries a chance to run. */ + if ((i != 0) && ((i % chunk_size) == 0)) { + //buf_pool_mutex_exit(); + rw_lock_x_unlock(&page_hash_latch); + rw_lock_x_unlock(&btr_search_latch); + os_thread_yield(); + rw_lock_x_lock(&btr_search_latch); + //buf_pool_mutex_enter(); + rw_lock_x_lock(&page_hash_latch); + } + + node = hash_get_nth_cell(btr_search_sys->hash_index, i)->node; + + for (; node != NULL; node = node->next) { + const buf_block_t* block + = buf_block_align(node->data); + const buf_block_t* hash_block; + + if (UNIV_LIKELY(buf_block_get_state(block) + == BUF_BLOCK_FILE_PAGE)) { + + /* The space and offset are only valid + for file blocks. It is possible that + the block is being freed + (BUF_BLOCK_REMOVE_HASH, see the + assertion and the comment below) */ + hash_block = buf_block_hash_get( + buf_block_get_space(block), + buf_block_get_page_no(block)); + } else { + hash_block = NULL; + } + + if (hash_block) { + ut_a(hash_block == block); + } else { + /* When a block is being freed, + buf_LRU_search_and_free_block() first + removes the block from + buf_pool->page_hash by calling + buf_LRU_block_remove_hashed_page(). + After that, it invokes + btr_search_drop_page_hash_index() to + remove the block from + btr_search_sys->hash_index. */ + + ut_a(buf_block_get_state(block) + == BUF_BLOCK_REMOVE_HASH); + } + + ut_a(!dict_index_is_ibuf(block->index)); + + offsets = rec_get_offsets((const rec_t*) node->data, + block->index, offsets, + block->curr_n_fields + + (block->curr_n_bytes > 0), + &heap); + + if (!block->is_hashed || node->fold + != rec_fold((rec_t*)(node->data), + offsets, + block->curr_n_fields, + block->curr_n_bytes, + btr_page_get_index_id(block->frame))) { + const page_t* page = block->frame; + + ok = FALSE; + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error in an adaptive hash" + " index pointer to page %lu\n" + "InnoDB: ptr mem address %p" + " index id %lu %lu," + " node fold %lu, rec fold %lu\n", + (ulong) page_get_page_no(page), + node->data, + (ulong) ut_dulint_get_high( + btr_page_get_index_id(page)), + (ulong) ut_dulint_get_low( + btr_page_get_index_id(page)), + (ulong) node->fold, + (ulong) rec_fold((rec_t*)(node->data), + offsets, + block->curr_n_fields, + block->curr_n_bytes, + btr_page_get_index_id( + page))); + + fputs("InnoDB: Record ", stderr); + rec_print_new(stderr, (rec_t*)node->data, + offsets); + fprintf(stderr, "\nInnoDB: on that page." + " Page mem address %p, is hashed %lu," + " n fields %lu, n bytes %lu\n" + "InnoDB: side %lu\n", + (void*) page, (ulong) block->is_hashed, + (ulong) block->curr_n_fields, + (ulong) block->curr_n_bytes, + (ulong) block->curr_left_side); + + if (n_page_dumps < 20) { + buf_page_print(page, 0); + n_page_dumps++; + } + } + } + } + + for (i = 0; i < cell_count; i += chunk_size) { + ulint end_index = ut_min(i + chunk_size - 1, cell_count - 1); + + /* We release btr_search_latch every once in a while to + give other queries a chance to run. */ + if (i != 0) { + //buf_pool_mutex_exit(); + rw_lock_x_unlock(&page_hash_latch); + rw_lock_x_unlock(&btr_search_latch); + os_thread_yield(); + rw_lock_x_lock(&btr_search_latch); + //buf_pool_mutex_enter(); + rw_lock_x_lock(&page_hash_latch); + } + + if (!ha_validate(btr_search_sys->hash_index, i, end_index)) { + ok = FALSE; + } + } + + //buf_pool_mutex_exit(); + rw_lock_x_unlock(&page_hash_latch); + rw_lock_x_unlock(&btr_search_latch); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(ok); +} +#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ |