summaryrefslogtreecommitdiff
path: root/storage/xtradb/btr/btr0cur.cc
diff options
context:
space:
mode:
Diffstat (limited to 'storage/xtradb/btr/btr0cur.cc')
-rw-r--r--storage/xtradb/btr/btr0cur.cc6148
1 files changed, 0 insertions, 6148 deletions
diff --git a/storage/xtradb/btr/btr0cur.cc b/storage/xtradb/btr/btr0cur.cc
deleted file mode 100644
index ffd7ebc7504..00000000000
--- a/storage/xtradb/btr/btr0cur.cc
+++ /dev/null
@@ -1,6148 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2008, Google Inc.
-Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2015, 2017, MariaDB Corporation.
-
-Portions of this file contain modifications contributed and copyrighted by
-Google, Inc. Those modifications are gratefully acknowledged and are described
-briefly in the InnoDB documentation. The contributions by Google are
-incorporated with their permission, and subject to the conditions contained in
-the file COPYING.Google.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file btr/btr0cur.cc
-The index tree cursor
-
-All changes that row operations make to a B-tree or the records
-there must go through this module! Undo log records are written here
-of every modify or insert of a clustered index record.
-
- NOTE!!!
-To make sure we do not run out of disk space during a pessimistic
-insert or update, we have to reserve 2 x the height of the index tree
-many pages in the tablespace before we start the operation, because
-if leaf splitting has been started, it is difficult to undo, except
-by crashing the database and doing a roll-forward.
-
-Created 10/16/1994 Heikki Tuuri
-*******************************************************/
-
-#include "btr0cur.h"
-
-#ifdef UNIV_NONINL
-#include "btr0cur.ic"
-#endif
-
-#include "row0upd.h"
-#ifndef UNIV_HOTBACKUP
-#include "mtr0log.h"
-#include "page0page.h"
-#include "page0zip.h"
-#include "rem0rec.h"
-#include "rem0cmp.h"
-#include "buf0lru.h"
-#include "btr0btr.h"
-#include "btr0sea.h"
-#include "row0log.h"
-#include "row0purge.h"
-#include "row0upd.h"
-#include "trx0rec.h"
-#include "trx0roll.h" /* trx_is_recv() */
-#include "que0que.h"
-#include "row0row.h"
-#include "srv0srv.h"
-#include "ibuf0ibuf.h"
-#include "lock0lock.h"
-#include "zlib.h"
-
-/** Buffered B-tree operation types, introduced as part of delete buffering. */
-enum btr_op_t {
- BTR_NO_OP = 0, /*!< Not buffered */
- BTR_INSERT_OP, /*!< Insert, do not ignore UNIQUE */
- BTR_INSERT_IGNORE_UNIQUE_OP, /*!< Insert, ignoring UNIQUE */
- BTR_DELETE_OP, /*!< Purge a delete-marked record */
- BTR_DELMARK_OP /*!< Mark a record for deletion */
-};
-
-#ifdef UNIV_DEBUG
-/** If the following is set to TRUE, this module prints a lot of
-trace information of individual record operations */
-UNIV_INTERN ibool btr_cur_print_record_ops = FALSE;
-#endif /* UNIV_DEBUG */
-
-/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
-UNIV_INTERN ulint btr_cur_n_non_sea = 0;
-/** Number of successful adaptive hash index lookups in
-btr_cur_search_to_nth_level(). */
-UNIV_INTERN ulint btr_cur_n_sea = 0;
-/** Old value of btr_cur_n_non_sea. Copied by
-srv_refresh_innodb_monitor_stats(). Referenced by
-srv_printf_innodb_monitor(). */
-UNIV_INTERN ulint btr_cur_n_non_sea_old = 0;
-/** Old value of btr_cur_n_sea. Copied by
-srv_refresh_innodb_monitor_stats(). Referenced by
-srv_printf_innodb_monitor(). */
-UNIV_INTERN ulint btr_cur_n_sea_old = 0;
-
-#ifdef UNIV_DEBUG
-/* Flag to limit optimistic insert records */
-UNIV_INTERN uint btr_cur_limit_optimistic_insert_debug = 0;
-#endif /* UNIV_DEBUG */
-
-/** In the optimistic insert, if the insert does not fit, but this much space
-can be released by page reorganize, then it is reorganized */
-#define BTR_CUR_PAGE_REORGANIZE_LIMIT (UNIV_PAGE_SIZE / 32)
-
-/** The structure of a BLOB part header */
-/* @{ */
-/*--------------------------------------*/
-#define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this
- page */
-#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no,
- FIL_NULL if none */
-/*--------------------------------------*/
-#define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB
- part header, in bytes */
-
-/** Estimated table level stats from sampled value.
-@param value sampled stats
-@param index index being sampled
-@param sample number of sampled rows
-@param ext_size external stored data size
-@param not_empty table not empty
-@return estimated table wide stats from sampled value */
-#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty)\
- (((value) * (ib_int64_t) index->stat_n_leaf_pages \
- + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
-
-/* @} */
-#endif /* !UNIV_HOTBACKUP */
-
-/** A BLOB field reference full of zero, for use in assertions and tests.
-Initially, BLOB field references are set to zero, in
-dtuple_convert_big_rec(). */
-const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE] = {
- 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0,
-};
-
-#ifndef UNIV_HOTBACKUP
-/*******************************************************************//**
-Marks all extern fields in a record as owned by the record. This function
-should be called if the delete mark of a record is removed: a not delete
-marked record always owns all its extern fields. */
-static
-void
-btr_cur_unmark_extern_fields(
-/*=========================*/
- page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
- part will be updated, or NULL */
- rec_t* rec, /*!< in/out: record in a clustered index */
- dict_index_t* index, /*!< in: index of the page */
- const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
- mtr_t* mtr); /*!< in: mtr, or NULL if not logged */
-/*******************************************************************//**
-Adds path information to the cursor for the current page, for which
-the binary search has been performed. */
-static
-void
-btr_cur_add_path_info(
-/*==================*/
- btr_cur_t* cursor, /*!< in: cursor positioned on a page */
- ulint height, /*!< in: height of the page in tree;
- 0 means leaf node */
- ulint root_height); /*!< in: root node height in tree */
-/***********************************************************//**
-Frees the externally stored fields for a record, if the field is mentioned
-in the update vector. */
-static
-void
-btr_rec_free_updated_extern_fields(
-/*===============================*/
- dict_index_t* index, /*!< in: index of rec; the index tree MUST be
- X-latched */
- rec_t* rec, /*!< in: record */
- page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
- part will be updated, or NULL */
- const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
- const upd_t* update, /*!< in: update vector */
- enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
- mtr_t* mtr); /*!< in: mini-transaction handle which contains
- an X-latch to record page and to the tree */
-/***********************************************************//**
-Frees the externally stored fields for a record. */
-static
-void
-btr_rec_free_externally_stored_fields(
-/*==================================*/
- dict_index_t* index, /*!< in: index of the data, the index
- tree MUST be X-latched */
- rec_t* rec, /*!< in: record */
- const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
- page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
- part will be updated, or NULL */
- enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
- mtr_t* mtr); /*!< in: mini-transaction handle which contains
- an X-latch to record page and to the index
- tree */
-#endif /* !UNIV_HOTBACKUP */
-
-/******************************************************//**
-The following function is used to set the deleted bit of a record. */
-UNIV_INLINE
-void
-btr_rec_set_deleted_flag(
-/*=====================*/
- rec_t* rec, /*!< in/out: physical record */
- page_zip_des_t* page_zip,/*!< in/out: compressed page (or NULL) */
- ulint flag) /*!< in: nonzero if delete marked */
-{
- if (page_rec_is_comp(rec)) {
- rec_set_deleted_flag_new(rec, page_zip, flag);
- } else {
- ut_ad(!page_zip);
- rec_set_deleted_flag_old(rec, flag);
- }
-}
-
-#ifndef UNIV_HOTBACKUP
-/*==================== B-TREE SEARCH =========================*/
-
-/********************************************************************//**
-Latches the leaf page or pages requested. */
-static
-void
-btr_cur_latch_leaves(
-/*=================*/
- page_t* page, /*!< in: leaf page where the search
- converged */
- ulint space, /*!< in: space id */
- ulint zip_size, /*!< in: compressed page size in bytes
- or 0 for uncompressed pages */
- ulint page_no, /*!< in: page number of the leaf */
- ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
- btr_cur_t* cursor, /*!< in: cursor */
- mtr_t* mtr) /*!< in: mtr */
-{
- ulint mode;
- ulint sibling_mode;
- ulint left_page_no;
- ulint right_page_no;
- buf_block_t* get_block;
-
- ut_ad(page && mtr);
-
- switch (latch_mode) {
- case BTR_SEARCH_LEAF:
- case BTR_MODIFY_LEAF:
- mode = latch_mode == BTR_SEARCH_LEAF ? RW_S_LATCH : RW_X_LATCH;
- get_block = btr_block_get(
- space, zip_size, page_no, mode, cursor->index, mtr);
-
- SRV_CORRUPT_TABLE_CHECK(get_block, return;);
-
-#ifdef UNIV_BTR_DEBUG
- ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
-#endif /* UNIV_BTR_DEBUG */
- get_block->check_index_page_at_flush = TRUE;
- return;
- case BTR_SEARCH_TREE:
- case BTR_MODIFY_TREE:
- if (UNIV_UNLIKELY(latch_mode == BTR_SEARCH_TREE)) {
- mode = RW_S_LATCH;
- sibling_mode = RW_NO_LATCH;
- } else {
- mode = sibling_mode = RW_X_LATCH;
- }
- /* Fetch and possibly latch also brothers from left to right */
- left_page_no = btr_page_get_prev(page, mtr);
-
- if (left_page_no != FIL_NULL) {
- get_block = btr_block_get(
- space, zip_size, left_page_no,
- sibling_mode, cursor->index, mtr);
-
- SRV_CORRUPT_TABLE_CHECK(get_block, return;);
-
-#ifdef UNIV_BTR_DEBUG
- ut_a(page_is_comp(get_block->frame)
- == page_is_comp(page));
-
- /* For fake_change mode we avoid a detailed validation
- as it operate in tweaked format where-in validation
- may fail. */
- ut_a(sibling_mode == RW_NO_LATCH
- || btr_page_get_next(get_block->frame, mtr)
- == page_get_page_no(page));
-#endif /* UNIV_BTR_DEBUG */
- if (sibling_mode == RW_NO_LATCH) {
- /* btr_block_get() called with RW_NO_LATCH will
- fix the read block in the buffer. This serves
- no purpose for the fake changes prefetching,
- thus we unfix the sibling blocks immediately.*/
- mtr_memo_release(mtr, get_block,
- MTR_MEMO_BUF_FIX);
- } else {
- get_block->check_index_page_at_flush = TRUE;
- }
- }
-
- get_block = btr_block_get(
- space, zip_size, page_no,
- mode, cursor->index, mtr);
-
- SRV_CORRUPT_TABLE_CHECK(get_block, return;);
-
-#ifdef UNIV_BTR_DEBUG
- ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
-#endif /* UNIV_BTR_DEBUG */
- get_block->check_index_page_at_flush = TRUE;
-
- right_page_no = btr_page_get_next(page, mtr);
-
- if (right_page_no != FIL_NULL) {
- get_block = btr_block_get(
- space, zip_size, right_page_no,
- sibling_mode, cursor->index, mtr);
-
- SRV_CORRUPT_TABLE_CHECK(get_block, return;);
-
-#ifdef UNIV_BTR_DEBUG
- ut_a(page_is_comp(get_block->frame)
- == page_is_comp(page));
- ut_a(btr_page_get_prev(get_block->frame, mtr)
- == page_get_page_no(page));
-#endif /* UNIV_BTR_DEBUG */
- if (sibling_mode == RW_NO_LATCH) {
- mtr_memo_release(mtr, get_block,
- MTR_MEMO_BUF_FIX);
- } else {
- get_block->check_index_page_at_flush = TRUE;
- }
- }
-
- return;
-
- case BTR_SEARCH_PREV:
- case BTR_MODIFY_PREV:
- mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
- /* latch also left brother */
- left_page_no = btr_page_get_prev(page, mtr);
-
- if (left_page_no != FIL_NULL) {
- get_block = btr_block_get(
- space, zip_size,
- left_page_no, mode, cursor->index, mtr);
- cursor->left_block = get_block;
-
- SRV_CORRUPT_TABLE_CHECK(get_block, return;);
-
-#ifdef UNIV_BTR_DEBUG
- ut_a(page_is_comp(get_block->frame)
- == page_is_comp(page));
- ut_a(btr_page_get_next(get_block->frame, mtr)
- == page_get_page_no(page));
-#endif /* UNIV_BTR_DEBUG */
- get_block->check_index_page_at_flush = TRUE;
- }
-
- get_block = btr_block_get(
- space, zip_size, page_no, mode, cursor->index, mtr);
-
- SRV_CORRUPT_TABLE_CHECK(get_block, return;);
-
-#ifdef UNIV_BTR_DEBUG
- ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
-#endif /* UNIV_BTR_DEBUG */
- get_block->check_index_page_at_flush = TRUE;
- return;
- }
-
- ut_error;
-}
-
-/********************************************************************//**
-Searches an index tree and positions a tree cursor on a given level.
-NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
-to node pointer page number fields on the upper levels of the tree!
-Note that if mode is PAGE_CUR_LE, which is used in inserts, then
-cursor->up_match and cursor->low_match both will have sensible values.
-If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
-
-If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
-search tuple should be performed in the B-tree. InnoDB does an insert
-immediately after the cursor. Thus, the cursor may end up on a user record,
-or on a page infimum record. */
-UNIV_INTERN
-dberr_t
-btr_cur_search_to_nth_level(
-/*========================*/
- dict_index_t* index, /*!< in: index */
- ulint level, /*!< in: the tree level of search */
- const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
- tuple must be set so that it cannot get
- compared to the node ptr page number field! */
- ulint mode, /*!< in: PAGE_CUR_L, ...;
- Inserts should always be made using
- PAGE_CUR_LE to search the position! */
- ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
- at most one of BTR_INSERT, BTR_DELETE_MARK,
- BTR_DELETE, or BTR_ESTIMATE;
- cursor->left_block is used to store a pointer
- to the left neighbor page, in the cases
- BTR_SEARCH_PREV and BTR_MODIFY_PREV;
- NOTE that if has_search_latch
- is != 0, we maybe do not have a latch set
- on the cursor page, we assume
- the caller uses his search latch
- to protect the record! */
- btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is
- s- or x-latched, but see also above! */
- ulint has_search_latch,/*!< in: info on the latch mode the
- caller currently has on btr_search_latch:
- RW_S_LATCH, or 0 */
- const char* file, /*!< in: file name */
- ulint line, /*!< in: line where called */
- mtr_t* mtr) /*!< in: mtr */
-{
- page_t* page;
- buf_block_t* block;
- ulint space;
- buf_block_t* guess;
- ulint height;
- ulint page_no;
- ulint up_match;
- ulint up_bytes;
- ulint low_match;
- ulint low_bytes;
- ulint savepoint;
- ulint rw_latch;
- ulint page_mode;
- ulint buf_mode;
- ulint estimate;
- ulint zip_size;
- page_cur_t* page_cursor;
- btr_op_t btr_op;
- ulint root_height = 0; /* remove warning */
- dberr_t err = DB_SUCCESS;
-
-#ifdef BTR_CUR_ADAPT
- btr_search_t* info;
-#endif
- mem_heap_t* heap = NULL;
- ulint offsets_[REC_OFFS_NORMAL_SIZE];
- ulint* offsets = offsets_;
- rec_offs_init(offsets_);
- /* Currently, PAGE_CUR_LE is the only search mode used for searches
- ending to upper levels */
-
- ut_ad(level == 0 || mode == PAGE_CUR_LE);
- ut_ad(dict_index_check_search_tuple(index, tuple));
- ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
- ut_ad(dtuple_check_typed(tuple));
- ut_ad(!(index->type & DICT_FTS));
- ut_ad(index->page != FIL_NULL);
-
- UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
- UNIV_MEM_INVALID(&cursor->up_bytes, sizeof cursor->up_bytes);
- UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match);
- UNIV_MEM_INVALID(&cursor->low_bytes, sizeof cursor->low_bytes);
-#ifdef UNIV_DEBUG
- cursor->up_match = ULINT_UNDEFINED;
- cursor->low_match = ULINT_UNDEFINED;
-#endif
-
- ibool s_latch_by_caller;
-
- s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
-
- ut_ad(!s_latch_by_caller
- || mtr_memo_contains(mtr, dict_index_get_lock(index),
- MTR_MEMO_S_LOCK));
-
- /* These flags are mutually exclusive, they are lumped together
- with the latch mode for historical reasons. It's possible for
- none of the flags to be set. */
- switch (UNIV_EXPECT(latch_mode
- & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK),
- 0)) {
- case 0:
- btr_op = BTR_NO_OP;
- break;
- case BTR_INSERT:
- btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
- ? BTR_INSERT_IGNORE_UNIQUE_OP
- : BTR_INSERT_OP;
- break;
- case BTR_DELETE:
- btr_op = BTR_DELETE_OP;
- ut_a(cursor->purge_node);
- break;
- case BTR_DELETE_MARK:
- btr_op = BTR_DELMARK_OP;
- break;
- default:
- /* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
- should be specified at a time */
- ut_error;
- }
-
- /* Operations on the insert buffer tree cannot be buffered. */
- ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
- /* Operations on the clustered index cannot be buffered. */
- ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
-
- estimate = latch_mode & BTR_ESTIMATE;
-
- /* Turn the flags unrelated to the latch mode off. */
- latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
-
- ut_ad(!s_latch_by_caller
- || latch_mode == BTR_SEARCH_LEAF
- || latch_mode == BTR_MODIFY_LEAF);
-
- cursor->flag = BTR_CUR_BINARY;
- cursor->index = index;
-
-#ifndef BTR_CUR_ADAPT
- guess = NULL;
-#else
- info = btr_search_get_info(index);
-
- guess = info->root_guess;
-
-#ifdef BTR_CUR_HASH_ADAPT
-
-# ifdef UNIV_SEARCH_PERF_STAT
- info->n_searches++;
-# endif
- if (rw_lock_get_writer(btr_search_get_latch(cursor->index)) ==
- RW_LOCK_NOT_LOCKED
- && latch_mode <= BTR_MODIFY_LEAF
- && info->last_hash_succ
- && !estimate
-# ifdef PAGE_CUR_LE_OR_EXTENDS
- && mode != PAGE_CUR_LE_OR_EXTENDS
-# endif /* PAGE_CUR_LE_OR_EXTENDS */
- /* If !has_search_latch, we do a dirty read of
- btr_search_enabled below, and btr_search_guess_on_hash()
- will have to check it again. */
- && UNIV_LIKELY(btr_search_enabled)
- && btr_search_guess_on_hash(index, info, tuple, mode,
- latch_mode, cursor,
- has_search_latch, mtr)) {
-
- /* Search using the hash index succeeded */
-
- ut_ad(cursor->up_match != ULINT_UNDEFINED
- || mode != PAGE_CUR_GE);
- ut_ad(cursor->up_match != ULINT_UNDEFINED
- || mode != PAGE_CUR_LE);
- ut_ad(cursor->low_match != ULINT_UNDEFINED
- || mode != PAGE_CUR_LE);
- btr_cur_n_sea++;
-
- return err;
- }
-# endif /* BTR_CUR_HASH_ADAPT */
-#endif /* BTR_CUR_ADAPT */
- btr_cur_n_non_sea++;
-
- /* If the hash search did not succeed, do binary search down the
- tree */
-
- if (has_search_latch) {
- /* Release possible search latch to obey latching order */
- rw_lock_s_unlock(btr_search_get_latch(cursor->index));
- }
-
- /* Store the position of the tree latch we push to mtr so that we
- know how to release it when we have latched leaf node(s) */
-
- savepoint = mtr_set_savepoint(mtr);
-
- switch (latch_mode) {
- case BTR_MODIFY_TREE:
- mtr_x_lock(dict_index_get_lock(index), mtr);
- break;
- case BTR_CONT_MODIFY_TREE:
- /* Do nothing */
- ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
- MTR_MEMO_X_LOCK));
- break;
- default:
- if (!s_latch_by_caller) {
- mtr_s_lock(dict_index_get_lock(index), mtr);
- }
- }
-
- page_cursor = btr_cur_get_page_cur(cursor);
-
- space = dict_index_get_space(index);
- page_no = dict_index_get_page(index);
-
- up_match = 0;
- up_bytes = 0;
- low_match = 0;
- low_bytes = 0;
-
- height = ULINT_UNDEFINED;
-
- /* We use these modified search modes on non-leaf levels of the
- B-tree. These let us end up in the right B-tree leaf. In that leaf
- we use the original search mode. */
-
- switch (mode) {
- case PAGE_CUR_GE:
- page_mode = PAGE_CUR_L;
- break;
- case PAGE_CUR_G:
- page_mode = PAGE_CUR_LE;
- break;
- default:
-#ifdef PAGE_CUR_LE_OR_EXTENDS
- ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
- || mode == PAGE_CUR_LE_OR_EXTENDS);
-#else /* PAGE_CUR_LE_OR_EXTENDS */
- ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
-#endif /* PAGE_CUR_LE_OR_EXTENDS */
- page_mode = mode;
- break;
- }
-
- /* Loop and search until we arrive at the desired level */
-
-search_loop:
- buf_mode = BUF_GET;
- rw_latch = RW_NO_LATCH;
-
- if (height != 0) {
- /* We are about to fetch the root or a non-leaf page. */
- } else if (latch_mode <= BTR_MODIFY_LEAF) {
- rw_latch = latch_mode;
-
- if (btr_op != BTR_NO_OP
- && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
-
- /* Try to buffer the operation if the leaf
- page is not in the buffer pool. */
-
- buf_mode = btr_op == BTR_DELETE_OP
- ? BUF_GET_IF_IN_POOL_OR_WATCH
- : BUF_GET_IF_IN_POOL;
- }
- }
-
- zip_size = dict_table_zip_size(index->table);
-
-retry_page_get:
- block = buf_page_get_gen(
- space, zip_size, page_no, rw_latch, guess, buf_mode,
- file, line, mtr, &err);
-
- /* Note that block==NULL signifies either an error or change
- buffering. */
- if (err != DB_SUCCESS) {
- ut_ad(block == NULL);
- if (err == DB_DECRYPTION_FAILED) {
- ib_push_warning((void *)NULL,
- DB_DECRYPTION_FAILED,
- "Table %s is encrypted but encryption service or"
- " used key_id is not available. "
- " Can't continue reading table.",
- index->table->name);
- index->table->file_unreadable = true;
- }
-
- goto func_exit;
- }
-
- if (block == NULL) {
- SRV_CORRUPT_TABLE_CHECK(buf_mode == BUF_GET_IF_IN_POOL ||
- buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH,
- {
- page_cursor->block = 0;
- page_cursor->rec = 0;
- if (estimate) {
-
- cursor->path_arr->nth_rec =
- ULINT_UNDEFINED;
- }
-
- goto func_exit;
- });
-
- /* This must be a search to perform an insert/delete
- mark/ delete; try using the insert/delete buffer */
-
- ut_ad(height == 0);
- ut_ad(cursor->thr);
-
- switch (btr_op) {
- case BTR_INSERT_OP:
- case BTR_INSERT_IGNORE_UNIQUE_OP:
- ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
-
- if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
- space, zip_size, page_no,
- cursor->thr)) {
-
- cursor->flag = BTR_CUR_INSERT_TO_IBUF;
-
- goto func_exit;
- }
- break;
-
- case BTR_DELMARK_OP:
- ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
-
- if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
- index, space, zip_size,
- page_no, cursor->thr)) {
-
- cursor->flag = BTR_CUR_DEL_MARK_IBUF;
-
- goto func_exit;
- }
-
- break;
-
- case BTR_DELETE_OP:
- ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
-
- if (!row_purge_poss_sec(cursor->purge_node,
- index, tuple)) {
-
- /* The record cannot be purged yet. */
- cursor->flag = BTR_CUR_DELETE_REF;
- } else if (ibuf_insert(IBUF_OP_DELETE, tuple,
- index, space, zip_size,
- page_no,
- cursor->thr)) {
-
- /* The purge was buffered. */
- cursor->flag = BTR_CUR_DELETE_IBUF;
- } else {
- /* The purge could not be buffered. */
- buf_pool_watch_unset(space, page_no);
- break;
- }
-
- buf_pool_watch_unset(space, page_no);
- goto func_exit;
-
- default:
- ut_error;
- }
-
- /* Insert to the insert/delete buffer did not succeed, we
- must read the page from disk. */
-
- buf_mode = BUF_GET;
-
- goto retry_page_get;
- }
-
- block->check_index_page_at_flush = TRUE;
- page = buf_block_get_frame(block);
-
- SRV_CORRUPT_TABLE_CHECK(page,
- {
- page_cursor->block = 0;
- page_cursor->rec = 0;
-
- if (estimate) {
-
- cursor->path_arr->nth_rec = ULINT_UNDEFINED;
- }
-
- goto func_exit;
- });
-
- if (rw_latch != RW_NO_LATCH) {
-#ifdef UNIV_ZIP_DEBUG
- const page_zip_des_t* page_zip
- = buf_block_get_page_zip(block);
- ut_a(!page_zip || page_zip_validate(page_zip, page, index));
-#endif /* UNIV_ZIP_DEBUG */
-
- buf_block_dbg_add_level(
- block, dict_index_is_ibuf(index)
- ? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
- }
-
- ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
- ut_ad(index->id == btr_page_get_index_id(page));
-
- if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
- /* We are in the root node */
-
- height = btr_page_get_level(page, mtr);
- root_height = height;
- cursor->tree_height = root_height + 1;
-
-#ifdef BTR_CUR_ADAPT
- if (block != guess) {
- info->root_guess = block;
- }
-#endif
- }
-
- if (height == 0) {
- if (rw_latch == RW_NO_LATCH) {
-
- btr_cur_latch_leaves(
- page, space, zip_size, page_no, latch_mode,
- cursor, mtr);
- }
-
- switch (latch_mode) {
- case BTR_MODIFY_TREE:
- case BTR_CONT_MODIFY_TREE:
- break;
- default:
- if (!s_latch_by_caller) {
- /* Release the tree s-latch */
- mtr_release_s_latch_at_savepoint(
- mtr, savepoint,
- dict_index_get_lock(index));
- }
- }
-
- page_mode = mode;
- }
-
- page_cur_search_with_match(
- block, index, tuple, page_mode, &up_match, &up_bytes,
- &low_match, &low_bytes, page_cursor);
-
- if (estimate) {
- btr_cur_add_path_info(cursor, height, root_height);
- }
-
- /* If this is the desired level, leave the loop */
-
- ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor),
- mtr));
-
- if (level != height) {
-
- const rec_t* node_ptr;
- ut_ad(height > 0);
-
- height--;
- guess = NULL;
-
- node_ptr = page_cur_get_rec(page_cursor);
-
- offsets = rec_get_offsets(
- node_ptr, index, offsets, ULINT_UNDEFINED, &heap);
-
- /* Go to the child node */
- page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
-
- if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
- /* We're doing a search on an ibuf tree and we're one
- level above the leaf page. */
-
- ut_ad(level == 0);
-
- buf_mode = BUF_GET;
- rw_latch = RW_NO_LATCH;
- goto retry_page_get;
- }
-
- goto search_loop;
- }
-
- if (level != 0) {
- /* x-latch the page */
- buf_block_t* child_block = btr_block_get(
- space, zip_size, page_no, RW_X_LATCH, index, mtr);
-
- page = buf_block_get_frame(child_block);
- btr_assert_not_corrupted(child_block, index);
- } else {
- cursor->low_match = low_match;
- cursor->low_bytes = low_bytes;
- cursor->up_match = up_match;
- cursor->up_bytes = up_bytes;
-
-#ifdef BTR_CUR_ADAPT
- /* We do a dirty read of btr_search_enabled here. We
- will properly check btr_search_enabled again in
- btr_search_build_page_hash_index() before building a
- page hash index, while holding btr_search_latch. */
- if (btr_search_enabled) {
- btr_search_info_update(index, cursor);
- }
-#endif
- ut_ad(cursor->up_match != ULINT_UNDEFINED
- || mode != PAGE_CUR_GE);
- ut_ad(cursor->up_match != ULINT_UNDEFINED
- || mode != PAGE_CUR_LE);
- ut_ad(cursor->low_match != ULINT_UNDEFINED
- || mode != PAGE_CUR_LE);
- }
-
-func_exit:
-
- if (UNIV_LIKELY_NULL(heap)) {
- mem_heap_free(heap);
- }
-
- if (has_search_latch) {
-
- rw_lock_s_lock(btr_search_get_latch(cursor->index));
- }
-
- return err;
-}
-
-/*****************************************************************//**
-Opens a cursor at either end of an index. */
-UNIV_INTERN
-dberr_t
-btr_cur_open_at_index_side_func(
-/*============================*/
- bool from_left, /*!< in: true if open to the low end,
- false if to the high end */
- dict_index_t* index, /*!< in: index */
- ulint latch_mode, /*!< in: latch mode */
- btr_cur_t* cursor, /*!< in/out: cursor */
- ulint level, /*!< in: level to search for
- (0=leaf). */
- const char* file, /*!< in: file name */
- ulint line, /*!< in: line where called */
- mtr_t* mtr) /*!< in/out: mini-transaction */
-{
- page_cur_t* page_cursor;
- ulint page_no;
- ulint space;
- ulint zip_size;
- ulint height;
- ulint root_height = 0; /* remove warning */
- rec_t* node_ptr;
- ulint estimate;
- ulint savepoint;
- mem_heap_t* heap = NULL;
- ulint offsets_[REC_OFFS_NORMAL_SIZE];
- ulint* offsets = offsets_;
- dberr_t err = DB_SUCCESS;
-
- rec_offs_init(offsets_);
-
- estimate = latch_mode & BTR_ESTIMATE;
- latch_mode &= ~BTR_ESTIMATE;
-
- ut_ad(level != ULINT_UNDEFINED);
-
- /* Store the position of the tree latch we push to mtr so that we
- know how to release it when we have latched the leaf node */
-
- savepoint = mtr_set_savepoint(mtr);
-
- switch (latch_mode) {
- case BTR_CONT_MODIFY_TREE:
- break;
- case BTR_MODIFY_TREE:
- mtr_x_lock(dict_index_get_lock(index), mtr);
- break;
- case BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED:
- case BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED:
- ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
- MTR_MEMO_S_LOCK));
- break;
- default:
- mtr_s_lock(dict_index_get_lock(index), mtr);
- }
-
- page_cursor = btr_cur_get_page_cur(cursor);
- cursor->index = index;
-
- space = dict_index_get_space(index);
- zip_size = dict_table_zip_size(index->table);
- page_no = dict_index_get_page(index);
-
- height = ULINT_UNDEFINED;
-
- for (;;) {
- buf_block_t* block=NULL;
- page_t* page=NULL;
-
- block = buf_page_get_gen(space, zip_size, page_no,
- RW_NO_LATCH, NULL, BUF_GET,
- file, line, mtr, &err);
-
- ut_ad((block != NULL) == (err == DB_SUCCESS));
-
- if (err != DB_SUCCESS) {
- if (err == DB_DECRYPTION_FAILED) {
- ib_push_warning((void *)NULL,
- DB_DECRYPTION_FAILED,
- "Table %s is encrypted but encryption service or"
- " used key_id is not available. "
- " Can't continue reading table.",
- index->table->name);
- index->table->file_unreadable = true;
- }
-
- goto exit_loop;
- }
-
- page = buf_block_get_frame(block);
-
- SRV_CORRUPT_TABLE_CHECK(page,
- {
- page_cursor->block = 0;
- page_cursor->rec = 0;
-
- if (estimate) {
-
- cursor->path_arr->nth_rec =
- ULINT_UNDEFINED;
- }
- /* Can't use break with the macro */
- goto exit_loop;
- });
-
- ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
-
- ut_ad(index->id == btr_page_get_index_id(page));
-
- block->check_index_page_at_flush = TRUE;
-
- if (height == ULINT_UNDEFINED) {
- /* We are in the root node */
-
- height = btr_page_get_level(page, mtr);
- root_height = height;
- ut_a(height >= level);
- } else {
- /* TODO: flag the index corrupted if this fails */
- ut_ad(height == btr_page_get_level(page, mtr));
- }
-
- if (height == level) {
- btr_cur_latch_leaves(
- page, space, zip_size, page_no,
- latch_mode & ~BTR_ALREADY_S_LATCHED,
- cursor, mtr);
-
- if (height == 0) {
- /* In versions <= 3.23.52 we had
- forgotten to release the tree latch
- here. If in an index scan we had to
- scan far to find a record visible to
- the current transaction, that could
- starve others waiting for the tree
- latch. */
-
- switch (latch_mode) {
- case BTR_MODIFY_TREE:
- case BTR_CONT_MODIFY_TREE:
- case BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED:
- case BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED:
- break;
- default:
- /* Release the tree s-latch */
-
- mtr_release_s_latch_at_savepoint(
- mtr, savepoint,
- dict_index_get_lock(index));
- }
- }
- }
-
- if (from_left) {
- page_cur_set_before_first(block, page_cursor);
- } else {
- page_cur_set_after_last(block, page_cursor);
- }
-
- if (height == level) {
- if (estimate) {
- btr_cur_add_path_info(cursor, height,
- root_height);
- }
-
- break;
- }
-
- ut_ad(height > 0);
-
- if (from_left) {
- page_cur_move_to_next(page_cursor);
- } else {
- page_cur_move_to_prev(page_cursor);
- }
-
- if (estimate) {
- btr_cur_add_path_info(cursor, height, root_height);
- }
-
- height--;
-
- node_ptr = page_cur_get_rec(page_cursor);
- offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
- ULINT_UNDEFINED, &heap);
- /* Go to the child node */
- page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
- }
-
-exit_loop:
- if (UNIV_LIKELY_NULL(heap)) {
- mem_heap_free(heap);
- }
-
- return err;
-}
-
-/**********************************************************************//**
-Positions a cursor at a randomly chosen position within a B-tree. */
-UNIV_INTERN
-void
-btr_cur_open_at_rnd_pos_func(
-/*=========================*/
- dict_index_t* index, /*!< in: index */
- ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
- btr_cur_t* cursor, /*!< in/out: B-tree cursor */
- const char* file, /*!< in: file name */
- ulint line, /*!< in: line where called */
- mtr_t* mtr) /*!< in: mtr */
-{
- page_cur_t* page_cursor;
- ulint page_no;
- ulint space;
- ulint zip_size;
- ulint height;
- rec_t* node_ptr;
- mem_heap_t* heap = NULL;
- ulint offsets_[REC_OFFS_NORMAL_SIZE];
- ulint* offsets = offsets_;
- rec_offs_init(offsets_);
-
- switch (latch_mode) {
- case BTR_MODIFY_TREE:
- mtr_x_lock(dict_index_get_lock(index), mtr);
- break;
- default:
- ut_ad(latch_mode != BTR_CONT_MODIFY_TREE);
- mtr_s_lock(dict_index_get_lock(index), mtr);
- }
-
- page_cursor = btr_cur_get_page_cur(cursor);
- cursor->index = index;
-
- space = dict_index_get_space(index);
- zip_size = dict_table_zip_size(index->table);
- page_no = dict_index_get_page(index);
-
- height = ULINT_UNDEFINED;
-
- for (;;) {
- buf_block_t* block;
- page_t* page;
- dberr_t err=DB_SUCCESS;
-
- block = buf_page_get_gen(space, zip_size, page_no,
- RW_NO_LATCH, NULL, BUF_GET,
- file, line, mtr, &err);
-
- ut_ad((block != NULL) == (err == DB_SUCCESS));
-
- if (err != DB_SUCCESS) {
- if (err == DB_DECRYPTION_FAILED) {
- ib_push_warning((void *)NULL,
- DB_DECRYPTION_FAILED,
- "Table %s is encrypted but encryption service or"
- " used key_id is not available. "
- " Can't continue reading table.",
- index->table->name);
- index->table->file_unreadable = true;
- }
-
- goto exit_loop;
- }
-
- page = buf_block_get_frame(block);
-
- SRV_CORRUPT_TABLE_CHECK(page,
- {
- page_cursor->block = 0;
- page_cursor->rec = 0;
-
- goto exit_loop;
- });
-
- ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
-
- ut_ad(index->id == btr_page_get_index_id(page));
-
- if (height == ULINT_UNDEFINED) {
- /* We are in the root node */
-
- height = btr_page_get_level(page, mtr);
- }
-
- if (height == 0) {
- btr_cur_latch_leaves(page, space, zip_size, page_no,
- latch_mode, cursor, mtr);
- }
-
- page_cur_open_on_rnd_user_rec(block, page_cursor);
-
- if (height == 0) {
-
- break;
- }
-
- ut_ad(height > 0);
-
- height--;
-
- node_ptr = page_cur_get_rec(page_cursor);
- offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
- ULINT_UNDEFINED, &heap);
- /* Go to the child node */
- page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
- }
-
-exit_loop:
- if (UNIV_LIKELY_NULL(heap)) {
- mem_heap_free(heap);
- }
-}
-
-/*==================== B-TREE INSERT =========================*/
-
-/*************************************************************//**
-Inserts a record if there is enough space, or if enough space can
-be freed by reorganizing. Differs from btr_cur_optimistic_insert because
-no heuristics is applied to whether it pays to use CPU time for
-reorganizing the page or not.
-
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if this is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
-@return pointer to inserted record if succeed, else NULL */
-static MY_ATTRIBUTE((nonnull, warn_unused_result))
-rec_t*
-btr_cur_insert_if_possible(
-/*=======================*/
- btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
- cursor stays valid */
- const dtuple_t* tuple, /*!< in: tuple to insert; the size info need not
- have been stored to tuple */
- ulint** offsets,/*!< out: offsets on *rec */
- mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
- ulint n_ext, /*!< in: number of externally stored columns */
- mtr_t* mtr) /*!< in/out: mini-transaction */
-{
- page_cur_t* page_cursor;
- rec_t* rec;
-
- ut_ad(dtuple_check_typed(tuple));
-
- ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
- MTR_MEMO_PAGE_X_FIX));
- page_cursor = btr_cur_get_page_cur(cursor);
-
- /* Now, try the insert */
- rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
- offsets, heap, n_ext, mtr);
-
- /* If the record did not fit, reorganize.
- For compressed pages, page_cur_tuple_insert()
- attempted this already. */
- if (!rec && !page_cur_get_page_zip(page_cursor)
- && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
- rec = page_cur_tuple_insert(
- page_cursor, tuple, cursor->index,
- offsets, heap, n_ext, mtr);
- }
-
- ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
- return(rec);
-}
-
-/*************************************************************//**
-For an insert, checks the locks and does the undo logging if desired.
-@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
-UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6)))
-dberr_t
-btr_cur_ins_lock_and_undo(
-/*======================*/
- ulint flags, /*!< in: undo logging and locking flags: if
- not zero, the parameters index and thr
- should be specified */
- btr_cur_t* cursor, /*!< in: cursor on page after which to insert */
- dtuple_t* entry, /*!< in/out: entry to insert */
- que_thr_t* thr, /*!< in: query thread or NULL */
- mtr_t* mtr, /*!< in/out: mini-transaction */
- ibool* inherit)/*!< out: TRUE if the inserted new record maybe
- should inherit LOCK_GAP type locks from the
- successor record */
-{
- dict_index_t* index;
- dberr_t err;
- rec_t* rec;
- roll_ptr_t roll_ptr;
-
- if (UNIV_UNLIKELY(thr && thr_get_trx(thr)->fake_changes)) {
- /* skip LOCK, UNDO */
- return(DB_SUCCESS);
- }
-
- /* Check if we have to wait for a lock: enqueue an explicit lock
- request if yes */
-
- rec = btr_cur_get_rec(cursor);
- index = cursor->index;
-
- ut_ad(!dict_index_is_online_ddl(index)
- || dict_index_is_clust(index)
- || (flags & BTR_CREATE_FLAG));
-
- err = lock_rec_insert_check_and_lock(flags, rec,
- btr_cur_get_block(cursor),
- index, thr, mtr, inherit);
-
- if (err != DB_SUCCESS
- || !(~flags | (BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG))
- || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
-
- return(err);
- }
-
- if (flags & BTR_NO_UNDO_LOG_FLAG) {
- roll_ptr = 0;
- } else {
- err = trx_undo_report_row_operation(thr, index, entry,
- NULL, 0, NULL, NULL,
- &roll_ptr);
- if (err != DB_SUCCESS) {
- return(err);
- }
- }
-
- /* Now we can fill in the roll ptr field in entry */
-
- if (!(flags & BTR_KEEP_SYS_FLAG)) {
-
- row_upd_index_entry_sys_field(entry, index,
- DATA_ROLL_PTR, roll_ptr);
- }
-
- return(DB_SUCCESS);
-}
-
-#ifdef UNIV_DEBUG
-/*************************************************************//**
-Report information about a transaction. */
-static
-void
-btr_cur_trx_report(
-/*===============*/
- trx_id_t trx_id, /*!< in: transaction id */
- const dict_index_t* index, /*!< in: index */
- const char* op) /*!< in: operation */
-{
- fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ", trx_id);
- fputs(op, stderr);
- dict_index_name_print(stderr, NULL, index);
- putc('\n', stderr);
-}
-#endif /* UNIV_DEBUG */
-
-/*************************************************************//**
-Tries to perform an insert to a page in an index tree, next to cursor.
-It is assumed that mtr holds an x-latch on the page. The operation does
-not succeed if there is too little space on the page. If there is just
-one record on the page, the insert will always succeed; this is to
-prevent trying to split a page with just one record.
-@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
-UNIV_INTERN
-dberr_t
-btr_cur_optimistic_insert(
-/*======================*/
- ulint flags, /*!< in: undo logging and locking flags: if not
- zero, the parameters index and thr should be
- specified */
- btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
- cursor stays valid */
- ulint** offsets,/*!< out: offsets on *rec */
- mem_heap_t** heap, /*!< in/out: pointer to memory heap */
- dtuple_t* entry, /*!< in/out: entry to insert */
- rec_t** rec, /*!< out: pointer to inserted record if
- succeed */
- big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
- be stored externally by the caller */
- ulint n_ext, /*!< in: number of externally stored columns */
- que_thr_t* thr, /*!< in/out: query thread; can be NULL if
- !(~flags
- & (BTR_NO_LOCKING_FLAG
- | BTR_NO_UNDO_LOG_FLAG)) */
- mtr_t* mtr) /*!< in/out: mini-transaction;
- if this function returns DB_SUCCESS on
- a leaf page of a secondary index in a
- compressed tablespace, the caller must
- mtr_commit(mtr) before latching
- any further pages */
-{
- big_rec_t* big_rec_vec = NULL;
- dict_index_t* index;
- page_cur_t* page_cursor;
- buf_block_t* block;
- page_t* page;
- rec_t* dummy;
- ibool leaf;
- ibool reorg;
- ibool inherit = TRUE;
- ulint zip_size;
- ulint rec_size;
- dberr_t err;
-
- ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
- *big_rec = NULL;
-
- block = btr_cur_get_block(cursor);
-
- SRV_CORRUPT_TABLE_CHECK(block, return(DB_CORRUPTION););
-
- page = buf_block_get_frame(block);
- index = cursor->index;
-
- const bool fake_changes = (~flags & (BTR_NO_LOCKING_FLAG
- | BTR_NO_UNDO_LOG_FLAG))
- && thr_get_trx(thr)->fake_changes;
- ut_ad(fake_changes
- || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
- ut_ad(!dict_index_is_online_ddl(index)
- || dict_index_is_clust(index)
- || (flags & BTR_CREATE_FLAG));
- ut_ad(dtuple_check_typed(entry));
-
- zip_size = buf_block_get_zip_size(block);
-#ifdef UNIV_DEBUG_VALGRIND
- if (zip_size) {
- UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
- UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
- }
-#endif /* UNIV_DEBUG_VALGRIND */
-
-#ifdef UNIV_DEBUG
- if (btr_cur_print_record_ops && thr) {
- btr_cur_trx_report(thr_get_trx(thr)->id, index, "insert ");
- dtuple_print(stderr, entry);
- }
-#endif /* UNIV_DEBUG */
-
- leaf = page_is_leaf(page);
-
- /* Calculate the record size when entry is converted to a record */
- rec_size = rec_get_converted_size(index, entry, n_ext);
-
- if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
- dtuple_get_n_fields(entry), zip_size)) {
-
- /* The record is so big that we have to store some fields
- externally on separate database pages */
- big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
-
- if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
-
- return(DB_TOO_BIG_RECORD);
- }
-
- rec_size = rec_get_converted_size(index, entry, n_ext);
- }
-
- if (zip_size) {
- /* Estimate the free space of an empty compressed page.
- Subtract one byte for the encoded heap_no in the
- modification log. */
- ulint free_space_zip = page_zip_empty_size(
- cursor->index->n_fields, zip_size);
- ulint n_uniq = dict_index_get_n_unique_in_tree(index);
-
- ut_ad(dict_table_is_comp(index->table));
-
- if (free_space_zip == 0) {
-too_big:
- if (big_rec_vec) {
- dtuple_convert_back_big_rec(
- index, entry, big_rec_vec);
- }
-
- return(DB_TOO_BIG_RECORD);
- }
-
- /* Subtract one byte for the encoded heap_no in the
- modification log. */
- free_space_zip--;
-
- /* There should be enough room for two node pointer
- records on an empty non-leaf page. This prevents
- infinite page splits. */
-
- if (entry->n_fields >= n_uniq
- && (REC_NODE_PTR_SIZE
- + rec_get_converted_size_comp_prefix(
- index, entry->fields, n_uniq, NULL)
- /* On a compressed page, there is
- a two-byte entry in the dense
- page directory for every record.
- But there is no record header. */
- - (REC_N_NEW_EXTRA_BYTES - 2)
- > free_space_zip / 2)) {
- goto too_big;
- }
- }
-
- LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
- goto fail);
-
- if (leaf && zip_size
- && (page_get_data_size(page) + rec_size
- >= dict_index_zip_pad_optimal_page_size(index))) {
- /* If compression padding tells us that insertion will
- result in too packed up page i.e.: which is likely to
- cause compression failure then don't do an optimistic
- insertion. */
-fail:
- err = DB_FAIL;
-fail_err:
-
- if (big_rec_vec) {
- dtuple_convert_back_big_rec(index, entry, big_rec_vec);
- }
-
- return(err);
- }
-
- ulint max_size = page_get_max_insert_size_after_reorganize(page, 1);
-
- if (page_has_garbage(page)) {
- if ((max_size < rec_size
- || max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT)
- && page_get_n_recs(page) > 1
- && page_get_max_insert_size(page, 1) < rec_size) {
-
- goto fail;
- }
- } else if (max_size < rec_size) {
- goto fail;
- }
-
- /* If there have been many consecutive inserts to the
- clustered index leaf page of an uncompressed table, check if
- we have to split the page to reserve enough free space for
- future updates of records. */
-
- if (leaf && !zip_size && dict_index_is_clust(index)
- && page_get_n_recs(page) >= 2
- && dict_index_get_space_reserve() + rec_size > max_size
- && (btr_page_get_split_rec_to_right(cursor, &dummy)
- || btr_page_get_split_rec_to_left(cursor, &dummy))) {
- goto fail;
- }
-
- /* Check locks and write to the undo log, if specified */
- err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
- thr, mtr, &inherit);
-
- if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
-
- goto fail_err;
- }
-
- if (UNIV_UNLIKELY(fake_changes)) {
- /* skip CHANGE, LOG */
- *big_rec = big_rec_vec;
- return(err); /* == DB_SUCCESS */
- }
-
- page_cursor = btr_cur_get_page_cur(cursor);
-
- /* Now, try the insert */
-
- {
- const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
- *rec = page_cur_tuple_insert(page_cursor, entry, index,
- offsets, heap, n_ext, mtr);
- reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
- }
-
- if (*rec) {
- } else if (zip_size) {
- /* Reset the IBUF_BITMAP_FREE bits, because
- page_cur_tuple_insert() will have attempted page
- reorganize before failing. */
- if (leaf && !dict_index_is_clust(index)) {
- ibuf_reset_free_bits(block);
- }
-
- goto fail;
- } else {
- ut_ad(!reorg);
-
- /* If the record did not fit, reorganize */
- if (!btr_page_reorganize(page_cursor, index, mtr)) {
- ut_ad(0);
- goto fail;
- }
-
- ut_ad(page_get_max_insert_size(page, 1) == max_size);
-
- reorg = TRUE;
-
- *rec = page_cur_tuple_insert(page_cursor, entry, index,
- offsets, heap, n_ext, mtr);
-
- if (UNIV_UNLIKELY(!*rec)) {
- fputs("InnoDB: Error: cannot insert tuple ", stderr);
- dtuple_print(stderr, entry);
- fputs(" into ", stderr);
- dict_index_name_print(stderr, thr_get_trx(thr), index);
- fprintf(stderr, "\nInnoDB: max insert size %lu\n",
- (ulong) max_size);
- ut_error;
- }
- }
-
-#ifdef BTR_CUR_HASH_ADAPT
- if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) {
- btr_search_update_hash_node_on_insert(cursor);
- } else {
- btr_search_update_hash_on_insert(cursor);
- }
-#endif
-
- if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
-
- lock_update_insert(block, *rec);
- }
-
- if (leaf && !dict_index_is_clust(index)) {
- /* Update the free bits of the B-tree page in the
- insert buffer bitmap. */
-
- /* The free bits in the insert buffer bitmap must
- never exceed the free space on a page. It is safe to
- decrement or reset the bits in the bitmap in a
- mini-transaction that is committed before the
- mini-transaction that affects the free space. */
-
- /* It is unsafe to increment the bits in a separately
- committed mini-transaction, because in crash recovery,
- the free bits could momentarily be set too high. */
-
- if (zip_size) {
- /* Update the bits in the same mini-transaction. */
- ibuf_update_free_bits_zip(block, mtr);
- } else {
- /* Decrement the bits in a separate
- mini-transaction. */
- ibuf_update_free_bits_if_full(
- block, max_size,
- rec_size + PAGE_DIR_SLOT_SIZE);
- }
- }
-
- *big_rec = big_rec_vec;
-
- return(DB_SUCCESS);
-}
-
-/*************************************************************//**
-Performs an insert on a page of an index tree. It is assumed that mtr
-holds an x-latch on the tree and on the cursor page. If the insert is
-made on the leaf level, to avoid deadlocks, mtr must also own x-latches
-to brothers of page, if those brothers exist.
-@return DB_SUCCESS or error number */
-UNIV_INTERN
-dberr_t
-btr_cur_pessimistic_insert(
-/*=======================*/
- ulint flags, /*!< in: undo logging and locking flags: if not
- zero, the parameter thr should be
- specified; if no undo logging is specified,
- then the caller must have reserved enough
- free extents in the file space so that the
- insertion will certainly succeed */
- btr_cur_t* cursor, /*!< in: cursor after which to insert;
- cursor stays valid */
- ulint** offsets,/*!< out: offsets on *rec */
- mem_heap_t** heap, /*!< in/out: pointer to memory heap
- that can be emptied */
- dtuple_t* entry, /*!< in/out: entry to insert */
- rec_t** rec, /*!< out: pointer to inserted record if
- succeed */
- big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
- be stored externally by the caller */
- ulint n_ext, /*!< in: number of externally stored columns */
- que_thr_t* thr, /*!< in/out: query thread; can be NULL if
- !(~flags
- & (BTR_NO_LOCKING_FLAG
- | BTR_NO_UNDO_LOG_FLAG)) */
- mtr_t* mtr) /*!< in/out: mini-transaction */
-{
- dict_index_t* index = cursor->index;
- ulint zip_size = dict_table_zip_size(index->table);
- big_rec_t* big_rec_vec = NULL;
- dberr_t err;
- ibool inherit = FALSE;
- ibool success;
- ulint n_reserved = 0;
-
- ut_ad(dtuple_check_typed(entry));
- ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
-
- *big_rec = NULL;
-
- const bool fake_changes = (~flags & (BTR_NO_LOCKING_FLAG
- | BTR_NO_UNDO_LOG_FLAG))
- && thr_get_trx(thr)->fake_changes;
- ut_ad(fake_changes || mtr_memo_contains(mtr,
- dict_index_get_lock(btr_cur_get_index(cursor)),
- MTR_MEMO_X_LOCK));
- ut_ad(fake_changes || mtr_memo_contains(mtr, btr_cur_get_block(cursor),
- MTR_MEMO_PAGE_X_FIX));
- ut_ad(!dict_index_is_online_ddl(index)
- || dict_index_is_clust(index)
- || (flags & BTR_CREATE_FLAG));
-
- cursor->flag = BTR_CUR_BINARY;
-
- /* Check locks and write to undo log, if specified */
-
- err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
- thr, mtr, &inherit);
-
- if (err != DB_SUCCESS) {
-
- return(err);
- }
-
- if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
-
- ut_a(cursor->tree_height != ULINT_UNDEFINED);
-
- /* First reserve enough free space for the file segments
- of the index tree, so that the insert will not fail because
- of lack of space */
-
- ulint n_extents = cursor->tree_height / 16 + 3;
-
- success = fsp_reserve_free_extents(&n_reserved, index->space,
- n_extents, FSP_NORMAL, mtr);
- if (!success) {
- return(DB_OUT_OF_FILE_SPACE);
- }
- }
-
- if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
- dict_table_is_comp(index->table),
- dtuple_get_n_fields(entry),
- zip_size)) {
- /* The record is so big that we have to store some fields
- externally on separate database pages */
-
- if (UNIV_LIKELY_NULL(big_rec_vec)) {
- /* This should never happen, but we handle
- the situation in a robust manner. */
- ut_ad(0);
- dtuple_convert_back_big_rec(index, entry, big_rec_vec);
- }
-
- big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
-
- if (big_rec_vec == NULL) {
-
- if (n_reserved > 0) {
- fil_space_release_free_extents(index->space,
- n_reserved);
- }
- return(DB_TOO_BIG_RECORD);
- }
- }
-
- if (UNIV_UNLIKELY(fake_changes)) {
- /* skip CHANGE, LOG */
- if (n_reserved > 0) {
- fil_space_release_free_extents(index->space,
- n_reserved);
- }
- *big_rec = big_rec_vec;
- return(DB_SUCCESS);
- }
-
- if (dict_index_get_page(index)
- == buf_block_get_page_no(btr_cur_get_block(cursor))) {
-
- /* The page is the root page */
- *rec = btr_root_raise_and_insert(
- flags, cursor, offsets, heap, entry, n_ext, mtr);
- } else {
- *rec = btr_page_split_and_insert(
- flags, cursor, offsets, heap, entry, n_ext, mtr);
- }
-
- if (*rec == NULL && os_has_said_disk_full) {
- return(DB_OUT_OF_FILE_SPACE);
- }
-
- ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec);
-
- if (!(flags & BTR_NO_LOCKING_FLAG)) {
- /* The cursor might be moved to the other page,
- and the max trx id field should be updated after
- the cursor was fixed. */
- if (!dict_index_is_clust(index)) {
- page_update_max_trx_id(
- btr_cur_get_block(cursor),
- btr_cur_get_page_zip(cursor),
- thr_get_trx(thr)->id, mtr);
- }
-
- if (!page_rec_is_infimum(btr_cur_get_rec(cursor))) {
- /* split and inserted need to call
- lock_update_insert() always. */
- inherit = TRUE;
- }
-
- buf_block_t* block = btr_cur_get_block(cursor);
- buf_frame_t* frame = NULL;
-
- if (block) {
- frame = buf_block_get_frame(block);
- }
- /* split and inserted need to call
- lock_update_insert() always. */
- if (frame && btr_page_get_prev(frame, mtr) == FIL_NULL) {
- inherit = TRUE;
- }
- }
-
-#ifdef BTR_CUR_ADAPT
- btr_search_update_hash_on_insert(cursor);
-#endif
- if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) {
-
- lock_update_insert(btr_cur_get_block(cursor), *rec);
- }
-
- if (n_reserved > 0) {
- fil_space_release_free_extents(index->space, n_reserved);
- }
-
- *big_rec = big_rec_vec;
-
- return(DB_SUCCESS);
-}
-
-/*==================== B-TREE UPDATE =========================*/
-
-/*************************************************************//**
-For an update, checks the locks and does the undo logging.
-@return DB_SUCCESS, DB_WAIT_LOCK, or error number */
-UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
-dberr_t
-btr_cur_upd_lock_and_undo(
-/*======================*/
- ulint flags, /*!< in: undo logging and locking flags */
- btr_cur_t* cursor, /*!< in: cursor on record to update */
- const ulint* offsets,/*!< in: rec_get_offsets() on cursor */
- const upd_t* update, /*!< in: update vector */
- ulint cmpl_info,/*!< in: compiler info on secondary index
- updates */
- que_thr_t* thr, /*!< in: query thread
- (can be NULL if BTR_NO_LOCKING_FLAG) */
- mtr_t* mtr, /*!< in/out: mini-transaction */
- roll_ptr_t* roll_ptr)/*!< out: roll pointer */
-{
- dict_index_t* index;
- const rec_t* rec;
- dberr_t err;
-
- ut_ad((thr != NULL) || (flags & BTR_NO_LOCKING_FLAG));
-
- if (!(flags & BTR_NO_LOCKING_FLAG) && thr_get_trx(thr)->fake_changes) {
- /* skip LOCK, UNDO */
- return(DB_SUCCESS);
- }
-
- rec = btr_cur_get_rec(cursor);
- index = cursor->index;
-
- ut_ad(rec_offs_validate(rec, index, offsets));
-
- if (!dict_index_is_clust(index)) {
- ut_ad(dict_index_is_online_ddl(index)
- == !!(flags & BTR_CREATE_FLAG));
-
- /* We do undo logging only when we update a clustered index
- record */
- return(lock_sec_rec_modify_check_and_lock(
- flags, btr_cur_get_block(cursor), rec,
- index, thr, mtr));
- }
-
- /* Check if we have to wait for a lock: enqueue an explicit lock
- request if yes */
-
- if (!(flags & BTR_NO_LOCKING_FLAG)) {
- err = lock_clust_rec_modify_check_and_lock(
- flags, btr_cur_get_block(cursor), rec, index,
- offsets, thr);
- if (err != DB_SUCCESS) {
- return(err);
- }
- }
-
- /* Append the info about the update in the undo log */
-
- return((flags & BTR_NO_UNDO_LOG_FLAG)
- ? DB_SUCCESS
- : trx_undo_report_row_operation(
- thr, index, NULL, update,
- cmpl_info, rec, offsets, roll_ptr));
-}
-
-/***********************************************************//**
-Writes a redo log record of updating a record in-place. */
-UNIV_INTERN
-void
-btr_cur_update_in_place_log(
-/*========================*/
- ulint flags, /*!< in: flags */
- const rec_t* rec, /*!< in: record */
- dict_index_t* index, /*!< in: index of the record */
- const upd_t* update, /*!< in: update vector */
- trx_id_t trx_id, /*!< in: transaction id */
- roll_ptr_t roll_ptr, /*!< in: roll ptr */
- mtr_t* mtr) /*!< in: mtr */
-{
- byte* log_ptr;
- const page_t* page = page_align(rec);
- ut_ad(flags < 256);
- ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
-
- log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
- ? MLOG_COMP_REC_UPDATE_IN_PLACE
- : MLOG_REC_UPDATE_IN_PLACE,
- 1 + DATA_ROLL_PTR_LEN + 14 + 2
- + MLOG_BUF_MARGIN);
-
- if (!log_ptr) {
- /* Logging in mtr is switched off during crash recovery */
- return;
- }
-
- /* For secondary indexes, we could skip writing the dummy system fields
- to the redo log but we have to change redo log parsing of
- MLOG_REC_UPDATE_IN_PLACE/MLOG_COMP_REC_UPDATE_IN_PLACE or we have to add
- new redo log record. For now, just write dummy sys fields to the redo
- log if we are updating a secondary index record.
- */
- mach_write_to_1(log_ptr, flags);
- log_ptr++;
-
- if (dict_index_is_clust(index)) {
- log_ptr = row_upd_write_sys_vals_to_log(
- index, trx_id, roll_ptr, log_ptr, mtr);
- } else {
- /* Dummy system fields for a secondary index */
- /* TRX_ID Position */
- log_ptr += mach_write_compressed(log_ptr, 0);
- /* ROLL_PTR */
- trx_write_roll_ptr(log_ptr, 0);
- log_ptr += DATA_ROLL_PTR_LEN;
- /* TRX_ID */
- log_ptr += mach_ull_write_compressed(log_ptr, 0);
- }
-
- mach_write_to_2(log_ptr, page_offset(rec));
- log_ptr += 2;
-
- row_upd_index_write_log(update, log_ptr, mtr);
-}
-#endif /* UNIV_HOTBACKUP */
-
-/***********************************************************//**
-Parses a redo log record of updating a record in-place.
-@return end of log record or NULL */
-UNIV_INTERN
-byte*
-btr_cur_parse_update_in_place(
-/*==========================*/
- byte* ptr, /*!< in: buffer */
- byte* end_ptr,/*!< in: buffer end */
- page_t* page, /*!< in/out: page or NULL */
- page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
- dict_index_t* index) /*!< in: index corresponding to page */
-{
- ulint flags;
- rec_t* rec;
- upd_t* update;
- ulint pos;
- trx_id_t trx_id;
- roll_ptr_t roll_ptr;
- ulint rec_offset;
- mem_heap_t* heap;
- ulint* offsets;
-
- if (end_ptr < ptr + 1) {
-
- return(NULL);
- }
-
- flags = mach_read_from_1(ptr);
- ptr++;
-
- ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
-
- if (ptr == NULL) {
-
- return(NULL);
- }
-
- if (end_ptr < ptr + 2) {
-
- return(NULL);
- }
-
- rec_offset = mach_read_from_2(ptr);
- ptr += 2;
-
- ut_a(rec_offset <= UNIV_PAGE_SIZE);
-
- heap = mem_heap_create(256);
-
- ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
-
- if (!ptr || !page) {
-
- goto func_exit;
- }
-
- ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
- rec = page + rec_offset;
-
- /* We do not need to reserve btr_search_latch, as the page is only
- being recovered, and there cannot be a hash index to it. */
-
- offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
-
- if (!(flags & BTR_KEEP_SYS_FLAG)) {
- row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
- pos, trx_id, roll_ptr);
- }
-
- row_upd_rec_in_place(rec, index, offsets, update, page_zip);
-
-func_exit:
- mem_heap_free(heap);
-
- return(ptr);
-}
-
-#ifndef UNIV_HOTBACKUP
-/*************************************************************//**
-See if there is enough place in the page modification log to log
-an update-in-place.
-
-@retval false if out of space; IBUF_BITMAP_FREE will be reset
-outside mtr if the page was recompressed
-@retval true if enough place;
-
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
-a secondary index leaf page. This has to be done either within the
-same mini-transaction, or by invoking ibuf_reset_free_bits() before
-mtr_commit(mtr). */
-UNIV_INTERN
-bool
-btr_cur_update_alloc_zip_func(
-/*==========================*/
- page_zip_des_t* page_zip,/*!< in/out: compressed page */
- page_cur_t* cursor, /*!< in/out: B-tree page cursor */
- dict_index_t* index, /*!< in: the index corresponding to cursor */
-#ifdef UNIV_DEBUG
- ulint* offsets,/*!< in/out: offsets of the cursor record */
-#endif /* UNIV_DEBUG */
- ulint length, /*!< in: size needed */
- bool create, /*!< in: true=delete-and-insert,
- false=update-in-place */
- mtr_t* mtr, /*!< in/out: mini-transaction */
- trx_t* trx) /*!< in: NULL or transaction */
-{
- const page_t* page = page_cur_get_page(cursor);
-
- ut_ad(page_zip == page_cur_get_page_zip(cursor));
- ut_ad(!dict_index_is_ibuf(index));
- ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
-
- if (page_zip_available(page_zip, dict_index_is_clust(index),
- length, create)) {
- return(true);
- }
-
- if (!page_zip->m_nonempty && !page_has_garbage(page)) {
- /* The page has been freshly compressed, so
- reorganizing it will not help. */
- return(false);
- }
-
- if (create && page_is_leaf(page)
- && (length + page_get_data_size(page)
- >= dict_index_zip_pad_optimal_page_size(index))) {
- return(false);
- }
-
- if (UNIV_UNLIKELY(trx && trx->fake_changes)) {
- /* Don't call page_zip_compress_write_log_no_data as that has
- assert which would fail. Assume there won't be a compression
- failure. */
-
- return(true);
- }
-
- if (!btr_page_reorganize(cursor, index, mtr)) {
- goto out_of_space;
- }
-
- rec_offs_make_valid(page_cur_get_rec(cursor), index, offsets);
-
- /* After recompressing a page, we must make sure that the free
- bits in the insert buffer bitmap will not exceed the free
- space on the page. Because this function will not attempt
- recompression unless page_zip_available() fails above, it is
- safe to reset the free bits if page_zip_available() fails
- again, below. The free bits can safely be reset in a separate
- mini-transaction. If page_zip_available() succeeds below, we
- can be sure that the btr_page_reorganize() above did not reduce
- the free space available on the page. */
-
- if (page_zip_available(page_zip, dict_index_is_clust(index),
- length, create)) {
- return(true);
- }
-
-out_of_space:
- ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
-
- /* Out of space: reset the free bits. */
- if (!dict_index_is_clust(index) && page_is_leaf(page)) {
- ibuf_reset_free_bits(page_cur_get_block(cursor));
- }
-
- return(false);
-}
-
-/*************************************************************//**
-Updates a record when the update causes no size changes in its fields.
-We assume here that the ordering fields of the record do not change.
-@return locking or undo log related error code, or
-@retval DB_SUCCESS on success
-@retval DB_ZIP_OVERFLOW if there is not enough space left
-on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
-UNIV_INTERN
-dberr_t
-btr_cur_update_in_place(
-/*====================*/
- ulint flags, /*!< in: undo logging and locking flags */
- btr_cur_t* cursor, /*!< in: cursor on the record to update;
- cursor stays valid and positioned on the
- same record */
- ulint* offsets,/*!< in/out: offsets on cursor->page_cur.rec */
- const upd_t* update, /*!< in: update vector */
- ulint cmpl_info,/*!< in: compiler info on secondary index
- updates */
- que_thr_t* thr, /*!< in: query thread */
- trx_id_t trx_id, /*!< in: transaction id */
- mtr_t* mtr) /*!< in/out: mini-transaction; if this
- is a secondary index, the caller must
- mtr_commit(mtr) before latching any
- further pages */
-{
- dict_index_t* index;
- buf_block_t* block;
- page_zip_des_t* page_zip;
- dberr_t err;
- rec_t* rec;
- roll_ptr_t roll_ptr = 0;
- ulint was_delete_marked;
- ibool is_hashed;
- trx_t* trx;
-
- rec = btr_cur_get_rec(cursor);
- index = cursor->index;
- ut_ad(rec_offs_validate(rec, index, offsets));
- ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
- /* The insert buffer tree should never be updated in place. */
- ut_ad(!dict_index_is_ibuf(index));
- ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
- || dict_index_is_clust(index));
- ut_ad(thr_get_trx(thr)->id == trx_id
- || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
- == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
- | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
- ut_ad(fil_page_get_type(btr_cur_get_page(cursor)) == FIL_PAGE_INDEX);
- ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
-
-#ifdef UNIV_DEBUG
- if (btr_cur_print_record_ops) {
- btr_cur_trx_report(trx_id, index, "update ");
- rec_print_new(stderr, rec, offsets);
- }
-#endif /* UNIV_DEBUG */
-
- block = btr_cur_get_block(cursor);
- page_zip = buf_block_get_page_zip(block);
- trx = thr_get_trx(thr);
-
- /* Check that enough space is available on the compressed page. */
- if (page_zip) {
- if (!btr_cur_update_alloc_zip(
- page_zip, btr_cur_get_page_cur(cursor),
- index, offsets, rec_offs_size(offsets),
- false, mtr, trx)) {
- return(DB_ZIP_OVERFLOW);
- }
-
- rec = btr_cur_get_rec(cursor);
- }
-
- /* Do lock checking and undo logging */
- err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
- update, cmpl_info,
- thr, mtr, &roll_ptr);
- if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
- /* We may need to update the IBUF_BITMAP_FREE
- bits after a reorganize that was done in
- btr_cur_update_alloc_zip(). */
- goto func_exit;
- }
-
- if (UNIV_UNLIKELY(trx->fake_changes)) {
- /* skip CHANGE, LOG */
- return(err); /* == DB_SUCCESS */
- }
-
- if (!(flags & BTR_KEEP_SYS_FLAG)) {
- row_upd_rec_sys_fields(rec, NULL, index, offsets,
- thr_get_trx(thr), roll_ptr);
- }
-
- was_delete_marked = rec_get_deleted_flag(
- rec, page_is_comp(buf_block_get_frame(block)));
-
- is_hashed = (block->index != NULL);
-
- if (is_hashed) {
- /* TO DO: Can we skip this if none of the fields
- index->search_info->curr_n_fields
- are being updated? */
-
- /* The function row_upd_changes_ord_field_binary works only
- if the update vector was built for a clustered index, we must
- NOT call it if index is secondary */
-
- if (!dict_index_is_clust(index)
- || row_upd_changes_ord_field_binary(index, update, thr,
- NULL, NULL)) {
-
- /* Remove possible hash index pointer to this record */
- btr_search_update_hash_on_delete(cursor);
- }
-
- rw_lock_x_lock(btr_search_get_latch(cursor->index));
- }
-
- row_upd_rec_in_place(rec, index, offsets, update, page_zip);
-
- if (is_hashed) {
- rw_lock_x_unlock(btr_search_get_latch(cursor->index));
- }
-
- btr_cur_update_in_place_log(flags, rec, index, update,
- trx_id, roll_ptr, mtr);
-
- if (was_delete_marked
- && !rec_get_deleted_flag(
- rec, page_is_comp(buf_block_get_frame(block)))) {
- /* The new updated record owns its possible externally
- stored fields */
-
- btr_cur_unmark_extern_fields(page_zip,
- rec, index, offsets, mtr);
- }
-
- ut_ad(err == DB_SUCCESS);
-
-func_exit:
- if (page_zip
- && !(flags & BTR_KEEP_IBUF_BITMAP)
- && !dict_index_is_clust(index)
- && block) {
- buf_frame_t* frame = buf_block_get_frame(block);
- if (frame && page_is_leaf(frame)) {
- /* Update the free bits in the insert buffer. */
- ibuf_update_free_bits_zip(block, mtr);
- }
- }
-
- return(err);
-}
-
-/*************************************************************//**
-Tries to update a record on a page in an index tree. It is assumed that mtr
-holds an x-latch on the page. The operation does not succeed if there is too
-little space on the page or if the update would result in too empty a page,
-so that tree compression is recommended. We assume here that the ordering
-fields of the record do not change.
-@return error code, including
-@retval DB_SUCCESS on success
-@retval DB_OVERFLOW if the updated record does not fit
-@retval DB_UNDERFLOW if the page would become too empty
-@retval DB_ZIP_OVERFLOW if there is not enough space left
-on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
-UNIV_INTERN
-dberr_t
-btr_cur_optimistic_update(
-/*======================*/
- ulint flags, /*!< in: undo logging and locking flags */
- btr_cur_t* cursor, /*!< in: cursor on the record to update;
- cursor stays valid and positioned on the
- same record */
- ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */
- mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */
- const upd_t* update, /*!< in: update vector; this must also
- contain trx id and roll ptr fields */
- ulint cmpl_info,/*!< in: compiler info on secondary index
- updates */
- que_thr_t* thr, /*!< in: query thread */
- trx_id_t trx_id, /*!< in: transaction id */
- mtr_t* mtr) /*!< in/out: mini-transaction; if this
- is a secondary index, the caller must
- mtr_commit(mtr) before latching any
- further pages */
-{
- dict_index_t* index;
- page_cur_t* page_cursor;
- dberr_t err;
- buf_block_t* block;
- page_t* page;
- page_zip_des_t* page_zip;
- rec_t* rec;
- ulint max_size;
- ulint new_rec_size;
- ulint old_rec_size;
- ulint max_ins_size = 0;
- dtuple_t* new_entry;
- roll_ptr_t roll_ptr;
- ulint i;
- ulint n_ext;
-
- block = btr_cur_get_block(cursor);
- page = buf_block_get_frame(block);
- rec = btr_cur_get_rec(cursor);
- index = cursor->index;
- ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
- ut_ad(thr_get_trx(thr)->fake_changes
- || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
- /* The insert buffer tree should never be updated in place. */
- ut_ad(!dict_index_is_ibuf(index));
- ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
- || dict_index_is_clust(index));
- ut_ad(thr_get_trx(thr)->id == trx_id
- || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
- == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
- | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
- ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
- ut_ad(btr_page_get_index_id(page) == index->id);
-
- *offsets = rec_get_offsets(rec, index, *offsets,
- ULINT_UNDEFINED, heap);
-#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
- ut_a(!rec_offs_any_null_extern(rec, *offsets)
- || trx_is_recv(thr_get_trx(thr)));
-#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
-
-#ifdef UNIV_DEBUG
- if (btr_cur_print_record_ops) {
- btr_cur_trx_report(trx_id, index, "update ");
- rec_print_new(stderr, rec, *offsets);
- }
-#endif /* UNIV_DEBUG */
-
- if (!row_upd_changes_field_size_or_external(index, *offsets, update)) {
-
- /* The simplest and the most common case: the update does not
- change the size of any field and none of the updated fields is
- externally stored in rec or update, and there is enough space
- on the compressed page to log the update. */
-
- return(btr_cur_update_in_place(
- flags, cursor, *offsets, update,
- cmpl_info, thr, trx_id, mtr));
- }
-
- if (rec_offs_any_extern(*offsets)) {
-any_extern:
- /* Externally stored fields are treated in pessimistic
- update */
-
- return(DB_OVERFLOW);
- }
-
- for (i = 0; i < upd_get_n_fields(update); i++) {
- if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
-
- goto any_extern;
- }
- }
-
- page_cursor = btr_cur_get_page_cur(cursor);
-
- if (!*heap) {
- *heap = mem_heap_create(
- rec_offs_size(*offsets)
- + DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
- }
-
- new_entry = row_rec_to_index_entry(rec, index, *offsets,
- &n_ext, *heap);
- /* We checked above that there are no externally stored fields. */
- ut_a(!n_ext);
-
- /* The page containing the clustered index record
- corresponding to new_entry is latched in mtr.
- Thus the following call is safe. */
- row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
- FALSE, *heap);
- old_rec_size = rec_offs_size(*offsets);
- new_rec_size = rec_get_converted_size(index, new_entry, 0);
-
- page_zip = buf_block_get_page_zip(block);
-#ifdef UNIV_ZIP_DEBUG
- ut_a(!page_zip || page_zip_validate(page_zip, page, index));
-#endif /* UNIV_ZIP_DEBUG */
-
- if (page_zip) {
- if (page_zip_rec_needs_ext(new_rec_size, page_is_comp(page),
- dict_index_get_n_fields(index),
- page_zip_get_size(page_zip))) {
- goto any_extern;
- }
-
- if (!btr_cur_update_alloc_zip(
- page_zip, page_cursor, index, *offsets,
- new_rec_size, true, mtr, thr_get_trx(thr))) {
- return(DB_ZIP_OVERFLOW);
- }
-
- rec = page_cur_get_rec(page_cursor);
- }
-
- if (UNIV_UNLIKELY(new_rec_size
- >= (page_get_free_space_of_empty(page_is_comp(page))
- / 2))) {
- /* We may need to update the IBUF_BITMAP_FREE
- bits after a reorganize that was done in
- btr_cur_update_alloc_zip(). */
- err = DB_OVERFLOW;
- goto func_exit;
- }
-
- if (UNIV_UNLIKELY(page_get_data_size(page)
- - old_rec_size + new_rec_size
- < BTR_CUR_PAGE_COMPRESS_LIMIT)) {
- /* We may need to update the IBUF_BITMAP_FREE
- bits after a reorganize that was done in
- btr_cur_update_alloc_zip(). */
-
- /* The page would become too empty */
- err = DB_UNDERFLOW;
- goto func_exit;
- }
-
- /* We do not attempt to reorganize if the page is compressed.
- This is because the page may fail to compress after reorganization. */
- max_size = page_zip
- ? page_get_max_insert_size(page, 1)
- : (old_rec_size
- + page_get_max_insert_size_after_reorganize(page, 1));
-
- if (!page_zip) {
- max_ins_size = page_get_max_insert_size_after_reorganize(page, 1);
- }
-
- if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
- && (max_size >= new_rec_size))
- || (page_get_n_recs(page) <= 1))) {
-
- /* We may need to update the IBUF_BITMAP_FREE
- bits after a reorganize that was done in
- btr_cur_update_alloc_zip(). */
-
- /* There was not enough space, or it did not pay to
- reorganize: for simplicity, we decide what to do assuming a
- reorganization is needed, though it might not be necessary */
-
- err = DB_OVERFLOW;
- goto func_exit;
- }
-
- /* Do lock checking and undo logging */
- err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
- update, cmpl_info,
- thr, mtr, &roll_ptr);
- if (err != DB_SUCCESS) {
- /* We may need to update the IBUF_BITMAP_FREE
- bits after a reorganize that was done in
- btr_cur_update_alloc_zip(). */
- goto func_exit;
- }
-
- if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) {
- /* skip CHANGE, LOG */
- ut_ad(err == DB_SUCCESS);
- return(DB_SUCCESS);
- }
-
- /* Ok, we may do the replacement. Store on the page infimum the
- explicit locks on rec, before deleting rec (see the comment in
- btr_cur_pessimistic_update). */
-
- lock_rec_store_on_page_infimum(block, rec);
-
- btr_search_update_hash_on_delete(cursor);
-
- page_cur_delete_rec(page_cursor, index, *offsets, mtr);
-
- page_cur_move_to_prev(page_cursor);
-
- if (!(flags & BTR_KEEP_SYS_FLAG)) {
- row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
- roll_ptr);
- row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
- trx_id);
- }
-
- /* There are no externally stored columns in new_entry */
- rec = btr_cur_insert_if_possible(
- cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
- ut_a(rec); /* <- We calculated above the insert would fit */
-
- /* Restore the old explicit lock state on the record */
-
- lock_rec_restore_from_page_infimum(block, rec, block);
-
- page_cur_move_to_next(page_cursor);
- ut_ad(err == DB_SUCCESS);
-
-func_exit:
- if (!(flags & BTR_KEEP_IBUF_BITMAP)
- && !dict_index_is_clust(index)
- && page_is_leaf(page)) {
-
- if (page_zip) {
- ibuf_update_free_bits_zip(block, mtr);
- } else {
- ibuf_update_free_bits_low(block, max_ins_size, mtr);
- }
- }
-
- return(err);
-}
-
-/*************************************************************//**
-If, in a split, a new supremum record was created as the predecessor of the
-updated record, the supremum record must inherit exactly the locks on the
-updated record. In the split it may have inherited locks from the successor
-of the updated record, which is not correct. This function restores the
-right locks for the new supremum. */
-static
-void
-btr_cur_pess_upd_restore_supremum(
-/*==============================*/
- buf_block_t* block, /*!< in: buffer block of rec */
- const rec_t* rec, /*!< in: updated record */
- mtr_t* mtr) /*!< in: mtr */
-{
- page_t* page;
- buf_block_t* prev_block;
- ulint space;
- ulint zip_size;
- ulint prev_page_no;
-
- page = buf_block_get_frame(block);
-
- if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
- /* Updated record is not the first user record on its page */
-
- return;
- }
-
- space = buf_block_get_space(block);
- zip_size = buf_block_get_zip_size(block);
- prev_page_no = btr_page_get_prev(page, mtr);
-
- ut_ad(prev_page_no != FIL_NULL);
- prev_block = buf_page_get_with_no_latch(space, zip_size,
- prev_page_no, mtr);
-#ifdef UNIV_BTR_DEBUG
- ut_a(btr_page_get_next(prev_block->frame, mtr)
- == page_get_page_no(page));
-#endif /* UNIV_BTR_DEBUG */
-
- /* We must already have an x-latch on prev_block! */
- ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
-
- lock_rec_reset_and_inherit_gap_locks(prev_block, block,
- PAGE_HEAP_NO_SUPREMUM,
- page_rec_get_heap_no(rec));
-}
-
-/*************************************************************//**
-Check if the total length of the modified blob for the row is within 10%
-of the total redo log size. This constraint on the blob length is to
-avoid overwriting the redo logs beyond the last checkpoint lsn.
-@return DB_SUCCESS or DB_TOO_BIG_FOR_REDO. */
-static
-dberr_t
-btr_check_blob_limit(const big_rec_t* big_rec_vec)
-{
- const ib_uint64_t redo_size = srv_n_log_files * srv_log_file_size
- * UNIV_PAGE_SIZE;
- const ib_uint64_t redo_10p = redo_size / 10;
- ib_uint64_t total_blob_len = 0;
- dberr_t err = DB_SUCCESS;
-
- /* Calculate the total number of bytes for blob data */
- for (ulint i = 0; i < big_rec_vec->n_fields; i++) {
- total_blob_len += big_rec_vec->fields[i].len;
- }
-
- if (total_blob_len > redo_10p) {
- ib_logf(IB_LOG_LEVEL_ERROR, "The total blob data"
- " length (" UINT64PF ") is greater than"
- " 10%% of the total redo log size (" UINT64PF
- "). Please increase total redo log size.",
- total_blob_len, redo_size);
- err = DB_TOO_BIG_FOR_REDO;
- }
-
- return(err);
-}
-
-/*************************************************************//**
-Performs an update of a record on a page of a tree. It is assumed
-that mtr holds an x-latch on the tree and on the cursor page. If the
-update is made on the leaf level, to avoid deadlocks, mtr must also
-own x-latches to brothers of page, if those brothers exist. We assume
-here that the ordering fields of the record do not change.
-@return DB_SUCCESS or error code */
-UNIV_INTERN
-dberr_t
-btr_cur_pessimistic_update(
-/*=======================*/
- ulint flags, /*!< in: undo logging, locking, and rollback
- flags */
- btr_cur_t* cursor, /*!< in/out: cursor on the record to update;
- cursor may become invalid if *big_rec == NULL
- || !(flags & BTR_KEEP_POS_FLAG) */
- ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */
- mem_heap_t** offsets_heap,
- /*!< in/out: pointer to memory heap
- that can be emptied */
- mem_heap_t* entry_heap,
- /*!< in/out: memory heap for allocating
- big_rec and the index tuple */
- big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
- be stored externally by the caller */
- const upd_t* update, /*!< in: update vector; this is allowed also
- contain trx id and roll ptr fields, but
- the values in update vector have no effect */
- ulint cmpl_info,/*!< in: compiler info on secondary index
- updates */
- que_thr_t* thr, /*!< in: query thread */
- trx_id_t trx_id, /*!< in: transaction id */
- mtr_t* mtr) /*!< in/out: mini-transaction; must be
- committed before latching any further pages */
-{
- big_rec_t* big_rec_vec = NULL;
- big_rec_t* dummy_big_rec;
- dict_index_t* index;
- buf_block_t* block;
- page_t* page;
- page_zip_des_t* page_zip;
- rec_t* rec;
- page_cur_t* page_cursor;
- dberr_t err;
- dberr_t optim_err;
- roll_ptr_t roll_ptr;
- ibool was_first;
- ulint n_reserved = 0;
- ulint n_ext;
- trx_t* trx;
- ulint max_ins_size = 0;
-
- *offsets = NULL;
- *big_rec = NULL;
-
- block = btr_cur_get_block(cursor);
- page = buf_block_get_frame(block);
- page_zip = buf_block_get_page_zip(block);
- index = cursor->index;
-
- ut_ad(thr_get_trx(thr)->fake_changes
- || mtr_memo_contains(mtr, dict_index_get_lock(index),
- MTR_MEMO_X_LOCK));
- ut_ad(thr_get_trx(thr)->fake_changes
- || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
-#ifdef UNIV_ZIP_DEBUG
- ut_a(!page_zip || page_zip_validate(page_zip, page, index));
-#endif /* UNIV_ZIP_DEBUG */
- /* The insert buffer tree should never be updated in place. */
- ut_ad(!dict_index_is_ibuf(index));
- ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
- || dict_index_is_clust(index));
- ut_ad(thr_get_trx(thr)->id == trx_id
- || (flags & ~BTR_KEEP_POS_FLAG)
- == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
- | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
-
- err = optim_err = btr_cur_optimistic_update(
- flags | BTR_KEEP_IBUF_BITMAP,
- cursor, offsets, offsets_heap, update,
- cmpl_info, thr, trx_id, mtr);
-
- switch (err) {
- case DB_ZIP_OVERFLOW:
- case DB_UNDERFLOW:
- case DB_OVERFLOW:
- break;
- default:
- err_exit:
- /* We suppressed this with BTR_KEEP_IBUF_BITMAP.
- For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
- already reset by btr_cur_update_alloc_zip() if the
- page was recompressed. */
- if (page_zip
- && optim_err != DB_ZIP_OVERFLOW
- && !dict_index_is_clust(index)
- && page_is_leaf(page)) {
- ibuf_update_free_bits_zip(block, mtr);
- }
-
- return(err);
- }
-
- /* Do lock checking and undo logging */
- err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
- update, cmpl_info,
- thr, mtr, &roll_ptr);
- if (err != DB_SUCCESS) {
- goto err_exit;
- }
-
- if (optim_err == DB_OVERFLOW) {
- ulint reserve_flag;
- ulint n_extents;
-
- /* First reserve enough free space for the file segments
- of the index tree, so that the update will not fail because
- of lack of space */
- if (UNIV_UNLIKELY(cursor->tree_height == ULINT_UNDEFINED)) {
- /* When the tree height is uninitialized due to fake
- changes, reserve some hardcoded number of extents. */
- ut_a(thr_get_trx(thr)->fake_changes);
- n_extents = 3;
- }
- else {
- n_extents = cursor->tree_height / 16 + 3;
- }
-
- if (flags & BTR_NO_UNDO_LOG_FLAG) {
- reserve_flag = FSP_CLEANING;
- } else {
- reserve_flag = FSP_NORMAL;
- }
-
- if (!fsp_reserve_free_extents(&n_reserved, index->space,
- n_extents, reserve_flag, mtr)) {
- err = DB_OUT_OF_FILE_SPACE;
- goto err_exit;
- }
- }
-
- rec = btr_cur_get_rec(cursor);
-
- *offsets = rec_get_offsets(
- rec, index, *offsets, ULINT_UNDEFINED, offsets_heap);
-
- dtuple_t* new_entry = row_rec_to_index_entry(
- rec, index, *offsets, &n_ext, entry_heap);
-
- /* The page containing the clustered index record
- corresponding to new_entry is latched in mtr. If the
- clustered index record is delete-marked, then its externally
- stored fields cannot have been purged yet, because then the
- purge would also have removed the clustered index record
- itself. Thus the following call is safe. */
- row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
- FALSE, entry_heap);
-
- trx = thr_get_trx(thr);
-
- if (!(flags & BTR_KEEP_SYS_FLAG) && UNIV_LIKELY(!trx->fake_changes)) {
- row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
- roll_ptr);
- row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
- trx_id);
- }
-
- if ((flags & BTR_NO_UNDO_LOG_FLAG) && rec_offs_any_extern(*offsets)) {
- /* We are in a transaction rollback undoing a row
- update: we must free possible externally stored fields
- which got new values in the update, if they are not
- inherited values. They can be inherited if we have
- updated the primary key to another value, and then
- update it back again. */
-
- ut_ad(big_rec_vec == NULL);
-
- /* fake_changes should not cause undo. so never reaches here */
- ut_ad(!(trx->fake_changes));
-
- btr_rec_free_updated_extern_fields(
- index, rec, page_zip, *offsets, update,
- trx_is_recv(thr_get_trx(thr))
- ? RB_RECOVERY : RB_NORMAL, mtr);
- }
-
- /* We have to set appropriate extern storage bits in the new
- record to be inserted: we have to remember which fields were such */
-
- ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
- ut_ad(rec_offs_validate(rec, index, *offsets));
- n_ext += btr_push_update_extern_fields(new_entry, update, entry_heap);
-
- if (page_zip) {
- ut_ad(page_is_comp(page));
- if (page_zip_rec_needs_ext(
- rec_get_converted_size(index, new_entry, n_ext),
- TRUE,
- dict_index_get_n_fields(index),
- page_zip_get_size(page_zip))) {
-
- goto make_external;
- }
- } else if (page_zip_rec_needs_ext(
- rec_get_converted_size(index, new_entry, n_ext),
- page_is_comp(page), 0, 0)) {
-make_external:
- big_rec_vec = dtuple_convert_big_rec(index, new_entry, &n_ext);
- if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
-
- /* We cannot goto return_after_reservations,
- because we may need to update the
- IBUF_BITMAP_FREE bits, which was suppressed by
- BTR_KEEP_IBUF_BITMAP. */
-#ifdef UNIV_ZIP_DEBUG
- ut_a(!page_zip
- || page_zip_validate(page_zip, page, index));
-#endif /* UNIV_ZIP_DEBUG */
- if (n_reserved > 0) {
- fil_space_release_free_extents(
- index->space, n_reserved);
- }
-
- err = DB_TOO_BIG_RECORD;
- goto err_exit;
- }
-
- ut_ad(page_is_leaf(page));
- ut_ad(dict_index_is_clust(index));
- ut_ad(flags & BTR_KEEP_POS_FLAG);
- }
-
- if (UNIV_UNLIKELY(trx->fake_changes)) {
- /* skip CHANGE, LOG */
- err = DB_SUCCESS;
- goto return_after_reservations;
- }
-
- if (big_rec_vec) {
-
- err = btr_check_blob_limit(big_rec_vec);
-
- if (err != DB_SUCCESS) {
- if (n_reserved > 0) {
- fil_space_release_free_extents(
- index->space, n_reserved);
- }
- goto err_exit;
- }
- }
-
- if (!page_zip) {
- max_ins_size = page_get_max_insert_size_after_reorganize(page, 1);
- }
-
- /* Store state of explicit locks on rec on the page infimum record,
- before deleting rec. The page infimum acts as a dummy carrier of the
- locks, taking care also of lock releases, before we can move the locks
- back on the actual record. There is a special case: if we are
- inserting on the root page and the insert causes a call of
- btr_root_raise_and_insert. Therefore we cannot in the lock system
- delete the lock structs set on the root page even if the root
- page carries just node pointers. */
-
- lock_rec_store_on_page_infimum(block, rec);
-
- btr_search_update_hash_on_delete(cursor);
-
-#ifdef UNIV_ZIP_DEBUG
- ut_a(!page_zip || page_zip_validate(page_zip, page, index));
-#endif /* UNIV_ZIP_DEBUG */
- page_cursor = btr_cur_get_page_cur(cursor);
-
- page_cur_delete_rec(page_cursor, index, *offsets, mtr);
-
- page_cur_move_to_prev(page_cursor);
-
- rec = btr_cur_insert_if_possible(cursor, new_entry,
- offsets, offsets_heap, n_ext, mtr);
-
- if (rec) {
- page_cursor->rec = rec;
-
- lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
- rec, block);
-
- if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
- /* The new inserted record owns its possible externally
- stored fields */
- btr_cur_unmark_extern_fields(
- page_zip, rec, index, *offsets, mtr);
- }
-
- bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
-
- if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
- if (adjust) {
- rec_offs_make_valid(
- page_cursor->rec, index, *offsets);
- }
- } else if (!dict_index_is_clust(index)
- && page_is_leaf(page)) {
-
- /* Update the free bits in the insert buffer.
- This is the same block which was skipped by
- BTR_KEEP_IBUF_BITMAP. */
- if (page_zip) {
- ibuf_update_free_bits_zip(block, mtr);
- } else {
- ibuf_update_free_bits_low(block, max_ins_size,
- mtr);
- }
- }
-
- err = DB_SUCCESS;
- goto return_after_reservations;
- } else {
- /* If the page is compressed and it initially
- compresses very well, and there is a subsequent insert
- of a badly-compressing record, it is possible for
- btr_cur_optimistic_update() to return DB_UNDERFLOW and
- btr_cur_insert_if_possible() to return FALSE. */
- ut_a(page_zip || optim_err != DB_UNDERFLOW);
-
- /* Out of space: reset the free bits.
- This is the same block which was skipped by
- BTR_KEEP_IBUF_BITMAP. */
- if (!dict_index_is_clust(index) && page_is_leaf(page)) {
- ibuf_reset_free_bits(block);
- }
- }
-
- if (big_rec_vec) {
- ut_ad(page_is_leaf(page));
- ut_ad(dict_index_is_clust(index));
- ut_ad(flags & BTR_KEEP_POS_FLAG);
-
- /* btr_page_split_and_insert() in
- btr_cur_pessimistic_insert() invokes
- mtr_memo_release(mtr, index->lock, MTR_MEMO_X_LOCK).
- We must keep the index->lock when we created a
- big_rec, so that row_upd_clust_rec() can store the
- big_rec in the same mini-transaction. */
-
- mtr_x_lock(dict_index_get_lock(index), mtr);
- }
-
- /* Was the record to be updated positioned as the first user
- record on its page? */
- was_first = page_cur_is_before_first(page_cursor);
-
- /* Lock checks and undo logging were already performed by
- btr_cur_upd_lock_and_undo(). We do not try
- btr_cur_optimistic_insert() because
- btr_cur_insert_if_possible() already failed above. */
-
- err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
- | BTR_NO_LOCKING_FLAG
- | BTR_KEEP_SYS_FLAG,
- cursor, offsets, offsets_heap,
- new_entry, &rec,
- &dummy_big_rec, n_ext, NULL, mtr);
- ut_a(rec);
- ut_a(err == DB_SUCCESS);
- ut_a(dummy_big_rec == NULL);
- ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
- page_cursor->rec = rec;
-
- if (dict_index_is_sec_or_ibuf(index)) {
- /* Update PAGE_MAX_TRX_ID in the index page header.
- It was not updated by btr_cur_pessimistic_insert()
- because of BTR_NO_LOCKING_FLAG. */
- buf_block_t* rec_block;
-
- rec_block = btr_cur_get_block(cursor);
-
- page_update_max_trx_id(rec_block,
- buf_block_get_page_zip(rec_block),
- trx_id, mtr);
- }
-
- if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
- /* The new inserted record owns its possible externally
- stored fields */
- buf_block_t* rec_block = btr_cur_get_block(cursor);
-
-#ifdef UNIV_ZIP_DEBUG
- ut_a(!page_zip || page_zip_validate(page_zip, page, index));
- page = buf_block_get_frame(rec_block);
-#endif /* UNIV_ZIP_DEBUG */
- page_zip = buf_block_get_page_zip(rec_block);
-
- btr_cur_unmark_extern_fields(page_zip,
- rec, index, *offsets, mtr);
- }
-
- lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
- rec, block);
-
- /* If necessary, restore also the correct lock state for a new,
- preceding supremum record created in a page split. While the old
- record was nonexistent, the supremum might have inherited its locks
- from a wrong record. */
-
- if (!was_first) {
- btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
- rec, mtr);
- }
-
-return_after_reservations:
-#ifdef UNIV_ZIP_DEBUG
- ut_a(!page_zip || page_zip_validate(page_zip, page, index));
-#endif /* UNIV_ZIP_DEBUG */
-
- if (n_reserved > 0) {
- fil_space_release_free_extents(index->space, n_reserved);
- }
-
- *big_rec = big_rec_vec;
-
- return(err);
-}
-
-/*==================== B-TREE DELETE MARK AND UNMARK ===============*/
-
-/****************************************************************//**
-Writes the redo log record for delete marking or unmarking of an index
-record. */
-UNIV_INLINE
-void
-btr_cur_del_mark_set_clust_rec_log(
-/*===============================*/
- rec_t* rec, /*!< in: record */
- dict_index_t* index, /*!< in: index of the record */
- trx_id_t trx_id, /*!< in: transaction id */
- roll_ptr_t roll_ptr,/*!< in: roll ptr to the undo log record */
- mtr_t* mtr) /*!< in: mtr */
-{
- byte* log_ptr;
-
- ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
-
- log_ptr = mlog_open_and_write_index(mtr, rec, index,
- page_rec_is_comp(rec)
- ? MLOG_COMP_REC_CLUST_DELETE_MARK
- : MLOG_REC_CLUST_DELETE_MARK,
- 1 + 1 + DATA_ROLL_PTR_LEN
- + 14 + 2);
-
- if (!log_ptr) {
- /* Logging in mtr is switched off during crash recovery */
- return;
- }
-
- *log_ptr++ = 0;
- *log_ptr++ = 1;
-
- log_ptr = row_upd_write_sys_vals_to_log(
- index, trx_id, roll_ptr, log_ptr, mtr);
- mach_write_to_2(log_ptr, page_offset(rec));
- log_ptr += 2;
-
- mlog_close(mtr, log_ptr);
-}
-#endif /* !UNIV_HOTBACKUP */
-
-/****************************************************************//**
-Parses the redo log record for delete marking or unmarking of a clustered
-index record.
-@return end of log record or NULL */
-UNIV_INTERN
-byte*
-btr_cur_parse_del_mark_set_clust_rec(
-/*=================================*/
- byte* ptr, /*!< in: buffer */
- byte* end_ptr,/*!< in: buffer end */
- page_t* page, /*!< in/out: page or NULL */
- page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
- dict_index_t* index) /*!< in: index corresponding to page */
-{
- ulint flags;
- ulint val;
- ulint pos;
- trx_id_t trx_id;
- roll_ptr_t roll_ptr;
- ulint offset;
- rec_t* rec;
-
- ut_ad(!page
- || !!page_is_comp(page) == dict_table_is_comp(index->table));
-
- if (end_ptr < ptr + 2) {
-
- return(NULL);
- }
-
- flags = mach_read_from_1(ptr);
- ptr++;
- val = mach_read_from_1(ptr);
- ptr++;
-
- ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
-
- if (ptr == NULL) {
-
- return(NULL);
- }
-
- if (end_ptr < ptr + 2) {
-
- return(NULL);
- }
-
- offset = mach_read_from_2(ptr);
- ptr += 2;
-
- ut_a(offset <= UNIV_PAGE_SIZE);
-
- if (page) {
- rec = page + offset;
-
- /* We do not need to reserve btr_search_latch, as the page
- is only being recovered, and there cannot be a hash index to
- it. Besides, these fields are being updated in place
- and the adaptive hash index does not depend on them. */
-
- btr_rec_set_deleted_flag(rec, page_zip, val);
-
- if (!(flags & BTR_KEEP_SYS_FLAG)) {
- mem_heap_t* heap = NULL;
- ulint offsets_[REC_OFFS_NORMAL_SIZE];
- rec_offs_init(offsets_);
-
- row_upd_rec_sys_fields_in_recovery(
- rec, page_zip,
- rec_get_offsets(rec, index, offsets_,
- ULINT_UNDEFINED, &heap),
- pos, trx_id, roll_ptr);
- if (UNIV_LIKELY_NULL(heap)) {
- mem_heap_free(heap);
- }
- }
- }
-
- return(ptr);
-}
-
-#ifndef UNIV_HOTBACKUP
-/***********************************************************//**
-Marks a clustered index record deleted. Writes an undo log record to
-undo log on this delete marking. Writes in the trx id field the id
-of the deleting transaction, and in the roll ptr field pointer to the
-undo log record created.
-@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
-UNIV_INTERN
-dberr_t
-btr_cur_del_mark_set_clust_rec(
-/*===========================*/
- buf_block_t* block, /*!< in/out: buffer block of the record */
- rec_t* rec, /*!< in/out: record */
- dict_index_t* index, /*!< in: clustered index of the record */
- const ulint* offsets,/*!< in: rec_get_offsets(rec) */
- que_thr_t* thr, /*!< in: query thread */
- mtr_t* mtr) /*!< in/out: mini-transaction */
-{
- roll_ptr_t roll_ptr;
- dberr_t err;
- page_zip_des_t* page_zip;
- trx_t* trx;
-
- ut_ad(dict_index_is_clust(index));
- ut_ad(rec_offs_validate(rec, index, offsets));
- ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
- ut_ad(buf_block_get_frame(block) == page_align(rec));
- ut_ad(page_is_leaf(page_align(rec)));
-
-#ifdef UNIV_DEBUG
- if (btr_cur_print_record_ops) {
- btr_cur_trx_report(thr_get_trx(thr)->id, index, "del mark ");
- rec_print_new(stderr, rec, offsets);
- }
-#endif /* UNIV_DEBUG */
-
- ut_ad(dict_index_is_clust(index));
- ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
-
- if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) {
- /* skip LOCK, UNDO, CHANGE, LOG */
- return(DB_SUCCESS);
- }
-
- err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block,
- rec, index, offsets, thr);
-
- if (err != DB_SUCCESS) {
-
- return(err);
- }
-
- err = trx_undo_report_row_operation(thr,
- index, NULL, NULL, 0, rec, offsets,
- &roll_ptr);
- if (err != DB_SUCCESS) {
-
- return(err);
- }
-
- /* The btr_search_latch is not needed here, because
- the adaptive hash index does not depend on the delete-mark
- and the delete-mark is being updated in place. */
-
- page_zip = buf_block_get_page_zip(block);
-
- btr_blob_dbg_set_deleted_flag(rec, index, offsets, TRUE);
- btr_rec_set_deleted_flag(rec, page_zip, TRUE);
-
- trx = thr_get_trx(thr);
-
- if (dict_index_is_online_ddl(index)) {
- row_log_table_delete(rec, index, offsets, NULL);
- }
-
- row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr);
-
- btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id,
- roll_ptr, mtr);
-
- return(err);
-}
-
-/****************************************************************//**
-Writes the redo log record for a delete mark setting of a secondary
-index record. */
-UNIV_INLINE
-void
-btr_cur_del_mark_set_sec_rec_log(
-/*=============================*/
- rec_t* rec, /*!< in: record */
- ibool val, /*!< in: value to set */
- mtr_t* mtr) /*!< in: mtr */
-{
- byte* log_ptr;
- ut_ad(val <= 1);
-
- log_ptr = mlog_open(mtr, 11 + 1 + 2);
-
- if (!log_ptr) {
- /* Logging in mtr is switched off during crash recovery:
- in that case mlog_open returns NULL */
- return;
- }
-
- log_ptr = mlog_write_initial_log_record_fast(
- rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
- mach_write_to_1(log_ptr, val);
- log_ptr++;
-
- mach_write_to_2(log_ptr, page_offset(rec));
- log_ptr += 2;
-
- mlog_close(mtr, log_ptr);
-}
-#endif /* !UNIV_HOTBACKUP */
-
-/****************************************************************//**
-Parses the redo log record for delete marking or unmarking of a secondary
-index record.
-@return end of log record or NULL */
-UNIV_INTERN
-byte*
-btr_cur_parse_del_mark_set_sec_rec(
-/*===============================*/
- byte* ptr, /*!< in: buffer */
- byte* end_ptr,/*!< in: buffer end */
- page_t* page, /*!< in/out: page or NULL */
- page_zip_des_t* page_zip)/*!< in/out: compressed page, or NULL */
-{
- ulint val;
- ulint offset;
- rec_t* rec;
-
- if (end_ptr < ptr + 3) {
-
- return(NULL);
- }
-
- val = mach_read_from_1(ptr);
- ptr++;
-
- offset = mach_read_from_2(ptr);
- ptr += 2;
-
- ut_a(offset <= UNIV_PAGE_SIZE);
-
- if (page) {
- rec = page + offset;
-
- /* We do not need to reserve btr_search_latch, as the page
- is only being recovered, and there cannot be a hash index to
- it. Besides, the delete-mark flag is being updated in place
- and the adaptive hash index does not depend on it. */
-
- btr_rec_set_deleted_flag(rec, page_zip, val);
- }
-
- return(ptr);
-}
-
-#ifndef UNIV_HOTBACKUP
-/***********************************************************//**
-Sets a secondary index record delete mark to TRUE or FALSE.
-@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
-UNIV_INTERN
-dberr_t
-btr_cur_del_mark_set_sec_rec(
-/*=========================*/
- ulint flags, /*!< in: locking flag */
- btr_cur_t* cursor, /*!< in: cursor */
- ibool val, /*!< in: value to set */
- que_thr_t* thr, /*!< in: query thread */
- mtr_t* mtr) /*!< in/out: mini-transaction */
-{
- buf_block_t* block;
- rec_t* rec;
- dberr_t err;
-
- if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) {
- /* skip LOCK, CHANGE, LOG */
- return(DB_SUCCESS);
- }
-
- block = btr_cur_get_block(cursor);
- rec = btr_cur_get_rec(cursor);
-
-#ifdef UNIV_DEBUG
- if (btr_cur_print_record_ops) {
- btr_cur_trx_report(thr_get_trx(thr)->id, cursor->index,
- "del mark ");
- rec_print(stderr, rec, cursor->index);
- }
-#endif /* UNIV_DEBUG */
-
- err = lock_sec_rec_modify_check_and_lock(flags,
- btr_cur_get_block(cursor),
- rec, cursor->index, thr, mtr);
- if (err != DB_SUCCESS) {
-
- return(err);
- }
-
- ut_ad(!!page_rec_is_comp(rec)
- == dict_table_is_comp(cursor->index->table));
-
- /* We do not need to reserve btr_search_latch, as the
- delete-mark flag is being updated in place and the adaptive
- hash index does not depend on it. */
- btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
-
- btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
-
- return(DB_SUCCESS);
-}
-
-/***********************************************************//**
-Sets a secondary index record's delete mark to the given value. This
-function is only used by the insert buffer merge mechanism. */
-UNIV_INTERN
-void
-btr_cur_set_deleted_flag_for_ibuf(
-/*==============================*/
- rec_t* rec, /*!< in/out: record */
- page_zip_des_t* page_zip, /*!< in/out: compressed page
- corresponding to rec, or NULL
- when the tablespace is
- uncompressed */
- ibool val, /*!< in: value to set */
- mtr_t* mtr) /*!< in/out: mini-transaction */
-{
- /* We do not need to reserve btr_search_latch, as the page
- has just been read to the buffer pool and there cannot be
- a hash index to it. Besides, the delete-mark flag is being
- updated in place and the adaptive hash index does not depend
- on it. */
-
- btr_rec_set_deleted_flag(rec, page_zip, val);
-
- btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
-}
-
-/*==================== B-TREE RECORD REMOVE =========================*/
-
-/*************************************************************//**
-Tries to compress a page of the tree if it seems useful. It is assumed
-that mtr holds an x-latch on the tree and on the cursor page. To avoid
-deadlocks, mtr must also own x-latches to brothers of page, if those
-brothers exist. NOTE: it is assumed that the caller has reserved enough
-free extents so that the compression will always succeed if done!
-@return TRUE if compression occurred */
-UNIV_INTERN
-ibool
-btr_cur_compress_if_useful(
-/*=======================*/
- btr_cur_t* cursor, /*!< in/out: cursor on the page to compress;
- cursor does not stay valid if !adjust and
- compression occurs */
- ibool adjust, /*!< in: TRUE if should adjust the
- cursor position even if compression occurs */
- mtr_t* mtr) /*!< in/out: mini-transaction */
-{
- ut_ad(mtr_memo_contains(mtr,
- dict_index_get_lock(btr_cur_get_index(cursor)),
- MTR_MEMO_X_LOCK));
- ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
- MTR_MEMO_PAGE_X_FIX));
-
- return(btr_cur_compress_recommendation(cursor, mtr)
- && btr_compress(cursor, adjust, mtr));
-}
-
-/*******************************************************//**
-Removes the record on which the tree cursor is positioned on a leaf page.
-It is assumed that the mtr has an x-latch on the page where the cursor is
-positioned, but no latch on the whole tree.
-@return TRUE if success, i.e., the page did not become too empty */
-UNIV_INTERN
-ibool
-btr_cur_optimistic_delete_func(
-/*===========================*/
- btr_cur_t* cursor, /*!< in: cursor on leaf page, on the record to
- delete; cursor stays valid: if deletion
- succeeds, on function exit it points to the
- successor of the deleted record */
-#ifdef UNIV_DEBUG
- ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
-#endif /* UNIV_DEBUG */
- mtr_t* mtr) /*!< in: mtr; if this function returns
- TRUE on a leaf page of a secondary
- index, the mtr must be committed
- before latching any further pages */
-{
- buf_block_t* block;
- rec_t* rec;
- mem_heap_t* heap = NULL;
- ulint offsets_[REC_OFFS_NORMAL_SIZE];
- ulint* offsets = offsets_;
- ibool no_compress_needed;
- rec_offs_init(offsets_);
-
- ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
- ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
- MTR_MEMO_PAGE_X_FIX));
- /* This is intended only for leaf page deletions */
-
- block = btr_cur_get_block(cursor);
-
- SRV_CORRUPT_TABLE_CHECK(block, return(DB_CORRUPTION););
-
- ut_ad(page_is_leaf(buf_block_get_frame(block)));
- ut_ad(!dict_index_is_online_ddl(cursor->index)
- || dict_index_is_clust(cursor->index)
- || (flags & BTR_CREATE_FLAG));
-
- rec = btr_cur_get_rec(cursor);
- offsets = rec_get_offsets(rec, cursor->index, offsets,
- ULINT_UNDEFINED, &heap);
-
- no_compress_needed = !rec_offs_any_extern(offsets)
- && btr_cur_can_delete_without_compress(
- cursor, rec_offs_size(offsets), mtr);
-
- if (no_compress_needed) {
-
- page_t* page = buf_block_get_frame(block);
- page_zip_des_t* page_zip= buf_block_get_page_zip(block);
-
- lock_update_delete(block, rec);
-
- btr_search_update_hash_on_delete(cursor);
-
- if (page_zip) {
-#ifdef UNIV_ZIP_DEBUG
- ut_a(page_zip_validate(page_zip, page, cursor->index));
-#endif /* UNIV_ZIP_DEBUG */
- page_cur_delete_rec(btr_cur_get_page_cur(cursor),
- cursor->index, offsets, mtr);
-#ifdef UNIV_ZIP_DEBUG
- ut_a(page_zip_validate(page_zip, page, cursor->index));
-#endif /* UNIV_ZIP_DEBUG */
-
- /* On compressed pages, the IBUF_BITMAP_FREE
- space is not affected by deleting (purging)
- records, because it is defined as the minimum
- of space available *without* reorganize, and
- space available in the modification log. */
- } else {
- const ulint max_ins
- = page_get_max_insert_size_after_reorganize(
- page, 1);
-
- page_cur_delete_rec(btr_cur_get_page_cur(cursor),
- cursor->index, offsets, mtr);
-
- /* The change buffer does not handle inserts
- into non-leaf pages, into clustered indexes,
- or into the change buffer. */
- if (page_is_leaf(page)
- && !dict_index_is_clust(cursor->index)
- && !dict_index_is_ibuf(cursor->index)) {
- ibuf_update_free_bits_low(block, max_ins, mtr);
- }
- }
- }
-
- if (UNIV_LIKELY_NULL(heap)) {
- mem_heap_free(heap);
- }
-
- return(no_compress_needed);
-}
-
-/*************************************************************//**
-Removes the record on which the tree cursor is positioned. Tries
-to compress the page if its fillfactor drops below a threshold
-or if it is the only page on the level. It is assumed that mtr holds
-an x-latch on the tree and on the cursor page. To avoid deadlocks,
-mtr must also own x-latches to brothers of page, if those brothers
-exist.
-@return TRUE if compression occurred */
-UNIV_INTERN
-ibool
-btr_cur_pessimistic_delete(
-/*=======================*/
- dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
- the latter may occur because we may have
- to update node pointers on upper levels,
- and in the case of variable length keys
- these may actually grow in size */
- ibool has_reserved_extents, /*!< in: TRUE if the
- caller has already reserved enough free
- extents so that he knows that the operation
- will succeed */
- btr_cur_t* cursor, /*!< in: cursor on the record to delete;
- if compression does not occur, the cursor
- stays valid: it points to successor of
- deleted record on function exit */
- ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
- enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
- mtr_t* mtr) /*!< in: mtr */
-{
- buf_block_t* block;
- page_t* page;
- page_zip_des_t* page_zip;
- dict_index_t* index;
- rec_t* rec;
- ulint n_reserved = 0;
- ibool success;
- ibool ret = FALSE;
- ulint level;
- mem_heap_t* heap;
- ulint* offsets;
-
- block = btr_cur_get_block(cursor);
- page = buf_block_get_frame(block);
- index = btr_cur_get_index(cursor);
-
- ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
- ut_ad(!dict_index_is_online_ddl(index)
- || dict_index_is_clust(index)
- || (flags & BTR_CREATE_FLAG));
- ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
- MTR_MEMO_X_LOCK));
- ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
- if (!has_reserved_extents) {
- /* First reserve enough free space for the file segments
- of the index tree, so that the node pointer updates will
- not fail because of lack of space */
-
- ut_a(cursor->tree_height != ULINT_UNDEFINED);
-
- ulint n_extents = cursor->tree_height / 32 + 1;
-
- success = fsp_reserve_free_extents(&n_reserved,
- index->space,
- n_extents,
- FSP_CLEANING, mtr);
- if (!success) {
- *err = DB_OUT_OF_FILE_SPACE;
-
- return(FALSE);
- }
- }
-
- heap = mem_heap_create(1024);
- rec = btr_cur_get_rec(cursor);
- page_zip = buf_block_get_page_zip(block);
-#ifdef UNIV_ZIP_DEBUG
- ut_a(!page_zip || page_zip_validate(page_zip, page, index));
-#endif /* UNIV_ZIP_DEBUG */
-
- offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
-
- if (rec_offs_any_extern(offsets)) {
- btr_rec_free_externally_stored_fields(index,
- rec, offsets, page_zip,
- rb_ctx, mtr);
-#ifdef UNIV_ZIP_DEBUG
- ut_a(!page_zip || page_zip_validate(page_zip, page, index));
-#endif /* UNIV_ZIP_DEBUG */
- }
-
- if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
- && UNIV_UNLIKELY(dict_index_get_page(index)
- != buf_block_get_page_no(block))) {
-
- /* If there is only one record, drop the whole page in
- btr_discard_page, if this is not the root page */
-
- btr_discard_page(cursor, mtr);
-
- ret = TRUE;
-
- goto return_after_reservations;
- }
-
- if (flags == 0) {
- lock_update_delete(block, rec);
- }
-
- level = btr_page_get_level(page, mtr);
-
- if (level > 0
- && UNIV_UNLIKELY(rec == page_rec_get_next(
- page_get_infimum_rec(page)))) {
-
- rec_t* next_rec = page_rec_get_next(rec);
-
- if (btr_page_get_prev(page, mtr) == FIL_NULL) {
-
- /* If we delete the leftmost node pointer on a
- non-leaf level, we must mark the new leftmost node
- pointer as the predefined minimum record */
-
- /* This will make page_zip_validate() fail until
- page_cur_delete_rec() completes. This is harmless,
- because everything will take place within a single
- mini-transaction and because writing to the redo log
- is an atomic operation (performed by mtr_commit()). */
- btr_set_min_rec_mark(next_rec, mtr);
- } else {
- /* Otherwise, if we delete the leftmost node pointer
- on a page, we have to change the father node pointer
- so that it is equal to the new leftmost node pointer
- on the page */
-
- btr_node_ptr_delete(index, block, mtr);
-
- dtuple_t* node_ptr = dict_index_build_node_ptr(
- index, next_rec, buf_block_get_page_no(block),
- heap, level);
-
- btr_insert_on_non_leaf_level(
- flags, index, level + 1, node_ptr, mtr);
- }
- }
-
- btr_search_update_hash_on_delete(cursor);
-
- page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
-#ifdef UNIV_ZIP_DEBUG
- ut_a(!page_zip || page_zip_validate(page_zip, page, index));
-#endif /* UNIV_ZIP_DEBUG */
-
- ut_ad(btr_check_node_ptr(index, block, mtr));
-
-return_after_reservations:
- *err = DB_SUCCESS;
-
- mem_heap_free(heap);
-
- if (ret == FALSE) {
- ret = btr_cur_compress_if_useful(cursor, FALSE, mtr);
- }
-
- if (n_reserved > 0) {
- fil_space_release_free_extents(index->space, n_reserved);
- }
-
- return(ret);
-}
-
-/*******************************************************************//**
-Adds path information to the cursor for the current page, for which
-the binary search has been performed. */
-static
-void
-btr_cur_add_path_info(
-/*==================*/
- btr_cur_t* cursor, /*!< in: cursor positioned on a page */
- ulint height, /*!< in: height of the page in tree;
- 0 means leaf node */
- ulint root_height) /*!< in: root node height in tree */
-{
- btr_path_t* slot;
- const rec_t* rec;
- const page_t* page;
-
- ut_a(cursor->path_arr);
-
- if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
- /* Do nothing; return empty path */
-
- slot = cursor->path_arr;
- slot->nth_rec = ULINT_UNDEFINED;
-
- return;
- }
-
- if (height == 0) {
- /* Mark end of slots for path */
- slot = cursor->path_arr + root_height + 1;
- slot->nth_rec = ULINT_UNDEFINED;
- }
-
- rec = btr_cur_get_rec(cursor);
-
- slot = cursor->path_arr + (root_height - height);
-
- page = page_align(rec);
-
- slot->nth_rec = page_rec_get_n_recs_before(rec);
- slot->n_recs = page_get_n_recs(page);
- slot->page_no = page_get_page_no(page);
- slot->page_level = btr_page_get_level_low(page);
-}
-
-/*******************************************************************//**
-Estimate the number of rows between slot1 and slot2 for any level on a
-B-tree. This function starts from slot1->page and reads a few pages to
-the right, counting their records. If we reach slot2->page quickly then
-we know exactly how many records there are between slot1 and slot2 and
-we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly
-then we calculate the average number of records in the pages scanned
-so far and assume that all pages that we did not scan up to slot2->page
-contain the same number of records, then we multiply that average to
-the number of pages between slot1->page and slot2->page (which is
-n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE.
-@return number of rows (exact or estimated) */
-static
-ib_int64_t
-btr_estimate_n_rows_in_range_on_level(
-/*==================================*/
- dict_index_t* index, /*!< in: index */
- btr_path_t* slot1, /*!< in: left border */
- btr_path_t* slot2, /*!< in: right border */
- ib_int64_t n_rows_on_prev_level, /*!< in: number of rows
- on the previous level for the
- same descend paths; used to
- determine the numbe of pages
- on this level */
- ibool* is_n_rows_exact) /*!< out: TRUE if the returned
- value is exact i.e. not an
- estimation */
-{
- ulint space;
- ib_int64_t n_rows;
- ulint n_pages_read;
- ulint page_no;
- ulint zip_size;
- ulint level;
-
- space = dict_index_get_space(index);
-
- n_rows = 0;
- n_pages_read = 0;
-
- /* Assume by default that we will scan all pages between
- slot1->page_no and slot2->page_no */
- *is_n_rows_exact = TRUE;
-
- /* add records from slot1->page_no which are to the right of
- the record which serves as a left border of the range, if any */
- if (slot1->nth_rec < slot1->n_recs) {
- n_rows += slot1->n_recs - slot1->nth_rec;
- }
-
- /* add records from slot2->page_no which are to the left of
- the record which servers as a right border of the range, if any */
- if (slot2->nth_rec > 1) {
- n_rows += slot2->nth_rec - 1;
- }
-
- /* count the records in the pages between slot1->page_no and
- slot2->page_no (non inclusive), if any */
-
- zip_size = fil_space_get_zip_size(space);
-
- /* Do not read more than this number of pages in order not to hurt
- performance with this code which is just an estimation. If we read
- this many pages before reaching slot2->page_no then we estimate the
- average from the pages scanned so far */
-# define N_PAGES_READ_LIMIT 10
-
- page_no = slot1->page_no;
- level = slot1->page_level;
-
- do {
- mtr_t mtr;
- page_t* page;
- buf_block_t* block;
- dberr_t err=DB_SUCCESS;
-
- mtr_start(&mtr);
-
- /* Fetch the page. Because we are not holding the
- index->lock, the tree may have changed and we may be
- attempting to read a page that is no longer part of
- the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to
- silence a debug assertion about this. */
- block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH,
- NULL, BUF_GET_POSSIBLY_FREED,
- __FILE__, __LINE__, &mtr, &err);
-
- ut_ad((block != NULL) == (err == DB_SUCCESS));
-
- if (err != DB_SUCCESS) {
- if (err == DB_DECRYPTION_FAILED) {
- ib_push_warning((void *)NULL,
- DB_DECRYPTION_FAILED,
- "Table %s is encrypted but encryption service or"
- " used key_id is not available. "
- " Can't continue reading table.",
- index->table->name);
- index->table->file_unreadable = true;
- }
-
- mtr_commit(&mtr);
- goto inexact;
- }
-
- page = buf_block_get_frame(block);
-
- /* It is possible that the tree has been reorganized in the
- meantime and this is a different page. If this happens the
- calculated estimate will be bogus, which is not fatal as
- this is only an estimate. We are sure that a page with
- page_no exists because InnoDB never frees pages, only
- reuses them. */
- if (fil_page_get_type(page) != FIL_PAGE_INDEX
- || btr_page_get_index_id(page) != index->id
- || btr_page_get_level_low(page) != level) {
-
- /* The page got reused for something else */
- mtr_commit(&mtr);
- goto inexact;
- }
-
- /* It is possible but highly unlikely that the page was
- originally written by an old version of InnoDB that did
- not initialize FIL_PAGE_TYPE on other than B-tree pages.
- For example, this could be an almost-empty BLOB page
- that happens to contain the magic values in the fields
- that we checked above. */
-
- n_pages_read++;
-
- if (page_no != slot1->page_no) {
- /* Do not count the records on slot1->page_no,
- we already counted them before this loop. */
- n_rows += page_get_n_recs(page);
- }
-
- page_no = btr_page_get_next(page, &mtr);
-
- mtr_commit(&mtr);
-
- if (n_pages_read == N_PAGES_READ_LIMIT
- || page_no == FIL_NULL) {
- /* Either we read too many pages or
- we reached the end of the level without passing
- through slot2->page_no, the tree must have changed
- in the meantime */
- goto inexact;
- }
-
- } while (page_no != slot2->page_no);
-
- return(n_rows);
-
-inexact:
-
- *is_n_rows_exact = FALSE;
-
- /* We did interrupt before reaching slot2->page */
-
- if (n_pages_read > 0) {
- /* The number of pages on this level is
- n_rows_on_prev_level, multiply it by the
- average number of recs per page so far */
- n_rows = n_rows_on_prev_level
- * n_rows / n_pages_read;
- } else {
- /* The tree changed before we could even
- start with slot1->page_no */
- n_rows = 10;
- }
-
- return(n_rows);
-}
-
-/** If the tree gets changed too much between the two dives for the left
-and right boundary then btr_estimate_n_rows_in_range_low() will retry
-that many times before giving up and returning the value stored in
-rows_in_range_arbitrary_ret_val. */
-static const unsigned rows_in_range_max_retries = 4;
-
-/** We pretend that a range has that many records if the tree keeps changing
-for rows_in_range_max_retries retries while we try to estimate the records
-in a given range. */
-static const ib_int64_t rows_in_range_arbitrary_ret_val = 10;
-
-/** Estimates the number of rows in a given index range.
-@param[in] index index
-@param[in] tuple1 range start, may also be empty tuple
-@param[in] mode1 search mode for range start
-@param[in] tuple2 range end, may also be empty tuple
-@param[in] mode2 search mode for range end
-@param[in] trx trx
-@param[in] nth_attempt if the tree gets modified too much while
-we are trying to analyze it, then we will retry (this function will call
-itself, incrementing this parameter)
-@return estimated number of rows; if after rows_in_range_max_retries
-retries the tree keeps changing, then we will just return
-rows_in_range_arbitrary_ret_val as a result (if
-nth_attempt >= rows_in_range_max_retries and the tree is modified between
-the two dives). */
-static
-ib_int64_t
-btr_estimate_n_rows_in_range_low(
- dict_index_t* index,
- const dtuple_t* tuple1,
- ulint mode1,
- const dtuple_t* tuple2,
- ulint mode2,
- trx_t* trx,
- unsigned nth_attempt)
-{
- btr_path_t path1[BTR_PATH_ARRAY_N_SLOTS];
- btr_path_t path2[BTR_PATH_ARRAY_N_SLOTS];
- btr_cur_t cursor;
- btr_path_t* slot1;
- btr_path_t* slot2;
- ibool diverged;
- ibool diverged_lot;
- ulint divergence_level;
- ib_int64_t n_rows;
- ibool is_n_rows_exact;
- ulint i;
- mtr_t mtr;
- ib_int64_t table_n_rows;
-
- table_n_rows = dict_table_get_n_rows(index->table);
-
- mtr_start_trx(&mtr, trx);
-
- cursor.path_arr = path1;
-
- if (dtuple_get_n_fields(tuple1) > 0) {
-
- btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
- BTR_SEARCH_LEAF | BTR_ESTIMATE,
- &cursor, 0,
- __FILE__, __LINE__, &mtr);
- } else {
- btr_cur_open_at_index_side(true, index,
- BTR_SEARCH_LEAF | BTR_ESTIMATE,
- &cursor, 0, &mtr);
- }
-
- mtr_commit(&mtr);
-
- if (index->table->file_unreadable) {
- return (0);
- }
-
- mtr_start_trx(&mtr, trx);
-
-#ifdef UNIV_DEBUG
- if (!strcmp(index->name, "iC")) {
- DEBUG_SYNC_C("btr_estimate_n_rows_in_range_between_dives");
- }
-#endif
-
- cursor.path_arr = path2;
-
- if (dtuple_get_n_fields(tuple2) > 0) {
-
- btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
- BTR_SEARCH_LEAF | BTR_ESTIMATE,
- &cursor, 0,
- __FILE__, __LINE__, &mtr);
- } else {
- btr_cur_open_at_index_side(false, index,
- BTR_SEARCH_LEAF | BTR_ESTIMATE,
- &cursor, 0, &mtr);
- }
-
- mtr_commit(&mtr);
-
- /* We have the path information for the range in path1 and path2 */
-
- n_rows = 1;
- is_n_rows_exact = TRUE;
- diverged = FALSE; /* This becomes true when the path is not
- the same any more */
- diverged_lot = FALSE; /* This becomes true when the paths are
- not the same or adjacent any more */
- divergence_level = 1000000; /* This is the level where paths diverged
- a lot */
- for (i = 0; ; i++) {
- ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
-
- slot1 = path1 + i;
- slot2 = path2 + i;
-
- if (slot1->nth_rec == ULINT_UNDEFINED
- || slot2->nth_rec == ULINT_UNDEFINED) {
-
- if (i > divergence_level + 1 && !is_n_rows_exact) {
- /* In trees whose height is > 1 our algorithm
- tends to underestimate: multiply the estimate
- by 2: */
-
- n_rows = n_rows * 2;
- }
-
- DBUG_EXECUTE_IF("bug14007649", return(n_rows););
-
- /* Do not estimate the number of rows in the range
- to over 1 / 2 of the estimated rows in the whole
- table */
-
- if (n_rows > table_n_rows / 2 && !is_n_rows_exact) {
-
- n_rows = table_n_rows / 2;
-
- /* If there are just 0 or 1 rows in the table,
- then we estimate all rows are in the range */
-
- if (n_rows == 0) {
- n_rows = table_n_rows;
- }
- }
-
- return(n_rows);
- }
-
- if (!diverged && slot1->nth_rec != slot2->nth_rec) {
-
- /* If both slots do not point to the same page or if
- the paths have crossed and the same page on both
- apparently contains a different number of records,
- this means that the tree must have changed between
- the dive for slot1 and the dive for slot2 at the
- beginning of this function. */
- if (slot1->page_no != slot2->page_no
- || slot1->page_level != slot2->page_level
- || (slot1->nth_rec >= slot2->nth_rec
- && slot1->n_recs != slot2->n_recs)) {
-
- /* If the tree keeps changing even after a
- few attempts, then just return some arbitrary
- number. */
- if (nth_attempt >= rows_in_range_max_retries) {
- return(rows_in_range_arbitrary_ret_val);
- }
-
- const ib_int64_t ret =
- btr_estimate_n_rows_in_range_low(
- index, tuple1, mode1,
- tuple2, mode2, trx,
- nth_attempt + 1);
-
- return(ret);
- }
-
- diverged = TRUE;
-
- if (slot1->nth_rec < slot2->nth_rec) {
- n_rows = slot2->nth_rec - slot1->nth_rec;
-
- if (n_rows > 1) {
- diverged_lot = TRUE;
- divergence_level = i;
- }
- } else {
- /* It is possible that
- slot1->nth_rec >= slot2->nth_rec
- if, for example, we have a single page
- tree which contains (inf, 5, 6, supr)
- and we select where x > 20 and x < 30;
- in this case slot1->nth_rec will point
- to the supr record and slot2->nth_rec
- will point to 6 */
- return(0);
- }
-
- } else if (diverged && !diverged_lot) {
-
- if (slot1->nth_rec < slot1->n_recs
- || slot2->nth_rec > 1) {
-
- diverged_lot = TRUE;
- divergence_level = i;
-
- n_rows = 0;
-
- if (slot1->nth_rec < slot1->n_recs) {
- n_rows += slot1->n_recs
- - slot1->nth_rec;
- }
-
- if (slot2->nth_rec > 1) {
- n_rows += slot2->nth_rec - 1;
- }
- }
- } else if (diverged_lot) {
-
- n_rows = btr_estimate_n_rows_in_range_on_level(
- index, slot1, slot2, n_rows,
- &is_n_rows_exact);
- }
- }
-}
-
-/** Estimates the number of rows in a given index range.
-@param[in] index index
-@param[in] tuple1 range start, may also be empty tuple
-@param[in] mode1 search mode for range start
-@param[in] tuple2 range end, may also be empty tuple
-@param[in] mode2 search mode for range end
-@param[in] trx trx
-@return estimated number of rows */
-ib_int64_t
-btr_estimate_n_rows_in_range(
- dict_index_t* index,
- const dtuple_t* tuple1,
- ulint mode1,
- const dtuple_t* tuple2,
- ulint mode2,
- trx_t* trx)
-{
- const ib_int64_t ret = btr_estimate_n_rows_in_range_low(
- index, tuple1, mode1, tuple2, mode2, trx,
- 1 /* first attempt */);
-
- return(ret);
-}
-
-/*******************************************************************//**
-Record the number of non_null key values in a given index for
-each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
-The estimates are eventually stored in the array:
-index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
-static
-void
-btr_record_not_null_field_in_rec(
-/*=============================*/
- ulint n_unique, /*!< in: dict_index_get_n_unique(index),
- number of columns uniquely determine
- an index entry */
- const ulint* offsets, /*!< in: rec_get_offsets(rec, index),
- its size could be for all fields or
- that of "n_unique" */
- ib_uint64_t* n_not_null) /*!< in/out: array to record number of
- not null rows for n-column prefix */
-{
- ulint i;
-
- ut_ad(rec_offs_n_fields(offsets) >= n_unique);
-
- if (n_not_null == NULL) {
- return;
- }
-
- for (i = 0; i < n_unique; i++) {
- if (rec_offs_nth_sql_null(offsets, i)) {
- break;
- }
-
- n_not_null[i]++;
- }
-}
-
-/*******************************************************************//**
-Estimates the number of different key values in a given index, for
-each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
-The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
-0..n_uniq-1) and the number of pages that were sampled is saved in
-index->stat_n_sample_sizes[].
-If innodb_stats_method is nulls_ignored, we also record the number of
-non-null values for each prefix and stored the estimates in
-array index->stat_n_non_null_key_vals. */
-UNIV_INTERN
-void
-btr_estimate_number_of_different_key_vals(
-/*======================================*/
- dict_index_t* index) /*!< in: index */
-{
- btr_cur_t cursor;
- page_t* page;
- rec_t* rec;
- ulint n_cols;
- ulint matched_fields;
- ulint matched_bytes;
- ib_uint64_t* n_diff;
- ib_uint64_t* n_not_null;
- ibool stats_null_not_equal;
- ullint n_sample_pages=1; /* number of pages to sample */
- ulint not_empty_flag = 0;
- ulint total_external_size = 0;
- ulint i;
- ulint j;
- ullint add_on;
- mtr_t mtr;
- mem_heap_t* heap = NULL;
- ulint* offsets_rec = NULL;
- ulint* offsets_next_rec = NULL;
-
- n_cols = dict_index_get_n_unique(index);
-
- heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
- * n_cols
- + dict_index_get_n_fields(index)
- * (sizeof *offsets_rec
- + sizeof *offsets_next_rec));
-
- n_diff = (ib_uint64_t*) mem_heap_zalloc(
- heap, n_cols * sizeof(ib_int64_t));
-
- n_not_null = NULL;
-
- /* Check srv_innodb_stats_method setting, and decide whether we
- need to record non-null value and also decide if NULL is
- considered equal (by setting stats_null_not_equal value) */
- switch (srv_innodb_stats_method) {
- case SRV_STATS_NULLS_IGNORED:
- n_not_null = (ib_uint64_t*) mem_heap_zalloc(
- heap, n_cols * sizeof *n_not_null);
- /* fall through */
-
- case SRV_STATS_NULLS_UNEQUAL:
- /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
- case, we will treat NULLs as unequal value */
- stats_null_not_equal = TRUE;
- break;
-
- case SRV_STATS_NULLS_EQUAL:
- stats_null_not_equal = FALSE;
- break;
-
- default:
- ut_error;
- }
-
- if (srv_stats_sample_traditional) {
- /* It makes no sense to test more pages than are contained
- in the index, thus we lower the number if it is too high */
- if (srv_stats_transient_sample_pages > index->stat_index_size) {
- if (index->stat_index_size > 0) {
- n_sample_pages = index->stat_index_size;
- }
- } else {
- n_sample_pages = srv_stats_transient_sample_pages;
- }
- } else {
- /* New logaritmic number of pages that are estimated.
- Number of pages estimated should be between 1 and
- index->stat_index_size.
-
- If we have only 0 or 1 index pages then we can only take 1
- sample. We have already initialized n_sample_pages to 1.
-
- So taking index size as I and sample as S and log(I)*S as L
-
- requirement 1) we want the out limit of the expression to not exceed I;
- requirement 2) we want the ideal pages to be at least S;
- so the current expression is min(I, max( min(S,I), L)
-
- looking for simplifications:
-
- case 1: assume S < I
- min(I, max( min(S,I), L) -> min(I , max( S, L))
-
- but since L=LOG2(I)*S and log2(I) >=1 L>S always so max(S,L) = L.
-
- so we have: min(I , L)
-
- case 2: assume I < S
- min(I, max( min(S,I), L) -> min(I, max( I, L))
-
- case 2a: L > I
- min(I, max( I, L)) -> min(I, L) -> I
-
- case 2b: when L < I
- min(I, max( I, L)) -> min(I, I ) -> I
-
- so taking all case2 paths is I, our expression is:
- n_pages = S < I? min(I,L) : I
- */
- if (index->stat_index_size > 1) {
- n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size) ?
- (ulint) ut_min((double) index->stat_index_size,
- log2(index->stat_index_size)*srv_stats_transient_sample_pages)
- : index->stat_index_size;
-
- }
- }
-
- /* Sanity check */
- ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size < 1 ? 1 : index->stat_index_size));
-
- /* We sample some pages in the index to get an estimate */
-
- for (i = 0; i < n_sample_pages; i++) {
- mtr_start(&mtr);
-
- btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr);
-
- /* Count the number of different key values for each prefix of
- the key on this index page. If the prefix does not determine
- the index record uniquely in the B-tree, then we subtract one
- because otherwise our algorithm would give a wrong estimate
- for an index where there is just one key value. */
-
- if (index->table->file_unreadable) {
- mtr_commit(&mtr);
- goto exit_loop;
- }
-
- page = btr_cur_get_page(&cursor);
-
- SRV_CORRUPT_TABLE_CHECK(page, goto exit_loop;);
- DBUG_EXECUTE_IF("ib_corrupt_page_while_stats_calc",
- page = NULL;);
-
- SRV_CORRUPT_TABLE_CHECK(page,
- {
- mtr_commit(&mtr);
- goto exit_loop;
- });
-
- rec = page_rec_get_next(page_get_infimum_rec(page));
-
- if (!page_rec_is_supremum(rec)) {
- not_empty_flag = 1;
- offsets_rec = rec_get_offsets(rec, index, offsets_rec,
- ULINT_UNDEFINED, &heap);
-
- if (n_not_null != NULL) {
- btr_record_not_null_field_in_rec(
- n_cols, offsets_rec, n_not_null);
- }
- }
-
- while (!page_rec_is_supremum(rec)) {
- rec_t* next_rec = page_rec_get_next(rec);
- if (page_rec_is_supremum(next_rec)) {
- total_external_size +=
- btr_rec_get_externally_stored_len(
- rec, offsets_rec);
- break;
- }
-
- matched_fields = 0;
- matched_bytes = 0;
- offsets_next_rec = rec_get_offsets(next_rec, index,
- offsets_next_rec,
- ULINT_UNDEFINED,
- &heap);
-
- cmp_rec_rec_with_match(rec, next_rec,
- offsets_rec, offsets_next_rec,
- index, stats_null_not_equal,
- &matched_fields,
- &matched_bytes);
-
- for (j = matched_fields; j < n_cols; j++) {
- /* We add one if this index record has
- a different prefix from the previous */
-
- n_diff[j]++;
- }
-
- if (n_not_null != NULL) {
- btr_record_not_null_field_in_rec(
- n_cols, offsets_next_rec, n_not_null);
- }
-
- total_external_size
- += btr_rec_get_externally_stored_len(
- rec, offsets_rec);
-
- rec = next_rec;
- /* Initialize offsets_rec for the next round
- and assign the old offsets_rec buffer to
- offsets_next_rec. */
- {
- ulint* offsets_tmp = offsets_rec;
- offsets_rec = offsets_next_rec;
- offsets_next_rec = offsets_tmp;
- }
- }
-
-
- if (n_cols == dict_index_get_n_unique_in_tree(index)) {
-
- /* If there is more than one leaf page in the tree,
- we add one because we know that the first record
- on the page certainly had a different prefix than the
- last record on the previous index page in the
- alphabetical order. Before this fix, if there was
- just one big record on each clustered index page, the
- algorithm grossly underestimated the number of rows
- in the table. */
-
- if (btr_page_get_prev(page, &mtr) != FIL_NULL
- || btr_page_get_next(page, &mtr) != FIL_NULL) {
-
- n_diff[n_cols - 1]++;
- }
- }
-
- mtr_commit(&mtr);
- }
-
-exit_loop:
- /* If we saw k borders between different key values on
- n_sample_pages leaf pages, we can estimate how many
- there will be in index->stat_n_leaf_pages */
-
- /* We must take into account that our sample actually represents
- also the pages used for external storage of fields (those pages are
- included in index->stat_n_leaf_pages) */
-
- for (j = 0; j < n_cols; j++) {
- index->stat_n_diff_key_vals[j]
- = BTR_TABLE_STATS_FROM_SAMPLE(
- n_diff[j], index, n_sample_pages,
- total_external_size, not_empty_flag);
-
- /* If the tree is small, smaller than
- 10 * n_sample_pages + total_external_size, then
- the above estimate is ok. For bigger trees it is common that we
- do not see any borders between key values in the few pages
- we pick. But still there may be n_sample_pages
- different key values, or even more. Let us try to approximate
- that: */
-
- add_on = index->stat_n_leaf_pages
- / (10 * (n_sample_pages
- + total_external_size));
-
- if (add_on > n_sample_pages) {
- add_on = n_sample_pages;
- }
-
- index->stat_n_diff_key_vals[j] += add_on;
-
- index->stat_n_sample_sizes[j] = n_sample_pages;
-
- /* Update the stat_n_non_null_key_vals[] with our
- sampled result. stat_n_non_null_key_vals[] is created
- and initialized to zero in dict_index_add_to_cache(),
- along with stat_n_diff_key_vals[] array */
- if (n_not_null != NULL) {
- index->stat_n_non_null_key_vals[j] =
- BTR_TABLE_STATS_FROM_SAMPLE(
- n_not_null[j], index, n_sample_pages,
- total_external_size, not_empty_flag);
- }
- }
-
- mem_heap_free(heap);
-}
-
-/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
-
-/***********************************************************//**
-Gets the offset of the pointer to the externally stored part of a field.
-@return offset of the pointer to the externally stored part */
-static
-ulint
-btr_rec_get_field_ref_offs(
-/*=======================*/
- const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
- ulint n) /*!< in: index of the external field */
-{
- ulint field_ref_offs;
- ulint local_len;
-
- ut_a(rec_offs_nth_extern(offsets, n));
- field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
- ut_a(local_len != UNIV_SQL_NULL);
- ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
-
- return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
-}
-
-/** Gets a pointer to the externally stored part of a field.
-@param rec record
-@param offsets rec_get_offsets(rec)
-@param n index of the externally stored field
-@return pointer to the externally stored part */
-#define btr_rec_get_field_ref(rec, offsets, n) \
- ((rec) + btr_rec_get_field_ref_offs(offsets, n))
-
-/** Gets the externally stored size of a record, in units of a database page.
-@param[in] rec record
-@param[in] offsets array returned by rec_get_offsets()
-@return externally stored part, in units of a database page */
-
-ulint
-btr_rec_get_externally_stored_len(
- const rec_t* rec,
- const ulint* offsets)
-{
- ulint n_fields;
- ulint total_extern_len = 0;
- ulint i;
-
- ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
-
- if (!rec_offs_any_extern(offsets)) {
- return(0);
- }
-
- n_fields = rec_offs_n_fields(offsets);
-
- for (i = 0; i < n_fields; i++) {
- if (rec_offs_nth_extern(offsets, i)) {
-
- ulint extern_len = mach_read_from_4(
- btr_rec_get_field_ref(rec, offsets, i)
- + BTR_EXTERN_LEN + 4);
-
- total_extern_len += ut_calc_align(extern_len,
- UNIV_PAGE_SIZE);
- }
- }
-
- return(total_extern_len / UNIV_PAGE_SIZE);
-}
-
-/*******************************************************************//**
-Sets the ownership bit of an externally stored field in a record. */
-static
-void
-btr_cur_set_ownership_of_extern_field(
-/*==================================*/
- page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
- part will be updated, or NULL */
- rec_t* rec, /*!< in/out: clustered index record */
- dict_index_t* index, /*!< in: index of the page */
- const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
- ulint i, /*!< in: field number */
- ibool val, /*!< in: value to set */
- mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
-{
- byte* data;
- ulint local_len;
- ulint byte_val;
-
- data = rec_get_nth_field(rec, offsets, i, &local_len);
- ut_ad(rec_offs_nth_extern(offsets, i));
- ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
-
- local_len -= BTR_EXTERN_FIELD_REF_SIZE;
-
- byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
-
- if (val) {
- byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
- } else {
-#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
- ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
-#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
- byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
- }
-
- if (page_zip) {
- mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
- page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
- } else if (mtr != NULL) {
-
- mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
- MLOG_1BYTE, mtr);
- } else {
- mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
- }
-
- btr_blob_dbg_owner(rec, index, offsets, i, val);
-}
-
-/*******************************************************************//**
-Marks non-updated off-page fields as disowned by this record. The ownership
-must be transferred to the updated record which is inserted elsewhere in the
-index tree. In purge only the owner of externally stored field is allowed
-to free the field. */
-UNIV_INTERN
-void
-btr_cur_disown_inherited_fields(
-/*============================*/
- page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
- part will be updated, or NULL */
- rec_t* rec, /*!< in/out: record in a clustered index */
- dict_index_t* index, /*!< in: index of the page */
- const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
- const upd_t* update, /*!< in: update vector */
- mtr_t* mtr) /*!< in/out: mini-transaction */
-{
- ulint i;
-
- ut_ad(rec_offs_validate(rec, index, offsets));
- ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
- ut_ad(rec_offs_any_extern(offsets));
-
- for (i = 0; i < rec_offs_n_fields(offsets); i++) {
- if (rec_offs_nth_extern(offsets, i)
- && !upd_get_field_by_field_no(update, i)) {
- btr_cur_set_ownership_of_extern_field(
- page_zip, rec, index, offsets, i, FALSE, mtr);
- }
- }
-}
-
-/*******************************************************************//**
-Marks all extern fields in a record as owned by the record. This function
-should be called if the delete mark of a record is removed: a not delete
-marked record always owns all its extern fields. */
-static
-void
-btr_cur_unmark_extern_fields(
-/*=========================*/
- page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
- part will be updated, or NULL */
- rec_t* rec, /*!< in/out: record in a clustered index */
- dict_index_t* index, /*!< in: index of the page */
- const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
- mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
-{
- ulint n;
- ulint i;
-
- ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
- n = rec_offs_n_fields(offsets);
-
- if (!rec_offs_any_extern(offsets)) {
-
- return;
- }
-
- for (i = 0; i < n; i++) {
- if (rec_offs_nth_extern(offsets, i)) {
-
- btr_cur_set_ownership_of_extern_field(
- page_zip, rec, index, offsets, i, TRUE, mtr);
- }
- }
-}
-
-/*******************************************************************//**
-Flags the data tuple fields that are marked as extern storage in the
-update vector. We use this function to remember which fields we must
-mark as extern storage in a record inserted for an update.
-@return number of flagged external columns */
-UNIV_INTERN
-ulint
-btr_push_update_extern_fields(
-/*==========================*/
- dtuple_t* tuple, /*!< in/out: data tuple */
- const upd_t* update, /*!< in: update vector */
- mem_heap_t* heap) /*!< in: memory heap */
-{
- ulint n_pushed = 0;
- ulint n;
- const upd_field_t* uf;
-
- uf = update->fields;
- n = upd_get_n_fields(update);
-
- for (; n--; uf++) {
- if (dfield_is_ext(&uf->new_val)) {
- dfield_t* field
- = dtuple_get_nth_field(tuple, uf->field_no);
-
- if (!dfield_is_ext(field)) {
- dfield_set_ext(field);
- n_pushed++;
- }
-
- switch (uf->orig_len) {
- byte* data;
- ulint len;
- byte* buf;
- case 0:
- break;
- case BTR_EXTERN_FIELD_REF_SIZE:
- /* Restore the original locally stored
- part of the column. In the undo log,
- InnoDB writes a longer prefix of externally
- stored columns, so that column prefixes
- in secondary indexes can be reconstructed. */
- dfield_set_data(field, (byte*) dfield_get_data(field)
- + dfield_get_len(field)
- - BTR_EXTERN_FIELD_REF_SIZE,
- BTR_EXTERN_FIELD_REF_SIZE);
- dfield_set_ext(field);
- break;
- default:
- /* Reconstruct the original locally
- stored part of the column. The data
- will have to be copied. */
- ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
-
- data = (byte*) dfield_get_data(field);
- len = dfield_get_len(field);
-
- buf = (byte*) mem_heap_alloc(heap,
- uf->orig_len);
- /* Copy the locally stored prefix. */
- memcpy(buf, data,
- uf->orig_len
- - BTR_EXTERN_FIELD_REF_SIZE);
- /* Copy the BLOB pointer. */
- memcpy(buf + uf->orig_len
- - BTR_EXTERN_FIELD_REF_SIZE,
- data + len - BTR_EXTERN_FIELD_REF_SIZE,
- BTR_EXTERN_FIELD_REF_SIZE);
-
- dfield_set_data(field, buf, uf->orig_len);
- dfield_set_ext(field);
- }
- }
- }
-
- return(n_pushed);
-}
-
-/*******************************************************************//**
-Returns the length of a BLOB part stored on the header page.
-@return part length */
-static
-ulint
-btr_blob_get_part_len(
-/*==================*/
- const byte* blob_header) /*!< in: blob header */
-{
- return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
-}
-
-/*******************************************************************//**
-Returns the page number where the next BLOB part is stored.
-@return page number or FIL_NULL if no more pages */
-static
-ulint
-btr_blob_get_next_page_no(
-/*======================*/
- const byte* blob_header) /*!< in: blob header */
-{
- return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
-}
-
-/*******************************************************************//**
-Deallocate a buffer block that was reserved for a BLOB part. */
-static
-void
-btr_blob_free(
-/*==========*/
- buf_block_t* block, /*!< in: buffer block */
- ibool all, /*!< in: TRUE=remove also the compressed page
- if there is one */
- mtr_t* mtr) /*!< in: mini-transaction to commit */
-{
- buf_pool_t* buf_pool = buf_pool_from_block(block);
- ulint space = buf_block_get_space(block);
- ulint page_no = buf_block_get_page_no(block);
- bool freed = false;
-
- ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
-
- mtr_commit(mtr);
-
- mutex_enter(&buf_pool->LRU_list_mutex);
- mutex_enter(&block->mutex);
-
- /* Only free the block if it is still allocated to
- the same file page. */
-
- if (buf_block_get_state(block)
- == BUF_BLOCK_FILE_PAGE
- && buf_block_get_space(block) == space
- && buf_block_get_page_no(block) == page_no) {
-
- freed = buf_LRU_free_page(&block->page, all);
-
- if (!freed && all && block->page.zip.data
- /* Now, buf_LRU_free_page() may release mutexes
- temporarily */
- && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE
- && buf_block_get_space(block) == space
- && buf_block_get_page_no(block) == page_no) {
-
- /* Attempt to deallocate the uncompressed page
- if the whole block cannot be deallocted. */
- freed = buf_LRU_free_page(&block->page, false);
- }
- }
-
- if (!freed) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- }
-
- mutex_exit(&block->mutex);
-}
-
-/*******************************************************************//**
-Stores the fields in big_rec_vec to the tablespace and puts pointers to
-them in rec. The extern flags in rec will have to be set beforehand.
-The fields are stored on pages allocated from leaf node
-file segment of the index tree.
-@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE or DB_TOO_BIG_FOR_REDO */
-UNIV_INTERN
-dberr_t
-btr_store_big_rec_extern_fields(
-/*============================*/
- dict_index_t* index, /*!< in: index of rec; the index tree
- MUST be X-latched */
- buf_block_t* rec_block, /*!< in/out: block containing rec */
- rec_t* rec, /*!< in/out: record */
- const ulint* offsets, /*!< in: rec_get_offsets(rec, index);
- the "external storage" flags in offsets
- will not correspond to rec when
- this function returns */
- const big_rec_t*big_rec_vec, /*!< in: vector containing fields
- to be stored externally */
- mtr_t* btr_mtr, /*!< in: mtr containing the
- latches to the clustered index */
- enum blob_op op) /*! in: operation code */
-{
- ulint rec_page_no;
- byte* field_ref;
- ulint extern_len;
- ulint store_len;
- ulint page_no;
- ulint space_id;
- ulint zip_size;
- ulint prev_page_no;
- ulint hint_page_no;
- ulint i;
- mtr_t mtr;
- mtr_t* alloc_mtr;
- mem_heap_t* heap = NULL;
- page_zip_des_t* page_zip;
- z_stream c_stream;
- buf_block_t** freed_pages = NULL;
- ulint n_freed_pages = 0;
- dberr_t error = DB_SUCCESS;
-
- ut_ad(rec_offs_validate(rec, index, offsets));
- ut_ad(rec_offs_any_extern(offsets));
- ut_ad(mtr_memo_contains(btr_mtr, dict_index_get_lock(index),
- MTR_MEMO_X_LOCK));
- ut_ad(mtr_memo_contains(btr_mtr, rec_block, MTR_MEMO_PAGE_X_FIX));
- ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
- ut_a(dict_index_is_clust(index));
-
- page_zip = buf_block_get_page_zip(rec_block);
- ut_a(dict_table_zip_size(index->table)
- == buf_block_get_zip_size(rec_block));
-
- space_id = buf_block_get_space(rec_block);
- zip_size = buf_block_get_zip_size(rec_block);
- rec_page_no = buf_block_get_page_no(rec_block);
- ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX);
-
- error = btr_check_blob_limit(big_rec_vec);
-
- if (error != DB_SUCCESS) {
- ut_ad(op == BTR_STORE_INSERT);
- return(error);
- }
-
- if (page_zip) {
- int err;
-
- /* Zlib deflate needs 128 kilobytes for the default
- window size, plus 512 << memLevel, plus a few
- kilobytes for small objects. We use reduced memLevel
- to limit the memory consumption, and preallocate the
- heap, hoping to avoid memory fragmentation. */
- heap = mem_heap_create(250000);
- page_zip_set_alloc(&c_stream, heap);
-
- err = deflateInit2(&c_stream, page_zip_level,
- Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
- ut_a(err == Z_OK);
- }
-
- if (btr_blob_op_is_update(op)) {
- /* Avoid reusing pages that have been previously freed
- in btr_mtr. */
- if (btr_mtr->n_freed_pages) {
- if (heap == NULL) {
- heap = mem_heap_create(
- btr_mtr->n_freed_pages
- * sizeof *freed_pages);
- }
-
- freed_pages = static_cast<buf_block_t**>(
- mem_heap_alloc(
- heap,
- btr_mtr->n_freed_pages
- * sizeof *freed_pages));
- n_freed_pages = 0;
- }
-
- /* Because btr_mtr will be committed after mtr, it is
- possible that the tablespace has been extended when
- the B-tree record was updated or inserted, or it will
- be extended while allocating pages for big_rec.
-
- TODO: In mtr (not btr_mtr), write a redo log record
- about extending the tablespace to its current size,
- and remember the current size. Whenever the tablespace
- grows as pages are allocated, write further redo log
- records to mtr. (Currently tablespace extension is not
- covered by the redo log. If it were, the record would
- only be written to btr_mtr, which is committed after
- mtr.) */
- alloc_mtr = btr_mtr;
- } else {
- /* Use the local mtr for allocations. */
- alloc_mtr = &mtr;
- }
-
-#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
- /* All pointers to externally stored columns in the record
- must either be zero or they must be pointers to inherited
- columns, owned by this record or an earlier record version. */
- for (i = 0; i < rec_offs_n_fields(offsets); i++) {
- if (!rec_offs_nth_extern(offsets, i)) {
- continue;
- }
- field_ref = btr_rec_get_field_ref(rec, offsets, i);
-
- ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
- /* Either this must be an update in place,
- or the BLOB must be inherited, or the BLOB pointer
- must be zero (will be written in this function). */
- ut_a(op == BTR_STORE_UPDATE
- || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
- || !memcmp(field_ref, field_ref_zero,
- BTR_EXTERN_FIELD_REF_SIZE));
- }
-#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
- /* We have to create a file segment to the tablespace
- for each field and put the pointer to the field in rec */
-
- for (i = 0; i < big_rec_vec->n_fields; i++) {
- field_ref = btr_rec_get_field_ref(
- rec, offsets, big_rec_vec->fields[i].field_no);
-#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
- /* A zero BLOB pointer should have been initially inserted. */
- ut_a(!memcmp(field_ref, field_ref_zero,
- BTR_EXTERN_FIELD_REF_SIZE));
-#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
- extern_len = big_rec_vec->fields[i].len;
- UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data,
- extern_len);
-
- ut_a(extern_len > 0);
-
- prev_page_no = FIL_NULL;
-
- if (page_zip) {
- int err = deflateReset(&c_stream);
- ut_a(err == Z_OK);
-
- c_stream.next_in = (Bytef*)
- big_rec_vec->fields[i].data;
- c_stream.avail_in = static_cast<uInt>(extern_len);
- }
-
- for (;;) {
- buf_block_t* block;
- page_t* page;
-
- mtr_start(&mtr);
-
- if (prev_page_no == FIL_NULL) {
- hint_page_no = 1 + rec_page_no;
- } else {
- hint_page_no = prev_page_no + 1;
- }
-
-alloc_another:
- block = btr_page_alloc(index, hint_page_no,
- FSP_NO_DIR, 0, alloc_mtr, &mtr);
- if (UNIV_UNLIKELY(block == NULL)) {
- mtr_commit(&mtr);
- error = DB_OUT_OF_FILE_SPACE;
- goto func_exit;
- }
-
- if (rw_lock_get_x_lock_count(&block->lock) > 1) {
- /* This page must have been freed in
- btr_mtr previously. Put it aside, and
- allocate another page for the BLOB data. */
- ut_ad(alloc_mtr == btr_mtr);
- ut_ad(btr_blob_op_is_update(op));
- ut_ad(n_freed_pages < btr_mtr->n_freed_pages);
- freed_pages[n_freed_pages++] = block;
- goto alloc_another;
- }
-
- page_no = buf_block_get_page_no(block);
- page = buf_block_get_frame(block);
-
- if (prev_page_no != FIL_NULL) {
- buf_block_t* prev_block;
- page_t* prev_page;
-
- prev_block = buf_page_get(space_id, zip_size,
- prev_page_no,
- RW_X_LATCH, &mtr);
- buf_block_dbg_add_level(prev_block,
- SYNC_EXTERN_STORAGE);
- prev_page = buf_block_get_frame(prev_block);
-
- if (page_zip) {
- mlog_write_ulint(
- prev_page + FIL_PAGE_NEXT,
- page_no, MLOG_4BYTES, &mtr);
- memcpy(buf_block_get_page_zip(
- prev_block)
- ->data + FIL_PAGE_NEXT,
- prev_page + FIL_PAGE_NEXT, 4);
- } else {
- mlog_write_ulint(
- prev_page + FIL_PAGE_DATA
- + BTR_BLOB_HDR_NEXT_PAGE_NO,
- page_no, MLOG_4BYTES, &mtr);
- }
-
- } else if (dict_index_is_online_ddl(index)) {
- row_log_table_blob_alloc(index, page_no);
- }
-
- if (page_zip) {
- int err;
- page_zip_des_t* blob_page_zip;
-
- /* Write FIL_PAGE_TYPE to the redo log
- separately, before logging any other
- changes to the page, so that the debug
- assertions in
- recv_parse_or_apply_log_rec_body() can
- be made simpler. Before InnoDB Plugin
- 1.0.4, the initialization of
- FIL_PAGE_TYPE was logged as part of
- the mlog_log_string() below. */
-
- mlog_write_ulint(page + FIL_PAGE_TYPE,
- prev_page_no == FIL_NULL
- ? FIL_PAGE_TYPE_ZBLOB
- : FIL_PAGE_TYPE_ZBLOB2,
- MLOG_2BYTES, &mtr);
-
- c_stream.next_out = page
- + FIL_PAGE_DATA;
- c_stream.avail_out
- = static_cast<uInt>(page_zip_get_size(page_zip))
- - FIL_PAGE_DATA;
-
- err = deflate(&c_stream, Z_FINISH);
- ut_a(err == Z_OK || err == Z_STREAM_END);
- ut_a(err == Z_STREAM_END
- || c_stream.avail_out == 0);
-
- /* Write the "next BLOB page" pointer */
- mlog_write_ulint(page + FIL_PAGE_NEXT,
- FIL_NULL, MLOG_4BYTES, &mtr);
- /* Initialize the unused "prev page" pointer */
- mlog_write_ulint(page + FIL_PAGE_PREV,
- FIL_NULL, MLOG_4BYTES, &mtr);
- /* Write a back pointer to the record
- into the otherwise unused area. This
- information could be useful in
- debugging. Later, we might want to
- implement the possibility to relocate
- BLOB pages. Then, we would need to be
- able to adjust the BLOB pointer in the
- record. We do not store the heap
- number of the record, because it can
- change in page_zip_reorganize() or
- btr_page_reorganize(). However, also
- the page number of the record may
- change when B-tree nodes are split or
- merged. */
- mlog_write_ulint(page
- + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
- space_id,
- MLOG_4BYTES, &mtr);
- mlog_write_ulint(page
- + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4,
- rec_page_no,
- MLOG_4BYTES, &mtr);
-
- /* Zero out the unused part of the page. */
- memset(page + page_zip_get_size(page_zip)
- - c_stream.avail_out,
- 0, c_stream.avail_out);
- mlog_log_string(page
- + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
- page_zip_get_size(page_zip)
- - FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
- &mtr);
- /* Copy the page to compressed storage,
- because it will be flushed to disk
- from there. */
- blob_page_zip = buf_block_get_page_zip(block);
- ut_ad(blob_page_zip);
- ut_ad(page_zip_get_size(blob_page_zip)
- == page_zip_get_size(page_zip));
- memcpy(blob_page_zip->data, page,
- page_zip_get_size(page_zip));
-
- if (err == Z_OK && prev_page_no != FIL_NULL) {
-
- goto next_zip_page;
- }
-
- if (alloc_mtr == &mtr) {
- rec_block = buf_page_get(
- space_id, zip_size,
- rec_page_no,
- RW_X_LATCH, &mtr);
- buf_block_dbg_add_level(
- rec_block,
- SYNC_NO_ORDER_CHECK);
- }
-
- if (err == Z_STREAM_END) {
- mach_write_to_4(field_ref
- + BTR_EXTERN_LEN, 0);
- mach_write_to_4(field_ref
- + BTR_EXTERN_LEN + 4,
- c_stream.total_in);
- } else {
- memset(field_ref + BTR_EXTERN_LEN,
- 0, 8);
- }
-
- if (prev_page_no == FIL_NULL) {
- btr_blob_dbg_add_blob(
- rec, big_rec_vec->fields[i]
- .field_no, page_no, index,
- "store");
-
- mach_write_to_4(field_ref
- + BTR_EXTERN_SPACE_ID,
- space_id);
-
- mach_write_to_4(field_ref
- + BTR_EXTERN_PAGE_NO,
- page_no);
-
- mach_write_to_4(field_ref
- + BTR_EXTERN_OFFSET,
- FIL_PAGE_NEXT);
- }
-
- page_zip_write_blob_ptr(
- page_zip, rec, index, offsets,
- big_rec_vec->fields[i].field_no,
- alloc_mtr);
-
-next_zip_page:
- prev_page_no = page_no;
-
- /* Commit mtr and release the
- uncompressed page frame to save memory. */
- btr_blob_free(block, FALSE, &mtr);
-
- if (err == Z_STREAM_END) {
- break;
- }
- } else {
- mlog_write_ulint(page + FIL_PAGE_TYPE,
- FIL_PAGE_TYPE_BLOB,
- MLOG_2BYTES, &mtr);
-
- if (extern_len > (UNIV_PAGE_SIZE
- - FIL_PAGE_DATA
- - BTR_BLOB_HDR_SIZE
- - FIL_PAGE_DATA_END)) {
- store_len = UNIV_PAGE_SIZE
- - FIL_PAGE_DATA
- - BTR_BLOB_HDR_SIZE
- - FIL_PAGE_DATA_END;
- } else {
- store_len = extern_len;
- }
-
- mlog_write_string(page + FIL_PAGE_DATA
- + BTR_BLOB_HDR_SIZE,
- (const byte*)
- big_rec_vec->fields[i].data
- + big_rec_vec->fields[i].len
- - extern_len,
- store_len, &mtr);
- mlog_write_ulint(page + FIL_PAGE_DATA
- + BTR_BLOB_HDR_PART_LEN,
- store_len, MLOG_4BYTES, &mtr);
- mlog_write_ulint(page + FIL_PAGE_DATA
- + BTR_BLOB_HDR_NEXT_PAGE_NO,
- FIL_NULL, MLOG_4BYTES, &mtr);
-
- extern_len -= store_len;
-
- if (alloc_mtr == &mtr) {
- rec_block = buf_page_get(
- space_id, zip_size,
- rec_page_no,
- RW_X_LATCH, &mtr);
- buf_block_dbg_add_level(
- rec_block,
- SYNC_NO_ORDER_CHECK);
- }
-
- mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
- MLOG_4BYTES, alloc_mtr);
- mlog_write_ulint(field_ref
- + BTR_EXTERN_LEN + 4,
- big_rec_vec->fields[i].len
- - extern_len,
- MLOG_4BYTES, alloc_mtr);
-
- if (prev_page_no == FIL_NULL) {
- btr_blob_dbg_add_blob(
- rec, big_rec_vec->fields[i]
- .field_no, page_no, index,
- "store");
-
- mlog_write_ulint(field_ref
- + BTR_EXTERN_SPACE_ID,
- space_id, MLOG_4BYTES,
- alloc_mtr);
-
- mlog_write_ulint(field_ref
- + BTR_EXTERN_PAGE_NO,
- page_no, MLOG_4BYTES,
- alloc_mtr);
-
- mlog_write_ulint(field_ref
- + BTR_EXTERN_OFFSET,
- FIL_PAGE_DATA,
- MLOG_4BYTES,
- alloc_mtr);
- }
-
- prev_page_no = page_no;
-
- mtr_commit(&mtr);
-
- if (extern_len == 0) {
- break;
- }
- }
- }
-
- DBUG_EXECUTE_IF("btr_store_big_rec_extern",
- error = DB_OUT_OF_FILE_SPACE;
- goto func_exit;);
- }
-
-func_exit:
- if (page_zip) {
- deflateEnd(&c_stream);
- }
-
- if (n_freed_pages) {
- ulint i;
-
- ut_ad(alloc_mtr == btr_mtr);
- ut_ad(btr_blob_op_is_update(op));
-
- for (i = 0; i < n_freed_pages; i++) {
- btr_page_free_low(index, freed_pages[i], 0, true, alloc_mtr);
- }
- }
-
- if (heap != NULL) {
- mem_heap_free(heap);
- }
-
-#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
- /* All pointers to externally stored columns in the record
- must be valid. */
- for (i = 0; i < rec_offs_n_fields(offsets); i++) {
- if (!rec_offs_nth_extern(offsets, i)) {
- continue;
- }
-
- field_ref = btr_rec_get_field_ref(rec, offsets, i);
-
- /* The pointer must not be zero if the operation
- succeeded. */
- ut_a(0 != memcmp(field_ref, field_ref_zero,
- BTR_EXTERN_FIELD_REF_SIZE)
- || error != DB_SUCCESS);
- /* The column must not be disowned by this record. */
- ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
- }
-#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
- return(error);
-}
-
-/*******************************************************************//**
-Check the FIL_PAGE_TYPE on an uncompressed BLOB page. */
-static
-void
-btr_check_blob_fil_page_type(
-/*=========================*/
- ulint space_id, /*!< in: space id */
- ulint page_no, /*!< in: page number */
- const page_t* page, /*!< in: page */
- ibool read) /*!< in: TRUE=read, FALSE=purge */
-{
- ulint type = fil_page_get_type(page);
-
- ut_a(space_id == page_get_space_id(page));
- ut_a(page_no == page_get_page_no(page));
-
- if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
- ulint flags = fil_space_get_flags(space_id);
-
-#ifndef UNIV_DEBUG /* Improve debug test coverage */
- if (dict_tf_get_format(flags) == UNIV_FORMAT_A) {
- /* Old versions of InnoDB did not initialize
- FIL_PAGE_TYPE on BLOB pages. Do not print
- anything about the type mismatch when reading
- a BLOB page that is in Antelope format.*/
- return;
- }
-#endif /* !UNIV_DEBUG */
-
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: FIL_PAGE_TYPE=%lu"
- " on BLOB %s space %lu page %lu flags %lx\n",
- (ulong) type, read ? "read" : "purge",
- (ulong) space_id, (ulong) page_no, (ulong) flags);
- ut_error;
- }
-}
-
-/*******************************************************************//**
-Frees the space in an externally stored field to the file space
-management if the field in data is owned by the externally stored field,
-in a rollback we may have the additional condition that the field must
-not be inherited. */
-UNIV_INTERN
-void
-btr_free_externally_stored_field(
-/*=============================*/
- dict_index_t* index, /*!< in: index of the data, the index
- tree MUST be X-latched; if the tree
- height is 1, then also the root page
- must be X-latched! (this is relevant
- in the case this function is called
- from purge where 'data' is located on
- an undo log page, not an index
- page) */
- byte* field_ref, /*!< in/out: field reference */
- const rec_t* rec, /*!< in: record containing field_ref, for
- page_zip_write_blob_ptr(), or NULL */
- const ulint* offsets, /*!< in: rec_get_offsets(rec, index),
- or NULL */
- page_zip_des_t* page_zip, /*!< in: compressed page corresponding
- to rec, or NULL if rec == NULL */
- ulint i, /*!< in: field number of field_ref;
- ignored if rec == NULL */
- enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
- mtr_t* local_mtr MY_ATTRIBUTE((unused))) /*!< in: mtr
- containing the latch to data an an
- X-latch to the index tree */
-{
- page_t* page;
- const ulint space_id = mach_read_from_4(
- field_ref + BTR_EXTERN_SPACE_ID);
- const ulint start_page = mach_read_from_4(
- field_ref + BTR_EXTERN_PAGE_NO);
- ulint rec_zip_size = dict_table_zip_size(index->table);
- ulint ext_zip_size;
- ulint page_no;
- ulint next_page_no;
- mtr_t mtr;
-
- ut_ad(dict_index_is_clust(index));
- ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
- MTR_MEMO_X_LOCK));
- ut_ad(mtr_memo_contains_page(local_mtr, field_ref,
- MTR_MEMO_PAGE_X_FIX));
- ut_ad(!rec || rec_offs_validate(rec, index, offsets));
- ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
-
- if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
- BTR_EXTERN_FIELD_REF_SIZE))) {
- /* In the rollback, we may encounter a clustered index
- record with some unwritten off-page columns. There is
- nothing to free then. */
- if (rb_ctx == RB_NONE) {
- char buf[3 * 512];
- char *bufend;
- ulint ispace = dict_index_get_space(index);
- bufend = innobase_convert_name(buf, sizeof buf,
- index->name, strlen(index->name),
- NULL,
- FALSE);
- buf[bufend - buf]='\0';
- ib_logf(IB_LOG_LEVEL_ERROR, "Unwritten off-page columns in "
- "rollback context %d. Table %s index %s space_id %lu "
- "index space %lu.",
- rb_ctx, index->table->name, buf, space_id, ispace);
- }
-
- ut_a(rb_ctx != RB_NONE);
- return;
- }
-
- ut_ad(space_id == index->space);
-
- if (UNIV_UNLIKELY(space_id != dict_index_get_space(index))) {
- ext_zip_size = fil_space_get_zip_size(space_id);
- /* This must be an undo log record in the system tablespace,
- that is, in row_purge_upd_exist_or_extern().
- Currently, externally stored records are stored in the
- same tablespace as the referring records. */
- ut_ad(!page_get_space_id(page_align(field_ref)));
- ut_ad(!rec);
- ut_ad(!page_zip);
- } else {
- ext_zip_size = rec_zip_size;
- }
-
- if (!rec) {
- /* This is a call from row_purge_upd_exist_or_extern(). */
- ut_ad(!page_zip);
- rec_zip_size = 0;
- }
-
-#ifdef UNIV_BLOB_DEBUG
- if (!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)
- && !((field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
- && (rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY))) {
- /* This off-page column will be freed.
- Check that no references remain. */
-
- btr_blob_dbg_t b;
-
- b.blob_page_no = start_page;
-
- if (rec) {
- /* Remove the reference from the record to the
- BLOB. If the BLOB were not freed, the
- reference would be removed when the record is
- removed. Freeing the BLOB will overwrite the
- BTR_EXTERN_PAGE_NO in the field_ref of the
- record with FIL_NULL, which would make the
- btr_blob_dbg information inconsistent with the
- record. */
- b.ref_page_no = page_get_page_no(page_align(rec));
- b.ref_heap_no = page_rec_get_heap_no(rec);
- b.ref_field_no = i;
- btr_blob_dbg_rbt_delete(index, &b, "free");
- }
-
- btr_blob_dbg_assert_empty(index, b.blob_page_no);
- }
-#endif /* UNIV_BLOB_DEBUG */
-
- for (;;) {
-#ifdef UNIV_SYNC_DEBUG
- buf_block_t* rec_block;
-#endif /* UNIV_SYNC_DEBUG */
- buf_block_t* ext_block;
-
- mtr_start(&mtr);
-
-#ifdef UNIV_SYNC_DEBUG
- rec_block =
-#endif /* UNIV_SYNC_DEBUG */
- buf_page_get(page_get_space_id(page_align(field_ref)),
- rec_zip_size,
- page_get_page_no(page_align(field_ref)),
- RW_X_LATCH, &mtr);
- buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
- page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
-
- if (/* There is no external storage data */
- page_no == FIL_NULL
- /* This field does not own the externally stored field */
- || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
- & BTR_EXTERN_OWNER_FLAG)
- /* Rollback and inherited field */
- || ((rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY)
- && (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
- & BTR_EXTERN_INHERITED_FLAG))) {
-
- /* Do not free */
- mtr_commit(&mtr);
-
- return;
- }
-
- if (page_no == start_page && dict_index_is_online_ddl(index)) {
- row_log_table_blob_free(index, start_page);
- }
-
- ext_block = buf_page_get(space_id, ext_zip_size, page_no,
- RW_X_LATCH, &mtr);
- buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
- page = buf_block_get_frame(ext_block);
-
- if (ext_zip_size) {
- /* Note that page_zip will be NULL
- in row_purge_upd_exist_or_extern(). */
- switch (fil_page_get_type(page)) {
- case FIL_PAGE_TYPE_ZBLOB:
- case FIL_PAGE_TYPE_ZBLOB2:
- break;
- default:
- ut_error;
- }
- next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
-
- btr_page_free_low(index, ext_block, 0, true, &mtr);
-
- if (page_zip != NULL) {
- mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
- next_page_no);
- mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
- 0);
- page_zip_write_blob_ptr(page_zip, rec, index,
- offsets, i, &mtr);
- } else {
- mlog_write_ulint(field_ref
- + BTR_EXTERN_PAGE_NO,
- next_page_no,
- MLOG_4BYTES, &mtr);
- mlog_write_ulint(field_ref
- + BTR_EXTERN_LEN + 4, 0,
- MLOG_4BYTES, &mtr);
- }
- } else {
- ut_a(!page_zip);
- btr_check_blob_fil_page_type(space_id, page_no, page,
- FALSE);
-
- next_page_no = mach_read_from_4(
- page + FIL_PAGE_DATA
- + BTR_BLOB_HDR_NEXT_PAGE_NO);
-
- /* We must supply the page level (= 0) as an argument
- because we did not store it on the page (we save the
- space overhead from an index page header. */
-
- btr_page_free_low(index, ext_block, 0, true, &mtr);
-
- mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
- next_page_no,
- MLOG_4BYTES, &mtr);
- /* Zero out the BLOB length. If the server
- crashes during the execution of this function,
- trx_rollback_or_clean_all_recovered() could
- dereference the half-deleted BLOB, fetching a
- wrong prefix for the BLOB. */
- mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
- 0,
- MLOG_4BYTES, &mtr);
- }
-
- /* Commit mtr and release the BLOB block to save memory. */
- btr_blob_free(ext_block, TRUE, &mtr);
- }
-}
-
-/***********************************************************//**
-Frees the externally stored fields for a record. */
-static
-void
-btr_rec_free_externally_stored_fields(
-/*==================================*/
- dict_index_t* index, /*!< in: index of the data, the index
- tree MUST be X-latched */
- rec_t* rec, /*!< in/out: record */
- const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
- page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
- part will be updated, or NULL */
- enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
- mtr_t* mtr) /*!< in: mini-transaction handle which contains
- an X-latch to record page and to the index
- tree */
-{
- ulint n_fields;
- ulint i;
-
- ut_ad(rec_offs_validate(rec, index, offsets));
- ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
- /* Free possible externally stored fields in the record */
-
- ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
- n_fields = rec_offs_n_fields(offsets);
-
- for (i = 0; i < n_fields; i++) {
- if (rec_offs_nth_extern(offsets, i)) {
- btr_free_externally_stored_field(
- index, btr_rec_get_field_ref(rec, offsets, i),
- rec, offsets, page_zip, i, rb_ctx, mtr);
- }
- }
-}
-
-/***********************************************************//**
-Frees the externally stored fields for a record, if the field is mentioned
-in the update vector. */
-static
-void
-btr_rec_free_updated_extern_fields(
-/*===============================*/
- dict_index_t* index, /*!< in: index of rec; the index tree MUST be
- X-latched */
- rec_t* rec, /*!< in/out: record */
- page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
- part will be updated, or NULL */
- const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
- const upd_t* update, /*!< in: update vector */
- enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
- mtr_t* mtr) /*!< in: mini-transaction handle which contains
- an X-latch to record page and to the tree */
-{
- ulint n_fields;
- ulint i;
-
- ut_ad(rec_offs_validate(rec, index, offsets));
- ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
-
- /* Free possible externally stored fields in the record */
-
- n_fields = upd_get_n_fields(update);
-
- for (i = 0; i < n_fields; i++) {
- const upd_field_t* ufield = upd_get_nth_field(update, i);
-
- if (rec_offs_nth_extern(offsets, ufield->field_no)) {
- ulint len;
- byte* data = rec_get_nth_field(
- rec, offsets, ufield->field_no, &len);
- ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
-
- btr_free_externally_stored_field(
- index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
- rec, offsets, page_zip,
- ufield->field_no, rb_ctx, mtr);
- }
- }
-}
-
-/*******************************************************************//**
-Copies the prefix of an uncompressed BLOB. The clustered index record
-that points to this BLOB must be protected by a lock or a page latch.
-@return number of bytes written to buf */
-static
-ulint
-btr_copy_blob_prefix(
-/*=================*/
- byte* buf, /*!< out: the externally stored part of
- the field, or a prefix of it */
- ulint len, /*!< in: length of buf, in bytes */
- ulint space_id,/*!< in: space id of the BLOB pages */
- ulint page_no,/*!< in: page number of the first BLOB page */
- ulint offset, /*!< in: offset on the first BLOB page */
- trx_t* trx) /*!< in: transaction handle */
-{
- ulint copied_len = 0;
-
- for (;;) {
- mtr_t mtr;
- buf_block_t* block;
- const page_t* page;
- const byte* blob_header;
- ulint part_len;
- ulint copy_len;
-
- mtr_start_trx(&mtr, trx);
-
- block = buf_page_get(space_id, 0, page_no, RW_S_LATCH, &mtr);
- buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
- page = buf_block_get_frame(block);
-
- btr_check_blob_fil_page_type(space_id, page_no, page, TRUE);
-
- blob_header = page + offset;
- part_len = btr_blob_get_part_len(blob_header);
- copy_len = ut_min(part_len, len - copied_len);
-
- memcpy(buf + copied_len,
- blob_header + BTR_BLOB_HDR_SIZE, copy_len);
- copied_len += copy_len;
-
- page_no = btr_blob_get_next_page_no(blob_header);
-
- mtr_commit(&mtr);
-
- if (page_no == FIL_NULL || copy_len != part_len) {
- UNIV_MEM_ASSERT_RW(buf, copied_len);
- return(copied_len);
- }
-
- /* On other BLOB pages except the first the BLOB header
- always is at the page data start: */
-
- offset = FIL_PAGE_DATA;
-
- ut_ad(copied_len <= len);
- }
-}
-
-/*******************************************************************//**
-Copies the prefix of a compressed BLOB. The clustered index record
-that points to this BLOB must be protected by a lock or a page latch.
-@return number of bytes written to buf */
-static
-ulint
-btr_copy_zblob_prefix(
-/*==================*/
- byte* buf, /*!< out: the externally stored part of
- the field, or a prefix of it */
- ulint len, /*!< in: length of buf, in bytes */
- ulint zip_size,/*!< in: compressed BLOB page size */
- ulint space_id,/*!< in: space id of the BLOB pages */
- ulint page_no,/*!< in: page number of the first BLOB page */
- ulint offset) /*!< in: offset on the first BLOB page */
-{
- ulint page_type = FIL_PAGE_TYPE_ZBLOB;
- mem_heap_t* heap;
- int err;
- z_stream d_stream;
-
- d_stream.next_out = buf;
- d_stream.avail_out = static_cast<uInt>(len);
- d_stream.next_in = Z_NULL;
- d_stream.avail_in = 0;
-
- /* Zlib inflate needs 32 kilobytes for the default
- window size, plus a few kilobytes for small objects. */
- heap = mem_heap_create(40000);
- page_zip_set_alloc(&d_stream, heap);
-
- ut_ad(ut_is_2pow(zip_size));
- ut_ad(zip_size >= UNIV_ZIP_SIZE_MIN);
- ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
- ut_ad(space_id);
-
- err = inflateInit(&d_stream);
- ut_a(err == Z_OK);
-
- for (;;) {
- buf_page_t* bpage;
- ulint next_page_no;
-
- /* There is no latch on bpage directly. Instead,
- bpage is protected by the B-tree page latch that
- is being held on the clustered index record, or,
- in row_merge_copy_blobs(), by an exclusive table lock. */
- bpage = buf_page_get_zip(space_id, zip_size, page_no);
-
- if (UNIV_UNLIKELY(!bpage)) {
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: Cannot load"
- " compressed BLOB"
- " page %lu space %lu\n",
- (ulong) page_no, (ulong) space_id);
- goto func_exit;
- }
-
- if (UNIV_UNLIKELY
- (fil_page_get_type(bpage->zip.data) != page_type)) {
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: Unexpected type %lu of"
- " compressed BLOB"
- " page %lu space %lu\n",
- (ulong) fil_page_get_type(bpage->zip.data),
- (ulong) page_no, (ulong) space_id);
- ut_ad(0);
- goto end_of_blob;
- }
-
- next_page_no = mach_read_from_4(bpage->zip.data + offset);
-
- if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
- /* When the BLOB begins at page header,
- the compressed data payload does not
- immediately follow the next page pointer. */
- offset = FIL_PAGE_DATA;
- } else {
- offset += 4;
- }
-
- d_stream.next_in = bpage->zip.data + offset;
- d_stream.avail_in = static_cast<uInt>(zip_size - offset);
-
- err = inflate(&d_stream, Z_NO_FLUSH);
- switch (err) {
- case Z_OK:
- if (!d_stream.avail_out) {
- goto end_of_blob;
- }
- break;
- case Z_STREAM_END:
- if (next_page_no == FIL_NULL) {
- goto end_of_blob;
- }
- /* fall through */
- default:
-inflate_error:
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: inflate() of"
- " compressed BLOB"
- " page %lu space %lu returned %d (%s)\n",
- (ulong) page_no, (ulong) space_id,
- err, d_stream.msg);
- case Z_BUF_ERROR:
- goto end_of_blob;
- }
-
- if (next_page_no == FIL_NULL) {
- if (!d_stream.avail_in) {
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: unexpected end of"
- " compressed BLOB"
- " page %lu space %lu\n",
- (ulong) page_no,
- (ulong) space_id);
- } else {
- err = inflate(&d_stream, Z_FINISH);
- switch (err) {
- case Z_STREAM_END:
- case Z_BUF_ERROR:
- break;
- default:
- goto inflate_error;
- }
- }
-
-end_of_blob:
- buf_page_release_zip(bpage);
- goto func_exit;
- }
-
- buf_page_release_zip(bpage);
-
- /* On other BLOB pages except the first
- the BLOB header always is at the page header: */
-
- page_no = next_page_no;
- offset = FIL_PAGE_NEXT;
- page_type = FIL_PAGE_TYPE_ZBLOB2;
- }
-
-func_exit:
- inflateEnd(&d_stream);
- mem_heap_free(heap);
- UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
- return(d_stream.total_out);
-}
-
-/*******************************************************************//**
-Copies the prefix of an externally stored field of a record. The
-clustered index record that points to this BLOB must be protected by a
-lock or a page latch.
-@return number of bytes written to buf */
-static
-ulint
-btr_copy_externally_stored_field_prefix_low(
-/*========================================*/
- byte* buf, /*!< out: the externally stored part of
- the field, or a prefix of it */
- ulint len, /*!< in: length of buf, in bytes */
- ulint zip_size,/*!< in: nonzero=compressed BLOB page size,
- zero for uncompressed BLOBs */
- ulint space_id,/*!< in: space id of the first BLOB page */
- ulint page_no,/*!< in: page number of the first BLOB page */
- ulint offset, /*!< in: offset on the first BLOB page */
- trx_t* trx) /*!< in: transaction handle */
-{
- if (UNIV_UNLIKELY(len == 0)) {
- return(0);
- }
-
- if (zip_size) {
- return(btr_copy_zblob_prefix(buf, len, zip_size,
- space_id, page_no, offset));
- } else {
- return(btr_copy_blob_prefix(buf, len, space_id,
- page_no, offset, trx));
- }
-}
-
-/*******************************************************************//**
-Copies the prefix of an externally stored field of a record. The
-clustered index record must be protected by a lock or a page latch.
-@return the length of the copied field, or 0 if the column was being
-or has been deleted */
-UNIV_INTERN
-ulint
-btr_copy_externally_stored_field_prefix(
-/*====================================*/
- byte* buf, /*!< out: the field, or a prefix of it */
- ulint len, /*!< in: length of buf, in bytes */
- ulint zip_size,/*!< in: nonzero=compressed BLOB page size,
- zero for uncompressed BLOBs */
- const byte* data, /*!< in: 'internally' stored part of the
- field containing also the reference to
- the external part; must be protected by
- a lock or a page latch */
- ulint local_len,/*!< in: length of data, in bytes */
- trx_t* trx) /*!< in: transaction handle */
-{
- ulint space_id;
- ulint page_no;
- ulint offset;
-
- ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
-
- local_len -= BTR_EXTERN_FIELD_REF_SIZE;
-
- if (UNIV_UNLIKELY(local_len >= len)) {
- memcpy(buf, data, len);
- return(len);
- }
-
- memcpy(buf, data, local_len);
- data += local_len;
-
- ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
-
- if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
- /* The externally stored part of the column has been
- (partially) deleted. Signal the half-deleted BLOB
- to the caller. */
-
- return(0);
- }
-
- space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
-
- page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
-
- offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
-
- return(local_len
- + btr_copy_externally_stored_field_prefix_low(buf + local_len,
- len - local_len,
- zip_size,
- space_id, page_no,
- offset, trx));
-}
-
-/*******************************************************************//**
-Copies an externally stored field of a record to mem heap. The
-clustered index record must be protected by a lock or a page latch.
-@return the whole field copied to heap */
-UNIV_INTERN
-byte*
-btr_copy_externally_stored_field(
-/*=============================*/
- ulint* len, /*!< out: length of the whole field */
- const byte* data, /*!< in: 'internally' stored part of the
- field containing also the reference to
- the external part; must be protected by
- a lock or a page latch */
- ulint zip_size,/*!< in: nonzero=compressed BLOB page size,
- zero for uncompressed BLOBs */
- ulint local_len,/*!< in: length of data */
- mem_heap_t* heap, /*!< in: mem heap */
- trx_t* trx) /*!< in: transaction handle */
-{
- ulint space_id;
- ulint page_no;
- ulint offset;
- ulint extern_len;
- byte* buf;
-
- ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
-
- local_len -= BTR_EXTERN_FIELD_REF_SIZE;
-
- space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
-
- page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
-
- offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
-
- /* Currently a BLOB cannot be bigger than 4 GB; we
- leave the 4 upper bytes in the length field unused */
-
- extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
-
- buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
-
- memcpy(buf, data, local_len);
- *len = local_len
- + btr_copy_externally_stored_field_prefix_low(buf + local_len,
- extern_len,
- zip_size,
- space_id,
- page_no, offset,
- trx);
-
- return(buf);
-}
-
-/*******************************************************************//**
-Copies an externally stored field of a record to mem heap.
-@return the field copied to heap, or NULL if the field is incomplete */
-UNIV_INTERN
-byte*
-btr_rec_copy_externally_stored_field(
-/*=================================*/
- const rec_t* rec, /*!< in: record in a clustered index;
- must be protected by a lock or a page latch */
- const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
- ulint zip_size,/*!< in: nonzero=compressed BLOB page size,
- zero for uncompressed BLOBs */
- ulint no, /*!< in: field number */
- ulint* len, /*!< out: length of the field */
- mem_heap_t* heap, /*!< in: mem heap */
- trx_t* trx) /*!< in: transaction handle */
-{
- ulint local_len;
- const byte* data;
-
- ut_a(rec_offs_nth_extern(offsets, no));
-
- /* An externally stored field can contain some initial
- data from the field, and in the last 20 bytes it has the
- space id, page number, and offset where the rest of the
- field data is stored, and the data length in addition to
- the data stored locally. We may need to store some data
- locally to get the local record length above the 128 byte
- limit so that field offsets are stored in two bytes, and
- the extern bit is available in those two bytes. */
-
- data = rec_get_nth_field(rec, offsets, no, &local_len);
-
- ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
-
- if (UNIV_UNLIKELY
- (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
- field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
- /* The externally stored field was not written yet.
- This record should only be seen by
- recv_recovery_rollback_active() or any
- TRX_ISO_READ_UNCOMMITTED transactions. */
- return(NULL);
- }
-
- return(btr_copy_externally_stored_field(len, data,
- zip_size, local_len, heap,
- trx));
-}
-#endif /* !UNIV_HOTBACKUP */