summaryrefslogtreecommitdiff
path: root/storage/innobase/include
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/include')
-rw-r--r--storage/innobase/include/btr0btr.h358
-rw-r--r--storage/innobase/include/btr0btr.inl46
-rw-r--r--storage/innobase/include/btr0bulk.h4
-rw-r--r--storage/innobase/include/btr0cur.h344
-rw-r--r--storage/innobase/include/btr0cur.inl53
-rw-r--r--storage/innobase/include/btr0defragment.h24
-rw-r--r--storage/innobase/include/btr0pcur.h461
-rw-r--r--storage/innobase/include/btr0pcur.inl355
-rw-r--r--storage/innobase/include/btr0sea.h89
-rw-r--r--storage/innobase/include/btr0sea.inl61
-rw-r--r--storage/innobase/include/btr0types.h103
-rw-r--r--storage/innobase/include/buf0block_hint.h2
-rw-r--r--storage/innobase/include/buf0buf.h1366
-rw-r--r--storage/innobase/include/buf0buf.inl271
-rw-r--r--storage/innobase/include/buf0checksum.h18
-rw-r--r--storage/innobase/include/buf0dblwr.h29
-rw-r--r--storage/innobase/include/buf0flu.h64
-rw-r--r--storage/innobase/include/buf0lru.h17
-rw-r--r--storage/innobase/include/buf0rea.h9
-rw-r--r--storage/innobase/include/buf0types.h140
-rw-r--r--storage/innobase/include/data0data.h6
-rw-r--r--storage/innobase/include/data0type.h31
-rw-r--r--storage/innobase/include/data0type.inl131
-rw-r--r--storage/innobase/include/db0err.h5
-rw-r--r--storage/innobase/include/dict0boot.h99
-rw-r--r--storage/innobase/include/dict0crea.h75
-rw-r--r--storage/innobase/include/dict0defrag_bg.h15
-rw-r--r--storage/innobase/include/dict0dict.h534
-rw-r--r--storage/innobase/include/dict0dict.inl39
-rw-r--r--storage/innobase/include/dict0load.h135
-rw-r--r--storage/innobase/include/dict0mem.h391
-rw-r--r--storage/innobase/include/dict0mem.inl7
-rw-r--r--storage/innobase/include/dict0stats.h115
-rw-r--r--storage/innobase/include/dict0stats.inl16
-rw-r--r--storage/innobase/include/dict0stats_bg.h58
-rw-r--r--storage/innobase/include/dict0types.h33
-rw-r--r--storage/innobase/include/dyn0buf.h56
-rw-r--r--storage/innobase/include/fil0crypt.h115
-rw-r--r--storage/innobase/include/fil0fil.h541
-rw-r--r--storage/innobase/include/fsp0file.h134
-rw-r--r--storage/innobase/include/fsp0fsp.h158
-rw-r--r--storage/innobase/include/fsp0space.h11
-rw-r--r--storage/innobase/include/fsp0types.h4
-rw-r--r--storage/innobase/include/fts0fts.h88
-rw-r--r--storage/innobase/include/fts0priv.h23
-rw-r--r--storage/innobase/include/fts0types.h56
-rw-r--r--storage/innobase/include/fut0lst.h65
-rw-r--r--storage/innobase/include/gis0rtree.h108
-rw-r--r--storage/innobase/include/gis0rtree.inl9
-rw-r--r--storage/innobase/include/gis0type.h14
-rw-r--r--storage/innobase/include/ha_prototypes.h95
-rw-r--r--storage/innobase/include/hash0hash.h105
-rw-r--r--storage/innobase/include/ibuf0ibuf.h16
-rw-r--r--storage/innobase/include/ibuf0ibuf.inl3
-rw-r--r--storage/innobase/include/lock0lock.h1081
-rw-r--r--storage/innobase/include/lock0lock.inl35
-rw-r--r--storage/innobase/include/lock0prdt.h16
-rw-r--r--storage/innobase/include/lock0priv.h203
-rw-r--r--storage/innobase/include/lock0priv.inl104
-rw-r--r--storage/innobase/include/lock0types.h79
-rw-r--r--storage/innobase/include/log0crypt.h16
-rw-r--r--storage/innobase/include/log0log.h20
-rw-r--r--storage/innobase/include/log0log.inl18
-rw-r--r--storage/innobase/include/log0recv.h78
-rw-r--r--storage/innobase/include/mach0data.inl1
-rw-r--r--storage/innobase/include/mem0mem.inl2
-rw-r--r--storage/innobase/include/mtr0log.h83
-rw-r--r--storage/innobase/include/mtr0mtr.h454
-rw-r--r--storage/innobase/include/mtr0types.h43
-rw-r--r--storage/innobase/include/os0file.h149
-rw-r--r--storage/innobase/include/os0file.inl46
-rw-r--r--storage/innobase/include/page0cur.h104
-rw-r--r--storage/innobase/include/page0cur.inl139
-rw-r--r--storage/innobase/include/page0page.h198
-rw-r--r--storage/innobase/include/page0page.inl206
-rw-r--r--storage/innobase/include/page0types.h47
-rw-r--r--storage/innobase/include/page0zip.h29
-rw-r--r--storage/innobase/include/page0zip.inl22
-rw-r--r--storage/innobase/include/pars0grm.h174
-rw-r--r--storage/innobase/include/pars0pars.h25
-rw-r--r--storage/innobase/include/que0que.h125
-rw-r--r--storage/innobase/include/que0que.inl50
-rw-r--r--storage/innobase/include/read0types.h70
-rw-r--r--storage/innobase/include/rem0rec.h31
-rw-r--r--storage/innobase/include/rem0rec.inl72
-rw-r--r--storage/innobase/include/row0ftsort.h7
-rw-r--r--storage/innobase/include/row0ins.h8
-rw-r--r--storage/innobase/include/row0log.h81
-rw-r--r--storage/innobase/include/row0merge.h67
-rw-r--r--storage/innobase/include/row0mysql.h209
-rw-r--r--storage/innobase/include/row0purge.h34
-rw-r--r--storage/innobase/include/row0row.h13
-rw-r--r--storage/innobase/include/row0sel.h87
-rw-r--r--storage/innobase/include/row0upd.h10
-rw-r--r--storage/innobase/include/row0vers.h10
-rw-r--r--storage/innobase/include/rw_lock.h50
-rw-r--r--storage/innobase/include/small_vector.h100
-rw-r--r--storage/innobase/include/srv0mon.h26
-rw-r--r--storage/innobase/include/srv0srv.h175
-rw-r--r--storage/innobase/include/srv0start.h2
-rw-r--r--storage/innobase/include/srw_lock.h554
-rw-r--r--storage/innobase/include/sux_lock.h472
-rw-r--r--storage/innobase/include/transactional_lock_guard.h174
-rw-r--r--storage/innobase/include/trx0i_s.h5
-rw-r--r--storage/innobase/include/trx0purge.h143
-rw-r--r--storage/innobase/include/trx0rec.h167
-rw-r--r--storage/innobase/include/trx0roll.h27
-rw-r--r--storage/innobase/include/trx0rseg.h285
-rw-r--r--storage/innobase/include/trx0sys.h224
-rw-r--r--storage/innobase/include/trx0trx.h703
-rw-r--r--storage/innobase/include/trx0trx.inl120
-rw-r--r--storage/innobase/include/trx0types.h44
-rw-r--r--storage/innobase/include/trx0undo.h166
-rw-r--r--storage/innobase/include/trx0undo.inl39
-rw-r--r--storage/innobase/include/univ.i144
-rw-r--r--storage/innobase/include/ut0counter.h64
-rw-r--r--storage/innobase/include/ut0new.h20
-rw-r--r--storage/innobase/include/ut0pool.h21
-rw-r--r--storage/innobase/include/ut0rnd.h11
-rw-r--r--storage/innobase/include/ut0rnd.inl24
-rw-r--r--storage/innobase/include/ut0ut.h21
-rw-r--r--storage/innobase/include/ut0wqueue.h17
122 files changed, 6776 insertions, 8373 deletions
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
index c0dcc6f39d3..a56598d3620 100644
--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -2,7 +2,7 @@
Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2014, 2021, MariaDB Corporation.
+Copyright (c) 2014, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -25,8 +25,7 @@ The B-tree
Created 6/2/1994 Heikki Tuuri
*******************************************************/
-#ifndef btr0btr_h
-#define btr0btr_h
+#pragma once
#include "dict0dict.h"
#include "data0data.h"
@@ -56,146 +55,20 @@ not acceptable for it to lead to mysterious memory corruption, but it
is acceptable for the program to die with a clear assert failure. */
#define BTR_MAX_LEVELS 100
-/** Latching modes for btr_cur_search_to_nth_level(). */
-enum btr_latch_mode {
- /** Search a record on a leaf page and S-latch it. */
- BTR_SEARCH_LEAF = RW_S_LATCH,
- /** (Prepare to) modify a record on a leaf page and X-latch it. */
- BTR_MODIFY_LEAF = RW_X_LATCH,
- /** Obtain no latches. */
- BTR_NO_LATCHES = RW_NO_LATCH,
- /** Start modifying the entire B-tree. */
- BTR_MODIFY_TREE = 33,
- /** Continue modifying the entire B-tree. */
- BTR_CONT_MODIFY_TREE = 34,
- /** Search the previous record. */
- BTR_SEARCH_PREV = 35,
- /** Modify the previous record. */
- BTR_MODIFY_PREV = 36,
- /** Start searching the entire B-tree. */
- BTR_SEARCH_TREE = 37,
- /** Continue searching the entire B-tree. */
- BTR_CONT_SEARCH_TREE = 38,
-
- /* BTR_INSERT, BTR_DELETE and BTR_DELETE_MARK are mutually
- exclusive. */
- /** The search tuple will be inserted to the secondary index
- at the searched position. When the leaf page is not in the
- buffer pool, try to use the change buffer. */
- BTR_INSERT = 512,
-
- /** Try to delete mark a secondary index leaf page record at
- the searched position using the change buffer when the page is
- not in the buffer pool. */
- BTR_DELETE_MARK = 4096,
-
- /** Try to purge the record using the change buffer when the
- secondary index leaf page is not in the buffer pool. */
- BTR_DELETE = 8192,
-
- /** The caller is already holding dict_index_t::lock S-latch. */
- BTR_ALREADY_S_LATCHED = 16384,
- /** Search and S-latch a leaf page, assuming that the
- dict_index_t::lock S-latch is being held. */
- BTR_SEARCH_LEAF_ALREADY_S_LATCHED = BTR_SEARCH_LEAF
- | BTR_ALREADY_S_LATCHED,
- /** Search the entire index tree, assuming that the
- dict_index_t::lock S-latch is being held. */
- BTR_SEARCH_TREE_ALREADY_S_LATCHED = BTR_SEARCH_TREE
- | BTR_ALREADY_S_LATCHED,
- /** Search and X-latch a leaf page, assuming that the
- dict_index_t::lock S-latch is being held. */
- BTR_MODIFY_LEAF_ALREADY_S_LATCHED = BTR_MODIFY_LEAF
- | BTR_ALREADY_S_LATCHED,
-
- /** Attempt to delete-mark a secondary index record. */
- BTR_DELETE_MARK_LEAF = BTR_MODIFY_LEAF | BTR_DELETE_MARK,
- /** Attempt to delete-mark a secondary index record
- while holding the dict_index_t::lock S-latch. */
- BTR_DELETE_MARK_LEAF_ALREADY_S_LATCHED = BTR_DELETE_MARK_LEAF
- | BTR_ALREADY_S_LATCHED,
- /** Attempt to purge a secondary index record. */
- BTR_PURGE_LEAF = BTR_MODIFY_LEAF | BTR_DELETE,
- /** Attempt to purge a secondary index record
- while holding the dict_index_t::lock S-latch. */
- BTR_PURGE_LEAF_ALREADY_S_LATCHED = BTR_PURGE_LEAF
- | BTR_ALREADY_S_LATCHED,
-
- /** In the case of BTR_MODIFY_TREE, the caller specifies
- the intention to delete record only. It is used to optimize
- block->lock range.*/
- BTR_LATCH_FOR_DELETE = 65536,
-
- /** Attempt to purge a secondary index record in the tree. */
- BTR_PURGE_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE
-};
-
-/** This flag ORed to btr_latch_mode says that we do the search in query
-optimization */
-#define BTR_ESTIMATE 1024U
-
-/** This flag ORed to BTR_INSERT says that we can ignore possible
-UNIQUE definition on secondary indexes when we decide if we can use
-the insert buffer to speed up inserts */
-#define BTR_IGNORE_SEC_UNIQUE 2048U
-
-/** In the case of BTR_MODIFY_TREE, the caller specifies the intention
-to insert record only. It is used to optimize block->lock range.*/
-#define BTR_LATCH_FOR_INSERT 32768U
-
-/** This flag is for undo insert of rtree. For rtree, we need this flag
-to find proper rec to undo insert.*/
-#define BTR_RTREE_UNDO_INS 131072U
-
-/** In the case of BTR_MODIFY_LEAF, the caller intends to allocate or
-free the pages of externally stored fields. */
-#define BTR_MODIFY_EXTERNAL 262144U
-
-/** Try to delete mark the record at the searched position when the
-record is in spatial index */
-#define BTR_RTREE_DELETE_MARK 524288U
-
#define BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode) \
- ((latch_mode) & ulint(~(BTR_INSERT \
+ btr_latch_mode((latch_mode) & ~(BTR_INSERT \
| BTR_DELETE_MARK \
| BTR_RTREE_UNDO_INS \
| BTR_RTREE_DELETE_MARK \
| BTR_DELETE \
- | BTR_ESTIMATE \
| BTR_IGNORE_SEC_UNIQUE \
| BTR_ALREADY_S_LATCHED \
| BTR_LATCH_FOR_INSERT \
- | BTR_LATCH_FOR_DELETE \
- | BTR_MODIFY_EXTERNAL)))
-
-#define BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode) \
- ((latch_mode) & ulint(~(BTR_LATCH_FOR_INSERT \
- | BTR_LATCH_FOR_DELETE \
- | BTR_MODIFY_EXTERNAL)))
-
-/** Report that an index page is corrupted.
-@param[in] buffer block
-@param[in] index tree */
-ATTRIBUTE_COLD ATTRIBUTE_NORETURN __attribute__((nonnull))
-void btr_corruption_report(const buf_block_t* block,const dict_index_t* index);
-
-/** Assert that a B-tree page is not corrupted.
-@param block buffer block containing a B-tree page
-@param index the B-tree index */
-#define btr_assert_not_corrupted(block, index) \
- if (!!page_is_comp(buf_block_get_frame(block)) \
- != index->table->not_redundant()) \
- btr_corruption_report(block, index)
+ | BTR_LATCH_FOR_DELETE))
-/**************************************************************//**
-Gets the root node of a tree and sx-latches it for segment access.
-@return root page, sx-latched */
-page_t*
-btr_root_get(
-/*=========*/
- const dict_index_t* index, /*!< in: index tree */
- mtr_t* mtr) /*!< in: mtr */
- MY_ATTRIBUTE((nonnull));
+#define BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode) \
+ btr_latch_mode((latch_mode) \
+ & ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE))
/**************************************************************//**
Checks and adjusts the root node of a tree during IMPORT TABLESPACE.
@@ -206,67 +79,21 @@ btr_root_adjust_on_import(
const dict_index_t* index) /*!< in: index tree */
MY_ATTRIBUTE((warn_unused_result));
-/**************************************************************//**
-Gets the height of the B-tree (the level of the root, when the leaf
-level is assumed to be 0). The caller must hold an S or X latch on
-the index.
-@return tree height (level of the root) */
-ulint
-btr_height_get(
-/*===========*/
- const dict_index_t* index, /*!< in: index tree */
- mtr_t* mtr) /*!< in/out: mini-transaction */
- MY_ATTRIBUTE((warn_unused_result));
+/** Report a decryption failure. */
+ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index);
/** Get an index page and declare its latching order level.
@param[in] index index tree
@param[in] page page number
@param[in] mode latch mode
@param[in] merge whether change buffer merge should be attempted
-@param[in] file file name
-@param[in] line line where called
@param[in,out] mtr mini-transaction
+@param[out] err error code
@return block */
-inline buf_block_t* btr_block_get_func(const dict_index_t& index,
- uint32_t page, ulint mode, bool merge,
- const char* file, unsigned line,
- mtr_t* mtr)
-{
- dberr_t err;
-
- if (buf_block_t* block = buf_page_get_gen(
- page_id_t(index.table->space->id, page),
- index.table->space->zip_size(), mode, NULL, BUF_GET,
- file, line, mtr, &err, merge && !index.is_clust())) {
- ut_ad(err == DB_SUCCESS);
- if (mode != RW_NO_LATCH) {
- buf_block_dbg_add_level(block, index.is_ibuf()
- ? SYNC_IBUF_TREE_NODE
- : SYNC_TREE_NODE);
- }
- return block;
- } else {
- ut_ad(err != DB_SUCCESS);
-
- if (err == DB_DECRYPTION_FAILED) {
- if (index.table) {
- index.table->file_unreadable = true;
- }
- }
-
- return NULL;
- }
-}
+buf_block_t *btr_block_get(const dict_index_t &index,
+ uint32_t page, ulint mode, bool merge,
+ mtr_t *mtr, dberr_t *err= nullptr);
-/** Gets a buffer page and declares its latching order level.
-@param index index tree
-@param page page number
-@param mode latch mode
-@param merge whether change buffer merge should be attempted
-@param mtr mini-transaction handle
-@return the block descriptor */
-# define btr_block_get(index, page, mode, merge, mtr) \
- btr_block_get_func(index, page, mode, merge, __FILE__, __LINE__, mtr)
/**************************************************************//**
Gets the index id field of a page.
@return index id */
@@ -305,17 +132,6 @@ inline uint32_t btr_page_get_prev(const page_t* page)
}
/**************************************************************//**
-Releases the latch on a leaf page and bufferunfixes it. */
-UNIV_INLINE
-void
-btr_leaf_page_release(
-/*==================*/
- buf_block_t* block, /*!< in: buffer block */
- ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or
- BTR_MODIFY_LEAF */
- mtr_t* mtr) /*!< in: mtr */
- MY_ATTRIBUTE((nonnull));
-/**************************************************************//**
Gets the child node file address in a node pointer.
NOTE: the offsets array must contain all offsets for the record since
we read the last field according to offsets and assume that it contains
@@ -336,6 +152,7 @@ btr_node_ptr_get_child_page_no(
@param[in] index_id index id
@param[in] index index, or NULL to create a system table
@param[in,out] mtr mini-transaction
+@param[out] err error code
@return page number of the created root
@retval FIL_NULL if did not succeed */
uint32_t
@@ -344,23 +161,21 @@ btr_create(
fil_space_t* space,
index_id_t index_id,
dict_index_t* index,
- mtr_t* mtr);
+ mtr_t* mtr,
+ dberr_t* err)
+ MY_ATTRIBUTE((nonnull(2,5,6), warn_unused_result));
/** Free a persistent index tree if it exists.
-@param[in] page_id root page id
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] space tablespce
+@param[in] page root page number
@param[in] index_id PAGE_INDEX_ID contents
@param[in,out] mtr mini-transaction */
-void
-btr_free_if_exists(
- const page_id_t page_id,
- ulint zip_size,
- index_id_t index_id,
- mtr_t* mtr);
+void btr_free_if_exists(fil_space_t *space, uint32_t page,
+ index_id_t index_id, mtr_t *mtr);
-/** Free an index tree in a temporary tablespace.
-@param[in] page_id root page id */
-void btr_free(const page_id_t page_id);
+/** Drop a temporary table
+@param table temporary table */
+void btr_drop_temporary_table(const dict_table_t &table);
/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC.
@param[in,out] index clustered index
@@ -396,11 +211,11 @@ btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset = false)
@param[in,out] mtr mini-transaction */
void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr);
+ATTRIBUTE_COLD __attribute__((nonnull))
/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE.
@param[in] index clustered index with instant ALTER TABLE
@param[in] all whether to reset FIL_PAGE_TYPE as well
@param[in,out] mtr mini-transaction */
-ATTRIBUTE_COLD __attribute__((nonnull))
void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr);
/*************************************************************//**
@@ -423,8 +238,9 @@ btr_root_raise_and_insert(
that can be emptied, or NULL */
const dtuple_t* tuple, /*!< in: tuple to insert */
ulint n_ext, /*!< in: number of externally stored columns */
- mtr_t* mtr) /*!< in: mtr */
- MY_ATTRIBUTE((warn_unused_result));
+ mtr_t* mtr, /*!< in: mtr */
+ dberr_t* err) /*!< out: error code */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/*************************************************************//**
Reorganizes an index page.
@@ -434,15 +250,12 @@ be done either within the same mini-transaction, or by invoking
ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
IBUF_BITMAP_FREE is unaffected by reorganization.
-@retval true if the operation was successful
-@retval false if it is a compressed page, and recompression failed */
-bool
-btr_page_reorganize(
-/*================*/
- page_cur_t* cursor, /*!< in/out: page cursor */
- dict_index_t* index, /*!< in: the index tree of the page */
- mtr_t* mtr) /*!< in/out: mini-transaction */
- MY_ATTRIBUTE((nonnull));
+@param cursor page cursor
+@param mtr mini-transaction
+@return error code
+@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
+dberr_t btr_page_reorganize(page_cur_t *cursor, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/** Decide if the page should be split at the convergence point of inserts
converging to the left.
@param[in] cursor insert position
@@ -481,23 +294,20 @@ btr_page_split_and_insert(
that can be emptied, or NULL */
const dtuple_t* tuple, /*!< in: tuple to insert */
ulint n_ext, /*!< in: number of externally stored columns */
- mtr_t* mtr) /*!< in: mtr */
- MY_ATTRIBUTE((warn_unused_result));
+ mtr_t* mtr, /*!< in: mtr */
+ dberr_t* err) /*!< out: error code */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/*******************************************************//**
Inserts a data tuple to a tree on a non-leaf level. It is assumed
that mtr holds an x-latch on the tree. */
-void
-btr_insert_on_non_leaf_level_func(
-/*==============================*/
+dberr_t
+btr_insert_on_non_leaf_level(
ulint flags, /*!< in: undo logging and locking flags */
dict_index_t* index, /*!< in: index */
ulint level, /*!< in: level, must be > 0 */
dtuple_t* tuple, /*!< in: the record to be inserted */
- const char* file, /*!< in: file name */
- unsigned line, /*!< in: line where called */
- mtr_t* mtr); /*!< in: mtr */
-#define btr_insert_on_non_leaf_level(f,i,l,t,m) \
- btr_insert_on_non_leaf_level_func(f,i,l,t,__FILE__,__LINE__,m)
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/** Set a child page pointer record as the predefined minimum record.
@tparam has_prev whether the page is supposed to have a left sibling
@@ -508,9 +318,9 @@ template<bool has_prev= false>
inline void btr_set_min_rec_mark(rec_t *rec, const buf_block_t &block,
mtr_t *mtr)
{
- ut_ad(block.frame == page_align(rec));
- ut_ad(!page_is_leaf(block.frame));
- ut_ad(has_prev == page_has_prev(block.frame));
+ ut_ad(block.page.frame == page_align(rec));
+ ut_ad(!page_is_leaf(block.page.frame));
+ ut_ad(has_prev == page_has_prev(block.page.frame));
rec-= page_rec_is_comp(rec) ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS;
@@ -523,13 +333,11 @@ inline void btr_set_min_rec_mark(rec_t *rec, const buf_block_t &block,
}
/** Seek to the parent page of a B-tree page.
-@param[in,out] index b-tree
-@param[in] block child page
@param[in,out] mtr mini-transaction
-@param[out] cursor cursor pointing to the x-latched parent page */
-void btr_page_get_father(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
- btr_cur_t* cursor)
- MY_ATTRIBUTE((nonnull));
+@param[in,out] cursor cursor pointing to the x-latched parent page
+@return whether the cursor was successfully positioned */
+bool btr_page_get_father(mtr_t* mtr, btr_cur_t* cursor)
+ MY_ATTRIBUTE((nonnull,warn_unused_result));
#ifdef UNIV_DEBUG
/************************************************************//**
Checks that the node pointer to a page is appropriate.
@@ -551,53 +359,29 @@ level lifts the records of the page to the father page, thus reducing the
tree height. It is assumed that mtr holds an x-latch on the tree and on the
page. If cursor is on the leaf level, mtr must also hold x-latches to
the brothers, if they exist.
-@return TRUE on success */
-ibool
+@return error code
+@retval DB_FAIL if the tree could not be merged */
+dberr_t
btr_compress(
/*=========*/
btr_cur_t* cursor, /*!< in/out: cursor on the page to merge
or lift; the page must not be empty:
when deleting records, use btr_discard_page()
if the page would become empty */
- ibool adjust, /*!< in: TRUE if should adjust the
- cursor position even if compression occurs */
+ bool adjust, /*!< in: whether the cursor position should be
+ adjusted even when compression occurs */
mtr_t* mtr) /*!< in/out: mini-transaction */
- MY_ATTRIBUTE((nonnull));
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/*************************************************************//**
Discards a page from a B-tree. This is used to remove the last record from
a B-tree page: the whole page must be removed at the same time. This cannot
be used for the root page, which is allowed to be empty. */
-void
+dberr_t
btr_discard_page(
/*=============*/
btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
the root page */
mtr_t* mtr); /*!< in: mtr */
-/**************************************************************//**
-Gets the number of pages in a B-tree.
-@return number of pages, or ULINT_UNDEFINED if the index is unavailable */
-ulint
-btr_get_size(
-/*=========*/
- const dict_index_t* index, /*!< in: index */
- ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
- mtr_t* mtr) /*!< in/out: mini-transaction where index
- is s-latched */
- MY_ATTRIBUTE((warn_unused_result));
-/**************************************************************//**
-Gets the number of reserved and used pages in a B-tree.
-@return number of pages reserved, or ULINT_UNDEFINED if the index
-is unavailable */
-UNIV_INTERN
-ulint
-btr_get_size_and_reserved(
-/*======================*/
- dict_index_t* index, /*!< in: index */
- ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
- ulint* used, /*!< out: number of pages used (<= reserved) */
- mtr_t* mtr) /*!< in/out: mini-transaction where index
- is s-latched */
- __attribute__((nonnull));
/**************************************************************//**
Allocates a new file page to be used in an index tree. NOTE: we assume
@@ -614,9 +398,10 @@ btr_page_alloc(
in the tree */
mtr_t* mtr, /*!< in/out: mini-transaction
for the allocation */
- mtr_t* init_mtr) /*!< in/out: mini-transaction
+ mtr_t* init_mtr, /*!< in/out: mini-transaction
for x-latching and initializing
the page */
+ dberr_t* err) /*!< out: error code */
MY_ATTRIBUTE((warn_unused_result));
/** Empty an index page (possibly the root page). @see btr_page_create().
@param[in,out] block page to be emptied
@@ -648,10 +433,11 @@ btr_page_create(
@param[in,out] index index tree
@param[in,out] block block to be freed
@param[in,out] mtr mini-transaction
-@param[in] blob whether this is freeing a BLOB page */
+@param[in] blob whether this is freeing a BLOB page
+@param[in] latched whether index->table->space->x_lock() was called */
MY_ATTRIBUTE((nonnull))
-void btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
- bool blob = false);
+dberr_t btr_page_free(dict_index_t *index, buf_block_t *block, mtr_t *mtr,
+ bool blob= false, bool space_latched= false);
/**************************************************************//**
Gets the root node of a tree and x- or s-latches it.
@@ -659,11 +445,11 @@ Gets the root node of a tree and x- or s-latches it.
buf_block_t*
btr_root_block_get(
/*===============*/
- const dict_index_t* index, /*!< in: index tree */
+ dict_index_t* index, /*!< in: index tree */
rw_lock_type_t mode, /*!< in: either RW_S_LATCH
or RW_X_LATCH */
- mtr_t* mtr); /*!< in: mtr */
-
+ mtr_t* mtr, /*!< in: mtr */
+ dberr_t* err); /*!< out: error code */
/*************************************************************//**
Reorganizes an index page.
@@ -673,15 +459,15 @@ be done either within the same mini-transaction, or by invoking
ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
IBUF_BITMAP_FREE is unaffected by reorganization.
-@retval true if the operation was successful
-@retval false if it is a compressed page, and recompression failed */
-bool btr_page_reorganize_block(
+@return error code
+@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
+dberr_t btr_page_reorganize_block(
ulint z_level,/*!< in: compression level to be used
if dealing with compressed page */
buf_block_t* block, /*!< in/out: B-tree page */
dict_index_t* index, /*!< in: the index tree of the page */
mtr_t* mtr) /*!< in/out: mini-transaction */
- __attribute__((nonnull));
+ __attribute__((nonnull, warn_unused_result));
#ifdef UNIV_BTR_PRINT
/*************************************************************//**
@@ -736,16 +522,15 @@ dberr_t btr_level_list_remove(const buf_block_t& block,
If page is the only on its level, this function moves its records to the
father page, thus reducing the tree height.
@return father block */
-UNIV_INTERN
buf_block_t*
btr_lift_page_up(
-/*=============*/
dict_index_t* index, /*!< in: index tree */
buf_block_t* block, /*!< in: page which is the only on its level;
must not be empty: use
btr_discard_only_page_on_level if the last
record from the page should be removed */
- mtr_t* mtr) /*!< in: mtr */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ dberr_t* err) /*!< out: error code */
__attribute__((nonnull));
#define BTR_N_LEAF_PAGES 1
@@ -756,6 +541,3 @@ btr_lift_page_up(
/****************************************************************
Global variable controlling if scrubbing should be performed */
extern my_bool srv_immediate_scrub_data_uncompressed;
-extern Atomic_counter<uint32_t> btr_validate_index_running;
-
-#endif
diff --git a/storage/innobase/include/btr0btr.inl b/storage/innobase/include/btr0btr.inl
index 89826e8f214..9a9e39b6b4c 100644
--- a/storage/innobase/include/btr0btr.inl
+++ b/storage/innobase/include/btr0btr.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2020, MariaDB Corporation.
+Copyright (c) 2015, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,10 +24,7 @@ The B-tree
Created 6/2/1994 Heikki Tuuri
*******************************************************/
-#include "mach0data.h"
-#include "mtr0mtr.h"
#include "mtr0log.h"
-#include "page0zip.h"
/**************************************************************//**
Gets the index id field of a page.
@@ -50,7 +47,7 @@ void btr_page_set_level(buf_block_t *block, ulint level, mtr_t *mtr)
{
ut_ad(level <= BTR_MAX_NODE_LEVEL);
constexpr uint16_t field= PAGE_HEADER + PAGE_LEVEL;
- byte *b= my_assume_aligned<2>(&block->frame[field]);
+ byte *b= my_assume_aligned<2>(&block->page.frame[field]);
if (mtr->write<2,mtr_t::MAYBE_NOP>(*block, b, level) &&
UNIV_LIKELY_NULL(block->page.zip.data))
memcpy_aligned<2>(&block->page.zip.data[field], b, 2);
@@ -63,7 +60,7 @@ void btr_page_set_level(buf_block_t *block, ulint level, mtr_t *mtr)
inline void btr_page_set_next(buf_block_t *block, ulint next, mtr_t *mtr)
{
constexpr uint16_t field= FIL_PAGE_NEXT;
- byte *b= my_assume_aligned<4>(&block->frame[field]);
+ byte *b= my_assume_aligned<4>(&block->page.frame[field]);
if (mtr->write<4,mtr_t::MAYBE_NOP>(*block, b, next) &&
UNIV_LIKELY_NULL(block->page.zip.data))
memcpy_aligned<4>(&block->page.zip.data[field], b, 4);
@@ -76,7 +73,7 @@ inline void btr_page_set_next(buf_block_t *block, ulint next, mtr_t *mtr)
inline void btr_page_set_prev(buf_block_t *block, ulint prev, mtr_t *mtr)
{
constexpr uint16_t field= FIL_PAGE_PREV;
- byte *b= my_assume_aligned<4>(&block->frame[field]);
+ byte *b= my_assume_aligned<4>(&block->page.frame[field]);
if (mtr->write<4,mtr_t::MAYBE_NOP>(*block, b, prev) &&
UNIV_LIKELY_NULL(block->page.zip.data))
memcpy_aligned<4>(&block->page.zip.data[field], b, 4);
@@ -112,38 +109,3 @@ btr_node_ptr_get_child_page_no(
return(page_no);
}
-
-/**************************************************************//**
-Releases the latches on a leaf page and bufferunfixes it. */
-UNIV_INLINE
-void
-btr_leaf_page_release(
-/*==================*/
- buf_block_t* block, /*!< in: buffer block */
- ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or
- BTR_MODIFY_LEAF */
- mtr_t* mtr) /*!< in: mtr */
-{
- ut_ad(latch_mode == BTR_SEARCH_LEAF
- || latch_mode == BTR_MODIFY_LEAF
- || latch_mode == BTR_NO_LATCHES);
-
- ut_ad(!mtr->memo_contains_flagged(block, MTR_MEMO_MODIFY));
-
- mtr_memo_type_t mode;
- switch (latch_mode) {
- case BTR_SEARCH_LEAF:
- mode = MTR_MEMO_PAGE_S_FIX;
- break;
- case BTR_MODIFY_LEAF:
- mode = MTR_MEMO_PAGE_X_FIX;
- break;
- case BTR_NO_LATCHES:
- mode = MTR_MEMO_BUF_FIX;
- break;
- default:
- ut_a(0);
- }
-
- mtr->memo_release(block, mode);
-}
diff --git a/storage/innobase/include/btr0bulk.h b/storage/innobase/include/btr0bulk.h
index 943836f8759..9fcea86d95d 100644
--- a/storage/innobase/include/btr0bulk.h
+++ b/storage/innobase/include/btr0bulk.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, 2020, MariaDB Corporation.
+Copyright (c) 2019, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -170,7 +170,7 @@ public:
inline void release();
/** Start mtr and latch block */
- inline dberr_t latch();
+ inline void latch();
/** Check if required space is available in the page for the rec
to be inserted. We check fill factor & padding here.
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
index 2cc7eb726a4..f6abc9f5e52 100644
--- a/storage/innobase/include/btr0cur.h
+++ b/storage/innobase/include/btr0cur.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -33,6 +33,9 @@ Created 10/16/1994 Heikki Tuuri
#include "rem0types.h"
#include "gis0type.h"
#include "my_base.h"
+#ifdef BTR_CUR_HASH_ADAPT
+# include "srw_lock.h"
+#endif
/** Mode flags for btr_cur operations; these can be ORed */
enum {
@@ -60,46 +63,13 @@ enum {
BTR_KEEP_IBUF_BITMAP = 32
};
-/* btr_cur_latch_leaves() returns latched blocks and savepoints. */
-struct btr_latch_leaves_t {
- /* left block, target block and right block */
- buf_block_t* blocks[3];
- ulint savepoints[3];
-};
-
#include "que0types.h"
#include "row0types.h"
-#ifdef UNIV_DEBUG
-/*********************************************************//**
-Returns the page cursor component of a tree cursor.
-@return pointer to page cursor component */
-UNIV_INLINE
-page_cur_t*
-btr_cur_get_page_cur(
-/*=================*/
- const btr_cur_t* cursor);/*!< in: tree cursor */
-/*********************************************************//**
-Returns the buffer block on which the tree cursor is positioned.
-@return pointer to buffer block */
-UNIV_INLINE
-buf_block_t*
-btr_cur_get_block(
-/*==============*/
- const btr_cur_t* cursor);/*!< in: tree cursor */
-/*********************************************************//**
-Returns the record pointer of a tree cursor.
-@return pointer to record */
-UNIV_INLINE
-rec_t*
-btr_cur_get_rec(
-/*============*/
- const btr_cur_t* cursor);/*!< in: tree cursor */
-#else /* UNIV_DEBUG */
-# define btr_cur_get_page_cur(cursor) (&(cursor)->page_cur)
-# define btr_cur_get_block(cursor) ((cursor)->page_cur.block)
-# define btr_cur_get_rec(cursor) ((cursor)->page_cur.rec)
-#endif /* UNIV_DEBUG */
+#define btr_cur_get_page_cur(cursor) (&(cursor)->page_cur)
+#define btr_cur_get_block(cursor) ((cursor)->page_cur.block)
+#define btr_cur_get_rec(cursor) ((cursor)->page_cur.rec)
+
/*********************************************************//**
Returns the compressed page on which the tree cursor is positioned.
@return pointer to compressed page, or NULL if the page is not compressed */
@@ -120,7 +90,7 @@ btr_cur_get_page(
Returns the index of a cursor.
@param cursor b-tree cursor
@return index */
-#define btr_cur_get_index(cursor) ((cursor)->index)
+#define btr_cur_get_index(cursor) ((cursor)->index())
/*********************************************************//**
Positions a tree cursor at a given record. */
UNIV_INLINE
@@ -150,104 +120,36 @@ bool
btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
ATTRIBUTE_COLD __attribute__((nonnull, warn_unused_result));
-/** Optimistically latches the leaf page or pages requested.
-@param[in] block guessed buffer block
-@param[in] modify_clock modify clock value
-@param[in,out] latch_mode BTR_SEARCH_LEAF, ...
-@param[in,out] cursor cursor
-@param[in] file file name
-@param[in] line line where called
-@param[in] mtr mini-transaction
-@return true if success */
-bool
-btr_cur_optimistic_latch_leaves(
- buf_block_t* block,
- ib_uint64_t modify_clock,
- ulint* latch_mode,
- btr_cur_t* cursor,
- const char* file,
- unsigned line,
- mtr_t* mtr);
-
-/** Searches an index tree and positions a tree cursor on a given level.
+MY_ATTRIBUTE((warn_unused_result))
+/********************************************************************//**
+Searches an index tree and positions a tree cursor on a given non-leaf level.
NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
to node pointer page number fields on the upper levels of the tree!
-Note that if mode is PAGE_CUR_LE, which is used in inserts, then
cursor->up_match and cursor->low_match both will have sensible values.
-If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
-@param index index
+Cursor is left at the place where an insert of the
+search tuple should be performed in the B-tree. InnoDB does an insert
+immediately after the cursor. Thus, the cursor may end up on a user record,
+or on a page infimum record.
@param level the tree level of search
@param tuple data tuple; NOTE: n_fields_cmp in tuple must be set so that
it cannot get compared to the node ptr page number field!
-@param mode PAGE_CUR_L, NOTE that if the search is made using a unique
- prefix of a record, mode should be PAGE_CUR_LE, not
- PAGE_CUR_GE, as the latter may end up on the previous page of
- the record! Inserts should always be made using PAGE_CUR_LE
- to search the position!
-@param latch_mode BTR_SEARCH_LEAF, ..., ORed with at most one of BTR_INSERT,
- BTR_DELETE_MARK, BTR_DELETE, or BTR_ESTIMATE;
- cursor->left_block is used to store a pointer to the left
- neighbor page, in the cases BTR_SEARCH_PREV and
- BTR_MODIFY_PREV; NOTE that if ahi_latch, we might not have a
- cursor page latch, we assume that ahi_latch protects the
- record!
+@param latch RW_S_LATCH or RW_X_LATCH
@param cursor tree cursor; the cursor page is s- or x-latched, but see also
above!
-@param file file name
-@param line line where called
@param mtr mini-transaction
-@param autoinc PAGE_ROOT_AUTO_INC to be written (0 if none)
@return DB_SUCCESS on success or error code otherwise */
-dberr_t btr_cur_search_to_nth_level(dict_index_t *index, ulint level,
+dberr_t btr_cur_search_to_nth_level(ulint level,
const dtuple_t *tuple,
- page_cur_mode_t mode, ulint latch_mode,
- btr_cur_t *cursor, const char *file,
- unsigned line, mtr_t *mtr,
- ib_uint64_t autoinc= 0);
-
-/*****************************************************************//**
-Opens a cursor at either end of an index.
-@return DB_SUCCESS or error code */
-dberr_t
-btr_cur_open_at_index_side_func(
-/*============================*/
- bool from_left, /*!< in: true if open to the low end,
- false if to the high end */
- dict_index_t* index, /*!< in: index */
- ulint latch_mode, /*!< in: latch mode */
- btr_cur_t* cursor, /*!< in/out: cursor */
- ulint level, /*!< in: level to search for
- (0=leaf) */
- const char* file, /*!< in: file name */
- unsigned line, /*!< in: line where called */
- mtr_t* mtr) /*!< in/out: mini-transaction */
- MY_ATTRIBUTE((nonnull));
-
-#define btr_cur_open_at_index_side(f,i,l,c,lv,m) \
- btr_cur_open_at_index_side_func(f,i,l,c,lv,__FILE__,__LINE__,m)
+ rw_lock_type_t rw_latch,
+ btr_cur_t *cursor, mtr_t *mtr);
-/**********************************************************************//**
-Positions a cursor at a randomly chosen position within a B-tree.
-@return true if the index is available and we have put the cursor, false
-if the index is unavailable */
-bool
-btr_cur_open_at_rnd_pos_func(
-/*=========================*/
- dict_index_t* index, /*!< in: index */
- ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
- btr_cur_t* cursor, /*!< in/out: B-tree cursor */
- const char* file, /*!< in: file name */
- unsigned line, /*!< in: line where called */
- mtr_t* mtr); /*!< in: mtr */
-#define btr_cur_open_at_rnd_pos(i,l,c,m) \
- btr_cur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m)
/*************************************************************//**
Tries to perform an insert to a page in an index tree, next to cursor.
It is assumed that mtr holds an x-latch on the page. The operation does
not succeed if there is too little space on the page. If there is just
one record on the page, the insert will always succeed; this is to
prevent trying to split a page with just one record.
-@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL, or error number */
dberr_t
btr_cur_optimistic_insert(
/*======================*/
@@ -324,7 +226,6 @@ btr_cur_update_alloc_zip_func(
/*==========================*/
page_zip_des_t* page_zip,/*!< in/out: compressed page */
page_cur_t* cursor, /*!< in/out: B-tree page cursor */
- dict_index_t* index, /*!< in: the index corresponding to cursor */
#ifdef UNIV_DEBUG
rec_offs* offsets,/*!< in/out: offsets of the cursor record */
#endif /* UNIV_DEBUG */
@@ -334,11 +235,11 @@ btr_cur_update_alloc_zip_func(
mtr_t* mtr) /*!< in/out: mini-transaction */
MY_ATTRIBUTE((nonnull, warn_unused_result));
#ifdef UNIV_DEBUG
-# define btr_cur_update_alloc_zip(page_zip,cursor,index,offsets,len,cr,mtr) \
- btr_cur_update_alloc_zip_func(page_zip,cursor,index,offsets,len,cr,mtr)
+# define btr_cur_update_alloc_zip(page_zip,cursor,offsets,len,cr,mtr) \
+ btr_cur_update_alloc_zip_func(page_zip,cursor,offsets,len,cr,mtr)
#else /* UNIV_DEBUG */
-# define btr_cur_update_alloc_zip(page_zip,cursor,index,offsets,len,cr,mtr) \
- btr_cur_update_alloc_zip_func(page_zip,cursor,index,len,cr,mtr)
+# define btr_cur_update_alloc_zip(page_zip,cursor,offsets,len,cr,mtr) \
+ btr_cur_update_alloc_zip_func(page_zip,cursor,len,cr,mtr)
#endif /* UNIV_DEBUG */
/** Apply an update vector to a record. No field size changes are allowed.
@@ -468,44 +369,36 @@ that mtr holds an x-latch on the tree and on the cursor page. To avoid
deadlocks, mtr must also own x-latches to brothers of page, if those
brothers exist. NOTE: it is assumed that the caller has reserved enough
free extents so that the compression will always succeed if done!
-@return TRUE if compression occurred */
-ibool
+@return whether compression occurred */
+bool
btr_cur_compress_if_useful(
/*=======================*/
btr_cur_t* cursor, /*!< in/out: cursor on the page to compress;
- cursor does not stay valid if compression
- occurs */
- ibool adjust, /*!< in: TRUE if should adjust the
- cursor position even if compression occurs */
+ cursor does not stay valid if !adjust and
+ compression occurs */
+ bool adjust, /*!< in: whether the cursor position should be
+ adjusted even when compression occurs */
mtr_t* mtr) /*!< in/out: mini-transaction */
MY_ATTRIBUTE((nonnull));
/*******************************************************//**
Removes the record on which the tree cursor is positioned. It is assumed
that the mtr has an x-latch on the page where the cursor is positioned,
but no latch on the whole tree.
-@return TRUE if success, i.e., the page did not become too empty */
-ibool
-btr_cur_optimistic_delete_func(
-/*===========================*/
+@return error code
+@retval DB_FAIL if the page would become too empty */
+dberr_t
+btr_cur_optimistic_delete(
+/*======================*/
btr_cur_t* cursor, /*!< in: cursor on the record to delete;
cursor stays valid: if deletion succeeds,
on function exit it points to the successor
of the deleted record */
-# ifdef UNIV_DEBUG
ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
-# endif /* UNIV_DEBUG */
mtr_t* mtr) /*!< in: mtr; if this function returns
TRUE on a leaf page of a secondary
index, the mtr must be committed
before latching any further pages */
MY_ATTRIBUTE((nonnull, warn_unused_result));
-# ifdef UNIV_DEBUG
-# define btr_cur_optimistic_delete(cursor, flags, mtr) \
- btr_cur_optimistic_delete_func(cursor, flags, mtr)
-# else /* UNIV_DEBUG */
-# define btr_cur_optimistic_delete(cursor, flags, mtr) \
- btr_cur_optimistic_delete_func(cursor, mtr)
-# endif /* UNIV_DEBUG */
/*************************************************************//**
Removes the record on which the tree cursor is positioned. Tries
to compress the page if its fillfactor drops below a threshold
@@ -537,8 +430,8 @@ btr_cur_pessimistic_delete(
/** Delete the node pointer in a parent page.
@param[in,out] parent cursor pointing to parent record
@param[in,out] mtr mini-transaction */
-void btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
- MY_ATTRIBUTE((nonnull));
+dberr_t btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/***********************************************************//**
Parses a redo log record of updating a record in-place.
@return end of log record or NULL */
@@ -564,47 +457,20 @@ struct btr_pos_t
page_id_t page_id; /* Out: Page where we found the tuple */
};
-/** Estimates the number of rows in a given index range.
-@param[in] index index
-@param[in/out] range_start
-@param[in/out] range_ end
-@return estimated number of rows */
-ha_rows
-btr_estimate_n_rows_in_range(
- dict_index_t* index,
- btr_pos_t* range_start,
- btr_pos_t* range_end);
-
-
-/** Statistics for one field of an index. */
-struct index_field_stats_t
-{
- ib_uint64_t n_diff_key_vals;
- ib_uint64_t n_sample_sizes;
- ib_uint64_t n_non_null_key_vals;
-
- index_field_stats_t(ib_uint64_t n_diff_key_vals= 0,
- ib_uint64_t n_sample_sizes= 0,
- ib_uint64_t n_non_null_key_vals= 0)
- : n_diff_key_vals(n_diff_key_vals), n_sample_sizes(n_sample_sizes),
- n_non_null_key_vals(n_non_null_key_vals)
- {
- }
-};
-
-/** Estimates the number of different key values in a given index, for
-each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
-The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
-0..n_uniq-1) and the number of pages that were sampled is saved in
-index->stat_n_sample_sizes[].
-If innodb_stats_method is nulls_ignored, we also record the number of
-non-null values for each prefix and stored the estimates in
-array index->stat_n_non_null_key_vals.
-@param[in] index index
-@return stat vector if the index is available and we get the estimated numbers,
-empty vector if the index is unavailable. */
-std::vector<index_field_stats_t>
-btr_estimate_number_of_different_key_vals(dict_index_t* index);
+/** Estimates the number of rows in a given index range. Do search in the
+left page, then if there are pages between left and right ones, read a few
+pages to the right, if the right page is reached, fetch it and count the exact
+number of rows, otherwise count the estimated(see
+btr_estimate_n_rows_in_range_on_level() for details) number if rows, and
+fetch the right page. If leaves are reached, unlatch non-leaf pages except
+the right leaf parent. After the right leaf page is fetched, commit mtr.
+@param[in] index index
+@param[in] range_start range start
+@param[in] range_end range end
+@return estimated number of rows; */
+ha_rows btr_estimate_n_rows_in_range(dict_index_t *index,
+ btr_pos_t *range_start,
+ btr_pos_t *range_end);
/** Gets the externally stored size of a record, in units of a database page.
@param[in] rec record
@@ -758,19 +624,6 @@ btr_rec_copy_externally_stored_field(
ulint* len,
mem_heap_t* heap);
-/** Latches the leaf page or pages requested.
-@param[in] block leaf page where the search converged
-@param[in] latch_mode BTR_SEARCH_LEAF, ...
-@param[in] cursor cursor
-@param[in] mtr mini-transaction
-@return blocks and savepoints which actually latched. */
-btr_latch_leaves_t
-btr_cur_latch_leaves(
- buf_block_t* block,
- ulint latch_mode,
- btr_cur_t* cursor,
- mtr_t* mtr);
-
/*######################################################################*/
/** In the pessimistic delete, if the page data size drops below this
@@ -829,24 +682,18 @@ enum btr_cur_method {
/** The tree cursor: the definition appears here only for the compiler
to know struct size! */
struct btr_cur_t {
- dict_index_t* index; /*!< index where positioned */
page_cur_t page_cur; /*!< page cursor */
purge_node_t* purge_node; /*!< purge node, for BTR_DELETE */
- buf_block_t* left_block; /*!< this field is used to store
- a pointer to the left neighbor
- page, in the cases
- BTR_SEARCH_PREV and
- BTR_MODIFY_PREV */
/*------------------------------*/
que_thr_t* thr; /*!< this field is only used
- when btr_cur_search_to_nth_level
+ when search_leaf()
is called for an index entry
insertion: the calling query
thread is passed here to be
used in the insert buffer */
/*------------------------------*/
/** The following fields are used in
- btr_cur_search_to_nth_level to pass information: */
+ search_leaf() to pass information: */
/* @{ */
enum btr_cur_method flag; /*!< Search method used */
ulint tree_height; /*!< Tree height if the search is done
@@ -855,8 +702,7 @@ struct btr_cur_t {
ulint up_match; /*!< If the search mode was PAGE_CUR_LE,
the number of matched fields to the
the first user record to the right of
- the cursor record after
- btr_cur_search_to_nth_level;
+ the cursor record after search_leaf();
for the mode PAGE_CUR_GE, the matched
fields to the first user record AT THE
CURSOR or to the right of it;
@@ -873,8 +719,7 @@ struct btr_cur_t {
ulint low_match; /*!< if search mode was PAGE_CUR_LE,
the number of matched fields to the
first user record AT THE CURSOR or
- to the left of it after
- btr_cur_search_to_nth_level;
+ to the left of it after search_leaf();
NOT defined for PAGE_CUR_GE or any
other search modes; see also the NOTE
in up_match! */
@@ -894,28 +739,45 @@ struct btr_cur_t {
information of the path through
the tree */
rtr_info_t* rtr_info; /*!< rtree search info */
- btr_cur_t():thr(NULL), rtr_info(NULL) {}
- /* default values */
- /** Zero-initialize all fields */
- void init()
- {
- index = NULL;
- memset(&page_cur, 0, sizeof page_cur);
- purge_node = NULL;
- left_block = NULL;
- thr = NULL;
- flag = btr_cur_method(0);
- tree_height = 0;
- up_match = 0;
- up_bytes = 0;
- low_match = 0;
- low_bytes = 0;
- n_fields = 0;
- n_bytes = 0;
- fold = 0;
- path_arr = NULL;
- rtr_info = NULL;
- }
+ btr_cur_t() { memset((void*) this, 0, sizeof *this); }
+
+ dict_index_t *index() const { return page_cur.index; }
+ buf_block_t *block() const { return page_cur.block; }
+
+ /** Open the cursor on the first or last record.
+ @param first true=first record, false=last record
+ @param index B-tree
+ @param latch_mode which latches to acquire
+ @param mtr mini-transaction
+ @return error code */
+ dberr_t open_leaf(bool first, dict_index_t *index, btr_latch_mode latch_mode,
+ mtr_t *mtr);
+
+ /** Search the leaf page record corresponding to a key.
+ @param tuple key to search for, with correct n_fields_cmp
+ @param mode search mode; PAGE_CUR_LE for unique prefix or for inserting
+ @param latch_mode latch mode
+ @param mtr mini-transaction
+ @return error code */
+ dberr_t search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
+ btr_latch_mode latch_mode, mtr_t *mtr);
+
+ /** Search the leaf page record corresponding to a key, exclusively latching
+ all sibling pages on the way.
+ @param tuple key to search for, with correct n_fields_cmp
+ @param mode search mode; PAGE_CUR_LE for unique prefix or for inserting
+ @param mtr mini-transaction
+ @return error code */
+ dberr_t pessimistic_search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
+ mtr_t *mtr);
+
+ /** Open the cursor at a random leaf page record.
+ @param offsets temporary memory for rec_get_offsets()
+ @param heap memory heap for rec_get_offsets()
+ @param mtr mini-transaction
+ @return error code */
+ inline dberr_t open_random_leaf(rec_offs *&offsets, mem_heap_t *& heap,
+ mtr_t &mtr);
};
/** Modify the delete-mark flag of a record.
@@ -932,9 +794,9 @@ is still a good change of success a little later. Try this many
times. */
#define BTR_CUR_RETRY_DELETE_N_TIMES 100
/** If pessimistic delete fails because of lack of file space, there
-is still a good change of success a little later. Sleep this many
-microseconds between retries. */
-#define BTR_CUR_RETRY_SLEEP_TIME 50000
+is still a good change of success a little later. Sleep this time
+between retries. */
+static const std::chrono::milliseconds BTR_CUR_RETRY_SLEEP_TIME(50);
/** The reference in a field for which data is stored on a different page.
The reference is at the end of the 'locally' stored part of the field.
@@ -967,16 +829,16 @@ earlier version of the row. In rollback we are not allowed to free an
inherited external field. */
#define BTR_EXTERN_INHERITED_FLAG 64U
-/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
-extern Atomic_counter<ulint> btr_cur_n_non_sea;
+#ifdef BTR_CUR_HASH_ADAPT
+/** Number of searches down the B-tree in btr_cur_t::search_leaf(). */
+extern ib_counter_t<ulint, ib_counter_element_t> btr_cur_n_non_sea;
/** Old value of btr_cur_n_non_sea. Copied by
srv_refresh_innodb_monitor_stats(). Referenced by
srv_printf_innodb_monitor(). */
extern ulint btr_cur_n_non_sea_old;
-#ifdef BTR_CUR_HASH_ADAPT
/** Number of successful adaptive hash index lookups in
-btr_cur_search_to_nth_level(). */
-extern ulint btr_cur_n_sea;
+btr_cur_t::search_leaf(). */
+extern ib_counter_t<ulint, ib_counter_element_t> btr_cur_n_sea;
/** Old value of btr_cur_n_sea. Copied by
srv_refresh_innodb_monitor_stats(). Referenced by
srv_printf_innodb_monitor(). */
diff --git a/storage/innobase/include/btr0cur.inl b/storage/innobase/include/btr0cur.inl
index 8a45b714936..955cf34288e 100644
--- a/storage/innobase/include/btr0cur.inl
+++ b/storage/innobase/include/btr0cur.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -36,44 +36,6 @@ if (btr_cur_limit_optimistic_insert_debug > 1\
# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE)
#endif /* UNIV_DEBUG */
-#ifdef UNIV_DEBUG
-/*********************************************************//**
-Returns the page cursor component of a tree cursor.
-@return pointer to page cursor component */
-UNIV_INLINE
-page_cur_t*
-btr_cur_get_page_cur(
-/*=================*/
- const btr_cur_t* cursor) /*!< in: tree cursor */
-{
- return(&((btr_cur_t*) cursor)->page_cur);
-}
-
-/*********************************************************//**
-Returns the buffer block on which the tree cursor is positioned.
-@return pointer to buffer block */
-UNIV_INLINE
-buf_block_t*
-btr_cur_get_block(
-/*==============*/
- const btr_cur_t* cursor) /*!< in: tree cursor */
-{
- return(page_cur_get_block(btr_cur_get_page_cur(cursor)));
-}
-
-/*********************************************************//**
-Returns the record pointer of a tree cursor.
-@return pointer to record */
-UNIV_INLINE
-rec_t*
-btr_cur_get_rec(
-/*============*/
- const btr_cur_t* cursor) /*!< in: tree cursor */
-{
- return(page_cur_get_rec(btr_cur_get_page_cur(cursor)));
-}
-#endif /* UNIV_DEBUG */
-
/*********************************************************//**
Returns the compressed page on which the tree cursor is positioned.
@return pointer to compressed page, or NULL if the page is not compressed */
@@ -109,11 +71,8 @@ btr_cur_position(
buf_block_t* block, /*!< in: buffer block of rec */
btr_cur_t* cursor) /*!< out: cursor */
{
- ut_ad(page_align(rec) == block->frame);
-
page_cur_position(rec, block, btr_cur_get_page_cur(cursor));
-
- cursor->index = index;
+ cursor->page_cur.index = index;
}
/*********************************************************************//**
@@ -139,14 +98,14 @@ btr_cur_compress_recommendation(
if (!page_has_siblings(page)
|| page_get_data_size(page)
- < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index)) {
+ < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index())) {
/* The page fillfactor has dropped below a predefined
minimum value OR the level in the B-tree contains just
one page: we recommend compression if this is not the
root page. */
- return cursor->index->page
+ return cursor->index()->page
!= btr_cur_get_block(cursor)->page.id().page_no();
}
@@ -174,14 +133,14 @@ btr_cur_can_delete_without_compress(
if (!page_has_siblings(page) || page_get_n_recs(page) < 2
|| page_get_data_size(page) - rec_size
- < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index)) {
+ < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index())) {
/* The page fillfactor will drop below a predefined
minimum value, OR the level in the B-tree contains just
one page, OR the page will become empty: we recommend
compression if this is not the root page. */
- return cursor->index->page
+ return cursor->index()->page
== btr_cur_get_block(cursor)->page.id().page_no();
}
diff --git a/storage/innobase/include/btr0defragment.h b/storage/innobase/include/btr0defragment.h
index a9212db0e04..0523829bdc3 100644
--- a/storage/innobase/include/btr0defragment.h
+++ b/storage/innobase/include/btr0defragment.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
-Copyright (C) 2014, 2020, MariaDB Corporation.
+Copyright (C) 2014, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -43,13 +43,11 @@ Check whether the given index is in btr_defragment_wq. */
bool
btr_defragment_find_index(
dict_index_t* index); /*!< Index to find. */
-/******************************************************************//**
-Add an index to btr_defragment_wq. Return a pointer to os_event if this
-is a synchronized defragmentation. */
-os_event_t
-btr_defragment_add_index(
- dict_index_t* index, /*!< index to be added */
- dberr_t* err); /*!< out: error code */
+/** Defragment an index.
+@param pcur persistent cursor
+@param thd current session, for checking thd_killed()
+@return whether the operation was interrupted */
+bool btr_defragment_add_index(btr_pcur_t *pcur, THD *thd);
/******************************************************************//**
When table is dropped, this function is called to mark a table as removed in
btr_efragment_wq. The difference between this function and the remove_index
@@ -57,17 +55,9 @@ function is this will not NULL the event. */
void
btr_defragment_remove_table(
dict_table_t* table); /*!< Index to be removed. */
-/******************************************************************//**
-Mark an index as removed from btr_defragment_wq. */
-void
-btr_defragment_remove_index(
- dict_index_t* index); /*!< Index to be removed. */
/*********************************************************************//**
Check whether we should save defragmentation statistics to persistent storage.*/
-UNIV_INTERN
-void
-btr_defragment_save_defrag_stats_if_needed(
- dict_index_t* index); /*!< in: index */
+void btr_defragment_save_defrag_stats_if_needed(dict_index_t *index);
/* Stop defragmentation.*/
void btr_defragment_end();
diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h
index 584cc143359..c66a3bfa329 100644
--- a/storage/innobase/include/btr0pcur.h
+++ b/storage/innobase/include/btr0pcur.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,8 +24,7 @@ The index tree persistent cursor
Created 2/23/1996 Heikki Tuuri
*******************************************************/
-#ifndef btr0pcur_h
-#define btr0pcur_h
+#pragma once
#include "dict0dict.h"
#include "btr0cur.h"
@@ -47,13 +46,6 @@ of a scroll cursor easier */
};
/**************************************************************//**
-Allocates memory for a persistent cursor object and initializes the cursor.
-@return own: persistent cursor */
-btr_pcur_t*
-btr_pcur_create_for_mysql(void);
-/*============================*/
-
-/**************************************************************//**
Resets a persistent cursor object, freeing ::old_rec_buf if it is
allocated and resetting the other members to their initial values. */
void
@@ -62,12 +54,6 @@ btr_pcur_reset(
btr_pcur_t* cursor);/*!< in, out: persistent cursor */
/**************************************************************//**
-Frees the memory for a persistent cursor object. */
-void
-btr_pcur_free_for_mysql(
-/*====================*/
- btr_pcur_t* cursor); /*!< in, own: persistent cursor */
-/**************************************************************//**
Copies the stored position of a pcur to another pcur. */
void
btr_pcur_copy_stored_position(
@@ -84,79 +70,22 @@ btr_pcur_init(
/*==========*/
btr_pcur_t* pcur); /*!< in: persistent cursor */
-/** Free old_rec_buf.
-@param[in] pcur Persistent cursor holding old_rec to be freed. */
-UNIV_INLINE
-void
-btr_pcur_free(
- btr_pcur_t* pcur);
-
-/**************************************************************//**
-Initializes and opens a persistent cursor to an index tree. It should be
-closed with btr_pcur_close. */
-UNIV_INLINE
-dberr_t
-btr_pcur_open_low(
-/*==============*/
- dict_index_t* index, /*!< in: index */
- ulint level, /*!< in: level in the btree */
- const dtuple_t* tuple, /*!< in: tuple on which search done */
- page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...;
- NOTE that if the search is made using a unique
- prefix of a record, mode should be
- PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
- may end up on the previous page from the
- record! */
- ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
- btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
- const char* file, /*!< in: file name */
- unsigned line, /*!< in: line where called */
- ib_uint64_t autoinc,/*!< in: PAGE_ROOT_AUTO_INC to be written
- (0 if none) */
- mtr_t* mtr); /*!< in: mtr */
-#define btr_pcur_open(i,t,md,l,c,m) \
- btr_pcur_open_low(i,0,t,md,l,c,__FILE__,__LINE__,0,m)
/** Opens an persistent cursor to an index tree without initializing the
cursor.
-@param index index
@param tuple tuple on which search done
@param mode PAGE_CUR_L, ...; NOTE that if the search is made using a
unique prefix of a record, mode should be PAGE_CUR_LE, not
PAGE_CUR_GE, as the latter may end up on the previous page of
the record!
-@param latch_mode BTR_SEARCH_LEAF, ...; NOTE that if ahi_latch then we might
- not acquire a cursor page latch, but assume that the
- ahi_latch protects the record!
+@param latch_mode BTR_SEARCH_LEAF, ...
@param cursor memory buffer for persistent cursor
-@param file file name
-@param line line where called
-@param mtr mtr
+@param mtr mini-transaction
@return DB_SUCCESS on success or error code otherwise. */
-UNIV_INLINE
-dberr_t btr_pcur_open_with_no_init_func(dict_index_t *index,
- const dtuple_t *tuple,
- page_cur_mode_t mode, ulint latch_mode,
- btr_pcur_t *cursor, const char *file,
- unsigned line, mtr_t *mtr);
-# define btr_pcur_open_with_no_init(ix,t,md,l,cur,m) \
- btr_pcur_open_with_no_init_func(ix,t,md,l,cur,__FILE__,__LINE__,m)
-
-/*****************************************************************//**
-Opens a persistent cursor at either end of an index. */
-UNIV_INLINE
-dberr_t
-btr_pcur_open_at_index_side(
-/*========================*/
- bool from_left, /*!< in: true if open to the low end,
- false if to the high end */
- dict_index_t* index, /*!< in: index */
- ulint latch_mode, /*!< in: latch mode */
- btr_pcur_t* pcur, /*!< in/out: cursor */
- bool init_pcur, /*!< in: whether to initialize pcur */
- ulint level, /*!< in: level to search for
- (0=leaf) */
- mtr_t* mtr) /*!< in/out: mini-transaction */
- MY_ATTRIBUTE((nonnull));
+inline
+dberr_t btr_pcur_open_with_no_init(const dtuple_t *tuple, page_cur_mode_t mode,
+ btr_latch_mode latch_mode,
+ btr_pcur_t *cursor, mtr_t *mtr);
+
/**************************************************************//**
Gets the up_match value for a pcur after a search.
@return number of matched fields at the cursor or to the right if
@@ -175,44 +104,7 @@ ulint
btr_pcur_get_low_match(
/*===================*/
const btr_pcur_t* cursor); /*!< in: persistent cursor */
-/**************************************************************//**
-If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first
-user record satisfying the search condition, in the case PAGE_CUR_L or
-PAGE_CUR_LE, on the last user record. If no such user record exists, then
-in the first case sets the cursor after last in tree, and in the latter case
-before first in tree. The latching mode must be BTR_SEARCH_LEAF or
-BTR_MODIFY_LEAF. */
-void
-btr_pcur_open_on_user_rec_func(
-/*===========================*/
- dict_index_t* index, /*!< in: index */
- const dtuple_t* tuple, /*!< in: tuple on which search done */
- page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ... */
- ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or
- BTR_MODIFY_LEAF */
- btr_pcur_t* cursor, /*!< in: memory buffer for persistent
- cursor */
- const char* file, /*!< in: file name */
- unsigned line, /*!< in: line where called */
- mtr_t* mtr); /*!< in: mtr */
-#define btr_pcur_open_on_user_rec(i,t,md,l,c,m) \
- btr_pcur_open_on_user_rec_func(i,t,md,l,c,__FILE__,__LINE__,m)
-/**********************************************************************//**
-Positions a cursor at a randomly chosen position within a B-tree.
-@return true if the index is available and we have put the cursor, false
-if the index is unavailable */
-UNIV_INLINE
-bool
-btr_pcur_open_at_rnd_pos_func(
-/*==========================*/
- dict_index_t* index, /*!< in: index */
- ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
- btr_pcur_t* cursor, /*!< in/out: B-tree pcur */
- const char* file, /*!< in: file name */
- unsigned line, /*!< in: line where called */
- mtr_t* mtr); /*!< in: mtr */
-#define btr_pcur_open_at_rnd_pos(i,l,c,m) \
- btr_pcur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m)
+
/**************************************************************//**
Frees the possible memory heap of a persistent cursor and sets the latch
mode of the persistent cursor to BTR_NO_LATCHES.
@@ -222,9 +114,7 @@ cursor is currently positioned. The latch is acquired by the
are not allowed, you must take care (if using the cursor in S-mode) to
manually release the latch by either calling
btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr)
-or by committing the mini-transaction right after btr_pcur_close().
-A subsequent attempt to crawl the same page in the same mtr would cause
-an assertion failure. */
+or by mtr_t::commit(). */
UNIV_INLINE
void
btr_pcur_close(
@@ -242,9 +132,6 @@ btr_pcur_store_position(
/*====================*/
btr_pcur_t* cursor, /*!< in: persistent cursor */
mtr_t* mtr); /*!< in: mtr */
-
-#define btr_pcur_restore_position(l,cur,mtr) \
- (cur)->restore_position(l,__FILE__,__LINE__,mtr)
/*********************************************************//**
Gets the rel_pos field for a cursor whose position has been stored.
@return BTR_PCUR_ON, ... */
@@ -293,13 +180,14 @@ btr_pcur_move_to_next(
/*********************************************************//**
Moves the persistent cursor to the previous record in the tree. If no records
are left, the cursor stays 'before first in tree'.
-@return TRUE if the cursor was not before first in tree */
-ibool
+@return true if the cursor was not before first in tree */
+bool
btr_pcur_move_to_prev(
/*==================*/
btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
function may release the page latch */
- mtr_t* mtr); /*!< in: mtr */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/*********************************************************//**
Moves the persistent cursor to the next user record in the tree. If no user
records are left, the cursor ends up 'after last in tree'.
@@ -316,60 +204,18 @@ Moves the persistent cursor to the first record on the next page.
Releases the latch on the current page, and bufferunfixes it.
Note that there must not be modifications on the current page,
as then the x-latch can be released only in mtr_commit. */
-void
+dberr_t
btr_pcur_move_to_next_page(
/*=======================*/
btr_pcur_t* cursor, /*!< in: persistent cursor; must be on the
last record of the current page */
- mtr_t* mtr); /*!< in: mtr */
-#ifdef UNIV_DEBUG
-/*********************************************************//**
-Returns the btr cursor component of a persistent cursor.
-@return pointer to btr cursor component */
-UNIV_INLINE
-btr_cur_t*
-btr_pcur_get_btr_cur(
-/*=================*/
- const btr_pcur_t* cursor); /*!< in: persistent cursor */
-/*********************************************************//**
-Returns the page cursor component of a persistent cursor.
-@return pointer to page cursor component */
-UNIV_INLINE
-page_cur_t*
-btr_pcur_get_page_cur(
-/*==================*/
- const btr_pcur_t* cursor); /*!< in: persistent cursor */
-/*********************************************************//**
-Returns the page of a persistent cursor.
-@return pointer to the page */
-UNIV_INLINE
-page_t*
-btr_pcur_get_page(
-/*==============*/
- const btr_pcur_t* cursor);/*!< in: persistent cursor */
-/*********************************************************//**
-Returns the buffer block of a persistent cursor.
-@return pointer to the block */
-UNIV_INLINE
-buf_block_t*
-btr_pcur_get_block(
-/*===============*/
- const btr_pcur_t* cursor);/*!< in: persistent cursor */
-/*********************************************************//**
-Returns the record of a persistent cursor.
-@return pointer to the record */
-UNIV_INLINE
-rec_t*
-btr_pcur_get_rec(
-/*=============*/
- const btr_pcur_t* cursor);/*!< in: persistent cursor */
-#else /* UNIV_DEBUG */
-# define btr_pcur_get_btr_cur(cursor) (&(cursor)->btr_cur)
-# define btr_pcur_get_page_cur(cursor) (&(cursor)->btr_cur.page_cur)
-# define btr_pcur_get_page(cursor) ((cursor)->btr_cur.page_cur.block->frame)
-# define btr_pcur_get_block(cursor) ((cursor)->btr_cur.page_cur.block)
-# define btr_pcur_get_rec(cursor) ((cursor)->btr_cur.page_cur.rec)
-#endif /* UNIV_DEBUG */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#define btr_pcur_get_btr_cur(cursor) (&(cursor)->btr_cur)
+#define btr_pcur_get_page_cur(cursor) (&(cursor)->btr_cur.page_cur)
+#define btr_pcur_get_page(cursor) btr_pcur_get_block(cursor)->page.frame
+
/*********************************************************//**
Checks if the persistent cursor is on a user record. */
UNIV_INLINE
@@ -401,17 +247,19 @@ static inline bool btr_pcur_is_before_first_in_tree(btr_pcur_t* cursor);
Checks if the persistent cursor is after the last user record in
the index tree. */
static inline bool btr_pcur_is_after_last_in_tree(btr_pcur_t* cursor);
+MY_ATTRIBUTE((nonnull, warn_unused_result))
/*********************************************************//**
Moves the persistent cursor to the next record on the same page. */
UNIV_INLINE
-void
+rec_t*
btr_pcur_move_to_next_on_page(
/*==========================*/
btr_pcur_t* cursor);/*!< in/out: persistent cursor */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
/*********************************************************//**
Moves the persistent cursor to the previous record on the same page. */
UNIV_INLINE
-void
+rec_t*
btr_pcur_move_to_prev_on_page(
/*==========================*/
btr_pcur_t* cursor);/*!< in/out: persistent cursor */
@@ -448,103 +296,164 @@ enum pcur_pos_t {
/* The persistent B-tree cursor structure. This is used mainly for SQL
selects, updates, and deletes. */
-struct btr_pcur_t{
- /** Return value of restore_position() */
- enum restore_status {
- /** cursor position on user rec and points on the record with
- the same field values as in the stored record */
- SAME_ALL,
- /** cursor position is on user rec and points on the record with
- the same unique field values as in the stored record */
- SAME_UNIQ,
- /** cursor position is not on user rec or points on the record
- with not the same uniq field values as in the stored record */
- NOT_SAME
- };
- /** a B-tree cursor */
- btr_cur_t btr_cur;
- /** see TODO note below!
- BTR_SEARCH_LEAF, BTR_MODIFY_LEAF, BTR_MODIFY_TREE or BTR_NO_LATCHES,
- depending on the latching state of the page and tree where the cursor
- is positioned; BTR_NO_LATCHES means that the cursor is not currently
- positioned:
- we say then that the cursor is detached; it can be restored to
- attached if the old position was stored in old_rec */
- ulint latch_mode;
- /** true if old_rec is stored */
- bool old_stored;
- /** if cursor position is stored, contains an initial segment of the
- latest record cursor was positioned either on, before or after */
- rec_t* old_rec;
- /** btr_cur.index->n_core_fields when old_rec was copied */
- uint16 old_n_core_fields;
- /** number of fields in old_rec */
- uint16 old_n_fields;
- /** BTR_PCUR_ON, BTR_PCUR_BEFORE, or BTR_PCUR_AFTER, depending on
- whether cursor was on, before, or after the old_rec record */
- enum btr_pcur_pos_t rel_pos;
- /** buffer block when the position was stored */
- buf::Block_hint block_when_stored;
- /** the modify clock value of the buffer block when the cursor position
- was stored */
- ib_uint64_t modify_clock;
- /** btr_pcur_store_position() and btr_pcur_restore_position() state. */
- enum pcur_pos_t pos_state;
- /** PAGE_CUR_G, ... */
- page_cur_mode_t search_mode;
- /** the transaction, if we know it; otherwise this field is not defined;
- can ONLY BE USED in error prints in fatal assertion failures! */
- trx_t* trx_if_known;
- /*-----------------------------*/
- /* NOTE that the following fields may possess dynamically allocated
- memory which should be freed if not needed anymore! */
-
- /** NULL, or a dynamically allocated buffer for old_rec */
- byte* old_rec_buf;
- /** old_rec_buf size if old_rec_buf is not NULL */
- ulint buf_size;
-
- btr_pcur_t() :
- btr_cur(), latch_mode(RW_NO_LATCH),
- old_stored(false), old_rec(NULL),
- old_n_fields(0), rel_pos(btr_pcur_pos_t(0)),
- block_when_stored(),
- modify_clock(0), pos_state(BTR_PCUR_NOT_POSITIONED),
- search_mode(PAGE_CUR_UNSUPP), trx_if_known(NULL),
- old_rec_buf(NULL), buf_size(0)
- {
- btr_cur.init();
- }
-
- /** Return the index of this persistent cursor */
- dict_index_t* index() const { return(btr_cur.index); }
- /** Restores the stored position of a persistent cursor bufferfixing
- the page and obtaining the specified latches. If the cursor position
- was saved when the
- (1) cursor was positioned on a user record: this function restores the
- position to the last record LESS OR EQUAL to the stored record;
- (2) cursor was positioned on a page infimum record: restores the
- position to the last record LESS than the user record which was the
- successor of the page infimum;
- (3) cursor was positioned on the page supremum: restores to the first
- record GREATER than the user record which was the predecessor of the
- supremum.
- (4) cursor was positioned before the first or after the last in an
- empty tree: restores to before first or after the last in the tree.
- @param latch_mode BTR_SEARCH_LEAF, ...
- @param file file name
- @param line line where called
- @param mtr mtr
- @return btr_pcur_t::SAME_ALL cursor position on user rec and points on
- the record with the same field values as in the stored record,
- btr_pcur_t::SAME_UNIQ cursor position is on user rec and points on the
- record with the same unique field values as in the stored record,
- btr_pcur_t::NOT_SAME cursor position is not on user rec or points on
- the record with not the samebuniq field values as in the stored */
- restore_status restore_position(ulint latch_mode, const char *file,
- unsigned line, mtr_t *mtr);
+struct btr_pcur_t
+{
+ /** Return value of restore_position() */
+ enum restore_status {
+ /** cursor position on user rec and points on the record with
+ the same field values as in the stored record */
+ SAME_ALL,
+ /** cursor position is on user rec and points on the record with
+ the same unique field values as in the stored record */
+ SAME_UNIQ,
+ /** cursor position is not on user rec or points on the record
+ with not the same uniq field values as in the stored record */
+ NOT_SAME,
+ /** the index tree is corrupted */
+ CORRUPTED
+ };
+ /** a B-tree cursor */
+ btr_cur_t btr_cur;
+ /** @see BTR_PCUR_WAS_POSITIONED
+ BTR_SEARCH_LEAF, BTR_MODIFY_LEAF, BTR_MODIFY_TREE or BTR_NO_LATCHES,
+ depending on the latching state of the page and tree where the cursor
+ is positioned; BTR_NO_LATCHES means that the cursor is not currently
+ positioned:
+ we say then that the cursor is detached; it can be restored to
+ attached if the old position was stored in old_rec */
+ btr_latch_mode latch_mode= BTR_NO_LATCHES;
+ /** if cursor position is stored, contains an initial segment of the
+ latest record cursor was positioned either on, before or after */
+ rec_t *old_rec= nullptr;
+ /** btr_cur.index()->n_core_fields when old_rec was copied */
+ uint16 old_n_core_fields= 0;
+ /** number of fields in old_rec */
+ uint16 old_n_fields= 0;
+ /** BTR_PCUR_ON, BTR_PCUR_BEFORE, or BTR_PCUR_AFTER, depending on
+ whether cursor was on, before, or after the old_rec record */
+ btr_pcur_pos_t rel_pos= btr_pcur_pos_t(0);
+ /** buffer block when the position was stored */
+ buf::Block_hint block_when_stored;
+ /** the modify clock value of the buffer block when the cursor position
+ was stored */
+ ib_uint64_t modify_clock= 0;
+ /** btr_pcur_store_position() and restore_position() state. */
+ enum pcur_pos_t pos_state= BTR_PCUR_NOT_POSITIONED;
+ page_cur_mode_t search_mode= PAGE_CUR_UNSUPP;
+ /** the transaction, if we know it; otherwise this field is not defined;
+ can ONLY BE USED in error prints in fatal assertion failures! */
+ trx_t *trx_if_known= nullptr;
+ /** a dynamically allocated buffer for old_rec */
+ byte *old_rec_buf= nullptr;
+ /** old_rec_buf size if old_rec_buf is not NULL */
+ ulint buf_size= 0;
+
+ /** Return the index of this persistent cursor */
+ dict_index_t *index() const { return(btr_cur.index()); }
+ MY_ATTRIBUTE((nonnull, warn_unused_result))
+ /** Restores the stored position of a persistent cursor bufferfixing
+ the page and obtaining the specified latches. If the cursor position
+ was saved when the
+ (1) cursor was positioned on a user record: this function restores the
+ position to the last record LESS OR EQUAL to the stored record;
+ (2) cursor was positioned on a page infimum record: restores the
+ position to the last record LESS than the user record which was the
+ successor of the page infimum;
+ (3) cursor was positioned on the page supremum: restores to the first
+ record GREATER than the user record which was the predecessor of the
+ supremum.
+ (4) cursor was positioned before the first or after the last in an
+ empty tree: restores to before first or after the last in the tree.
+ @param latch_mode BTR_SEARCH_LEAF, ...
+ @param mtr mini-transaction
+ @retval SAME_ALL cursor position on user rec and points on
+ the record with the same field values as in the stored record,
+ @retval SAME_UNIQ cursor position is on user rec and points on the
+ record with the same unique field values as in the stored record,
+ @retval NOT_SAME cursor position is not on user rec or points on
+ the record with not the same uniq field values as in the stored
+ @retval CORRUPTED if the index is corrupted */
+ restore_status restore_position(btr_latch_mode latch_mode, mtr_t *mtr);
+
+ /** Open the cursor on the first or last record.
+ @param first true=first record, false=last record
+ @param index B-tree
+ @param latch_mode which latches to acquire
+ @param mtr mini-transaction
+ @return error code */
+ dberr_t open_leaf(bool first, dict_index_t *index, btr_latch_mode latch_mode,
+ mtr_t *mtr)
+
+ {
+ this->latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+ search_mode= first ? PAGE_CUR_G : PAGE_CUR_L;
+ pos_state= BTR_PCUR_IS_POSITIONED;
+ old_rec= nullptr;
+
+ return btr_cur.open_leaf(first, index, this->latch_mode, mtr);
+ }
};
-#include "btr0pcur.inl"
+inline buf_block_t *btr_pcur_get_block(btr_pcur_t *cursor)
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ return cursor->btr_cur.page_cur.block;
+}
-#endif
+inline const buf_block_t *btr_pcur_get_block(const btr_pcur_t *cursor)
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ return cursor->btr_cur.page_cur.block;
+}
+
+inline rec_t *btr_pcur_get_rec(const btr_pcur_t *cursor)
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+ return cursor->btr_cur.page_cur.rec;
+}
+
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. */
+inline
+dberr_t
+btr_pcur_open(
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_LE, ... */
+ btr_latch_mode latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ cursor->latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+ cursor->search_mode= mode;
+ cursor->pos_state= BTR_PCUR_IS_POSITIONED;
+ cursor->trx_if_known= nullptr;
+ return cursor->btr_cur.search_leaf(tuple, mode, latch_mode, mtr);
+}
+
+/** Open a cursor on the first user record satisfying the search condition;
+in case of no match, after the last index record. */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline
+dberr_t
+btr_pcur_open_on_user_rec(
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ btr_latch_mode latch_mode, /*!< in: BTR_SEARCH_LEAF or
+ BTR_MODIFY_LEAF */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent
+ cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF);
+ if (dberr_t err=
+ btr_pcur_open(tuple, PAGE_CUR_GE, latch_mode, cursor, mtr))
+ return err;
+ if (!btr_pcur_is_after_last_on_page(cursor) ||
+ btr_pcur_is_after_last_in_tree(cursor))
+ return DB_SUCCESS;
+ if (dberr_t err= btr_pcur_move_to_next_page(cursor, mtr))
+ return err;
+ return btr_pcur_move_to_next_on_page(cursor) ? DB_SUCCESS : DB_CORRUPTION;
+}
+
+#include "btr0pcur.inl"
diff --git a/storage/innobase/include/btr0pcur.inl b/storage/innobase/include/btr0pcur.inl
index 05f61b903ff..b827d70dc47 100644
--- a/storage/innobase/include/btr0pcur.inl
+++ b/storage/innobase/include/btr0pcur.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2020, MariaDB Corporation.
+Copyright (c) 2015, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -36,83 +36,12 @@ btr_pcur_get_rel_pos(
{
ut_ad(cursor);
ut_ad(cursor->old_rec);
- ut_ad(cursor->old_stored);
ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED
|| cursor->pos_state == BTR_PCUR_IS_POSITIONED);
return(cursor->rel_pos);
}
-#ifdef UNIV_DEBUG
-/*********************************************************//**
-Returns the btr cursor component of a persistent cursor.
-@return pointer to btr cursor component */
-UNIV_INLINE
-btr_cur_t*
-btr_pcur_get_btr_cur(
-/*=================*/
- const btr_pcur_t* cursor) /*!< in: persistent cursor */
-{
- const btr_cur_t* btr_cur = &cursor->btr_cur;
- return((btr_cur_t*) btr_cur);
-}
-
-/*********************************************************//**
-Returns the page cursor component of a persistent cursor.
-@return pointer to page cursor component */
-UNIV_INLINE
-page_cur_t*
-btr_pcur_get_page_cur(
-/*==================*/
- const btr_pcur_t* cursor) /*!< in: persistent cursor */
-{
- return(btr_cur_get_page_cur(btr_pcur_get_btr_cur(cursor)));
-}
-
-/*********************************************************//**
-Returns the page of a persistent cursor.
-@return pointer to the page */
-UNIV_INLINE
-page_t*
-btr_pcur_get_page(
-/*==============*/
- const btr_pcur_t* cursor) /*!< in: persistent cursor */
-{
- ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
-
- return(btr_cur_get_page(btr_pcur_get_btr_cur(cursor)));
-}
-
-/*********************************************************//**
-Returns the buffer block of a persistent cursor.
-@return pointer to the block */
-UNIV_INLINE
-buf_block_t*
-btr_pcur_get_block(
-/*===============*/
- const btr_pcur_t* cursor) /*!< in: persistent cursor */
-{
- ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
-
- return(btr_cur_get_block(btr_pcur_get_btr_cur(cursor)));
-}
-
-/*********************************************************//**
-Returns the record of a persistent cursor.
-@return pointer to the record */
-UNIV_INLINE
-rec_t*
-btr_pcur_get_rec(
-/*=============*/
- const btr_pcur_t* cursor) /*!< in: persistent cursor */
-{
- ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
- ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
-
- return(btr_cur_get_rec(btr_pcur_get_btr_cur(cursor)));
-}
-#endif /* UNIV_DEBUG */
-
/**************************************************************//**
Gets the up_match value for a pcur after a search.
@return number of matched fields at the cursor or to the right if
@@ -194,16 +123,8 @@ btr_pcur_is_on_user_rec(
/*====================*/
const btr_pcur_t* cursor) /*!< in: persistent cursor */
{
- ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
- ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
-
- if (btr_pcur_is_before_first_on_page(cursor)
- || btr_pcur_is_after_last_on_page(cursor)) {
-
- return(FALSE);
- }
-
- return(TRUE);
+ return !btr_pcur_is_before_first_on_page(cursor) &&
+ !btr_pcur_is_after_last_on_page(cursor);
}
/*********************************************************//**
@@ -233,7 +154,7 @@ static inline bool btr_pcur_is_after_last_in_tree(btr_pcur_t* cursor)
/*********************************************************//**
Moves the persistent cursor to the next record on the same page. */
UNIV_INLINE
-void
+rec_t*
btr_pcur_move_to_next_on_page(
/*==========================*/
btr_pcur_t* cursor) /*!< in/out: persistent cursor */
@@ -241,25 +162,23 @@ btr_pcur_move_to_next_on_page(
ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
- page_cur_move_to_next(btr_pcur_get_page_cur(cursor));
-
- cursor->old_stored = false;
+ cursor->old_rec = nullptr;
+ return page_cur_move_to_next(btr_pcur_get_page_cur(cursor));
}
/*********************************************************//**
Moves the persistent cursor to the previous record on the same page. */
UNIV_INLINE
-void
+rec_t*
btr_pcur_move_to_prev_on_page(
/*==========================*/
btr_pcur_t* cursor) /*!< in/out: persistent cursor */
{
ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+ cursor->old_rec = nullptr;
- page_cur_move_to_prev(btr_pcur_get_page_cur(cursor));
-
- cursor->old_stored = false;
+ return page_cur_move_to_prev(btr_pcur_get_page_cur(cursor));
}
/*********************************************************//**
@@ -276,16 +195,15 @@ btr_pcur_move_to_next_user_rec(
{
ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
- cursor->old_stored = false;
+ cursor->old_rec = nullptr;
loop:
if (btr_pcur_is_after_last_on_page(cursor)) {
- if (btr_pcur_is_after_last_in_tree(cursor)) {
+ if (btr_pcur_is_after_last_in_tree(cursor)
+ || btr_pcur_move_to_next_page(cursor, mtr) != DB_SUCCESS) {
return(FALSE);
}
-
- btr_pcur_move_to_next_page(cursor, mtr);
- } else {
- btr_pcur_move_to_next_on_page(cursor);
+ } else if (UNIV_UNLIKELY(!btr_pcur_move_to_next_on_page(cursor))) {
+ return false;
}
if (btr_pcur_is_on_user_rec(cursor)) {
@@ -308,22 +226,16 @@ btr_pcur_move_to_next(
function may release the page latch */
mtr_t* mtr) /*!< in: mtr */
{
- ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
- ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
- cursor->old_stored = false;
+ cursor->old_rec= nullptr;
- if (btr_pcur_is_after_last_on_page(cursor)) {
- if (btr_pcur_is_after_last_in_tree(cursor)) {
- return(FALSE);
- }
-
- btr_pcur_move_to_next_page(cursor, mtr);
- return(TRUE);
- }
-
- btr_pcur_move_to_next_on_page(cursor);
- return(TRUE);
+ if (btr_pcur_is_after_last_on_page(cursor))
+ return !btr_pcur_is_after_last_in_tree(cursor) &&
+ btr_pcur_move_to_next_page(cursor, mtr) == DB_SUCCESS;
+ else
+ return !!btr_pcur_move_to_next_on_page(cursor);
}
/**************************************************************//**
@@ -381,200 +293,33 @@ btr_pcur_init(
/*==========*/
btr_pcur_t* pcur) /*!< in: persistent cursor */
{
- pcur->old_stored = false;
pcur->old_rec_buf = NULL;
pcur->old_rec = NULL;
pcur->btr_cur.rtr_info = NULL;
}
-/** Free old_rec_buf.
-@param[in] pcur Persistent cursor holding old_rec to be freed. */
-UNIV_INLINE
-void
-btr_pcur_free(
- btr_pcur_t* pcur)
-{
- ut_free(pcur->old_rec_buf);
-}
-
-/**************************************************************//**
-Initializes and opens a persistent cursor to an index tree. It should be
-closed with btr_pcur_close. */
-UNIV_INLINE
-dberr_t
-btr_pcur_open_low(
-/*==============*/
- dict_index_t* index, /*!< in: index */
- ulint level, /*!< in: level in the btree */
- const dtuple_t* tuple, /*!< in: tuple on which search done */
- page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...;
- NOTE that if the search is made using a unique
- prefix of a record, mode should be
- PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
- may end up on the previous page from the
- record! */
- ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
- btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
- const char* file, /*!< in: file name */
- unsigned line, /*!< in: line where called */
- ib_uint64_t autoinc,/*!< in: PAGE_ROOT_AUTO_INC to be written
- (0 if none) */
- mtr_t* mtr) /*!< in: mtr */
-{
- btr_cur_t* btr_cursor;
- dberr_t err = DB_SUCCESS;
-
- /* Initialize the cursor */
-
- btr_pcur_init(cursor);
-
- cursor->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
- cursor->search_mode = mode;
-
- /* Search with the tree cursor */
-
- btr_cursor = btr_pcur_get_btr_cur(cursor);
-
- ut_ad(!dict_index_is_spatial(index));
-
- err = btr_cur_search_to_nth_level(
- index, level, tuple, mode, latch_mode, btr_cursor,
- file, line, mtr, autoinc);
-
- if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
- ib::warn() << "btr_pcur_open_low"
- << " level: " << level
- << " called from file: "
- << file << " line: " << line
- << " table: " << index->table->name
- << " index: " << index->name
- << " error: " << err;
- }
-
- cursor->pos_state = BTR_PCUR_IS_POSITIONED;
-
- cursor->trx_if_known = NULL;
-
- return(err);
-}
-
/** Opens an persistent cursor to an index tree without initializing the
cursor.
-@param index index
@param tuple tuple on which search done
-@param mode PAGE_CUR_L, ...; NOTE that if the search is made using a
+@param mode search mode; NOTE that if the search is made using a
unique prefix of a record, mode should be PAGE_CUR_LE, not
PAGE_CUR_GE, as the latter may end up on the previous page of
the record!
-@param latch_mode BTR_SEARCH_LEAF, ...; NOTE that if ahi_latch then we might
- not acquire a cursor page latch, but assume that the
- ahi_latch protects the record!
+@param latch_mode BTR_SEARCH_LEAF, ...
@param cursor memory buffer for persistent cursor
-@param file file name
-@param line line where called
-@param mtr mtr
+@param mtr mini-transaction
@return DB_SUCCESS on success or error code otherwise. */
-UNIV_INLINE
-dberr_t btr_pcur_open_with_no_init_func(dict_index_t *index,
- const dtuple_t *tuple,
- page_cur_mode_t mode, ulint latch_mode,
- btr_pcur_t *cursor, const char *file,
- unsigned line, mtr_t *mtr)
-{
- btr_cur_t* btr_cursor;
- dberr_t err = DB_SUCCESS;
-
- cursor->latch_mode = BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode);
- cursor->search_mode = mode;
-
- /* Search with the tree cursor */
-
- btr_cursor = btr_pcur_get_btr_cur(cursor);
-
- err = btr_cur_search_to_nth_level(
- index, 0, tuple, mode, latch_mode, btr_cursor,
- file, line, mtr);
-
- cursor->pos_state = BTR_PCUR_IS_POSITIONED;
-
- cursor->old_stored = false;
-
- cursor->trx_if_known = NULL;
- return err;
-}
-
-/*****************************************************************//**
-Opens a persistent cursor at either end of an index. */
-UNIV_INLINE
-dberr_t
-btr_pcur_open_at_index_side(
-/*========================*/
- bool from_left, /*!< in: true if open to the low end,
- false if to the high end */
- dict_index_t* index, /*!< in: index */
- ulint latch_mode, /*!< in: latch mode */
- btr_pcur_t* pcur, /*!< in/out: cursor */
- bool init_pcur, /*!< in: whether to initialize pcur */
- ulint level, /*!< in: level to search for
- (0=leaf) */
- mtr_t* mtr) /*!< in/out: mini-transaction */
-{
- dberr_t err = DB_SUCCESS;
-
- pcur->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
-
- pcur->search_mode = from_left ? PAGE_CUR_G : PAGE_CUR_L;
-
- if (init_pcur) {
- btr_pcur_init(pcur);
- }
-
- err = btr_cur_open_at_index_side(
- from_left, index, latch_mode,
- btr_pcur_get_btr_cur(pcur), level, mtr);
- pcur->pos_state = BTR_PCUR_IS_POSITIONED;
-
- pcur->old_stored = false;
-
- pcur->trx_if_known = NULL;
-
- return (err);
-}
-
-/**********************************************************************//**
-Positions a cursor at a randomly chosen position within a B-tree.
-@return true if the index is available and we have put the cursor, false
-if the index is unavailable */
-UNIV_INLINE
-bool
-btr_pcur_open_at_rnd_pos_func(
-/*==========================*/
- dict_index_t* index, /*!< in: index */
- ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
- btr_pcur_t* cursor, /*!< in/out: B-tree pcur */
- const char* file, /*!< in: file name */
- unsigned line, /*!< in: line where called */
- mtr_t* mtr) /*!< in: mtr */
+inline
+dberr_t btr_pcur_open_with_no_init(const dtuple_t *tuple, page_cur_mode_t mode,
+ btr_latch_mode latch_mode,
+ btr_pcur_t *cursor, mtr_t *mtr)
{
- /* Initialize the cursor */
-
- cursor->latch_mode = latch_mode;
- cursor->search_mode = PAGE_CUR_G;
-
- btr_pcur_init(cursor);
-
- bool available;
-
- available = btr_cur_open_at_rnd_pos_func(index, latch_mode,
- btr_pcur_get_btr_cur(cursor),
- file, line, mtr);
- cursor->pos_state = BTR_PCUR_IS_POSITIONED;
- cursor->old_stored = false;
-
- cursor->trx_if_known = NULL;
-
- return(available);
+ cursor->latch_mode= BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode);
+ cursor->search_mode= mode;
+ cursor->pos_state= BTR_PCUR_IS_POSITIONED;
+ cursor->trx_if_known= nullptr;
+ return cursor->btr_cur.search_leaf(tuple, mode, latch_mode, mtr);
}
/**************************************************************//**
@@ -586,34 +331,28 @@ cursor is currently positioned. The latch is acquired by the
are not allowed, you must take care (if using the cursor in S-mode) to
manually release the latch by either calling
btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr)
-or by committing the mini-transaction right after btr_pcur_close().
-A subsequent attempt to crawl the same page in the same mtr would cause
-an assertion failure. */
+or by mtr_t::commit(). */
UNIV_INLINE
void
btr_pcur_close(
/*===========*/
btr_pcur_t* cursor) /*!< in: persistent cursor */
{
- ut_free(cursor->old_rec_buf);
-
- if (cursor->btr_cur.rtr_info) {
- rtr_clean_rtr_info(cursor->btr_cur.rtr_info, true);
- cursor->btr_cur.rtr_info = NULL;
- }
+ ut_free(cursor->old_rec_buf);
- cursor->old_rec = NULL;
- cursor->old_rec_buf = NULL;
- cursor->btr_cur.page_cur.rec = NULL;
- cursor->btr_cur.page_cur.block = NULL;
+ if (cursor->btr_cur.rtr_info)
+ rtr_clean_rtr_info(cursor->btr_cur.rtr_info, true);
- cursor->old_rec = NULL;
- cursor->old_stored = false;
+ cursor->btr_cur.rtr_info= nullptr;
+ cursor->old_rec = nullptr;
+ cursor->old_rec_buf = nullptr;
+ cursor->btr_cur.page_cur.rec = nullptr;
+ cursor->btr_cur.page_cur.block = nullptr;
- cursor->latch_mode = BTR_NO_LATCHES;
- cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
+ cursor->latch_mode = BTR_NO_LATCHES;
+ cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
- cursor->trx_if_known = NULL;
+ cursor->trx_if_known = nullptr;
}
/*********************************************************//**
@@ -629,5 +368,5 @@ btr_pcur_move_before_first_on_page(
page_cur_set_before_first(btr_pcur_get_block(cursor),
btr_pcur_get_page_cur(cursor));
- cursor->old_stored = false;
+ cursor->old_rec = nullptr;
}
diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h
index cd29e13f5bd..48e4fadab9b 100644
--- a/storage/innobase/include/btr0sea.h
+++ b/storage/innobase/include/btr0sea.h
@@ -30,7 +30,11 @@ Created 2/17/1996 Heikki Tuuri
#include "dict0dict.h"
#ifdef BTR_CUR_HASH_ADAPT
#include "ha0ha.h"
-#include "sync0sync.h"
+#include "srw_lock.h"
+
+#ifdef UNIV_PFS_RWLOCK
+extern mysql_pfs_key_t btr_search_latch_key;
+#endif /* UNIV_PFS_RWLOCK */
#define btr_search_sys_create() btr_search_sys.create()
#define btr_search_sys_free() btr_search_sys.free()
@@ -59,15 +63,9 @@ both have sensible values.
@param[in,out] info index search info
@param[in] tuple logical record
@param[in] mode PAGE_CUR_L, ....
-@param[in] latch_mode BTR_SEARCH_LEAF, ...;
- NOTE that only if has_search_latch is 0, we will
- have a latch set on the cursor page, otherwise
- we assume the caller uses his search latch
- to protect the record!
+@param[in] latch_mode BTR_SEARCH_LEAF, ...
@param[out] cursor tree cursor
-@param[in] ahi_latch the adaptive hash index latch being held,
- or NULL
-@param[in] mtr mini transaction
+@param[in] mtr mini-transaction
@return whether the search succeeded */
bool
btr_search_guess_on_hash(
@@ -111,8 +109,8 @@ void btr_search_drop_page_hash_when_freed(const page_id_t page_id);
using btr_cur_search_, and the new record has been
inserted next to the cursor.
@param[in] ahi_latch the adaptive hash index latch */
-void
-btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch);
+void btr_search_update_hash_node_on_insert(btr_cur_t *cursor,
+ srw_spin_lock *ahi_latch);
/** Updates the page hash index when a single record is inserted on a page.
@param[in,out] cursor cursor which was positioned to the
@@ -120,13 +118,13 @@ btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch);
and the new record has been inserted next
to the cursor
@param[in] ahi_latch the adaptive hash index latch */
-void
-btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch);
+void btr_search_update_hash_on_insert(btr_cur_t *cursor,
+ srw_spin_lock *ahi_latch);
/** Updates the page hash index when a single record is deleted from a page.
@param[in] cursor cursor which was positioned on the record to delete
using btr_cur_search_, the record is not yet deleted.*/
-void btr_search_update_hash_on_delete(btr_cur_t* cursor);
+void btr_search_update_hash_on_delete(btr_cur_t *cursor);
/** Validates the search system.
@return true if ok */
@@ -141,28 +139,13 @@ static inline void btr_search_x_unlock_all();
/** Lock all search latches in shared mode. */
static inline void btr_search_s_lock_all();
-#ifdef UNIV_DEBUG
-/** Check if thread owns all the search latches.
-@param[in] mode lock mode check
-@retval true if owns all of them
-@retval false if does not own some of them */
-static inline bool btr_search_own_all(ulint mode);
-
-/** Check if thread owns any of the search latches.
-@param[in] mode lock mode check
-@retval true if owns any of them
-@retval false if owns no search latch */
-static inline bool btr_search_own_any(ulint mode);
-
-/** @return whether this thread holds any of the search latches */
-static inline bool btr_search_own_any();
+/** Unlock all search latches from shared mode. */
+static inline void btr_search_s_unlock_all();
+# ifdef UNIV_DEBUG
/** @return if the index is marked as freed */
bool btr_search_check_marked_free_index(const buf_block_t *block);
-#endif /* UNIV_DEBUG */
-
-/** Unlock all search latches from shared mode. */
-static inline void btr_search_s_unlock_all();
+# endif /* UNIV_DEBUG */
#else /* BTR_CUR_HASH_ADAPT */
# define btr_search_sys_create()
# define btr_search_sys_free()
@@ -257,20 +240,30 @@ struct btr_search_sys_t
struct partition
{
/** latches protecting hash_table */
- rw_lock_t latch;
+ srw_spin_lock latch;
/** mapping of dtuple_fold() to rec_t* in buf_block_t::frame */
hash_table_t table;
/** memory heap for table */
mem_heap_t *heap;
- char pad[(CPU_LEVEL1_DCACHE_LINESIZE - sizeof(rw_lock_t) -
- sizeof(hash_table_t) - sizeof(mem_heap_t)) &
+#ifdef _MSC_VER
+#pragma warning(push)
+// nonstandard extension - zero sized array, if perfschema is not compiled
+#pragma warning(disable : 4200)
+#endif
+
+ char pad[(CPU_LEVEL1_DCACHE_LINESIZE - sizeof latch -
+ sizeof table - sizeof heap) &
(CPU_LEVEL1_DCACHE_LINESIZE - 1)];
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
void init()
{
memset((void*) this, 0, sizeof *this);
- rw_lock_create(btr_search_latch_key, &latch, SYNC_SEARCH_SYS);
+ latch.SRW_LOCK_INIT(btr_search_latch_key);
}
void alloc(ulint hash_size)
@@ -292,7 +285,7 @@ struct btr_search_sys_t
void free()
{
- rw_lock_free(&latch);
+ latch.destroy();
if (heap)
clear();
}
@@ -316,7 +309,7 @@ struct btr_search_sys_t
}
/** Get the search latch for the adaptive hash index partition */
- rw_lock_t *get_latch(const dict_index_t &index) const
+ srw_spin_lock *get_latch(const dict_index_t &index) const
{ return &get_part(index)->latch; }
/** Create and initialize at startup */
@@ -357,14 +350,24 @@ struct btr_search_sys_t
extern btr_search_sys_t btr_search_sys;
/** @return number of leaf pages pointed to by the adaptive hash index */
-inline ulint dict_index_t::n_ahi_pages() const
+TRANSACTIONAL_INLINE inline ulint dict_index_t::n_ahi_pages() const
{
if (!btr_search_enabled)
return 0;
- rw_lock_t *latch = &btr_search_sys.get_part(*this)->latch;
- rw_lock_s_lock(latch);
+ srw_spin_lock *latch= &btr_search_sys.get_part(*this)->latch;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (xbegin())
+ {
+ if (latch->is_locked())
+ xabort();
+ ulint ref_count= search_info->ref_count;
+ xend();
+ return ref_count;
+ }
+#endif
+ latch->rd_lock(SRW_LOCK_CALL);
ulint ref_count= search_info->ref_count;
- rw_lock_s_unlock(latch);
+ latch->rd_unlock();
return ref_count;
}
diff --git a/storage/innobase/include/btr0sea.inl b/storage/innobase/include/btr0sea.inl
index 40eb5d86ead..5a8d648029a 100644
--- a/storage/innobase/include/btr0sea.inl
+++ b/storage/innobase/include/btr0sea.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -47,8 +47,7 @@ static inline btr_search_t* btr_search_info_create(mem_heap_t* heap)
/** Updates the search info.
@param[in,out] info search info
@param[in,out] cursor cursor which was just positioned */
-void
-btr_search_info_update_slow(btr_search_t* info, btr_cur_t* cursor);
+void btr_search_info_update_slow(btr_search_t *info, btr_cur_t *cursor);
/*********************************************************************//**
Updates the search info. */
@@ -59,10 +58,10 @@ btr_search_info_update(
dict_index_t* index, /*!< in: index of the cursor */
btr_cur_t* cursor) /*!< in: cursor which was just positioned */
{
- ut_ad(!btr_search_own_any(RW_LOCK_S));
- ut_ad(!btr_search_own_any(RW_LOCK_X));
+ ut_ad(!index->is_spatial());
+ ut_ad(!index->table->is_temporary());
- if (dict_index_is_spatial(index) || !btr_search_enabled) {
+ if (!btr_search_enabled) {
return;
}
@@ -88,7 +87,7 @@ btr_search_info_update(
static inline void btr_search_x_lock_all()
{
for (ulint i = 0; i < btr_ahi_parts; ++i) {
- rw_lock_x_lock(&btr_search_sys.parts[i].latch);
+ btr_search_sys.parts[i].latch.wr_lock(SRW_LOCK_CALL);
}
}
@@ -96,7 +95,7 @@ static inline void btr_search_x_lock_all()
static inline void btr_search_x_unlock_all()
{
for (ulint i = 0; i < btr_ahi_parts; ++i) {
- rw_lock_x_unlock(&btr_search_sys.parts[i].latch);
+ btr_search_sys.parts[i].latch.wr_unlock();
}
}
@@ -104,7 +103,7 @@ static inline void btr_search_x_unlock_all()
static inline void btr_search_s_lock_all()
{
for (ulint i = 0; i < btr_ahi_parts; ++i) {
- rw_lock_s_lock(&btr_search_sys.parts[i].latch);
+ btr_search_sys.parts[i].latch.rd_lock(SRW_LOCK_CALL);
}
}
@@ -112,49 +111,7 @@ static inline void btr_search_s_lock_all()
static inline void btr_search_s_unlock_all()
{
for (ulint i = 0; i < btr_ahi_parts; ++i) {
- rw_lock_s_unlock(&btr_search_sys.parts[i].latch);
- }
-}
-
-#ifdef UNIV_DEBUG
-/** Check if thread owns all the search latches.
-@param[in] mode lock mode check
-@retval true if owns all of them
-@retval false if does not own some of them */
-static inline bool btr_search_own_all(ulint mode)
-{
- for (ulint i = 0; i < btr_ahi_parts; ++i) {
- if (!rw_lock_own(&btr_search_sys.parts[i].latch, mode)) {
- return(false);
- }
- }
- return(true);
-}
-
-/** Check if thread owns any of the search latches.
-@param[in] mode lock mode check
-@retval true if owns any of them
-@retval false if owns no search latch */
-static inline bool btr_search_own_any(ulint mode)
-{
- for (ulint i = 0; i < btr_ahi_parts; ++i) {
- if (rw_lock_own(&btr_search_sys.parts[i].latch, mode)) {
- return(true);
- }
- }
- return(false);
-}
-
-/** @return whether this thread holds any of the search latches */
-static inline bool btr_search_own_any()
-{
- for (ulint i = btr_ahi_parts; i--; ) {
- if (rw_lock_own_flagged(&btr_search_sys.parts[i].latch,
- RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)) {
- return true;
- }
+ btr_search_sys.parts[i].latch.rd_unlock();
}
- return false;
}
-#endif /* UNIV_DEBUG */
#endif /* BTR_CUR_HASH_ADAPT */
diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h
index 83c374e2561..fc829e7857a 100644
--- a/storage/innobase/include/btr0types.h
+++ b/storage/innobase/include/btr0types.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2019, MariaDB Corporation.
+Copyright (c) 2018, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,8 +24,7 @@ The index tree general types
Created 2/17/1996 Heikki Tuuri
*************************************************************************/
-#ifndef btr0types_h
-#define btr0types_h
+#pragma once
#include "page0types.h"
#include "rem0types.h"
@@ -56,4 +55,100 @@ in the index record. */
#define BTR_EXTERN_LOCAL_STORED_MAX_SIZE \
(BTR_EXTERN_FIELD_REF_SIZE * 2)
-#endif
+/** Latching modes for btr_cur_t::search_leaf(). */
+enum btr_latch_mode {
+ /** Search a record on a leaf page and S-latch it. */
+ BTR_SEARCH_LEAF = RW_S_LATCH,
+ /** (Prepare to) modify a record on a leaf page and X-latch it. */
+ BTR_MODIFY_LEAF = RW_X_LATCH,
+ /** U-latch root and X-latch a leaf page */
+ BTR_MODIFY_ROOT_AND_LEAF = RW_SX_LATCH,
+ /** Obtain no latches. */
+ BTR_NO_LATCHES = RW_NO_LATCH,
+ /** Search the previous record.
+ Used in btr_pcur_move_backward_from_page(). */
+ BTR_SEARCH_PREV = 4 | BTR_SEARCH_LEAF,
+ /** Modify the previous record.
+ Used in btr_pcur_move_backward_from_page() and ibuf_insert(). */
+ BTR_MODIFY_PREV = 4 | BTR_MODIFY_LEAF,
+ /** Start modifying the entire B-tree. */
+ BTR_MODIFY_TREE = 8 | BTR_MODIFY_LEAF,
+ /** Continue modifying the entire R-tree.
+ Only used by rtr_search_to_nth_level(). */
+ BTR_CONT_MODIFY_TREE = 4 | BTR_MODIFY_TREE,
+
+ /* BTR_INSERT, BTR_DELETE and BTR_DELETE_MARK are mutually
+ exclusive. */
+ /** The search tuple will be inserted to the secondary index
+ at the searched position. When the leaf page is not in the
+ buffer pool, try to use the change buffer. */
+ BTR_INSERT = 64,
+
+ /** Try to delete mark a secondary index leaf page record at
+ the searched position using the change buffer when the page is
+ not in the buffer pool. */
+ BTR_DELETE_MARK = 128,
+
+ /** Try to purge the record using the change buffer when the
+ secondary index leaf page is not in the buffer pool. */
+ BTR_DELETE = BTR_INSERT | BTR_DELETE_MARK,
+
+ /** The caller is already holding dict_index_t::lock S-latch. */
+ BTR_ALREADY_S_LATCHED = 256,
+ /** Search and S-latch a leaf page, assuming that the
+ dict_index_t::lock S-latch is being held. */
+ BTR_SEARCH_LEAF_ALREADY_S_LATCHED = BTR_SEARCH_LEAF
+ | BTR_ALREADY_S_LATCHED,
+ /** Search and X-latch a leaf page, assuming that the
+ dict_index_t::lock is being held in non-exclusive mode. */
+ BTR_MODIFY_LEAF_ALREADY_LATCHED = BTR_MODIFY_LEAF
+ | BTR_ALREADY_S_LATCHED,
+ /** Attempt to modify records in an x-latched tree. */
+ BTR_MODIFY_TREE_ALREADY_LATCHED = BTR_MODIFY_TREE
+ | BTR_ALREADY_S_LATCHED,
+ /** U-latch root and X-latch a leaf page, assuming that
+ dict_index_t::lock is being held in U mode. */
+ BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED = BTR_MODIFY_ROOT_AND_LEAF
+ | BTR_ALREADY_S_LATCHED,
+
+ /** Attempt to delete-mark a secondary index record. */
+ BTR_DELETE_MARK_LEAF = BTR_MODIFY_LEAF | BTR_DELETE_MARK,
+ /** Attempt to delete-mark a secondary index record
+ while holding the dict_index_t::lock S-latch. */
+ BTR_DELETE_MARK_LEAF_ALREADY_S_LATCHED = BTR_DELETE_MARK_LEAF
+ | BTR_ALREADY_S_LATCHED,
+ /** Attempt to purge a secondary index record. */
+ BTR_PURGE_LEAF = BTR_MODIFY_LEAF | BTR_DELETE,
+ /** Attempt to purge a secondary index record
+ while holding the dict_index_t::lock S-latch. */
+ BTR_PURGE_LEAF_ALREADY_S_LATCHED = BTR_PURGE_LEAF
+ | BTR_ALREADY_S_LATCHED,
+
+ /** In the case of BTR_MODIFY_TREE, the caller specifies
+ the intention to delete record only. It is used to optimize
+ block->lock range.*/
+ BTR_LATCH_FOR_DELETE = 512,
+
+ /** In the case of BTR_MODIFY_TREE, the caller specifies
+ the intention to delete record only. It is used to optimize
+ block->lock range.*/
+ BTR_LATCH_FOR_INSERT = 1024,
+
+ /** Attempt to delete a record in the tree. */
+ BTR_PURGE_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
+ /** Attempt to delete a record in an x-latched tree. */
+ BTR_PURGE_TREE_ALREADY_LATCHED = BTR_PURGE_TREE
+ | BTR_ALREADY_S_LATCHED,
+
+ /** Attempt to insert a record into the tree. */
+ BTR_INSERT_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_INSERT,
+
+ /** This flag ORed to BTR_INSERT says that we can ignore possible
+ UNIQUE definition on secondary indexes when we decide if we can use
+ the insert buffer to speed up inserts */
+ BTR_IGNORE_SEC_UNIQUE = 2048,
+ /** Rollback in spatial index */
+ BTR_RTREE_UNDO_INS = 4096,
+ /** Try to delete mark a spatial index record */
+ BTR_RTREE_DELETE_MARK = 8192
+};
diff --git a/storage/innobase/include/buf0block_hint.h b/storage/innobase/include/buf0block_hint.h
index ee48e7ce6d2..d4fee7c1e99 100644
--- a/storage/innobase/include/buf0block_hint.h
+++ b/storage/innobase/include/buf0block_hint.h
@@ -56,7 +56,7 @@ public:
buf_block_t *block= m_block;
bool res= f(block);
if (block)
- buf_block_buf_fix_dec(block);
+ block->page.unfix();
return res;
}
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
index a84ea047a54..2b4732a64a0 100644
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -24,40 +24,30 @@ The database buffer pool high-level routines
Created 11/5/1995 Heikki Tuuri
*******************************************************/
-#ifndef buf0buf_h
-#define buf0buf_h
+#pragma once
/** Magic value to use instead of checksums when they are disabled */
#define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL
#include "fil0fil.h"
#include "mtr0types.h"
-#include "buf0types.h"
#include "span.h"
#include "assume_aligned.h"
+#include "buf0types.h"
#ifndef UNIV_INNOCHECKSUM
-#include "hash0hash.h"
#include "ut0byte.h"
#include "page0types.h"
#include "log0log.h"
#include "srv0srv.h"
+#include "transactional_lock_guard.h"
#include <ostream>
-// Forward declaration
-struct fil_addr_t;
-
/** @name Modes for buf_page_get_gen */
/* @{ */
#define BUF_GET 10 /*!< get always */
#define BUF_GET_IF_IN_POOL 11 /*!< get if in pool */
#define BUF_PEEK_IF_IN_POOL 12 /*!< get if in pool, do not make
the block young in the LRU list */
-#define BUF_GET_NO_LATCH 14 /*!< get and bufferfix, but
- set no latch; we have
- separated this case, because
- it is error-prone programming
- not to set a latch, and it
- should be used with care */
#define BUF_GET_IF_IN_POOL_OR_WATCH 15
/*!< Get the page only if it's in the
buffer pool, if not then set a watch
@@ -65,7 +55,6 @@ struct fil_addr_t;
#define BUF_GET_POSSIBLY_FREED 16
/*!< Like BUF_GET, but do not mind
if the file page has been freed. */
-#define BUF_EVICT_IF_IN_POOL 20 /*!< evict a clean block if found */
/* @} */
/** If LRU list of a buf_pool is less than this size then LRU eviction
@@ -74,22 +63,6 @@ the blocks on free list. If LRU list is very small then we can end up
in thrashing. */
#define BUF_LRU_MIN_LEN 256
-/** buf_page_t::state() values, distinguishing buf_page_t and buf_block_t */
-enum buf_page_state
-{
- /** available in buf_pool.free or buf_pool.watch */
- BUF_BLOCK_NOT_USED,
- /** allocated for something else than a file page */
- BUF_BLOCK_MEMORY,
- /** a previously allocated file page, in transit to NOT_USED */
- BUF_BLOCK_REMOVE_HASH,
- /** a buf_block_t that is also in buf_pool.LRU */
- BUF_BLOCK_FILE_PAGE,
- /** the buf_page_t of a ROW_FORMAT=COMPRESSED page
- whose uncompressed page frame has been evicted */
- BUF_BLOCK_ZIP_PAGE
-};
-
/** This structure defines information we will fetch from each buffer pool. It
will be used to print table IO stats */
struct buf_pool_info_t
@@ -170,33 +143,10 @@ operator<<(
const page_id_t page_id);
#ifndef UNIV_INNOCHECKSUM
-/*********************************************************************//**
-Gets the current size of buffer buf_pool in bytes.
-@return size in bytes */
-UNIV_INLINE
-ulint
-buf_pool_get_curr_size(void);
-/*========================*/
-
-/********************************************************************//**
-Allocates a buf_page_t descriptor. This function must succeed. In case
-of failure we assert in this function. */
-UNIV_INLINE
-buf_page_t*
-buf_page_alloc_descriptor(void)
-/*===========================*/
- MY_ATTRIBUTE((malloc));
-/********************************************************************//**
-Free a buf_page_t descriptor. */
-UNIV_INLINE
-void
-buf_page_free_descriptor(
-/*=====================*/
- buf_page_t* bpage) /*!< in: bpage descriptor to free. */
- MY_ATTRIBUTE((nonnull));
+# define buf_pool_get_curr_size() srv_buf_pool_curr_size
/** Allocate a buffer block.
-@return own: the allocated block, in state BUF_BLOCK_MEMORY */
+@return own: the allocated block, state()==MEMORY */
inline buf_block_t *buf_block_alloc();
/********************************************************************//**
Frees a buffer block which does not contain a file page. */
@@ -206,71 +156,37 @@ buf_block_free(
/*===========*/
buf_block_t* block); /*!< in, own: block to be freed */
-/**************************************************************//**
-NOTE! The following macros should be used instead of buf_page_get_gen,
-to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed
-in LA! */
#define buf_page_get(ID, SIZE, LA, MTR) \
- buf_page_get_gen(ID, SIZE, LA, NULL, BUF_GET, __FILE__, __LINE__, MTR)
-
-/**************************************************************//**
-Use these macros to bufferfix a page with no latching. Remember not to
-read the contents of the page unless you know it is safe. Do not modify
-the contents of the page! We have separated this case, because it is
-error-prone programming not to set a latch, and it should be used
-with care. */
-#define buf_page_get_with_no_latch(ID, SIZE, MTR) \
- buf_page_get_gen(ID, SIZE, RW_NO_LATCH, NULL, BUF_GET_NO_LATCH, \
- __FILE__, __LINE__, MTR)
-/********************************************************************//**
-This is the general function used to get optimistic access to a database
-page.
-@return TRUE if success */
-ibool
-buf_page_optimistic_get(
-/*====================*/
- ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
- buf_block_t* block, /*!< in: guessed block */
- ib_uint64_t modify_clock,/*!< in: modify clock value */
- const char* file, /*!< in: file name */
- unsigned line, /*!< in: line where called */
- mtr_t* mtr); /*!< in: mini-transaction */
-
-/** Given a tablespace id and page number tries to get that page. If the
-page is not in the buffer pool it is not loaded and NULL is returned.
-Suitable for using when holding the lock_sys_t::mutex.
-@param[in] page_id page id
-@param[in] file file name
-@param[in] line line where called
-@param[in] mtr mini-transaction
-@return pointer to a page or NULL */
-buf_block_t*
-buf_page_try_get_func(
- const page_id_t page_id,
- const char* file,
- unsigned line,
- mtr_t* mtr);
-
-/** Tries to get a page.
-If the page is not in the buffer pool it is not loaded. Suitable for using
-when holding the lock_sys_t::mutex.
+ buf_page_get_gen(ID, SIZE, LA, NULL, BUF_GET, MTR)
+
+/** Try to acquire a page latch.
+@param rw_latch RW_S_LATCH or RW_X_LATCH
+@param block guessed block
+@param modify_clock expected value of block->modify_clock
+@param mtr mini-transaction
+@return whether the latch was acquired (the page is an allocated file page) */
+bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block,
+ uint64_t modify_clock, mtr_t *mtr);
+
+/** Try to S-latch a page.
+Suitable for using when holding the lock_sys latches (as it avoids deadlock).
@param[in] page_id page identifier
-@param[in] mtr mini-transaction
-@return the page if in buffer pool, NULL if not */
-#define buf_page_try_get(page_id, mtr) \
- buf_page_try_get_func((page_id), __FILE__, __LINE__, mtr);
+@param[in,out] mtr mini-transaction
+@return the block
+@retval nullptr if an S-latch cannot be granted immediately */
+buf_block_t *buf_page_try_get(const page_id_t page_id, mtr_t *mtr);
/** Get read access to a compressed page (usually of type
FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
-The page must be released with buf_page_release_zip().
+The page must be released with unfix().
NOTE: the page is not protected by any latch. Mutual exclusion has to
be implemented at a higher level. In other words, all possible
accesses to a given page through this function must be protected by
the same set of mutexes or latches.
-@param[in] page_id page id
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size
-@return pointer to the block */
-buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size);
+@param page_id page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size in bytes
+@return pointer to the block, s-latched */
+buf_page_t *buf_page_get_zip(const page_id_t page_id, ulint zip_size);
/** Get access to a database page. Buffered redo log may be applied.
@param[in] page_id page id
@@ -278,10 +194,8 @@ buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size);
@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
@param[in] guess guessed block or NULL
@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
-BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
-@param[in] file file name
-@param[in] line line where called
-@param[in] mtr mini-transaction
+BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in,out] mtr mini-transaction
@param[out] err DB_SUCCESS or error code
@param[in] allow_ibuf_merge Allow change buffer merge while
reading the pages from file.
@@ -293,11 +207,10 @@ buf_page_get_gen(
ulint rw_latch,
buf_block_t* guess,
ulint mode,
- const char* file,
- unsigned line,
mtr_t* mtr,
dberr_t* err = NULL,
- bool allow_ibuf_merge = false);
+ bool allow_ibuf_merge = false)
+ MY_ATTRIBUTE((nonnull(6), warn_unused_result));
/** This is the low level function used to get access to a database page.
@param[in] page_id page id
@@ -305,10 +218,9 @@ buf_page_get_gen(
@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
@param[in] guess guessed block or NULL
@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
-BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
-@param[in] file file name
-@param[in] line line where called
-@param[in] mtr mini-transaction
+BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in,out] mtr mini-transaction, or NULL if a
+ block with page_id is to be evicted
@param[out] err DB_SUCCESS or error code
@param[in] allow_ibuf_merge Allow change buffer merge to happen
while reading the page from file
@@ -322,16 +234,14 @@ buf_page_get_low(
ulint rw_latch,
buf_block_t* guess,
ulint mode,
- const char* file,
- unsigned line,
mtr_t* mtr,
dberr_t* err,
bool allow_ibuf_merge);
/** Initialize a page in the buffer pool. The page is usually not read
from a file even if it cannot be found in the buffer buf_pool. This is one
-of the functions which perform to a block a state transition NOT_USED =>
-FILE_PAGE (the other is buf_page_get_gen).
+of the functions which perform to a block a state transition NOT_USED => LRU
+(the other is buf_page_get_low()).
@param[in,out] space space object
@param[in] offset offset of the tablespace
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
@@ -342,52 +252,24 @@ buf_block_t*
buf_page_create(fil_space_t *space, uint32_t offset,
ulint zip_size, mtr_t *mtr, buf_block_t *free_block);
-/********************************************************************//**
-Releases a compressed-only page acquired with buf_page_get_zip(). */
-UNIV_INLINE
-void
-buf_page_release_zip(
-/*=================*/
- buf_page_t* bpage); /*!< in: buffer block */
-/********************************************************************//**
-Releases a latch, if specified. */
-UNIV_INLINE
-void
-buf_page_release_latch(
-/*=====================*/
- buf_block_t* block, /*!< in: buffer block */
- ulint rw_latch); /*!< in: RW_S_LATCH, RW_X_LATCH,
- RW_NO_LATCH */
+/** Initialize a page in buffer pool while initializing the
+deferred tablespace
+@param space_id space identfier
+@param zip_size ROW_FORMAT=COMPRESSED page size or 0
+@param mtr mini-transaction
+@param free_block pre-allocated buffer block
+@return pointer to the block, page bufferfixed */
+buf_block_t*
+buf_page_create_deferred(uint32_t space_id, ulint zip_size, mtr_t *mtr,
+ buf_block_t *free_block);
+
/** Move a block to the start of the LRU list. */
void buf_page_make_young(buf_page_t *bpage);
-/** Mark the page status as FREED for the given tablespace id and
-page number. If the page is not in buffer pool then ignore it.
+/** Mark the page status as FREED for the given tablespace and page number.
@param[in,out] space tablespace
@param[in] page page number
-@param[in,out] mtr mini-transaction
-@param[in] file file name
-@param[in] line line where called */
-void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr,
- const char *file, unsigned line);
-
-/********************************************************************//**
-Reads the freed_page_clock of a buffer block.
-@return freed_page_clock */
-UNIV_INLINE
-unsigned
-buf_page_get_freed_page_clock(
-/*==========================*/
- const buf_page_t* bpage) /*!< in: block */
- MY_ATTRIBUTE((warn_unused_result));
-/********************************************************************//**
-Reads the freed_page_clock of a buffer block.
-@return freed_page_clock */
-UNIV_INLINE
-unsigned
-buf_block_get_freed_page_clock(
-/*===========================*/
- const buf_block_t* block) /*!< in: block */
- MY_ATTRIBUTE((warn_unused_result));
+@param[in,out] mtr mini-transaction */
+void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr);
/** Determine if a block is still close enough to the MRU end of the LRU list
meaning that it is not in danger of getting evicted and also implying
@@ -431,32 +313,6 @@ ib_uint64_t
buf_block_get_modify_clock(
/*=======================*/
buf_block_t* block); /*!< in: block */
-/*******************************************************************//**
-Increments the bufferfix count. */
-UNIV_INLINE
-void
-buf_block_buf_fix_inc_func(
-/*=======================*/
-# ifdef UNIV_DEBUG
- const char* file, /*!< in: file name */
- unsigned line, /*!< in: line */
-# endif /* UNIV_DEBUG */
- buf_block_t* block) /*!< in/out: block to bufferfix */
- MY_ATTRIBUTE((nonnull));
-
-# ifdef UNIV_DEBUG
-/** Increments the bufferfix count.
-@param[in,out] b block to bufferfix
-@param[in] f file name where requested
-@param[in] l line number where requested */
-# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(f,l,b)
-# else /* UNIV_DEBUG */
-/** Increments the bufferfix count.
-@param[in,out] b block to bufferfix
-@param[in] f file name where requested
-@param[in] l line number where requested */
-# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b)
-# endif /* UNIV_DEBUG */
#endif /* !UNIV_INNOCHECKSUM */
/** Check if a buffer is all zeroes.
@@ -464,42 +320,6 @@ buf_block_buf_fix_inc_func(
@return whether the buffer is all zeroes */
bool buf_is_zeroes(st_::span<const byte> buf);
-/** Checks if the page is in crc32 checksum format.
-@param[in] read_buf database page
-@param[in] checksum_field1 new checksum field
-@param[in] checksum_field2 old checksum field
-@return true if the page is in crc32 checksum format. */
-bool
-buf_page_is_checksum_valid_crc32(
- const byte* read_buf,
- ulint checksum_field1,
- ulint checksum_field2)
- MY_ATTRIBUTE((nonnull(1), warn_unused_result));
-
-/** Checks if the page is in innodb checksum format.
-@param[in] read_buf database page
-@param[in] checksum_field1 new checksum field
-@param[in] checksum_field2 old checksum field
-@return true if the page is in innodb checksum format. */
-bool
-buf_page_is_checksum_valid_innodb(
- const byte* read_buf,
- ulint checksum_field1,
- ulint checksum_field2)
- MY_ATTRIBUTE((nonnull(1), warn_unused_result));
-
-/** Checks if the page is in none checksum format.
-@param[in] read_buf database page
-@param[in] checksum_field1 new checksum field
-@param[in] checksum_field2 old checksum field
-@return true if the page is in none checksum format. */
-bool
-buf_page_is_checksum_valid_none(
- const byte* read_buf,
- ulint checksum_field1,
- ulint checksum_field2)
- MY_ATTRIBUTE((nonnull(1), warn_unused_result));
-
/** Check if a page is corrupt.
@param[in] check_lsn whether the LSN should be checked
@param[in] read_buf database page
@@ -512,27 +332,6 @@ buf_page_is_corrupted(
ulint fsp_flags)
MY_ATTRIBUTE((warn_unused_result));
-inline void *aligned_malloc(size_t size, size_t align)
-{
-#ifdef _MSC_VER
- return _aligned_malloc(size, align);
-#else
- void *result;
- if (posix_memalign(&result, align, size))
- result= NULL;
- return result;
-#endif
-}
-
-inline void aligned_free(void *ptr)
-{
-#ifdef _MSC_VER
- _aligned_free(ptr);
-#else
- free(ptr);
-#endif
-}
-
/** Read the key version from the page. In full crc32 format,
key version is stored at {0-3th} bytes. In other format, it is
stored in 26th position.
@@ -631,35 +430,7 @@ void buf_pool_invalidate();
--------------------------- LOWER LEVEL ROUTINES -------------------------
=========================================================================*/
-#ifdef UNIV_DEBUG
-/*********************************************************************//**
-Adds latch level info for the rw-lock protecting the buffer frame. This
-should be called in the debug version after a successful latching of a
-page if we know the latching order level of the acquired latch. */
-UNIV_INLINE
-void
-buf_block_dbg_add_level(
-/*====================*/
- buf_block_t* block, /*!< in: buffer page
- where we have acquired latch */
- latch_level_t level); /*!< in: latching order level */
-#else /* UNIV_DEBUG */
-# define buf_block_dbg_add_level(block, level) /* nothing */
-#endif /* UNIV_DEBUG */
-
-#ifdef UNIV_DEBUG
-/*********************************************************************//**
-Gets a pointer to the memory frame of a block.
-@return pointer to the frame */
-UNIV_INLINE
-buf_frame_t*
-buf_block_get_frame(
-/*================*/
- const buf_block_t* block) /*!< in: pointer to the control block */
- MY_ATTRIBUTE((warn_unused_result));
-#else /* UNIV_DEBUG */
-# define buf_block_get_frame(block) (block)->frame
-#endif /* UNIV_DEBUG */
+#define buf_block_get_frame(block) (block)->page.frame
/*********************************************************************//**
Gets the compressed page descriptor corresponding to an uncompressed page
@@ -672,18 +443,8 @@ if applicable. */
/** Monitor the buffer page read/write activity, and increment corresponding
counter value in MONITOR_MODULE_BUF_PAGE.
@param bpage buffer page whose read or write was completed
-@param io_type BUF_IO_READ or BUF_IO_WRITE */
-ATTRIBUTE_COLD __attribute__((nonnull))
-void buf_page_monitor(const buf_page_t *bpage, buf_io_fix io_type);
-
-/** Complete a read request of a file page to buf_pool.
-@param bpage recently read page
-@param node data file
-@return whether the operation succeeded
-@retval DB_SUCCESS always when writing, or if a read page was OK
-@retval DB_PAGE_CORRUPTED if the checksum fails on a page read
-@retval DB_DECRYPTION_FAILED if the page cannot be decrypted */
-dberr_t buf_page_read_complete(buf_page_t *bpage, const fil_node_t &node);
+@param read true=read, false=write */
+ATTRIBUTE_COLD void buf_page_monitor(const buf_page_t &bpage, bool read);
/** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit,
if needed.
@@ -752,17 +513,17 @@ class buf_page_t
{
friend buf_pool_t;
friend buf_block_t;
+
/** @name General fields */
/* @{ */
public: // FIXME: fix fil_iterate()
- /** Page id. Protected by buf_pool.hash_lock_get(id) when
+ /** Page id. Protected by buf_pool.page_hash.lock_get() when
the page is in buf_pool.page_hash. */
page_id_t id_;
+ /** buf_pool.page_hash link; protected by buf_pool.page_hash.lock_get() */
+ buf_page_t *hash;
private:
- /** Count of how manyfold this block is currently bufferfixed. */
- Atomic_counter<uint32_t> buf_fix_count_;
-
/** log sequence number of the START of the log entry written of the
oldest modification to this block which has not yet been written
to the data file;
@@ -773,53 +534,64 @@ private:
(because id().space() is the temporary tablespace). */
Atomic_relaxed<lsn_t> oldest_modification_;
- /** type of pending I/O operation; protected by buf_pool.mutex
- if in_LRU_list */
- Atomic_relaxed<buf_io_fix> io_fix_;
- /** Block state. @see in_file().
- State transitions between in_file() states and to
- BUF_BLOCK_REMOVE_HASH are protected by buf_pool.hash_lock_get(id)
- when the block is in buf_pool.page_hash.
- Other transitions when in_LRU_list are protected by buf_pool.mutex. */
- buf_page_state state_;
-
public:
- /** buf_pool.page_hash link; protected by buf_pool.hash_lock_get(id) */
- buf_page_t *hash;
+ /** state() of unused block (in buf_pool.free list) */
+ static constexpr uint32_t NOT_USED= 0;
+ /** state() of block allocated as general-purpose memory */
+ static constexpr uint32_t MEMORY= 1;
+ /** state() of block that is being freed */
+ static constexpr uint32_t REMOVE_HASH= 2;
+ /** smallest state() of a buffer page that is freed in the tablespace */
+ static constexpr uint32_t FREED= 3;
+ /** smallest state() for a block that belongs to buf_pool.LRU */
+ static constexpr uint32_t UNFIXED= 1U << 29;
+ /** smallest state() of a block for which buffered changes may exist */
+ static constexpr uint32_t IBUF_EXIST= 2U << 29;
+ /** smallest state() of a (re)initialized page (no doublewrite needed) */
+ static constexpr uint32_t REINIT= 3U << 29;
+ /** smallest state() for an io-fixed block */
+ static constexpr uint32_t READ_FIX= 4U << 29;
+ /** smallest state() for a write-fixed block */
+ static constexpr uint32_t WRITE_FIX= 5U << 29;
+ /** smallest state() for a write-fixed block with buffered changes */
+ static constexpr uint32_t WRITE_FIX_IBUF= 6U << 29;
+ /** smallest state() for a write-fixed block (no doublewrite was used) */
+ static constexpr uint32_t WRITE_FIX_REINIT= 7U << 29;
+ /** buf_pool.LRU status mask in state() */
+ static constexpr uint32_t LRU_MASK= 7U << 29;
+
+ /** lock covering the contents of frame */
+ block_lock lock;
+ /** pointer to aligned, uncompressed page frame of innodb_page_size */
+ byte *frame;
/* @} */
- page_zip_des_t zip; /*!< compressed page; zip.data
- (but not the data it points to) is
- also protected by buf_pool.mutex;
- state == BUF_BLOCK_ZIP_PAGE and
- zip.data == NULL means an active
- buf_pool.watch */
-
- buf_tmp_buffer_t* slot; /*!< Slot for temporary memory
- used for encryption/compression
- or NULL */
+ /** ROW_FORMAT=COMPRESSED page; zip.data (but not the data it points to)
+ is also protected by buf_pool.mutex;
+ !frame && !zip.data means an active buf_pool.watch */
+ page_zip_des_t zip;
#ifdef UNIV_DEBUG
/** whether this->list is in buf_pool.zip_hash; protected by buf_pool.mutex */
bool in_zip_hash;
- /** whether this->LRU is in buf_pool.LRU (in_file() holds);
+ /** whether this->LRU is in buf_pool.LRU (in_file());
protected by buf_pool.mutex */
bool in_LRU_list;
- /** whether this is in buf_pool.page_hash (in_file() holds);
+ /** whether this is in buf_pool.page_hash (in_file());
protected by buf_pool.mutex */
bool in_page_hash;
- /** whether this->list is in buf_pool.free (state() == BUF_BLOCK_NOT_USED);
+ /** whether this->list is in buf_pool.free (state() == NOT_USED);
protected by buf_pool.flush_list_mutex */
bool in_free_list;
#endif /* UNIV_DEBUG */
/** list member in one of the lists of buf_pool; protected by
buf_pool.mutex or buf_pool.flush_list_mutex
- state() == BUF_BLOCK_NOT_USED: buf_pool.free or buf_pool.withdraw
+ state() == NOT_USED: buf_pool.free or buf_pool.withdraw
in_file() && oldest_modification():
buf_pool.flush_list (protected by buf_pool.flush_list_mutex)
The contents is undefined if in_file() && !oldest_modification(),
- or if state() is BUF_BLOCK_MEMORY or BUF_BLOCK_REMOVE_HASH. */
+ or if state() == MEMORY or state() == REMOVE_HASH. */
UT_LIST_NODE_T(buf_page_t) list;
/** @name LRU replacement algorithm fields.
@@ -843,7 +615,7 @@ public:
0 if the block was never accessed
in the buffer pool.
- For state==BUF_BLOCK_MEMORY
+ For state() == MEMORY
blocks, this field can be repurposed
for something else.
@@ -851,89 +623,127 @@ public:
and bytes allocated for recv_sys.pages,
the field is protected by
recv_sys_t::mutex. */
- /** Change buffer entries for the page exist.
- Protected by io_fix()==BUF_IO_READ or by buf_block_t::lock. */
- bool ibuf_exist;
-
- /** Block initialization status. Can be modified while holding io_fix()
- or buf_block_t::lock X-latch */
- enum {
- /** the page was read normally and should be flushed normally */
- NORMAL = 0,
- /** the page was (re)initialized, and the doublewrite buffer can be
- skipped on the next flush */
- INIT_ON_FLUSH,
- /** the page was freed and need to be flushed.
- For page_compressed, page flush will punch a hole to free space.
- Else if innodb_immediate_scrub_data_uncompressed, the page will
- be overwritten with zeroes. */
- FREED
- } status;
-
- buf_page_t() : id_(0)
+ buf_page_t() : id_{0}
{
- static_assert(BUF_BLOCK_NOT_USED == 0, "compatibility");
+ static_assert(NOT_USED == 0, "compatibility");
memset((void*) this, 0, sizeof *this);
}
- /** Initialize some fields */
- void init()
+ buf_page_t(const buf_page_t &b) :
+ id_(b.id_), hash(b.hash),
+ oldest_modification_(b.oldest_modification_),
+ lock() /* not copied */,
+ frame(b.frame), zip(b.zip),
+#ifdef UNIV_DEBUG
+ in_zip_hash(b.in_zip_hash), in_LRU_list(b.in_LRU_list),
+ in_page_hash(b.in_page_hash), in_free_list(b.in_free_list),
+#endif /* UNIV_DEBUG */
+ list(b.list), LRU(b.LRU), old(b.old), freed_page_clock(b.freed_page_clock),
+ access_time(b.access_time)
{
- io_fix_= BUF_IO_NONE;
- buf_fix_count_= 0;
- old= 0;
- freed_page_clock= 0;
- access_time= 0;
+ lock.init();
+ }
+
+ /** Initialize some more fields */
+ void init(uint32_t state, page_id_t id)
+ {
+ ut_ad(state < REMOVE_HASH || state >= UNFIXED);
+ id_= id;
+ zip.fix= state;
oldest_modification_= 0;
- slot= nullptr;
- ibuf_exist= false;
- status= NORMAL;
+ lock.init();
ut_d(in_zip_hash= false);
ut_d(in_free_list= false);
ut_d(in_LRU_list= false);
ut_d(in_page_hash= false);
- HASH_INVALIDATE(this, hash);
+ old= 0;
+ freed_page_clock= 0;
+ access_time= 0;
}
- /** Initialize some more fields */
- void init(buf_page_state state, page_id_t id, uint32_t buf_fix_count= 0)
+public:
+ const page_id_t &id() const { return id_; }
+ uint32_t state() const { return zip.fix; }
+ uint32_t buf_fix_count() const
{
- init();
- state_= state;
- id_= id;
- buf_fix_count_= buf_fix_count;
+ uint32_t f= state();
+ ut_ad(f >= FREED);
+ return f < UNFIXED ? (f - FREED) : (~LRU_MASK & f);
}
+ /** @return whether this block is read or write fixed;
+ read_complete() or write_complete() will always release
+ the io-fix before releasing U-lock or X-lock */
+ bool is_io_fixed() const
+ { const auto s= state(); ut_ad(s >= FREED); return s >= READ_FIX; }
+ /** @return whether this block is write fixed;
+ write_complete() will always release the write-fix before releasing U-lock */
+ bool is_write_fixed() const { return state() >= WRITE_FIX; }
+ /** @return whether this block is read fixed; this should never hold
+ when a thread is holding the block lock in any mode */
+ bool is_read_fixed() const { return is_io_fixed() && !is_write_fixed(); }
- /** Initialize some more fields */
- void init(page_id_t id, uint32_t buf_fix_count= 0)
+ /** @return if this belongs to buf_pool.unzip_LRU */
+ bool belongs_to_unzip_LRU() const
+ { return UNIV_LIKELY_NULL(zip.data) && frame; }
+
+ bool is_freed() const
+ { const auto s= state(); ut_ad(s >= FREED); return s < UNFIXED; }
+ bool is_ibuf_exist() const
{
- init();
- id_= id;
- buf_fix_count_= buf_fix_count;
+ const auto s= state();
+ ut_ad(s >= UNFIXED);
+ ut_ad(s < READ_FIX);
+ return (s & LRU_MASK) == IBUF_EXIST;
}
+ bool is_reinit() const { return !(~state() & REINIT); }
-public:
- const page_id_t &id() const { return id_; }
- buf_page_state state() const { return state_; }
- uint32_t buf_fix_count() const { return buf_fix_count_; }
- buf_io_fix io_fix() const { return io_fix_; }
- void io_unfix()
+ void set_reinit(uint32_t prev_state)
{
- ut_d(const auto old_io_fix= io_fix());
- ut_ad(old_io_fix == BUF_IO_READ || old_io_fix == BUF_IO_PIN);
- io_fix_= BUF_IO_NONE;
+ ut_ad(prev_state < READ_FIX);
+ ut_d(const auto s=) zip.fix.fetch_add(REINIT - prev_state);
+ ut_ad(s > prev_state);
+ ut_ad(s < prev_state + UNFIXED);
}
- /** @return if this belongs to buf_pool.unzip_LRU */
- bool belongs_to_unzip_LRU() const
+ void set_ibuf_exist()
+ {
+ ut_ad(lock.is_write_locked());
+ ut_ad(id() < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0));
+ const auto s= state();
+ ut_ad(s >= UNFIXED);
+ ut_ad(s < READ_FIX);
+ ut_ad(s < IBUF_EXIST || s >= REINIT);
+ zip.fix.fetch_add(IBUF_EXIST - (LRU_MASK & s));
+ }
+ void clear_ibuf_exist()
+ {
+ ut_ad(lock.is_write_locked());
+ ut_ad(id() < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0));
+ ut_d(const auto s=) zip.fix.fetch_sub(IBUF_EXIST - UNFIXED);
+ ut_ad(s >= IBUF_EXIST);
+ ut_ad(s < REINIT);
+ }
+
+ uint32_t read_unfix(uint32_t s)
{
- return zip.data && state() != BUF_BLOCK_ZIP_PAGE;
+ ut_ad(lock.is_write_locked());
+ ut_ad(s == UNFIXED + 1 || s == IBUF_EXIST + 1 || s == REINIT + 1);
+ uint32_t old_state= zip.fix.fetch_add(s - READ_FIX);
+ ut_ad(old_state >= READ_FIX);
+ ut_ad(old_state < WRITE_FIX);
+ return old_state + (s - READ_FIX);
}
- inline void add_buf_fix_count(uint32_t count);
- inline void set_buf_fix_count(uint32_t count);
- inline void set_state(buf_page_state state);
- inline void set_io_fix(buf_io_fix io_fix);
+ void set_freed(uint32_t prev_state, uint32_t count= 0)
+ {
+ ut_ad(lock.is_write_locked());
+ ut_ad(prev_state >= UNFIXED);
+ ut_ad(prev_state < READ_FIX);
+ ut_d(auto s=) zip.fix.fetch_sub((prev_state & LRU_MASK) - FREED - count);
+ ut_ad(!((prev_state ^ s) & LRU_MASK));
+ }
+
+ inline void set_state(uint32_t s);
inline void set_corrupt_id();
/** @return the log sequence number of the oldest pending modification
@@ -953,35 +763,72 @@ public:
inline void set_oldest_modification(lsn_t lsn);
/** Clear oldest_modification after removing from buf_pool.flush_list */
inline void clear_oldest_modification();
+ /** Reset the oldest_modification when marking a persistent page freed */
+ void reset_oldest_modification()
+ {
+ ut_ad(oldest_modification() > 2);
+ oldest_modification_.store(1, std::memory_order_release);
+ }
+
+ /** Complete a read of a page.
+ @param node data file
+ @return whether the operation succeeded
+ @retval DB_PAGE_CORRUPTED if the checksum fails
+ @retval DB_DECRYPTION_FAILED if the page cannot be decrypted
+ @retval DB_FAIL if the page contains the wrong ID */
+ dberr_t read_complete(const fil_node_t &node);
+
/** Note that a block is no longer dirty, while not removing
it from buf_pool.flush_list */
- inline void clear_oldest_modification(bool temporary);
+ inline void write_complete(bool temporary);
+
+ /** Write a flushable page to a file or free a freeable block.
+ @param evict whether to evict the page on write completion
+ @param space tablespace
+ @return whether a page write was initiated and buf_pool.mutex released */
+ bool flush(bool evict, fil_space_t *space);
/** Notify that a page in a temporary tablespace has been modified. */
void set_temp_modified()
{
ut_ad(fsp_is_system_temporary(id().space()));
- ut_ad(state() == BUF_BLOCK_FILE_PAGE);
- ut_ad(!oldest_modification());
+ ut_ad(in_file());
+ ut_ad((oldest_modification() | 2) == 2);
oldest_modification_= 2;
}
/** Prepare to release a file page to buf_pool.free. */
void free_file_page()
{
- ut_ad(state() == BUF_BLOCK_REMOVE_HASH);
+ ut_ad((zip.fix.fetch_sub(REMOVE_HASH - MEMORY)) == REMOVE_HASH);
/* buf_LRU_block_free_non_file_page() asserts !oldest_modification() */
ut_d(oldest_modification_= 0;)
- set_corrupt_id();
- ut_d(set_state(BUF_BLOCK_MEMORY));
+ id_= page_id_t(~0ULL);
+ }
+
+ void fix_on_recovery()
+ {
+ ut_d(const auto f=) zip.fix.fetch_sub(READ_FIX - UNFIXED - 1);
+ ut_ad(f >= READ_FIX);
+ ut_ad(f < WRITE_FIX);
+ }
+
+ uint32_t fix(uint32_t count= 1)
+ {
+ ut_ad(count);
+ ut_ad(count < IBUF_EXIST);
+ uint32_t f= zip.fix.fetch_add(count);
+ ut_ad(f >= FREED);
+ ut_ad(!((f ^ (f + 1)) & LRU_MASK));
+ return f;
}
- void fix() { buf_fix_count_++; }
uint32_t unfix()
{
- uint32_t count= buf_fix_count_--;
- ut_ad(count != 0);
- return count - 1;
+ uint32_t f= zip.fix.fetch_sub(1);
+ ut_ad(f > FREED);
+ ut_ad(!((f ^ (f - 1)) & LRU_MASK));
+ return f - 1;
}
/** @return the physical size, in bytes */
@@ -1007,27 +854,8 @@ public:
}
/** @return whether the block is mapped to a data file */
- bool in_file() const
- {
- switch (state_) {
- case BUF_BLOCK_ZIP_PAGE:
- case BUF_BLOCK_FILE_PAGE:
- return true;
- case BUF_BLOCK_NOT_USED:
- case BUF_BLOCK_MEMORY:
- case BUF_BLOCK_REMOVE_HASH:
- return false;
- }
+ bool in_file() const { return state() >= FREED; }
- ut_error;
- return false;
- }
-
- /** @return whether the block is modified and ready for flushing */
- inline bool ready_for_flush() const;
- /** @return whether the state can be changed to BUF_BLOCK_NOT_USED */
- bool ready_for_replace() const
- { return !oldest_modification() && can_relocate(); }
/** @return whether the block can be relocated in memory.
The block can be dirty, but it must not be I/O-fixed or bufferfixed. */
inline bool can_relocate() const;
@@ -1059,27 +887,18 @@ struct buf_block_t{
be the first field, so that
buf_pool.page_hash can point
to buf_page_t or buf_block_t */
- byte* frame; /*!< pointer to buffer frame which
- is of size srv_page_size, and
- aligned to an address divisible by
- srv_page_size */
- rw_lock_t lock; /*!< read-write lock of the buffer
- frame */
#ifdef UNIV_DEBUG
/** whether page.list is in buf_pool.withdraw
- ((state() == BUF_BLOCK_NOT_USED)) and the buffer pool is being shrunk;
+ ((state() == NOT_USED)) and the buffer pool is being shrunk;
protected by buf_pool.mutex */
bool in_withdraw_list;
/** whether unzip_LRU is in buf_pool.unzip_LRU
- (state() == BUF_BLOCK_FILE_PAGE and zip.data != nullptr);
+ (in_file() && frame && zip.data);
protected by buf_pool.mutex */
bool in_unzip_LRU_list;
#endif
- UT_LIST_NODE_T(buf_block_t) unzip_LRU;
- /*!< node of the decompressed LRU list;
- a block is in the unzip_LRU list
- if page.state() == BUF_BLOCK_FILE_PAGE
- and page.zip.data != NULL */
+ /** member of buf_pool.unzip_LRU (if belongs_to_unzip_LRU()) */
+ UT_LIST_NODE_T(buf_block_t) unzip_LRU;
/* @} */
/** @name Optimistic search field */
/* @{ */
@@ -1118,17 +937,15 @@ struct buf_block_t{
These 5 fields may only be modified when:
we are holding the appropriate x-latch in btr_search_latches[], and
one of the following holds:
- (1) the block state is BUF_BLOCK_FILE_PAGE, and
- we are holding an s-latch or x-latch on buf_block_t::lock, or
- (2) buf_block_t::buf_fix_count == 0, or
- (3) the block state is BUF_BLOCK_REMOVE_HASH.
+ (1) in_file(), and we are holding lock in any mode, or
+ (2) !is_read_fixed()&&(state()>=UNFIXED||state()==REMOVE_HASH).
An exception to this is when we init or create a page
in the buffer pool in buf0buf.cc.
Another exception for buf_pool_t::clear_hash_index() is that
assigning block->index = NULL (and block->n_pointers = 0)
- is allowed whenever btr_search_own_all(RW_LOCK_X).
+ is allowed whenever all AHI latches are exclusively locked.
Another exception is that ha_insert_for_fold() may
decrement n_pointers without holding the appropriate latch
@@ -1137,8 +954,8 @@ struct buf_block_t{
This implies that the fields may be read without race
condition whenever any of the following hold:
- - the btr_search_latches[] s-latch or x-latch is being held, or
- - the block state is not BUF_BLOCK_FILE_PAGE or BUF_BLOCK_REMOVE_HASH,
+ - the btr_search_sys.partition[].latch is being held, or
+ - state() == NOT_USED || state() == MEMORY,
and holding some latch prevents the state from changing to that.
Some use of assert_block_ahi_empty() or assert_block_ahi_valid()
@@ -1152,9 +969,7 @@ struct buf_block_t{
Atomic_counter<ulint>
n_pointers; /*!< used in debugging: the number of
pointers in the adaptive hash index
- pointing to this frame;
- protected by atomic memory access
- or btr_search_own_all(). */
+ pointing to this frame */
# define assert_block_ahi_empty(block) \
ut_a((block)->n_pointers == 0)
# define assert_block_ahi_empty_on_init(block) do { \
@@ -1188,24 +1003,8 @@ struct buf_block_t{
# define assert_block_ahi_empty_on_init(block) /* nothing */
# define assert_block_ahi_valid(block) /* nothing */
#endif /* BTR_CUR_HASH_ADAPT */
-# ifdef UNIV_DEBUG
- /** @name Debug fields */
- /* @{ */
- rw_lock_t* debug_latch; /*!< in the debug version, each thread
- which bufferfixes the block acquires
- an s-latch here; so we can use the
- debug utilities in sync0rw */
- /* @} */
-# endif
void fix() { page.fix(); }
- uint32_t unfix()
- {
- ut_ad(page.buf_fix_count() || page.io_fix() != BUF_IO_NONE ||
- page.state() == BUF_BLOCK_ZIP_PAGE ||
- !rw_lock_own_flagged(&lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S |
- RW_LOCK_FLAG_SX));
- return page.unfix();
- }
+ uint32_t unfix() { return page.unfix(); }
/** @return the physical size, in bytes */
ulint physical_size() const { return page.physical_size(); }
@@ -1217,22 +1016,22 @@ struct buf_block_t{
/** Initialize the block.
@param page_id page identifier
@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
- @param fix initial buf_fix_count() */
- void initialise(const page_id_t page_id, ulint zip_size, uint32_t fix= 0);
+ @param state initial state() */
+ void initialise(const page_id_t page_id, ulint zip_size, uint32_t state);
};
/**********************************************************************//**
Compute the hash fold value for blocks in buf_pool.zip_hash. */
/* @{ */
#define BUF_POOL_ZIP_FOLD_PTR(ptr) (ulint(ptr) >> srv_page_size_shift)
-#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame)
+#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->page.frame)
#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b))
/* @} */
-/** A "Hazard Pointer" class used to iterate over page lists
-inside the buffer pool. A hazard pointer is a buf_page_t pointer
+/** A "Hazard Pointer" class used to iterate over buf_pool.LRU or
+buf_pool.flush_list. A hazard pointer is a buf_page_t pointer
which we intend to iterate over next and we want it remain valid
-even after we release the buffer pool mutex. */
+even after we release the mutex that protects the list. */
class HazardPointer
{
public:
@@ -1347,14 +1146,18 @@ struct buf_buddy_free_t {
/*!< Node of zip_free list */
};
-/** @brief The buffer pool statistics structure. */
+/** @brief The buffer pool statistics structure;
+protected by buf_pool.mutex unless otherwise noted. */
struct buf_pool_stat_t{
- ulint n_page_gets; /*!< number of page gets performed;
+ /** Initialize the counters */
+ void init() { memset((void*) this, 0, sizeof *this); }
+
+ ib_counter_t<ulint, ib_counter_element_t> n_page_gets;
+ /*!< number of page gets performed;
also successful searches through
the adaptive hash index are
- counted as page gets; this field
- is NOT protected by the buffer
- pool mutex */
+ counted as page gets;
+ NOT protected by buf_pool.mutex */
ulint n_pages_read; /*!< number read operations */
ulint n_pages_written;/*!< number write operations */
ulint n_pages_created;/*!< number of pages created
@@ -1372,10 +1175,9 @@ struct buf_pool_stat_t{
young because the first access
was not long enough ago, in
buf_page_peek_if_too_old() */
- /** number of waits for eviction; writes protected by buf_pool.mutex */
+ /** number of waits for eviction */
ulint LRU_waits;
ulint LRU_bytes; /*!< LRU size in bytes */
- ulint flush_list_bytes;/*!< flush_list size in bytes */
};
/** Statistics of buddy blocks of a given size. */
@@ -1415,7 +1217,7 @@ class buf_pool_t
size_t mem_size() const { return mem_pfx.m_size; }
/** Register the chunk */
- void reg() { map_reg->emplace(map::value_type(blocks->frame, this)); }
+ void reg() { map_reg->emplace(map::value_type(blocks->page.frame, this)); }
/** Allocate a chunk of buffer frames.
@param bytes requested size
@@ -1442,7 +1244,14 @@ class buf_pool_t
inline const buf_block_t *not_freed() const;
#endif /* UNIV_DEBUG */
};
-
+public:
+ /** Hash cell chain in page_hash_table */
+ struct hash_chain
+ {
+ /** pointer to the first block */
+ buf_page_t *first;
+ };
+private:
/** Withdraw blocks from the buffer pool until meeting withdraw_target.
@return whether retry is needed */
inline bool withdraw_blocks();
@@ -1494,27 +1303,27 @@ public:
{
ut_ad(is_initialised());
size_t size= 0;
- for (auto j= n_chunks; j--; )
+ for (auto j= ut_min(n_chunks_new, n_chunks); j--; )
size+= chunks[j].size;
return size;
}
/** Determine whether a frame is intended to be withdrawn during resize().
- @param ptr pointer within a buf_block_t::frame
+ @param ptr pointer within a buf_page_t::frame
@return whether the frame will be withdrawn */
bool will_be_withdrawn(const byte *ptr) const
{
- ut_ad(curr_size < old_size);
+ ut_ad(n_chunks_new < n_chunks);
#ifdef SAFE_MUTEX
- if (resizing.load(std::memory_order_relaxed))
+ if (resize_in_progress())
mysql_mutex_assert_owner(&mutex);
#endif /* SAFE_MUTEX */
for (const chunk_t *chunk= chunks + n_chunks_new,
* const echunk= chunks + n_chunks;
chunk != echunk; chunk++)
- if (ptr >= chunk->blocks->frame &&
- ptr < (chunk->blocks + chunk->size - 1)->frame + srv_page_size)
+ if (ptr >= chunk->blocks->page.frame &&
+ ptr < (chunk->blocks + chunk->size - 1)->page.frame + srv_page_size)
return true;
return false;
}
@@ -1524,9 +1333,9 @@ public:
@return whether the frame will be withdrawn */
bool will_be_withdrawn(const buf_page_t &bpage) const
{
- ut_ad(curr_size < old_size);
+ ut_ad(n_chunks_new < n_chunks);
#ifdef SAFE_MUTEX
- if (resizing.load(std::memory_order_relaxed))
+ if (resize_in_progress())
mysql_mutex_assert_owner(&mutex);
#endif /* SAFE_MUTEX */
@@ -1540,8 +1349,9 @@ public:
}
/** Release and evict a corrupted page.
- @param bpage page that was being read */
- ATTRIBUTE_COLD void corrupted_evict(buf_page_t *bpage);
+ @param bpage x-latched page that was found corrupted
+ @param state expected current state of the page */
+ ATTRIBUTE_COLD void corrupted_evict(buf_page_t *bpage, uint32_t state);
/** Release a memory block to the buffer pool. */
ATTRIBUTE_COLD void free_block(buf_block_t *block);
@@ -1576,9 +1386,6 @@ public:
inline buf_block_t *block_from_ahi(const byte *ptr) const;
#endif /* BTR_CUR_HASH_ADAPT */
- bool is_block_lock(const rw_lock_t *l) const
- { return is_block_field(static_cast<const void*>(l)); }
-
/**
@return the smallest oldest_modification lsn for any page
@retval empty_lsn if all modified persistent pages have been flushed */
@@ -1607,84 +1414,27 @@ public:
return is_block_field(reinterpret_cast<const void*>(block));
}
- /** Get the page_hash latch for a page */
- page_hash_latch *hash_lock_get(const page_id_t id) const
- {
- return page_hash.lock_get(id.fold());
- }
-
- /** Look up a block descriptor.
- @param id page identifier
- @param fold id.fold()
- @return block descriptor, possibly in watch[]
- @retval nullptr if not found*/
- buf_page_t *page_hash_get_low(const page_id_t id, const ulint fold)
- {
- ut_ad(id.fold() == fold);
-#ifdef SAFE_MUTEX
- DBUG_ASSERT(mysql_mutex_is_owner(&mutex) ||
- page_hash.lock_get(fold)->is_locked());
-#endif /* SAFE_MUTEX */
- buf_page_t *bpage;
- /* Look for the page in the hash table */
- HASH_SEARCH(hash, &page_hash, fold, buf_page_t*, bpage,
- ut_ad(bpage->in_page_hash), id == bpage->id());
- return bpage;
- }
-private:
- /** Look up a block descriptor.
- @tparam exclusive whether the latch is to be acquired exclusively
- @tparam watch whether to allow watch_is_sentinel()
- @param page_id page identifier
- @param fold page_id.fold()
- @param hash_lock pointer to the acquired latch (to be released by caller)
- @return pointer to the block
- @retval nullptr if no block was found; !lock || !*lock will also hold */
- template<bool exclusive,bool watch>
- buf_page_t *page_hash_get_locked(const page_id_t page_id, ulint fold,
- page_hash_latch **hash_lock)
+public:
+ /** @return whether the buffer pool contains a page
+ @tparam allow_watch whether to allow watch_is_sentinel()
+ @param page_id page identifier
+ @param chain hash table chain for page_id.fold() */
+ template<bool allow_watch= false>
+ TRANSACTIONAL_INLINE
+ bool page_hash_contains(const page_id_t page_id, hash_chain &chain)
{
- ut_ad(hash_lock || !exclusive);
- page_hash_latch *latch= page_hash.lock<exclusive>(fold);
- buf_page_t *bpage= page_hash_get_low(page_id, fold);
- if (!bpage || watch_is_sentinel(*bpage))
+ transactional_shared_lock_guard<page_hash_latch> g
+ {page_hash.lock_get(chain)};
+ buf_page_t *bpage= page_hash.get(page_id, chain);
+ if (bpage >= &watch[0] && bpage < &watch[UT_ARR_SIZE(watch)])
{
- latch->release<exclusive>();
- if (hash_lock)
- *hash_lock= nullptr;
- return watch ? bpage : nullptr;
+ ut_ad(!bpage->in_zip_hash);
+ ut_ad(!bpage->zip.data);
+ if (!allow_watch)
+ bpage= nullptr;
}
-
- ut_ad(bpage->in_file());
- ut_ad(page_id == bpage->id());
-
- if (hash_lock)
- *hash_lock= latch; /* to be released by the caller */
- else
- latch->release<exclusive>();
return bpage;
}
-public:
- /** Look up a block descriptor.
- @tparam exclusive whether the latch is to be acquired exclusively
- @param page_id page identifier
- @param fold page_id.fold()
- @param hash_lock pointer to the acquired latch (to be released by caller)
- @return pointer to the block
- @retval nullptr if no block was found; !lock || !*lock will also hold */
- template<bool exclusive>
- buf_page_t *page_hash_get_locked(const page_id_t page_id, ulint fold,
- page_hash_latch **hash_lock)
- { return page_hash_get_locked<exclusive,false>(page_id, fold, hash_lock); }
-
- /** @return whether the buffer pool contains a page
- @tparam watch whether to allow watch_is_sentinel()
- @param page_id page identifier */
- template<bool watch= false>
- bool page_hash_contains(const page_id_t page_id)
- {
- return page_hash_get_locked<false,watch>(page_id, page_id.fold(), nullptr);
- }
/** Determine if a block is a sentinel for a buffer pool watch.
@param bpage page descriptor
@@ -1693,17 +1443,12 @@ public:
{
#ifdef SAFE_MUTEX
DBUG_ASSERT(mysql_mutex_is_owner(&mutex) ||
- hash_lock_get(bpage.id())->is_locked());
+ page_hash.lock_get(page_hash.cell_get(bpage.id().fold())).
+ is_locked());
#endif /* SAFE_MUTEX */
ut_ad(bpage.in_file());
-
- if (&bpage < &watch[0] || &bpage >= &watch[UT_ARR_SIZE(watch)])
- {
- ut_ad(bpage.state() != BUF_BLOCK_ZIP_PAGE || bpage.zip.data);
+ if (&bpage < &watch[0] || &bpage >= &watch[array_elements(watch)])
return false;
- }
-
- ut_ad(bpage.state() == BUF_BLOCK_ZIP_PAGE);
ut_ad(!bpage.in_zip_hash);
ut_ad(!bpage.zip.data);
return true;
@@ -1713,44 +1458,55 @@ public:
This may only be called after !watch_set() and before invoking watch_unset().
@param id page identifier
@return whether the page was read to the buffer pool */
+ TRANSACTIONAL_INLINE
bool watch_occurred(const page_id_t id)
{
- const ulint fold= id.fold();
- page_hash_latch *hash_lock= page_hash.lock<false>(fold);
+ hash_chain &chain= page_hash.cell_get(id.fold());
+ transactional_shared_lock_guard<page_hash_latch> g
+ {page_hash.lock_get(chain)};
/* The page must exist because watch_set() increments buf_fix_count. */
- buf_page_t *bpage= page_hash_get_low(id, fold);
- const bool is_sentinel= watch_is_sentinel(*bpage);
- hash_lock->read_unlock();
- return !is_sentinel;
+ return !watch_is_sentinel(*page_hash.get(id, chain));
}
- /** Register a watch for a page identifier. The caller must hold an
- exclusive page hash latch. The *hash_lock may be released,
- relocated, and reacquired.
+ /** Register a watch for a page identifier.
@param id page identifier
- @param hash_lock exclusively held page_hash latch
- @return a buffer pool block corresponding to id
- @retval nullptr if the block was not present, and a watch was installed */
- inline buf_page_t *watch_set(const page_id_t id,
- page_hash_latch **hash_lock);
+ @param chain page_hash.cell_get(id.fold())
+ @return a buffer page corresponding to id
+ @retval nullptr if the block was not present in page_hash */
+ buf_page_t *watch_set(const page_id_t id, hash_chain &chain);
/** Stop watching whether a page has been read in.
watch_set(id) must have returned nullptr before.
- @param id page identifier */
- void watch_unset(const page_id_t id);
+ @param id page identifier
+ @param chain unlocked hash table chain */
+ void watch_unset(const page_id_t id, hash_chain &chain);
/** Remove the sentinel block for the watch before replacing it with a
real block. watch_unset() or watch_occurred() will notice
that the block has been replaced with the real block.
- @param watch sentinel */
- inline void watch_remove(buf_page_t *watch);
+ @param w sentinel
+ @param chain locked hash table chain
+ @return w->state() */
+ inline uint32_t watch_remove(buf_page_t *w, hash_chain &chain);
/** @return whether less than 1/4 of the buffer pool is available */
+ TPOOL_SUPPRESS_TSAN
bool running_out() const
{
return !recv_recovery_is_on() &&
- UNIV_UNLIKELY(UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) <
- std::min(curr_size, old_size) / 4);
+ UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) <
+ n_chunks_new / 4 * chunks->size;
+ }
+
+ /** @return whether the buffer pool has run out */
+ TPOOL_SUPPRESS_TSAN
+ bool ran_out() const
+ { return UNIV_UNLIKELY(!try_LRU_scan || !UT_LIST_GET_LEN(free)); }
+
+ /** @return whether the buffer pool is shrinking */
+ inline bool is_shrinking() const
+ {
+ return n_chunks_new < n_chunks;
}
#ifdef UNIV_DEBUG
@@ -1783,18 +1539,11 @@ public:
static constexpr uint32_t READ_AHEAD_PAGES= 64;
/** Buffer pool mutex */
- MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
- /** Number of pending LRU flush; protected by mutex. */
- ulint n_flush_LRU_;
- /** broadcast when n_flush_LRU reaches 0; protected by mutex */
- pthread_cond_t done_flush_LRU;
- /** Number of pending flush_list flush; protected by mutex */
- ulint n_flush_list_;
- /** broadcast when n_flush_list reaches 0; protected by mutex */
- pthread_cond_t done_flush_list;
-
- TPOOL_SUPPRESS_TSAN ulint n_flush_LRU() const { return n_flush_LRU_; }
- TPOOL_SUPPRESS_TSAN ulint n_flush_list() const { return n_flush_list_; }
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
+ /** current statistics; protected by mutex */
+ buf_pool_stat_t stat;
+ /** old statistics; protected by mutex */
+ buf_pool_stat_t old_stat;
/** @name General fields */
/* @{ */
@@ -1809,30 +1558,35 @@ public:
ut_allocator<unsigned char> allocator; /*!< Allocator used for
allocating memory for the the "chunks"
member. */
- volatile ulint n_chunks; /*!< number of buffer pool chunks */
- volatile ulint n_chunks_new; /*!< new number of buffer pool chunks */
+ ulint n_chunks; /*!< number of buffer pool chunks */
+ ulint n_chunks_new; /*!< new number of buffer pool chunks.
+ both n_chunks{,new} are protected under
+ mutex */
chunk_t* chunks; /*!< buffer pool chunks */
chunk_t* chunks_old; /*!< old buffer pool chunks to be freed
after resizing buffer pool */
/** current pool size in pages */
Atomic_counter<ulint> curr_size;
- /** previous pool size in pages */
- Atomic_counter<ulint> old_size;
/** read-ahead request size in pages */
Atomic_counter<uint32_t> read_ahead_area;
- /** Hash table with singly-linked overflow lists. @see hash_table_t */
+ /** Hash table with singly-linked overflow lists */
struct page_hash_table
{
+ static_assert(CPU_LEVEL1_DCACHE_LINESIZE >= 64, "less than 64 bytes");
+ static_assert(!(CPU_LEVEL1_DCACHE_LINESIZE & 63),
+ "not a multiple of 64 bytes");
+
/** Number of array[] elements per page_hash_latch.
Must be one less than a power of 2. */
- static constexpr size_t ELEMENTS_PER_LATCH= CPU_LEVEL1_DCACHE_LINESIZE /
- sizeof(void*) - 1;
+ static constexpr size_t ELEMENTS_PER_LATCH= 64 / sizeof(void*) - 1;
+ static constexpr size_t EMPTY_SLOTS_PER_LATCH=
+ ((CPU_LEVEL1_DCACHE_LINESIZE / 64) - 1) * (64 / sizeof(void*));
/** number of payload elements in array[] */
Atomic_relaxed<ulint> n_cells;
/** the hash table, with pad(n_cells) elements, aligned to L1 cache size */
- hash_cell_t *array;
+ hash_chain *array;
/** Create the hash table.
@param n the lower bound of n_cells */
@@ -1844,7 +1598,12 @@ public:
/** @return the index of an array element */
ulint calc_hash(ulint fold) const { return calc_hash(fold, n_cells); }
/** @return raw array index converted to padded index */
- static ulint pad(ulint h) { return 1 + (h / ELEMENTS_PER_LATCH) + h; }
+ static ulint pad(ulint h)
+ {
+ ulint latches= h / ELEMENTS_PER_LATCH;
+ ulint empty_slots= latches * EMPTY_SLOTS_PER_LATCH;
+ return 1 + latches + empty_slots + h;
+ }
private:
/** @return the hash value before any ELEMENTS_PER_LATCH padding */
static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); }
@@ -1854,29 +1613,72 @@ public:
{
return pad(hash(fold, n_cells));
}
- /** Get a page_hash latch. */
- page_hash_latch *lock_get(ulint fold, ulint n) const
+ public:
+ /** @return the latch covering a hash table chain */
+ static page_hash_latch &lock_get(hash_chain &chain)
{
static_assert(!((ELEMENTS_PER_LATCH + 1) & ELEMENTS_PER_LATCH),
"must be one less than a power of 2");
- return reinterpret_cast<page_hash_latch*>
- (&array[calc_hash(fold, n) & ~ELEMENTS_PER_LATCH]);
+ const size_t addr= reinterpret_cast<size_t>(&chain);
+ ut_ad(addr & (ELEMENTS_PER_LATCH * sizeof chain));
+ return *reinterpret_cast<page_hash_latch*>
+ (addr & ~(ELEMENTS_PER_LATCH * sizeof chain));
}
- public:
- /** Get a page_hash latch. */
- page_hash_latch *lock_get(ulint fold) const
- { return lock_get(fold, n_cells); }
-
- /** Acquire an array latch.
- @tparam exclusive whether the latch is to be acquired exclusively
- @param fold hash bucket key */
- template<bool exclusive> page_hash_latch *lock(ulint fold)
+
+ /** Get a hash table slot. */
+ hash_chain &cell_get(ulint fold) const
+ { return array[calc_hash(fold, n_cells)]; }
+
+ /** Append a block descriptor to a hash bucket chain. */
+ void append(hash_chain &chain, buf_page_t *bpage)
+ {
+ ut_ad(!bpage->in_page_hash);
+ ut_ad(!bpage->hash);
+ ut_d(bpage->in_page_hash= true);
+ buf_page_t **prev= &chain.first;
+ while (*prev)
+ {
+ ut_ad((*prev)->in_page_hash);
+ prev= &(*prev)->hash;
+ }
+ *prev= bpage;
+ }
+
+ /** Remove a block descriptor from a hash bucket chain. */
+ void remove(hash_chain &chain, buf_page_t *bpage)
{
- page_hash_latch *latch= lock_get(fold, n_cells);
- latch->acquire<exclusive>();
- return latch;
+ ut_ad(bpage->in_page_hash);
+ buf_page_t **prev= &chain.first;
+ while (*prev != bpage)
+ {
+ ut_ad((*prev)->in_page_hash);
+ prev= &(*prev)->hash;
+ }
+ *prev= bpage->hash;
+ ut_d(bpage->in_page_hash= false);
+ bpage->hash= nullptr;
}
+ /** Replace a block descriptor with another. */
+ void replace(hash_chain &chain, buf_page_t *old, buf_page_t *bpage)
+ {
+ ut_ad(old->in_page_hash);
+ ut_ad(bpage->in_page_hash);
+ ut_d(old->in_page_hash= false);
+ ut_ad(bpage->hash == old->hash);
+ old->hash= nullptr;
+ buf_page_t **prev= &chain.first;
+ while (*prev != old)
+ {
+ ut_ad((*prev)->in_page_hash);
+ prev= &(*prev)->hash;
+ }
+ *prev= bpage;
+ }
+
+ /** Look up a page in a hash bucket chain. */
+ inline buf_page_t *get(const page_id_t id, const hash_chain &chain) const;
+
/** Exclusively aqcuire all latches */
inline void write_lock_all();
@@ -1891,8 +1693,6 @@ public:
/** map of block->frame to buf_block_t blocks that belong
to buf_buddy_alloc(); protected by buf_pool.mutex */
hash_table_t zip_hash;
- /** number of pending read operations */
- Atomic_counter<ulint> n_pend_reads;
Atomic_counter<ulint>
n_pend_unzip; /*!< number of pending decompressions */
@@ -1902,44 +1702,90 @@ public:
buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES_MAX + 1];
/*!< Statistics of buddy system,
indexed by block size */
- buf_pool_stat_t stat; /*!< current statistics */
- buf_pool_stat_t old_stat; /*!< old statistics */
/* @} */
+ /** number of index page splits */
+ Atomic_counter<ulint> pages_split;
+
/** @name Page flushing algorithm fields */
/* @{ */
/** mutex protecting flush_list, buf_page_t::set_oldest_modification()
and buf_page_t::list pointers when !oldest_modification() */
- MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_list_mutex;
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_list_mutex;
/** "hazard pointer" for flush_list scans; protected by flush_list_mutex */
FlushHp flush_hp;
- /** modified blocks (a subset of LRU) */
+ /** flush_list size in bytes; protected by flush_list_mutex */
+ ulint flush_list_bytes;
+ /** possibly modified persistent pages (a subset of LRU);
+ os_aio_pending_writes() is approximately COUNT(is_write_fixed()) */
UT_LIST_BASE_NODE_T(buf_page_t) flush_list;
private:
- /** whether the page cleaner needs wakeup from indefinite sleep */
- bool page_cleaner_is_idle;
+ static constexpr unsigned PAGE_CLEANER_IDLE= 1;
+ static constexpr unsigned FLUSH_LIST_ACTIVE= 2;
+ static constexpr unsigned LRU_FLUSH= 4;
+
+ /** Number of pending LRU flush * LRU_FLUSH +
+ PAGE_CLEANER_IDLE + FLUSH_LIST_ACTIVE flags */
+ unsigned page_cleaner_status;
/** track server activity count for signaling idle flushing */
ulint last_activity_count;
public:
/** signalled to wake up the page_cleaner; protected by flush_list_mutex */
pthread_cond_t do_flush_list;
+ /** broadcast when !n_flush(); protected by flush_list_mutex */
+ pthread_cond_t done_flush_LRU;
+ /** broadcast when a batch completes; protected by flush_list_mutex */
+ pthread_cond_t done_flush_list;
+
+ /** @return number of pending LRU flush */
+ unsigned n_flush() const
+ {
+ mysql_mutex_assert_owner(&flush_list_mutex);
+ return page_cleaner_status / LRU_FLUSH;
+ }
+
+ /** Increment the number of pending LRU flush */
+ inline void n_flush_inc();
+
+ /** Decrement the number of pending LRU flush */
+ inline void n_flush_dec();
+
+ /** @return whether flush_list flushing is active */
+ bool flush_list_active() const
+ {
+ mysql_mutex_assert_owner(&flush_list_mutex);
+ return page_cleaner_status & FLUSH_LIST_ACTIVE;
+ }
+
+ void flush_list_set_active()
+ {
+ ut_ad(!flush_list_active());
+ page_cleaner_status+= FLUSH_LIST_ACTIVE;
+ }
+ void flush_list_set_inactive()
+ {
+ ut_ad(flush_list_active());
+ page_cleaner_status-= FLUSH_LIST_ACTIVE;
+ }
/** @return whether the page cleaner must sleep due to being idle */
bool page_cleaner_idle() const
{
mysql_mutex_assert_owner(&flush_list_mutex);
- return page_cleaner_is_idle;
+ return page_cleaner_status & PAGE_CLEANER_IDLE;
}
- /** Wake up the page cleaner if needed */
- void page_cleaner_wakeup();
+ /** Wake up the page cleaner if needed.
+ @param for_LRU whether to wake up for LRU eviction */
+ void page_cleaner_wakeup(bool for_LRU= false);
/** Register whether an explicit wakeup of the page cleaner is needed */
void page_cleaner_set_idle(bool deep_sleep)
{
mysql_mutex_assert_owner(&flush_list_mutex);
- page_cleaner_is_idle= deep_sleep;
+ page_cleaner_status= (page_cleaner_status & ~PAGE_CLEANER_IDLE) |
+ (PAGE_CLEANER_IDLE * deep_sleep);
}
/** Update server last activity count */
@@ -1949,9 +1795,6 @@ public:
last_activity_count= activity_count;
}
- // n_flush_LRU() + n_flush_list()
- // is approximately COUNT(io_fix()==BUF_IO_WRITE) in flush_list
-
unsigned freed_page_clock;/*!< a sequence number used
to count the number of buffer
blocks removed from the end of
@@ -1961,16 +1804,10 @@ public:
to read this for heuristic
purposes without holding any
mutex or latch */
- bool try_LRU_scan; /*!< Cleared when an LRU
- scan for free block fails. This
- flag is used to avoid repeated
- scans of LRU list when we know
- that there is no free block
- available in the scan depth for
- eviction. Set whenever
- we flush a batch from the
- buffer pool. Protected by the
- buf_pool.mutex */
+ /** Cleared when buf_LRU_get_free_block() fails.
+ Set whenever the free list grows, along with a broadcast of done_free.
+ Protected by buf_pool.mutex. */
+ Atomic_relaxed<bool> try_LRU_scan;
/* @} */
/** @name LRU replacement algorithm fields */
@@ -1979,7 +1816,8 @@ public:
UT_LIST_BASE_NODE_T(buf_page_t) free;
/*!< base node of the free
block list */
- /** signaled each time when the free list grows; protected by mutex */
+ /** broadcast each time when the free list grows or try_LRU_scan is set;
+ protected by mutex */
pthread_cond_t done_free;
UT_LIST_BASE_NODE_T(buf_page_t) withdraw;
@@ -2034,34 +1872,13 @@ public:
/** Reserve a buffer. */
buf_tmp_buffer_t *io_buf_reserve() { return io_buf.reserve(); }
- /** @return whether any I/O is pending */
- bool any_io_pending()
- {
- if (n_pend_reads)
- return true;
- mysql_mutex_lock(&mutex);
- const bool any_pending{n_flush_LRU_ || n_flush_list_};
- mysql_mutex_unlock(&mutex);
- return any_pending;
- }
- /** @return total amount of pending I/O */
- ulint io_pending() const
- {
- return n_pend_reads + n_flush_LRU() + n_flush_list();
- }
-
private:
/** Remove a block from the flush list. */
inline void delete_from_flush_list_low(buf_page_t *bpage);
- /** Remove a block from flush_list.
- @param bpage buffer pool page
- @param clear whether to invoke buf_page_t::clear_oldest_modification() */
- void delete_from_flush_list(buf_page_t *bpage, bool clear);
public:
/** Remove a block from flush_list.
@param bpage buffer pool page */
- void delete_from_flush_list(buf_page_t *bpage)
- { delete_from_flush_list(bpage, true); }
+ void delete_from_flush_list(buf_page_t *bpage);
/** Insert a modified block into the flush list.
@param block modified block
@@ -2069,7 +1886,7 @@ public:
void insert_into_flush_list(buf_block_t *block, lsn_t lsn);
/** Free a page whose underlying file page has been freed. */
- inline void release_freed_page(buf_page_t *bpage);
+ ATTRIBUTE_COLD void release_freed_page(buf_page_t *bpage);
private:
/** Temporary memory for page_compressed and encrypted I/O */
@@ -2080,34 +1897,12 @@ private:
/** array of slots */
buf_tmp_buffer_t *slots;
- void create(ulint n_slots)
- {
- this->n_slots= n_slots;
- slots= static_cast<buf_tmp_buffer_t*>
- (ut_malloc_nokey(n_slots * sizeof *slots));
- memset((void*) slots, 0, n_slots * sizeof *slots);
- }
+ void create(ulint n_slots);
- void close()
- {
- for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
- {
- aligned_free(s->crypt_buf);
- aligned_free(s->comp_buf);
- }
- ut_free(slots);
- slots= nullptr;
- n_slots= 0;
- }
+ void close();
/** Reserve a buffer */
- buf_tmp_buffer_t *reserve()
- {
- for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
- if (s->acquire())
- return s;
- return nullptr;
- }
+ buf_tmp_buffer_t *reserve();
} io_buf;
/** whether resize() is in the critical path */
@@ -2117,64 +1912,46 @@ private:
/** The InnoDB buffer pool */
extern buf_pool_t buf_pool;
-inline void page_hash_latch::read_lock()
+inline buf_page_t *buf_pool_t::page_hash_table::get(const page_id_t id,
+ const hash_chain &chain)
+ const
+{
+#ifdef SAFE_MUTEX
+ DBUG_ASSERT(mysql_mutex_is_owner(&buf_pool.mutex) ||
+ lock_get(const_cast<hash_chain&>(chain)).is_locked());
+#endif /* SAFE_MUTEX */
+ for (buf_page_t *bpage= chain.first; bpage; bpage= bpage->hash)
+ {
+ ut_ad(bpage->in_page_hash);
+ ut_ad(bpage->in_file());
+ if (bpage->id() == id)
+ return bpage;
+ }
+ return nullptr;
+}
+
+#ifdef SUX_LOCK_GENERIC
+inline void page_hash_latch::lock_shared()
{
mysql_mutex_assert_not_owner(&buf_pool.mutex);
if (!read_trylock())
read_lock_wait();
}
-inline void page_hash_latch::write_lock()
+inline void page_hash_latch::lock()
{
if (!write_trylock())
write_lock_wait();
}
+#endif /* SUX_LOCK_GENERIC */
-inline void buf_page_t::add_buf_fix_count(uint32_t count)
-{
- mysql_mutex_assert_owner(&buf_pool.mutex);
- buf_fix_count_+= count;
-}
-
-inline void buf_page_t::set_buf_fix_count(uint32_t count)
-{
- mysql_mutex_assert_owner(&buf_pool.mutex);
- buf_fix_count_= count;
-}
-
-inline void buf_page_t::set_state(buf_page_state state)
-{
- mysql_mutex_assert_owner(&buf_pool.mutex);
-#ifdef UNIV_DEBUG
- switch (state) {
- case BUF_BLOCK_REMOVE_HASH:
- /* buf_pool_t::corrupted_evict() invokes set_corrupt_id()
- before buf_LRU_free_one_page(), so we cannot assert that
- we are holding the hash_lock. */
- break;
- case BUF_BLOCK_MEMORY:
- if (!in_file()) break;
- /* fall through */
- case BUF_BLOCK_FILE_PAGE:
- ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked());
- break;
- case BUF_BLOCK_NOT_USED:
- if (!in_file()) break;
- /* fall through */
- case BUF_BLOCK_ZIP_PAGE:
- ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked() ||
- (this >= &buf_pool.watch[0] &&
- this <= &buf_pool.watch[UT_ARR_SIZE(buf_pool.watch)]));
- break;
- }
-#endif
- state_= state;
-}
-
-inline void buf_page_t::set_io_fix(buf_io_fix io_fix)
+inline void buf_page_t::set_state(uint32_t s)
{
mysql_mutex_assert_owner(&buf_pool.mutex);
- io_fix_= io_fix;
+ ut_ad(s <= REMOVE_HASH || s >= UNFIXED);
+ ut_ad(s < WRITE_FIX);
+ ut_ad(s <= READ_FIX || zip.fix == READ_FIX);
+ zip.fix= s;
}
inline void buf_page_t::set_corrupt_id()
@@ -2191,19 +1968,15 @@ inline void buf_page_t::set_corrupt_id()
default:
ut_ad("block is dirty" == 0);
}
- switch (state()) {
- case BUF_BLOCK_REMOVE_HASH:
- break;
- case BUF_BLOCK_ZIP_PAGE:
- case BUF_BLOCK_FILE_PAGE:
- ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked());
- break;
- case BUF_BLOCK_NOT_USED:
- case BUF_BLOCK_MEMORY:
- ut_ad("invalid state" == 0);
+ const auto f= state();
+ if (f != REMOVE_HASH)
+ {
+ ut_ad(f >= UNFIXED);
+ ut_ad(buf_pool.page_hash.lock_get(buf_pool.page_hash.cell_get(id_.fold())).
+ is_write_locked());
}
#endif
- id_= page_id_t(~0ULL);
+ id_.set_corrupted();
}
/** Set oldest_modification when adding to buf_pool.flush_list */
@@ -2218,10 +1991,12 @@ inline void buf_page_t::set_oldest_modification(lsn_t lsn)
/** Clear oldest_modification after removing from buf_pool.flush_list */
inline void buf_page_t::clear_oldest_modification()
{
- mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
- ut_d(const auto state= state_);
- ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_ZIP_PAGE ||
- state == BUF_BLOCK_REMOVE_HASH);
+#ifdef SAFE_MUTEX
+ if (oldest_modification() != 2)
+ mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+#endif /* SAFE_MUTEX */
+ ut_d(const auto s= state());
+ ut_ad(s >= REMOVE_HASH);
ut_ad(oldest_modification());
ut_ad(!list.prev);
ut_ad(!list.next);
@@ -2231,46 +2006,16 @@ inline void buf_page_t::clear_oldest_modification()
oldest_modification_.store(0, std::memory_order_release);
}
-/** Note that a block is no longer dirty, while not removing
-it from buf_pool.flush_list */
-inline void buf_page_t::clear_oldest_modification(bool temporary)
-{
- ut_ad(temporary == fsp_is_system_temporary(id().space()));
- if (temporary)
- {
- ut_ad(oldest_modification() == 2);
- oldest_modification_= 0;
- }
- else
- {
- /* We use release memory order to guarantee that callers of
- oldest_modification_acquire() will observe the block as
- being detached from buf_pool.flush_list, after reading the value 0. */
- ut_ad(oldest_modification() > 2);
- oldest_modification_.store(1, std::memory_order_release);
- }
-}
-
-/** @return whether the block is modified and ready for flushing */
-inline bool buf_page_t::ready_for_flush() const
-{
- mysql_mutex_assert_owner(&buf_pool.mutex);
- ut_ad(in_LRU_list);
- ut_a(in_file());
- ut_ad(fsp_is_system_temporary(id().space())
- ? oldest_modification() == 2
- : oldest_modification() > 2);
- return io_fix_ == BUF_IO_NONE;
-}
-
/** @return whether the block can be relocated in memory.
The block can be dirty, but it must not be I/O-fixed or bufferfixed. */
inline bool buf_page_t::can_relocate() const
{
mysql_mutex_assert_owner(&buf_pool.mutex);
- ut_ad(in_file());
+ const auto f= state();
+ ut_ad(f >= FREED);
ut_ad(in_LRU_list);
- return io_fix_ == BUF_IO_NONE && !buf_fix_count_;
+ return (f == FREED || (f < READ_FIX && !(f & ~LRU_MASK))) &&
+ !lock.is_locked_or_waiting();
}
/** @return whether the block has been flagged old in buf_pool.LRU */
@@ -2331,41 +2076,26 @@ inline void buf_page_t::set_old(bool old)
/**********************************************************************
Let us list the consistency conditions for different control block states.
-NOT_USED: is in free list, not in LRU list, not in flush list, nor
- page hash table
-MEMORY: is not in free list, LRU list, or flush list, nor page
- hash table
-FILE_PAGE: space and offset are defined, is in page hash table
- if io_fix == BUF_IO_WRITE,
- buf_pool.n_flush_LRU() || buf_pool.n_flush_list()
-
- (1) if buf_fix_count == 0, then
- is in LRU list, not in free list
- is in flush list,
- if and only if oldest_modification > 0
- is x-locked,
- if and only if io_fix == BUF_IO_READ
- is s-locked,
- if and only if io_fix == BUF_IO_WRITE
-
- (2) if buf_fix_count > 0, then
- is not in LRU list, not in free list
- is in flush list,
- if and only if oldest_modification > 0
- if io_fix == BUF_IO_READ,
- is x-locked
- if io_fix == BUF_IO_WRITE,
- is s-locked
+NOT_USED: is in free list, not LRU, not flush_list, nor page_hash
+MEMORY: is not in any of free, LRU, flush_list, page_hash
+in_file(): is not in free list, is in LRU list, id() is defined,
+ is in page_hash (not necessarily if is_read_fixed())
+
+ is in buf_pool.flush_list, if and only
+ if oldest_modification == 1 || oldest_modification > 2
+
+ (1) if is_write_fixed(): is u-locked
+ (2) if is_read_fixed(): is x-locked
State transitions:
NOT_USED => MEMORY
-MEMORY => FILE_PAGE
MEMORY => NOT_USED
-FILE_PAGE => NOT_USED NOTE: This transition is allowed if and only if
- (1) buf_fix_count == 0,
- (2) oldest_modification == 0, and
- (3) io_fix == 0.
+MEMORY => UNFIXED
+UNFIXED => in_file()
+in_file() => UNFIXED or FREED
+UNFIXED or FREED => REMOVE_HASH
+REMOVE_HASH => NOT_USED (if and only if !oldest_modification())
*/
/** Select from where to start a scan. If we have scanned
@@ -2427,5 +2157,3 @@ struct CheckUnzipLRUAndLRUList {
#include "buf0buf.inl"
#endif /* !UNIV_INNOCHECKSUM */
-
-#endif
diff --git a/storage/innobase/include/buf0buf.inl b/storage/innobase/include/buf0buf.inl
index 364f04d3f69..3c4da98f83b 100644
--- a/storage/innobase/include/buf0buf.inl
+++ b/storage/innobase/include/buf0buf.inl
@@ -2,7 +2,7 @@
Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
-Copyright (c) 2014, 2020, MariaDB Corporation.
+Copyright (c) 2014, 2021, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -37,42 +37,6 @@ Created 11/5/1995 Heikki Tuuri
#include "buf0rea.h"
#include "fsp0types.h"
-/*********************************************************************//**
-Gets the current size of buffer buf_pool in bytes.
-@return size in bytes */
-UNIV_INLINE
-ulint
-buf_pool_get_curr_size(void)
-/*========================*/
-{
- return(srv_buf_pool_curr_size);
-}
-
-/********************************************************************//**
-Reads the freed_page_clock of a buffer block.
-@return freed_page_clock */
-UNIV_INLINE
-unsigned
-buf_page_get_freed_page_clock(
-/*==========================*/
- const buf_page_t* bpage) /*!< in: block */
-{
- /* This is sometimes read without holding buf_pool.mutex. */
- return(bpage->freed_page_clock);
-}
-
-/********************************************************************//**
-Reads the freed_page_clock of a buffer block.
-@return freed_page_clock */
-UNIV_INLINE
-unsigned
-buf_block_get_freed_page_clock(
-/*===========================*/
- const buf_block_t* block) /*!< in: block */
-{
- return(buf_page_get_freed_page_clock(&block->page));
-}
-
/** Determine if a block is still close enough to the MRU end of the LRU list
meaning that it is not in danger of getting evicted and also implying
that it has been accessed recently.
@@ -122,67 +86,6 @@ inline bool buf_page_peek_if_too_old(const buf_page_t *bpage)
}
}
-#ifdef UNIV_DEBUG
-/*********************************************************************//**
-Gets a pointer to the memory frame of a block.
-@return pointer to the frame */
-UNIV_INLINE
-buf_frame_t*
-buf_block_get_frame(
-/*================*/
- const buf_block_t* block) /*!< in: pointer to the control block */
-{
- if (!block) {
- return NULL;
- }
-
- switch (block->page.state()) {
- case BUF_BLOCK_ZIP_PAGE:
- case BUF_BLOCK_NOT_USED:
- ut_error;
- break;
- case BUF_BLOCK_FILE_PAGE:
- ut_a(block->page.buf_fix_count());
- /* fall through */
- case BUF_BLOCK_MEMORY:
- case BUF_BLOCK_REMOVE_HASH:
- goto ok;
- }
- ut_error;
-ok:
- return((buf_frame_t*) block->frame);
-}
-#endif /* UNIV_DEBUG */
-
-/********************************************************************//**
-Allocates a buf_page_t descriptor. This function must succeed. In case
-of failure we assert in this function.
-@return: the allocated descriptor. */
-UNIV_INLINE
-buf_page_t*
-buf_page_alloc_descriptor(void)
-/*===========================*/
-{
- buf_page_t* bpage;
-
- bpage = (buf_page_t*) ut_zalloc_nokey(sizeof *bpage);
- ut_ad(bpage);
- MEM_UNDEFINED(bpage, sizeof *bpage);
-
- return(bpage);
-}
-
-/********************************************************************//**
-Free a buf_page_t descriptor. */
-UNIV_INLINE
-void
-buf_page_free_descriptor(
-/*=====================*/
- buf_page_t* bpage) /*!< in: bpage descriptor to free. */
-{
- ut_free(bpage);
-}
-
/** Allocate a buffer block.
@return own: the allocated block, in state BUF_BLOCK_MEMORY */
inline buf_block_t *buf_block_alloc()
@@ -214,18 +117,11 @@ buf_block_modify_clock_inc(
buf_block_t* block) /*!< in: block */
{
#ifdef SAFE_MUTEX
- /* No latch is acquired for the shared temporary tablespace. */
- ut_ad(fsp_is_system_temporary(block->page.id().space())
- || (mysql_mutex_is_owner(&buf_pool.mutex)
- && !block->page.buf_fix_count())
- || rw_lock_own_flagged(&block->lock,
- RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+ ut_ad((mysql_mutex_is_owner(&buf_pool.mutex)
+ && !block->page.buf_fix_count())
+ || block->page.lock.have_u_or_x());
#else /* SAFE_MUTEX */
- /* No latch is acquired for the shared temporary tablespace. */
- ut_ad(fsp_is_system_temporary(block->page.id().space())
- || !block->page.buf_fix_count()
- || rw_lock_own_flagged(&block->lock,
- RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+ ut_ad(!block->page.buf_fix_count() || block->page.lock.have_u_or_x());
#endif /* SAFE_MUTEX */
assert_block_ahi_valid(block);
@@ -242,162 +138,7 @@ buf_block_get_modify_clock(
/*=======================*/
buf_block_t* block) /*!< in: block */
{
-#ifdef UNIV_DEBUG
- /* No latch is acquired for the shared temporary tablespace. */
- if (!fsp_is_system_temporary(block->page.id().space())) {
- ut_ad(rw_lock_own(&(block->lock), RW_LOCK_S)
- || rw_lock_own(&(block->lock), RW_LOCK_X)
- || rw_lock_own(&(block->lock), RW_LOCK_SX));
- }
-#endif /* UNIV_DEBUG */
-
+ ut_ad(block->page.lock.have_any());
return(block->modify_clock);
}
-/*******************************************************************//**
-Increments the bufferfix count. */
-UNIV_INLINE
-void
-buf_block_buf_fix_inc_func(
-/*=======================*/
-#ifdef UNIV_DEBUG
- const char* file, /*!< in: file name */
- unsigned line, /*!< in: line */
-#endif /* UNIV_DEBUG */
- buf_block_t* block) /*!< in/out: block to bufferfix */
-{
-#ifdef UNIV_DEBUG
- /* No debug latch is acquired if block belongs to system temporary.
- Debug latch is not of much help if access to block is single
- threaded. */
- if (!fsp_is_system_temporary(block->page.id().space())) {
- ibool ret;
- ret = rw_lock_s_lock_nowait(block->debug_latch, file, line);
- ut_a(ret);
- }
-#endif /* UNIV_DEBUG */
-
- block->fix();
-}
-
-/*******************************************************************//**
-Decrements the bufferfix count. */
-UNIV_INLINE
-void
-buf_block_buf_fix_dec(
-/*==================*/
- buf_block_t* block) /*!< in/out: block to bufferunfix */
-{
-#ifdef UNIV_DEBUG
- /* No debug latch is acquired if block belongs to system temporary.
- Debug latch is not of much help if access to block is single
- threaded. */
- if (!fsp_is_system_temporary(block->page.id().space())) {
- rw_lock_s_unlock(block->debug_latch);
- }
-#endif /* UNIV_DEBUG */
-
- block->unfix();
-}
-
-/********************************************************************//**
-Releases a compressed-only page acquired with buf_page_get_zip(). */
-UNIV_INLINE
-void
-buf_page_release_zip(
-/*=================*/
- buf_page_t* bpage) /*!< in: buffer block */
-{
- ut_ad(bpage);
- ut_a(bpage->buf_fix_count());
-
- switch (bpage->state()) {
- case BUF_BLOCK_FILE_PAGE:
-#ifdef UNIV_DEBUG
- {
- /* No debug latch is acquired if block belongs to system
- temporary. Debug latch is not of much help if access to block
- is single threaded. */
- buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
- if (!fsp_is_system_temporary(block->page.id().space())) {
- rw_lock_s_unlock(block->debug_latch);
- }
- }
-#endif /* UNIV_DEBUG */
- /* Fall through */
- case BUF_BLOCK_ZIP_PAGE:
- reinterpret_cast<buf_block_t*>(bpage)->unfix();
- return;
-
- case BUF_BLOCK_NOT_USED:
- case BUF_BLOCK_MEMORY:
- case BUF_BLOCK_REMOVE_HASH:
- break;
- }
-
- ut_error;
-}
-
-/********************************************************************//**
-Releases a latch, if specified. */
-UNIV_INLINE
-void
-buf_page_release_latch(
-/*===================*/
- buf_block_t* block, /*!< in: buffer block */
- ulint rw_latch) /*!< in: RW_S_LATCH, RW_X_LATCH,
- RW_NO_LATCH */
-{
-#ifdef UNIV_DEBUG
- /* No debug latch is acquired if block belongs to system
- temporary. Debug latch is not of much help if access to block
- is single threaded. */
- if (!fsp_is_system_temporary(block->page.id().space())) {
- rw_lock_s_unlock(block->debug_latch);
- }
-#endif /* UNIV_DEBUG */
-
- if (rw_latch == RW_S_LATCH) {
- rw_lock_s_unlock(&block->lock);
- } else if (rw_latch == RW_SX_LATCH) {
- rw_lock_sx_unlock(&block->lock);
- } else if (rw_latch == RW_X_LATCH) {
- rw_lock_x_unlock(&block->lock);
- }
-}
-
-#ifdef UNIV_DEBUG
-/*********************************************************************//**
-Adds latch level info for the rw-lock protecting the buffer frame. This
-should be called in the debug version after a successful latching of a
-page if we know the latching order level of the acquired latch. */
-UNIV_INLINE
-void
-buf_block_dbg_add_level(
-/*====================*/
- buf_block_t* block, /*!< in: buffer page
- where we have acquired latch */
- latch_level_t level) /*!< in: latching order level */
-{
- sync_check_lock(&block->lock, level);
-}
-#endif /* UNIV_DEBUG */
-
-/********************************************************************//**
-Get buf frame. */
-UNIV_INLINE
-void *
-buf_page_get_frame(
-/*===============*/
- const buf_page_t* bpage) /*!< in: buffer pool page */
-{
- /* In encryption/compression buffer pool page may contain extra
- buffer where result is stored. */
- if (bpage->slot && bpage->slot->out_buf) {
- return bpage->slot->out_buf;
- } else if (bpage->zip.data) {
- return bpage->zip.data;
- } else {
- return ((buf_block_t*) bpage)->frame;
- }
-}
diff --git a/storage/innobase/include/buf0checksum.h b/storage/innobase/include/buf0checksum.h
index 8dc25f91d59..d9f03177812 100644
--- a/storage/innobase/include/buf0checksum.h
+++ b/storage/innobase/include/buf0checksum.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,9 +24,7 @@ Buffer pool checksum functions, also linked from /extra/innochecksum.cc
Created Aug 11, 2011 Vasil Dimov
*******************************************************/
-#ifndef buf0checksum_h
-#define buf0checksum_h
-
+#pragma once
#include "buf0types.h"
/** Calculate the CRC32 checksum of a page. The value is stored to the page
@@ -37,6 +35,7 @@ architectures.
@return CRC-32C */
uint32_t buf_calc_page_crc32(const byte* page);
+#ifndef UNIV_INNOCHECKSUM
/** Calculate a checksum which is stored to the page when it is written
to a file. Note that we must be careful to calculate the same value on
32-bit and 64-bit architectures.
@@ -55,13 +54,4 @@ because this takes that field as an input!
@return checksum */
uint32_t
buf_calc_page_old_checksum(const byte* page);
-
-/** Return a printable string describing the checksum algorithm.
-@param[in] algo algorithm
-@return algorithm name */
-const char*
-buf_checksum_algorithm_name(srv_checksum_algorithm_t algo);
-
-extern ulong srv_checksum_algorithm;
-
-#endif /* buf0checksum_h */
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h
index fb9df55504c..92b840d2f4c 100644
--- a/storage/innobase/include/buf0dblwr.h
+++ b/storage/innobase/include/buf0dblwr.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -54,9 +54,9 @@ class buf_dblwr_t
};
/** the page number of the first doublewrite block (block_size() pages) */
- page_id_t block1= page_id_t(0, 0);
+ page_id_t block1{0, 0};
/** the page number of the second doublewrite block (block_size() pages) */
- page_id_t block2= page_id_t(0, 0);
+ page_id_t block2{0, 0};
/** mutex protecting the data members below */
mysql_mutex_t mutex;
@@ -74,9 +74,9 @@ class buf_dblwr_t
ulint pages_written;
slot slots[2];
- slot *active_slot= &slots[0];
+ slot *active_slot;
- /** Initialize the doublewrite buffer data structure.
+ /** Initialise the persistent storage of the doublewrite buffer.
@param header doublewrite page header in the TRX_SYS page */
inline void init(const byte *header);
@@ -84,6 +84,8 @@ class buf_dblwr_t
bool flush_buffered_writes(const ulint size);
public:
+ /** Initialise the doublewrite buffer data structures. */
+ void init();
/** Create or restore the doublewrite buffer in the TRX_SYS page.
@return whether the operation succeeded */
bool create();
@@ -137,14 +139,14 @@ public:
@param size payload size in bytes */
void add_to_batch(const IORequest &request, size_t size);
- /** Determine whether the doublewrite buffer is initialized */
- bool is_initialised() const
+ /** Determine whether the doublewrite buffer has been created */
+ bool is_created() const
{ return UNIV_LIKELY(block1 != page_id_t(0, 0)); }
/** @return whether a page identifier is part of the doublewrite buffer */
bool is_inside(const page_id_t id) const
{
- if (!is_initialised())
+ if (!is_created())
return false;
ut_ad(block1 < block2);
if (id < block1)
@@ -156,13 +158,10 @@ public:
/** Wait for flush_buffered_writes() to be fully completed */
void wait_flush_buffered_writes()
{
- if (is_initialised())
- {
- mysql_mutex_lock(&mutex);
- while (batch_running)
- my_cond_wait(&cond, &mutex.m_mutex);
- mysql_mutex_unlock(&mutex);
- }
+ mysql_mutex_lock(&mutex);
+ while (batch_running)
+ my_cond_wait(&cond, &mutex.m_mutex);
+ mysql_mutex_unlock(&mutex);
}
};
diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h
index c772f84147d..13a9363922b 100644
--- a/storage/innobase/include/buf0flu.h
+++ b/storage/innobase/include/buf0flu.h
@@ -24,21 +24,20 @@ The database buffer pool flush algorithm
Created 11/5/1995 Heikki Tuuri
*******************************************************/
-#ifndef buf0flu_h
-#define buf0flu_h
+#pragma once
#include "ut0byte.h"
#include "log0log.h"
-#include "buf0types.h"
+#include "buf0buf.h"
-/** Number of pages flushed. Protected by buf_pool.mutex. */
-extern ulint buf_flush_page_count;
/** Number of pages flushed via LRU. Protected by buf_pool.mutex.
-Also included in buf_flush_page_count. */
+Also included in buf_pool.stat.n_pages_written. */
extern ulint buf_lru_flush_page_count;
+/** Number of pages freed without flushing. Protected by buf_pool.mutex. */
+extern ulint buf_lru_freed_page_count;
/** Flag indicating if the page_cleaner is in active state. */
-extern bool buf_page_cleaner_is_active;
+extern Atomic_relaxed<bool> buf_page_cleaner_is_active;
/** Remove all dirty pages belonging to a given tablespace when we are
deleting the data file of that tablespace.
@@ -85,15 +84,18 @@ buf_flush_init_for_writing(
bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr)
MY_ATTRIBUTE((warn_unused_result));
-/** Write out dirty blocks from buf_pool.LRU.
+/** Write out dirty blocks from buf_pool.LRU,
+and move clean blocks to buf_pool.free.
+The caller must invoke buf_dblwr.flush_buffered_writes()
+after releasing buf_pool.mutex.
@param max_n wished maximum mumber of blocks flushed
-@return the number of processed pages
+@param evict whether to evict pages after flushing
+@return evict ? number of processed pages : number of pages written
@retval 0 if a buf_pool.LRU batch is already running */
-ulint buf_flush_LRU(ulint max_n);
+ulint buf_flush_LRU(ulint max_n, bool evict);
-/** Wait until a flush batch ends.
-@param lru true=buf_pool.LRU; false=buf_pool.flush_list */
-void buf_flush_wait_batch_end(bool lru);
+/** Wait until a LRU flush batch ends. */
+void buf_flush_wait_LRU_batch_end();
/** Wait until all persistent pages are flushed up to a limit.
@param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */
ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn);
@@ -106,22 +108,30 @@ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious);
This function should be called at a mini-transaction commit, if a page was
modified in it. Puts the block to the list of modified blocks, if it not
already in it. */
-UNIV_INLINE
-void
-buf_flush_note_modification(
-/*========================*/
- buf_block_t* block, /*!< in: block which is modified */
- lsn_t start_lsn, /*!< in: start lsn of the first mtr in a
- set of mtr's */
- lsn_t end_lsn); /*!< in: end lsn of the last mtr in the
- set of mtr's */
+inline void buf_flush_note_modification(buf_block_t *b, lsn_t start, lsn_t end)
+{
+ ut_ad(!srv_read_only_mode);
+ ut_d(const auto s= b->page.state());
+ ut_ad(s > buf_page_t::FREED);
+ ut_ad(s < buf_page_t::READ_FIX);
+ ut_ad(mach_read_from_8(b->page.frame + FIL_PAGE_LSN) <= end);
+ mach_write_to_8(b->page.frame + FIL_PAGE_LSN, end);
+ if (UNIV_LIKELY_NULL(b->page.zip.data))
+ memcpy_aligned<8>(FIL_PAGE_LSN + b->page.zip.data,
+ FIL_PAGE_LSN + b->page.frame, 8);
+
+ const lsn_t oldest_modification= b->page.oldest_modification();
+
+ if (oldest_modification > 1)
+ ut_ad(oldest_modification <= start);
+ else
+ buf_pool.insert_into_flush_list(b, start);
+ srv_stats.buf_pool_write_requests.inc();
+}
/** Initialize page_cleaner. */
ATTRIBUTE_COLD void buf_flush_page_cleaner_init();
-/** Wait for pending flushes to complete. */
-void buf_flush_wait_batch_end_acquiring_mutex(bool lru);
-
/** Flush the buffer pool on shutdown. */
ATTRIBUTE_COLD void buf_flush_buffer_pool();
@@ -137,7 +147,3 @@ void buf_flush_sync_batch(lsn_t lsn);
/** Synchronously flush dirty blocks.
NOTE: The calling thread is not allowed to hold any buffer page latches! */
void buf_flush_sync();
-
-#include "buf0flu.inl"
-
-#endif
diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h
index 540c14a49c9..aec08e77f54 100644
--- a/storage/innobase/include/buf0lru.h
+++ b/storage/innobase/include/buf0lru.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,11 +24,10 @@ The database buffer pool LRU replacement algorithm
Created 11/5/1995 Heikki Tuuri
*******************************************************/
-#ifndef buf0lru_h
-#define buf0lru_h
+#pragma once
-#include "ut0byte.h"
#include "buf0types.h"
+#include "hash0hash.h"
// Forward declaration
struct trx_t;
@@ -132,14 +131,6 @@ policy at the end of each interval. */
void
buf_LRU_stat_update();
-/** Remove one page from LRU list and put it to free list.
-@param bpage file page to be freed
-@param id page identifier
-@param hash_lock buf_pool.page_hash latch (will be released here) */
-void buf_LRU_free_one_page(buf_page_t *bpage, const page_id_t id,
- page_hash_latch *hash_lock)
- MY_ATTRIBUTE((nonnull));
-
#ifdef UNIV_DEBUG
/** Validate the LRU list. */
void buf_LRU_validate();
@@ -200,5 +191,3 @@ Increments the I/O counter in buf_LRU_stat_cur. */
/********************************************************************//**
Increments the page_zip_decompress() counter in buf_LRU_stat_cur. */
#define buf_LRU_stat_inc_unzip() buf_LRU_stat_cur.unzip++
-
-#endif
diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h
index 8d6b28194dc..d898c5efc63 100644
--- a/storage/innobase/include/buf0rea.h
+++ b/storage/innobase/include/buf0rea.h
@@ -33,10 +33,11 @@ Created 11/5/1995 Heikki Tuuri
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
an exclusive lock on the buffer frame. The flag is cleared and the x-lock
released by the i/o-handler thread.
-@param[in] page_id page id
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@retval DB_SUCCESS if the page was read and is not corrupted,
-@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted,
+@param page_id page id
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@retval DB_SUCCESS if the page was read and is not corrupted
+@retval DB_SUCCESS_LOCKED_REC if the page was not read
+@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted
@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
after decryption normal page checksum does not match.
@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h
index 327e2e2498e..c69c07d66e1 100644
--- a/storage/innobase/include/buf0types.h
+++ b/storage/innobase/include/buf0types.h
@@ -39,57 +39,29 @@ struct buf_buddy_stat_t;
/** A buffer frame. @see page_t */
typedef byte buf_frame_t;
-/** Flags for io_fix types */
-enum buf_io_fix {
- BUF_IO_NONE = 0, /**< no pending I/O */
- BUF_IO_READ, /**< read pending */
- BUF_IO_WRITE, /**< write pending */
- BUF_IO_PIN /**< disallow relocation of
- block and its removal of from
- the flush_list */
-};
-
/** Alternatives for srv_checksum_algorithm, which can be changed by
setting innodb_checksum_algorithm */
enum srv_checksum_algorithm_t {
- SRV_CHECKSUM_ALGORITHM_CRC32, /*!< Write crc32, allow crc32,
- innodb or none when reading */
- SRV_CHECKSUM_ALGORITHM_STRICT_CRC32, /*!< Write crc32, allow crc32
- when reading */
- SRV_CHECKSUM_ALGORITHM_INNODB, /*!< Write innodb, allow crc32,
- innodb or none when reading */
- SRV_CHECKSUM_ALGORITHM_STRICT_INNODB, /*!< Write innodb, allow
- innodb when reading */
- SRV_CHECKSUM_ALGORITHM_NONE, /*!< Write none, allow crc32,
- innodb or none when reading */
- SRV_CHECKSUM_ALGORITHM_STRICT_NONE, /*!< Write none, allow none
- when reading */
-
- /** For new files, always compute CRC-32C for the whole page.
- For old files, allow crc32, innodb or none when reading. */
- SRV_CHECKSUM_ALGORITHM_FULL_CRC32,
-
- /** For new files, always compute CRC-32C for the whole page.
- For old files, allow crc32 when reading. */
- SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32
+ /** Write crc32; allow full_crc32,crc32,innodb,none when reading */
+ SRV_CHECKSUM_ALGORITHM_CRC32,
+ /** Write crc32; allow full_crc23,crc32 when reading */
+ SRV_CHECKSUM_ALGORITHM_STRICT_CRC32,
+ /** For new files, always compute CRC-32C for the whole page.
+ For old files, allow crc32, innodb or none when reading. */
+ SRV_CHECKSUM_ALGORITHM_FULL_CRC32,
+ /** For new files, always compute CRC-32C for the whole page.
+ For old files, allow crc32 when reading. */
+ SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32
};
-inline
-bool
-is_checksum_strict(srv_checksum_algorithm_t algo)
+inline bool is_checksum_strict(srv_checksum_algorithm_t algo)
{
- return(algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32
- || algo == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB
- || algo == SRV_CHECKSUM_ALGORITHM_STRICT_NONE);
+ return algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32;
}
-inline
-bool
-is_checksum_strict(ulint algo)
+inline bool is_checksum_strict(ulint algo)
{
- return(algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32
- || algo == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB
- || algo == SRV_CHECKSUM_ALGORITHM_STRICT_NONE);
+ return algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32;
}
/** Parameters of binary buddy system for compressed pages (buf0buddy.h) */
@@ -176,6 +148,12 @@ public:
constexpr ulonglong raw() const { return m_id; }
+ /** Flag the page identifier as corrupted. */
+ void set_corrupted() { m_id= ~0ULL; }
+
+ /** @return whether the page identifier belongs to a corrupted page */
+ constexpr bool is_corrupted() const { return m_id == ~0ULL; }
+
private:
/** The page identifier */
uint64_t m_id;
@@ -189,39 +167,69 @@ extern const byte *field_ref_zero;
#ifndef UNIV_INNOCHECKSUM
-#include "ut0mutex.h"
-#include "sync0rw.h"
-#include "rw_lock.h"
+/** Latch types */
+enum rw_lock_type_t
+{
+ RW_S_LATCH= 1 << 0,
+ RW_X_LATCH= 1 << 1,
+ RW_SX_LATCH= 1 << 2,
+ RW_NO_LATCH= 1 << 3
+};
-class page_hash_latch : public rw_lock
+#include "sux_lock.h"
+
+#ifdef SUX_LOCK_GENERIC
+class page_hash_latch : private rw_lock
{
-public:
/** Wait for a shared lock */
void read_lock_wait();
/** Wait for an exclusive lock */
void write_lock_wait();
-
+public:
/** Acquire a shared lock */
- inline void read_lock();
+ inline void lock_shared();
/** Acquire an exclusive lock */
- inline void write_lock();
+ inline void lock();
- /** Acquire a lock */
- template<bool exclusive> void acquire()
- {
- if (exclusive)
- write_lock();
- else
- read_lock();
- }
- /** Release a lock */
- template<bool exclusive> void release()
- {
- if (exclusive)
- write_unlock();
- else
- read_unlock();
- }
+ /** @return whether an exclusive lock is being held by any thread */
+ bool is_write_locked() const { return rw_lock::is_write_locked(); }
+
+ /** @return whether any lock is being held by any thread */
+ bool is_locked() const { return rw_lock::is_locked(); }
+ /** @return whether any lock is being held or waited for by any thread */
+ bool is_locked_or_waiting() const { return rw_lock::is_locked_or_waiting(); }
+
+ /** Release a shared lock */
+ void unlock_shared() { read_unlock(); }
+ /** Release an exclusive lock */
+ void unlock() { write_unlock(); }
+};
+#elif defined _WIN32 || SIZEOF_SIZE_T >= 8
+class page_hash_latch
+{
+ srw_spin_lock_low lk;
+public:
+ void lock_shared() { lk.rd_lock(); }
+ void unlock_shared() { lk.rd_unlock(); }
+ void lock() { lk.wr_lock(); }
+ void unlock() { lk.wr_unlock(); }
+ bool is_write_locked() const { return lk.is_write_locked(); }
+ bool is_locked() const { return lk.is_locked(); }
+ bool is_locked_or_waiting() const { return lk.is_locked_or_waiting(); }
+};
+#else
+class page_hash_latch
+{
+ srw_spin_mutex lk;
+public:
+ void lock_shared() { lock(); }
+ void unlock_shared() { unlock(); }
+ void lock() { lk.wr_lock(); }
+ void unlock() { lk.wr_unlock(); }
+ bool is_locked() const { return lk.is_locked(); }
+ bool is_write_locked() const { return is_locked(); }
+ bool is_locked_or_waiting() const { return is_locked(); }
};
+#endif
#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h
index c2b8c3e00b6..5eaad5bf552 100644
--- a/storage/innobase/include/data0data.h
+++ b/storage/innobase/include/data0data.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, 2020 MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -581,6 +581,10 @@ struct dtuple_t {
/** @return whether this is a hidden metadata record
for instant ADD COLUMN or ALTER TABLE */
bool is_metadata() const { return is_metadata(info_bits); }
+
+ /** Copy type information from index fields.
+ @param index index field to be copied */
+ inline void copy_field_types(const dict_index_t &index);
};
inline ulint dtuple_get_n_fields(const dtuple_t* tuple)
diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h
index 9528443e7a8..3d63ddb767c 100644
--- a/storage/innobase/include/data0type.h
+++ b/storage/innobase/include/data0type.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,9 +24,7 @@ Data types
Created 1/16/1996 Heikki Tuuri
*******************************************************/
-#ifndef data0type_h
-#define data0type_h
-
+#pragma once
#include "univ.i"
/** Special length indicating a missing instantly added column */
@@ -196,9 +194,6 @@ constexpr uint8_t DATA_MBR_LEN= uint8_t(SPDIMS * 2 * sizeof(double));
/** system-versioned user data column */
#define DATA_VERSIONED (DATA_VERS_START|DATA_VERS_END)
-/** Check whether locking is disabled (never). */
-#define dict_table_is_locking_disabled(table) false
-
/*-------------------------------------------*/
/* This many bytes we need to store the type information affecting the
@@ -325,7 +320,6 @@ dtype_get_prtype(
/*********************************************************************//**
Compute the mbminlen and mbmaxlen members of a data type structure. */
-UNIV_INLINE
void
dtype_get_mblen(
/*============*/
@@ -480,19 +474,6 @@ dtype_new_read_for_order_and_null_size(
const byte* buf); /*!< in: buffer for stored type order info */
/*********************************************************************//**
-Returns the type's SQL name (e.g. BIGINT UNSIGNED) from mtype,prtype,len
-@return the SQL type name */
-UNIV_INLINE
-char*
-dtype_sql_name(
-/*===========*/
- unsigned mtype, /*!< in: mtype */
- unsigned prtype, /*!< in: prtype */
- unsigned len, /*!< in: len */
- char* name, /*!< out: SQL name */
- unsigned name_sz);/*!< in: size of the name buffer */
-
-/*********************************************************************//**
Validates a data type structure.
@return TRUE if ok */
ibool
@@ -507,6 +488,8 @@ dtype_print(
const dtype_t* type);
#endif /* UNIV_DEBUG */
+struct dict_col_t;
+
/* Structure for an SQL data type.
If you add fields to this structure, be sure to initialize them everywhere.
This structure is initialized in the following functions:
@@ -562,6 +545,10 @@ struct dtype_t{
mbminlen = 0;
mbmaxlen = 0;
}
+
+ /** Copy the type information from a column.
+ @param col column type to be copied */
+ void assign(const dict_col_t &col);
};
/** The DB_TRX_ID,DB_ROLL_PTR values for "no history is available" */
@@ -602,5 +589,3 @@ static const byte REC_INFO_METADATA_ALTER
= REC_INFO_METADATA_ADD | REC_INFO_DELETED_FLAG;
#include "data0type.inl"
-
-#endif
diff --git a/storage/innobase/include/data0type.inl b/storage/innobase/include/data0type.inl
index b81b68e69e9..329cee5d190 100644
--- a/storage/innobase/include/data0type.inl
+++ b/storage/innobase/include/data0type.inl
@@ -68,30 +68,6 @@ dtype_get_mysql_type(
Compute the mbminlen and mbmaxlen members of a data type structure. */
UNIV_INLINE
void
-dtype_get_mblen(
-/*============*/
- ulint mtype, /*!< in: main type */
- ulint prtype, /*!< in: precise type (and collation) */
- unsigned*mbminlen, /*!< out: minimum length of a
- multi-byte character */
- unsigned*mbmaxlen) /*!< out: maximum length of a
- multi-byte character */
-{
- if (dtype_is_string_type(mtype)) {
- innobase_get_cset_width(dtype_get_charset_coll(prtype),
- mbminlen, mbmaxlen);
- ut_ad(*mbminlen <= *mbmaxlen);
- ut_ad(*mbminlen < DATA_MBMAX);
- ut_ad(*mbmaxlen < DATA_MBMAX);
- } else {
- *mbminlen = *mbmaxlen = 0;
- }
-}
-
-/*********************************************************************//**
-Compute the mbminlen and mbmaxlen members of a data type structure. */
-UNIV_INLINE
-void
dtype_set_mblen(
/*============*/
dtype_t* type) /*!< in/out: type */
@@ -327,103 +303,6 @@ dtype_new_read_for_order_and_null_size(
dtype_set_mblen(type);
}
-/*********************************************************************//**
-Returns the type's SQL name (e.g. BIGINT UNSIGNED) from mtype,prtype,len
-@return the SQL type name */
-UNIV_INLINE
-char*
-dtype_sql_name(
-/*===========*/
- unsigned mtype, /*!< in: mtype */
- unsigned prtype, /*!< in: prtype */
- unsigned len, /*!< in: len */
- char* name, /*!< out: SQL name */
- unsigned name_sz)/*!< in: size of the name buffer */
-{
-
-#define APPEND_UNSIGNED() \
- do { \
- if (prtype & DATA_UNSIGNED) { \
- snprintf(name + strlen(name), \
- name_sz - strlen(name), \
- " UNSIGNED"); \
- } \
- } while (0)
-
- snprintf(name, name_sz, "UNKNOWN");
-
- switch (mtype) {
- case DATA_INT:
- switch (len) {
- case 1:
- snprintf(name, name_sz, "TINYINT");
- break;
- case 2:
- snprintf(name, name_sz, "SMALLINT");
- break;
- case 3:
- snprintf(name, name_sz, "MEDIUMINT");
- break;
- case 4:
- snprintf(name, name_sz, "INT");
- break;
- case 8:
- snprintf(name, name_sz, "BIGINT");
- break;
- }
- APPEND_UNSIGNED();
- break;
- case DATA_FLOAT:
- snprintf(name, name_sz, "FLOAT");
- APPEND_UNSIGNED();
- break;
- case DATA_DOUBLE:
- snprintf(name, name_sz, "DOUBLE");
- APPEND_UNSIGNED();
- break;
- case DATA_FIXBINARY:
- snprintf(name, name_sz, "BINARY(%u)", len);
- break;
- case DATA_CHAR:
- case DATA_MYSQL:
- snprintf(name, name_sz, "CHAR(%u)", len);
- break;
- case DATA_VARCHAR:
- case DATA_VARMYSQL:
- snprintf(name, name_sz, "VARCHAR(%u)", len);
- break;
- case DATA_BINARY:
- snprintf(name, name_sz, "VARBINARY(%u)", len);
- break;
- case DATA_GEOMETRY:
- snprintf(name, name_sz, "GEOMETRY");
- break;
- case DATA_BLOB:
- switch (len) {
- case 9:
- snprintf(name, name_sz, "TINYBLOB");
- break;
- case 10:
- snprintf(name, name_sz, "BLOB");
- break;
- case 11:
- snprintf(name, name_sz, "MEDIUMBLOB");
- break;
- case 12:
- snprintf(name, name_sz, "LONGBLOB");
- break;
- }
- }
-
- if (prtype & DATA_NOT_NULL) {
- snprintf(name + strlen(name),
- name_sz - strlen(name),
- " NOT NULL");
- }
-
- return(name);
-}
-
/***********************************************************************//**
Returns the size of a fixed size data type, 0 if not a fixed size type.
@return fixed size, or 0 */
@@ -471,16 +350,6 @@ dtype_get_fixed_size_low(
} else if (!comp) {
return static_cast<unsigned>(len);
} else {
-#ifdef UNIV_DEBUG
- unsigned i_mbminlen, i_mbmaxlen;
-
- innobase_get_cset_width(
- dtype_get_charset_coll(prtype),
- &i_mbminlen, &i_mbmaxlen);
-
- ut_ad(i_mbminlen == mbminlen);
- ut_ad(i_mbmaxlen == mbmaxlen);
-#endif /* UNIV_DEBUG */
if (mbminlen == mbmaxlen) {
return static_cast<unsigned>(len);
}
diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h
index 51d116d5ede..64182aabc38 100644
--- a/storage/innobase/include/db0err.h
+++ b/storage/innobase/include/db0err.h
@@ -49,9 +49,6 @@ enum dberr_t {
rollback segment */
DB_CLUSTER_NOT_FOUND = 30,
DB_TABLE_NOT_FOUND,
- DB_MUST_GET_MORE_FILE_SPACE, /*!< the database has to be stopped
- and restarted with more file space */
- DB_TABLE_IS_BEING_USED,
DB_TOO_BIG_RECORD, /*!< a record in an index would not fit
on a compressed page, or it would
become bigger than 1/2 free space in
@@ -121,8 +118,6 @@ enum dberr_t {
DB_READ_ONLY, /*!< Update operation attempted in
a read-only transaction */
DB_FTS_INVALID_DOCID, /* FTS Doc ID cannot be zero */
- DB_TABLE_IN_FK_CHECK, /* table is being used in foreign
- key check */
DB_ONLINE_LOG_TOO_BIG, /*!< Modification log grew too big
during online index creation */
diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h
index 186fd30f89f..3e14e0ace69 100644
--- a/storage/innobase/include/dict0boot.h
+++ b/storage/innobase/include/dict0boot.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -33,8 +33,6 @@ Created 4/18/1996 Heikki Tuuri
#include "buf0buf.h"
#include "dict0dict.h"
-/** @return the DICT_HDR block, x-latched */
-buf_block_t *dict_hdr_get(mtr_t* mtr);
/**********************************************************************//**
Returns a new table, index, or space id. */
void
@@ -46,27 +44,39 @@ dict_hdr_get_new_id(
(not assigned if NULL) */
ulint* space_id); /*!< out: space id
(not assigned if NULL) */
-/**********************************************************************//**
-Writes the current value of the row id counter to the dictionary header file
-page. */
-void
-dict_hdr_flush_row_id(void);
-/*=======================*/
-/**********************************************************************//**
-Returns a new row id.
-@return the new id */
-UNIV_INLINE
-row_id_t
-dict_sys_get_new_row_id(void);
-/*=========================*/
+/** Update dict_sys.row_id in the dictionary header file page. */
+void dict_hdr_flush_row_id(row_id_t id);
+/** @return A new value for GEN_CLUST_INDEX(DB_ROW_ID) */
+inline row_id_t dict_sys_t::get_new_row_id()
+{
+ row_id_t id= row_id.fetch_add(1);
+ if (!(id % ROW_ID_WRITE_MARGIN))
+ dict_hdr_flush_row_id(id);
+ return id;
+}
+
+/** Ensure that row_id is not smaller than id, on IMPORT TABLESPACE */
+inline void dict_sys_t::update_row_id(row_id_t id)
+{
+ row_id_t sys_id= row_id;
+ while (id >= sys_id)
+ {
+ if (!row_id.compare_exchange_strong(sys_id, id))
+ continue;
+ if (!(id % ROW_ID_WRITE_MARGIN))
+ dict_hdr_flush_row_id(id);
+ break;
+ }
+}
+
/**********************************************************************//**
Writes a row id to a record or other 6-byte stored form. */
-UNIV_INLINE
-void
-dict_sys_write_row_id(
-/*==================*/
- byte* field, /*!< in: record field */
- row_id_t row_id);/*!< in: row id */
+inline void dict_sys_write_row_id(byte *field, row_id_t row_id)
+{
+ static_assert(DATA_ROW_ID_LEN == 6, "compatibility");
+ mach_write_to_6(field, row_id);
+}
+
/*****************************************************************//**
Initializes the data dictionary memory structures when the database is
started. This function is also called when the data dictionary is created.
@@ -87,12 +97,7 @@ dict_create(void)
/*********************************************************************//**
Check if a table id belongs to system table.
@return true if the table id belongs to a system table. */
-UNIV_INLINE
-bool
-dict_is_sys_table(
-/*==============*/
- table_id_t id) /*!< in: table id to check */
- MY_ATTRIBUTE((warn_unused_result));
+inline bool dict_is_sys_table(table_id_t id) { return id < DICT_HDR_FIRST_ID; }
/* Space id and page no where the dictionary header resides */
#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */
@@ -267,37 +272,6 @@ enum dict_fld_sys_foreign_cols_enum {
DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME = 5,
DICT_NUM_FIELDS__SYS_FOREIGN_COLS = 6
};
-/* The columns in SYS_TABLESPACES */
-enum dict_col_sys_tablespaces_enum {
- DICT_COL__SYS_TABLESPACES__SPACE = 0,
- DICT_COL__SYS_TABLESPACES__NAME = 1,
- DICT_COL__SYS_TABLESPACES__FLAGS = 2,
- DICT_NUM_COLS__SYS_TABLESPACES = 3
-};
-/* The field numbers in the SYS_TABLESPACES clustered index */
-enum dict_fld_sys_tablespaces_enum {
- DICT_FLD__SYS_TABLESPACES__SPACE = 0,
- DICT_FLD__SYS_TABLESPACES__DB_TRX_ID = 1,
- DICT_FLD__SYS_TABLESPACES__DB_ROLL_PTR = 2,
- DICT_FLD__SYS_TABLESPACES__NAME = 3,
- DICT_FLD__SYS_TABLESPACES__FLAGS = 4,
- DICT_NUM_FIELDS__SYS_TABLESPACES = 5
-};
-/* The columns in SYS_DATAFILES */
-enum dict_col_sys_datafiles_enum {
- DICT_COL__SYS_DATAFILES__SPACE = 0,
- DICT_COL__SYS_DATAFILES__PATH = 1,
- DICT_NUM_COLS__SYS_DATAFILES = 2
-};
-/* The field numbers in the SYS_DATAFILES clustered index */
-enum dict_fld_sys_datafiles_enum {
- DICT_FLD__SYS_DATAFILES__SPACE = 0,
- DICT_FLD__SYS_DATAFILES__DB_TRX_ID = 1,
- DICT_FLD__SYS_DATAFILES__DB_ROLL_PTR = 2,
- DICT_FLD__SYS_DATAFILES__PATH = 3,
- DICT_NUM_FIELDS__SYS_DATAFILES = 4
-};
-
/* The columns in SYS_VIRTUAL */
enum dict_col_sys_virtual_enum {
DICT_COL__SYS_VIRTUAL__TABLE_ID = 0,
@@ -320,11 +294,4 @@ length of thos fields. */
#define DICT_FLD_LEN_SPACE 4
#define DICT_FLD_LEN_FLAGS 4
-/* When a row id which is zero modulo this number (which must be a power of
-two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is
-updated */
-#define DICT_HDR_ROW_ID_WRITE_MARGIN 256
-
-#include "dict0boot.inl"
-
#endif
diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h
index 50f7f34a8e8..c40df12babe 100644
--- a/storage/innobase/include/dict0crea.h
+++ b/storage/innobase/include/dict0crea.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -41,14 +41,14 @@ tab_create_graph_create(
/*====================*/
dict_table_t* table, /*!< in: table to create, built as
a memory data structure */
- mem_heap_t* heap, /*!< in: heap where created */
- fil_encryption_t mode, /*!< in: encryption mode */
- uint32_t key_id); /*!< in: encryption key_id */
+ mem_heap_t* heap); /*!< in: heap where created */
/** Creates an index create graph.
@param[in] index index to create, built as a memory data structure
@param[in] table table name
@param[in,out] heap heap where created
+@param[in] mode encryption mode (for creating a table)
+@param[in] key_id encryption key identifier (for creating a table)
@param[in] add_v new virtual columns added in the same clause with
add index
@return own: index create node */
@@ -57,6 +57,8 @@ ind_create_graph_create(
dict_index_t* index,
const char* table,
mem_heap_t* heap,
+ fil_encryption_t mode,
+ uint32_t key_id,
const dict_add_v_col_t* add_v = NULL);
/***********************************************************//**
@@ -99,29 +101,22 @@ dict_create_index_tree(
/** Drop the index tree associated with a row in SYS_INDEXES table.
@param[in,out] pcur persistent cursor on rec
@param[in,out] trx dictionary transaction
-@param[in,out] mtr mini-transaction */
-void dict_drop_index_tree(btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr)
- MY_ATTRIBUTE((nonnull));
+@param[in,out] mtr mini-transaction
+@return tablespace ID to drop (if this is the clustered index)
+@retval 0 if no tablespace is to be dropped */
+uint32_t dict_drop_index_tree(btr_pcur_t *pcur, trx_t *trx, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
/***************************************************************//**
Creates an index tree for the index if it is not a member of a cluster.
Don't update SYSTEM TABLES.
-@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+@return error code */
dberr_t
dict_create_index_tree_in_mem(
/*==========================*/
dict_index_t* index, /*!< in/out: index */
const trx_t* trx); /*!< in: InnoDB transaction handle */
-/****************************************************************//**
-Creates the foreign key constraints system tables inside InnoDB
-at server bootstrap or server start if they are not found or are
-not of the right form.
-@return DB_SUCCESS or error code */
-dberr_t
-dict_create_or_check_foreign_constraint_tables(void);
-/*================================================*/
-
/********************************************************************//**
Generate a foreign key constraint name when it was not named by the user.
A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER,
@@ -167,37 +162,6 @@ dict_foreigns_has_s_base_col(
const dict_foreign_set& local_fk_set,
const dict_table_t* table);
-/****************************************************************//**
-Creates the tablespaces and datafiles system tables inside InnoDB
-at server bootstrap or server start if they are not found or are
-not of the right form.
-@return DB_SUCCESS or error code */
-dberr_t
-dict_create_or_check_sys_tablespace(void);
-/*=====================================*/
-/** Creates the virtual column system tables inside InnoDB
-at server bootstrap or server start if they are not found or are
-not of the right form.
-@return DB_SUCCESS or error code */
-dberr_t
-dict_create_or_check_sys_virtual();
-
-/** Put a tablespace definition into the data dictionary,
-replacing what was there previously.
-@param[in] space Tablespace id
-@param[in] name Tablespace name
-@param[in] flags Tablespace flags
-@param[in] path Tablespace path
-@param[in] trx Transaction
-@return error code or DB_SUCCESS */
-dberr_t
-dict_replace_tablespace_in_dictionary(
- ulint space_id,
- const char* name,
- ulint flags,
- const char* path,
- trx_t* trx);
-
/********************************************************************//**
Add a foreign key definition to the data dictionary tables.
@return error code or DB_SUCCESS */
@@ -209,16 +173,6 @@ dict_create_add_foreign_to_dictionary(
trx_t* trx) /*!< in/out: dictionary transaction */
MY_ATTRIBUTE((nonnull, warn_unused_result));
-/********************************************************************//**
-Construct foreign key constraint defintion from data dictionary information.
-*/
-UNIV_INTERN
-char*
-dict_foreign_def_get(
-/*=================*/
- dict_foreign_t* foreign,/*!< in: foreign */
- trx_t* trx); /*!< in: trx */
-
/* Table create node structure */
struct tab_node_t{
que_common_t common; /*!< node type: QUE_NODE_TABLE_CREATE */
@@ -240,8 +194,6 @@ struct tab_node_t{
/* Local storage for this graph node */
ulint state; /*!< node execution state */
ulint col_no; /*!< next column definition to insert */
- uint key_id; /*!< encryption key_id */
- fil_encryption_t mode; /*!< encryption mode */
ulint base_col_no; /*!< next base column to insert */
mem_heap_t* heap; /*!< memory heap used as auxiliary
storage */
@@ -273,11 +225,12 @@ struct ind_node_t{
/* Local storage for this graph node */
ulint state; /*!< node execution state */
uint32_t page_no; /* root page number of the index */
- dict_table_t* table; /*!< table which owns the index */
dtuple_t* ind_row; /* index definition row built */
ulint field_no; /* next field definition to insert */
mem_heap_t* heap; /*!< memory heap used as auxiliary
storage */
+ uint key_id; /*!< encryption key_id */
+ fil_encryption_t mode; /*!< encryption mode */
const dict_add_v_col_t*
add_v; /*!< new virtual columns that being
added along with an add index call */
diff --git a/storage/innobase/include/dict0defrag_bg.h b/storage/innobase/include/dict0defrag_bg.h
index 3aea41b0bb8..679484ad64e 100644
--- a/storage/innobase/include/dict0defrag_bg.h
+++ b/storage/innobase/include/dict0defrag_bg.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2016, 2020, MariaDB Corporation.
+Copyright (c) 2016, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -80,21 +80,16 @@ dict_stats_defrag_pool_del(
all entries for the table */
const dict_index_t* index); /*!< in: index to remove */
-/*****************************************************************//**
+/**
Get the first index that has been added for updating persistent defrag
stats and eventually save its stats. */
-void
-dict_defrag_process_entries_from_defrag_pool();
-/*===========================================*/
+void dict_defrag_process_entries_from_defrag_pool(THD *thd);
/*********************************************************************//**
Save defragmentation result.
@return DB_SUCCESS or error code */
-dberr_t
-dict_stats_save_defrag_summary(
-/*============================*/
- dict_index_t* index) /*!< in: index */
- MY_ATTRIBUTE((warn_unused_result));
+dberr_t dict_stats_save_defrag_summary(dict_index_t *index, THD *thd)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/*********************************************************************//**
Save defragmentation stats for a given index.
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
index 65b88a65185..e54a138cc02 100644
--- a/storage/innobase/include/dict0dict.h
+++ b/storage/innobase/include/dict0dict.h
@@ -31,11 +31,11 @@ Created 1/8/1996 Heikki Tuuri
#include "data0data.h"
#include "dict0mem.h"
#include "fsp0fsp.h"
+#include "srw_lock.h"
+#include <my_sys.h>
#include <deque>
class MDL_ticket;
-extern bool innodb_table_stats_not_found;
-extern bool innodb_index_stats_not_found;
/** the first table or index ID for other than hard-coded system tables */
constexpr uint8_t DICT_HDR_FIRST_ID= 10;
@@ -132,7 +132,7 @@ enum dict_table_op_t {
@param[in] table_op operation to perform when opening
@return table object after locking MDL shared
@retval NULL if the table is not readable, or if trylock && MDL blocked */
-template<bool trylock>
+template<bool trylock, bool purge_thd= false>
dict_table_t*
dict_acquire_mdl_shared(dict_table_t *table,
THD *thd,
@@ -140,7 +140,6 @@ dict_acquire_mdl_shared(dict_table_t *table,
dict_table_op_t table_op= DICT_TABLE_OP_NORMAL);
/** Look up a table by numeric identifier.
-@tparam purge_thd Whether the function is called by purge thread
@param[in] table_id table identifier
@param[in] dict_locked data dictionary locked
@param[in] table_op operation to perform when opening
@@ -154,11 +153,12 @@ dict_table_open_on_id(table_id_t table_id, bool dict_locked,
MDL_ticket **mdl= nullptr)
MY_ATTRIBUTE((warn_unused_result));
+/** Decrement the count of open handles */
+void dict_table_close(dict_table_t *table);
+
/** Decrements the count of open handles of a table.
@param[in,out] table table
-@param[in] dict_locked data dictionary locked
-@param[in] try_drop try to drop any orphan indexes after
- an aborted online index creation
+@param[in] dict_locked whether dict_sys.latch is being held
@param[in] thd thread to release MDL
@param[in] mdl metadata lock or NULL if the thread is a
foreground one. */
@@ -166,22 +166,10 @@ void
dict_table_close(
dict_table_t* table,
bool dict_locked,
- bool try_drop,
THD* thd = NULL,
MDL_ticket* mdl = NULL);
/*********************************************************************//**
-Closes the only open handle to a table and drops a table while assuring
-that dict_sys.mutex is held the whole time. This assures that the table
-is not evicted after the close when the count of open handles goes to zero.
-Because dict_sys.mutex is held, we do not need to call prevent_eviction(). */
-void
-dict_table_close_and_drop(
-/*======================*/
- trx_t* trx, /*!< in: data dictionary transaction */
- dict_table_t* table); /*!< in/out: table */
-
-/*********************************************************************//**
Gets the minimum number of bytes per character.
@return minimum multi-byte char size, in bytes */
UNIV_INLINE
@@ -381,12 +369,8 @@ dberr_t
dict_table_rename_in_cache(
/*=======================*/
dict_table_t* table, /*!< in/out: table */
- const char* new_name, /*!< in: new name */
- bool rename_also_foreigns,
- /*!< in: in ALTER TABLE we want
- to preserve the original table name
- in constraints which reference it */
- bool replace_new_file = false)
+ span<const char> new_name, /*!< in: new name */
+ bool replace_new_file)
/*!< in: whether to replace the
file with the new name
(as part of rolling back TRUNCATE) */
@@ -437,14 +421,6 @@ dict_foreign_add_to_cache(
dict_err_ignore_t ignore_err)
/*!< in: error to be ignored */
MY_ATTRIBUTE((nonnull(1), warn_unused_result));
-/*********************************************************************//**
-Checks if a table is referenced by foreign keys.
-@return TRUE if table is referenced by a foreign key */
-ibool
-dict_table_is_referenced_by_foreign_key(
-/*====================================*/
- const dict_table_t* table) /*!< in: InnoDB table */
- MY_ATTRIBUTE((nonnull, warn_unused_result));
/**********************************************************************//**
Replace the index passed in with another equivalent index in the
foreign key lists of the table.
@@ -480,16 +456,14 @@ NOTE! This is a high-level function to be used mainly from outside the
'dict' directory. Inside this directory dict_table_get_low
is usually the appropriate function.
@param[in] table_name Table name
-@param[in] dict_locked TRUE=data dictionary locked
-@param[in] try_drop TRUE=try to drop any orphan indexes after
- an aborted online index creation
+@param[in] dict_locked whether dict_sys.latch is being held exclusively
@param[in] ignore_err error to be ignored when loading the table
-@return table, NULL if does not exist */
+@return table
+@retval nullptr if does not exist */
dict_table_t*
dict_table_open_on_name(
const char* table_name,
- ibool dict_locked,
- ibool try_drop,
+ bool dict_locked,
dict_err_ignore_t ignore_err)
MY_ATTRIBUTE((warn_unused_result));
@@ -656,19 +630,6 @@ dict_table_get_next_index(
# define dict_table_get_next_index(index) UT_LIST_GET_NEXT(indexes, index)
#endif /* UNIV_DEBUG */
-/* Skip corrupted index */
-#define dict_table_skip_corrupt_index(index) \
- while (index && index->is_corrupted()) { \
- index = dict_table_get_next_index(index); \
- }
-
-/* Get the next non-corrupt index */
-#define dict_table_next_uncorrupted_index(index) \
-do { \
- index = dict_table_get_next_index(index); \
- dict_table_skip_corrupt_index(index); \
-} while (0)
-
#define dict_index_is_clust(index) (index)->is_clust()
#define dict_index_is_auto_gen_clust(index) (index)->is_gen_clust()
#define dict_index_is_unique(index) (index)->is_unique()
@@ -946,17 +907,6 @@ dict_table_copy_types(
dtuple_t* tuple, /*!< in/out: data tuple */
const dict_table_t* table) /*!< in: table */
MY_ATTRIBUTE((nonnull));
-/**********************************************************************//**
-Make room in the table cache by evicting an unused table. The unused table
-should not be part of FK relationship and currently not used in any user
-transaction. There is no guarantee that it will remove a table.
-@return number of tables evicted. */
-ulint
-dict_make_room_in_cache(
-/*====================*/
- ulint max_tables, /*!< in: max tables allowed in cache */
- ulint pct_check); /*!< in: max percent to check */
-
/** Adds an index to the dictionary cache, with possible indexing newly
added column.
@param[in,out] index index; NOTE! The index memory
@@ -1159,7 +1109,6 @@ dict_field_get_col(
/**********************************************************************//**
Returns an index object if it is found in the dictionary cache.
-Assumes that dict_sys.mutex is already being held.
@return index, NULL if not found */
dict_index_t*
dict_index_get_if_in_cache_low(
@@ -1246,15 +1195,6 @@ dict_index_get_page(
/*================*/
const dict_index_t* tree) /*!< in: index */
MY_ATTRIBUTE((nonnull, warn_unused_result));
-/*********************************************************************//**
-Gets the read-write lock of the index tree.
-@return read-write lock */
-UNIV_INLINE
-rw_lock_t*
-dict_index_get_lock(
-/*================*/
- const dict_index_t* index) /*!< in: index */
- MY_ATTRIBUTE((nonnull, warn_unused_result));
/********************************************************************//**
Returns free space reserved for future updates of records. This is
relevant only in the case of many consecutive inserts, as updates
@@ -1306,9 +1246,6 @@ dict_index_calc_min_rec_len(
const dict_index_t* index) /*!< in: index */
MY_ATTRIBUTE((nonnull, warn_unused_result));
-#define dict_mutex_enter_for_mysql() mutex_enter(&dict_sys.mutex)
-#define dict_mutex_exit_for_mysql() mutex_exit(&dict_sys.mutex)
-
/********************************************************************//**
Checks if the database name in two table names is the same.
@return TRUE if same db name */
@@ -1372,105 +1309,134 @@ constraint */
/* Buffers for storing detailed information about the latest foreign key
and unique key errors */
extern FILE* dict_foreign_err_file;
-extern ib_mutex_t dict_foreign_err_mutex; /* mutex protecting the
- foreign key error messages */
+extern mysql_mutex_t dict_foreign_err_mutex;
/** InnoDB data dictionary cache */
class dict_sys_t
{
+ /** The my_hrtime_coarse().val of the oldest lock_wait() start, or 0 */
+ std::atomic<ulonglong> latch_ex_wait_start;
+
+ /** the rw-latch protecting the data dictionary cache */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_lock latch;
+#ifdef UNIV_DEBUG
+ /** whether latch is being held in exclusive mode (by any thread) */
+ Atomic_relaxed<pthread_t> latch_ex;
+ /** number of S-latch holders */
+ Atomic_counter<uint32_t> latch_readers;
+#endif
public:
- DictSysMutex mutex; /*!< mutex protecting the data
- dictionary; protects also the
- disk-based dictionary system tables;
- this mutex serializes CREATE TABLE
- and DROP TABLE, as well as reading
- the dictionary data for a table from
- system tables */
- /** @brief the data dictionary rw-latch protecting dict_sys
-
- Table create, drop, etc. reserve this in X-mode; implicit or
- backround operations purge, rollback, foreign key checks reserve this
- in S-mode; not all internal InnoDB operations are covered by MDL.
-
- This latch also prevents lock waits when accessing the InnoDB
- data dictionary tables. @see trx_t::dict_operation_lock_mode */
- rw_lock_t latch;
- row_id_t row_id; /*!< the next row id to assign;
- NOTE that at a checkpoint this
- must be written to the dict system
- header and flushed to a file; in
- recovery this must be derived from
- the log records */
- hash_table_t table_hash; /*!< hash table of the tables, based
- on name */
- /** hash table of persistent table IDs */
- hash_table_t table_id_hash;
- dict_table_t* sys_tables; /*!< SYS_TABLES table */
- dict_table_t* sys_columns; /*!< SYS_COLUMNS table */
- dict_table_t* sys_indexes; /*!< SYS_INDEXES table */
- dict_table_t* sys_fields; /*!< SYS_FIELDS table */
- dict_table_t* sys_virtual; /*!< SYS_VIRTUAL table */
-
- /*=============================*/
- UT_LIST_BASE_NODE_T(dict_table_t)
- table_LRU; /*!< List of tables that can be evicted
- from the cache */
- UT_LIST_BASE_NODE_T(dict_table_t)
- table_non_LRU; /*!< List of tables that can't be
- evicted from the cache */
+ /** Indexes of SYS_TABLE[] */
+ enum
+ {
+ SYS_TABLES= 0,
+ SYS_INDEXES,
+ SYS_COLUMNS,
+ SYS_FIELDS,
+ SYS_FOREIGN,
+ SYS_FOREIGN_COLS,
+ SYS_VIRTUAL
+ };
+ /** System table names */
+ static const span<const char> SYS_TABLE[];
+
+ /** all tables (persistent and temporary), hashed by name */
+ hash_table_t table_hash;
+ /** hash table of persistent table IDs */
+ hash_table_t table_id_hash;
+
+ /** the SYS_TABLES table */
+ dict_table_t *sys_tables;
+ /** the SYS_COLUMNS table */
+ dict_table_t *sys_columns;
+ /** the SYS_INDEXES table */
+ dict_table_t *sys_indexes;
+ /** the SYS_FIELDS table */
+ dict_table_t *sys_fields;
+ /** the SYS_FOREIGN table */
+ dict_table_t *sys_foreign;
+ /** the SYS_FOREIGN_COLS table */
+ dict_table_t *sys_foreign_cols;
+ /** the SYS_VIRTUAL table */
+ dict_table_t *sys_virtual;
+
+ /** @return whether all non-hard-coded system tables exist */
+ bool sys_tables_exist() const
+ { return UNIV_LIKELY(sys_foreign && sys_foreign_cols && sys_virtual); }
+
+ /** list of persistent tables that can be evicted */
+ UT_LIST_BASE_NODE_T(dict_table_t) table_LRU;
+ /** list of persistent tables that cannot be evicted */
+ UT_LIST_BASE_NODE_T(dict_table_t) table_non_LRU;
+
private:
- bool m_initialised;
- /** the sequence of temporary table IDs */
- std::atomic<table_id_t> temp_table_id;
- /** hash table of temporary table IDs */
- hash_table_t temp_id_hash;
+ bool m_initialised= false;
+ /** the sequence of temporary table IDs */
+ std::atomic<table_id_t> temp_table_id{DICT_HDR_FIRST_ID};
+ /** hash table of temporary table IDs */
+ hash_table_t temp_id_hash;
+ /** the next value of DB_ROW_ID, backed by DICT_HDR_ROW_ID
+ (FIXME: remove this, and move to dict_table_t) */
+ Atomic_relaxed<row_id_t> row_id;
+ /** The synchronization interval of row_id */
+ static constexpr size_t ROW_ID_WRITE_MARGIN= 256;
public:
- /** @return a new temporary table ID */
- table_id_t get_temporary_table_id() {
- return temp_table_id.fetch_add(1, std::memory_order_relaxed);
- }
+ /** Diagnostic message for exceeding the lock_wait() timeout */
+ static const char fatal_msg[];
- /** Look up a temporary table.
- @param id temporary table ID
- @return temporary table
- @retval NULL if the table does not exist
- (should only happen during the rollback of CREATE...SELECT) */
- dict_table_t* get_temporary_table(table_id_t id)
- {
- ut_ad(mutex_own(&mutex));
- dict_table_t* table;
- ulint fold = ut_fold_ull(id);
- HASH_SEARCH(id_hash, &temp_id_hash, fold, dict_table_t*, table,
- ut_ad(table->cached), table->id == id);
- if (UNIV_LIKELY(table != NULL)) {
- DBUG_ASSERT(table->is_temporary());
- DBUG_ASSERT(table->id >= DICT_HDR_FIRST_ID);
- table->acquire();
- }
- return table;
- }
+ /** @return A new value for GEN_CLUST_INDEX(DB_ROW_ID) */
+ inline row_id_t get_new_row_id();
- /** Look up a persistent table.
- @param id table ID
- @return table
- @retval NULL if not cached */
- dict_table_t* get_table(table_id_t id)
- {
- ut_ad(mutex_own(&mutex));
- dict_table_t* table;
- ulint fold = ut_fold_ull(id);
- HASH_SEARCH(id_hash, &table_id_hash, fold, dict_table_t*,
- table,
- ut_ad(table->cached), table->id == id);
- DBUG_ASSERT(!table || !table->is_temporary());
- return table;
- }
+ /** Ensure that row_id is not smaller than id, on IMPORT TABLESPACE */
+ inline void update_row_id(row_id_t id);
- /**
- Constructor. Further initialisation happens in create().
- */
+ /** Recover the global DB_ROW_ID sequence on database startup */
+ void recover_row_id(row_id_t id)
+ {
+ row_id= ut_uint64_align_up(id, ROW_ID_WRITE_MARGIN) + ROW_ID_WRITE_MARGIN;
+ }
- dict_sys_t() : m_initialised(false), temp_table_id(DICT_HDR_FIRST_ID) {}
+ /** @return a new temporary table ID */
+ table_id_t acquire_temporary_table_id()
+ {
+ return temp_table_id.fetch_add(1, std::memory_order_relaxed);
+ }
+
+ /** Look up a temporary table.
+ @param id temporary table ID
+ @return temporary table
+ @retval nullptr if the table does not exist
+ (should only happen during the rollback of CREATE...SELECT) */
+ dict_table_t *acquire_temporary_table(table_id_t id)
+ {
+ ut_ad(frozen());
+ dict_table_t *table;
+ ulint fold = ut_fold_ull(id);
+ HASH_SEARCH(id_hash, &temp_id_hash, fold, dict_table_t*, table,
+ ut_ad(table->cached), table->id == id);
+ if (UNIV_LIKELY(table != nullptr))
+ {
+ DBUG_ASSERT(table->is_temporary());
+ DBUG_ASSERT(table->id >= DICT_HDR_FIRST_ID);
+ table->acquire();
+ }
+ return table;
+ }
+
+ /** Look up a persistent table.
+ @param id table ID
+ @return table
+ @retval nullptr if not cached */
+ dict_table_t *find_table(table_id_t id)
+ {
+ ut_ad(frozen());
+ dict_table_t *table;
+ ulint fold= ut_fold_ull(id);
+ HASH_SEARCH(id_hash, &table_id_hash, fold, dict_table_t*, table,
+ ut_ad(table->cached), table->id == id);
+ DBUG_ASSERT(!table || !table->is_temporary());
+ return table;
+ }
bool is_initialised() const { return m_initialised; }
@@ -1493,14 +1459,13 @@ public:
#ifdef UNIV_DEBUG
/** Find a table */
- template <bool in_lru> bool find(dict_table_t* table)
+ template <bool in_lru> bool find(const dict_table_t *table)
{
ut_ad(table);
ut_ad(table->can_be_evicted == in_lru);
- ut_ad(mutex_own(&mutex));
- for (const dict_table_t* t = UT_LIST_GET_FIRST(in_lru
- ? table_LRU : table_non_LRU);
- t; t = UT_LIST_GET_NEXT(table_LRU, t))
+ ut_ad(frozen());
+ for (const dict_table_t* t= in_lru ? table_LRU.start : table_non_LRU.start;
+ t; t = UT_LIST_GET_NEXT(table_LRU, t))
{
if (t == table) return true;
ut_ad(t->can_be_evicted == in_lru);
@@ -1508,128 +1473,146 @@ public:
return false;
}
/** Find a table */
- bool find(dict_table_t* table)
+ bool find(const dict_table_t *table)
{
return table->can_be_evicted ? find<true>(table) : find<false>(table);
}
#endif
/** Move a table to the non-LRU list from the LRU list. */
- void prevent_eviction(dict_table_t* table)
+ void prevent_eviction(dict_table_t *table)
{
+ ut_d(locked());
ut_ad(find(table));
- if (table->can_be_evicted)
- {
- table->can_be_evicted = FALSE;
- UT_LIST_REMOVE(table_LRU, table);
- UT_LIST_ADD_LAST(table_non_LRU, table);
- }
+ if (!table->can_be_evicted)
+ return;
+ table->can_be_evicted= false;
+ UT_LIST_REMOVE(table_LRU, table);
+ UT_LIST_ADD_LAST(table_non_LRU, table);
}
- /** Acquire a reference to a cached table. */
- inline void acquire(dict_table_t* table);
#ifdef UNIV_DEBUG
- /** Assert that the data dictionary is locked */
- void assert_locked()
- {
- ut_ad(mutex_own(&mutex));
- ut_ad(rw_lock_own(&latch, RW_LOCK_X));
- }
+ /** @return whether any thread (not necessarily the current thread)
+ is holding the latch; that is, this check may return false
+ positives */
+ bool frozen() const { return latch_readers || latch_ex; }
+ /** @return whether any thread (not necessarily the current thread)
+ is holding a shared latch */
+ bool frozen_not_locked() const { return latch_readers; }
+ /** @return whether the current thread holds the exclusive latch */
+ bool locked() const { return latch_ex == pthread_self(); }
#endif
- /** Lock the data dictionary cache. */
- void lock(const char* file, unsigned line)
+private:
+ /** Acquire the exclusive latch */
+ ATTRIBUTE_NOINLINE
+ void lock_wait(SRW_LOCK_ARGS(const char *file, unsigned line));
+public:
+ /** @return the my_hrtime_coarse().val of the oldest lock_wait() start,
+ assuming that requests are served on a FIFO basis */
+ ulonglong oldest_wait() const
+ { return latch_ex_wait_start.load(std::memory_order_relaxed); }
+
+ /** Exclusively lock the dictionary cache. */
+ void lock(SRW_LOCK_ARGS(const char *file, unsigned line))
{
- rw_lock_x_lock_func(&latch, 0, file, line);
- mutex_enter_loc(&mutex, file, line);
+ if (latch.wr_lock_try())
+ {
+ ut_ad(!latch_readers);
+ ut_ad(!latch_ex);
+ ut_d(latch_ex= pthread_self());
+ }
+ else
+ lock_wait(SRW_LOCK_ARGS(file, line));
}
+#ifdef UNIV_PFS_RWLOCK
+ /** Unlock the data dictionary cache. */
+ ATTRIBUTE_NOINLINE void unlock();
+ /** Acquire a shared lock on the dictionary cache. */
+ ATTRIBUTE_NOINLINE void freeze(const char *file, unsigned line);
+ /** Release a shared lock on the dictionary cache. */
+ ATTRIBUTE_NOINLINE void unfreeze();
+#else
/** Unlock the data dictionary cache. */
void unlock()
{
- mutex_exit(&mutex);
- rw_lock_x_unlock(&latch);
+ ut_ad(latch_ex == pthread_self());
+ ut_ad(!latch_readers);
+ ut_d(latch_ex= 0);
+ latch.wr_unlock();
}
+ /** Acquire a shared lock on the dictionary cache. */
+ void freeze()
+ {
+ latch.rd_lock();
+ ut_ad(!latch_ex);
+ ut_d(latch_readers++);
+ }
+ /** Release a shared lock on the dictionary cache. */
+ void unfreeze()
+ {
+ ut_ad(!latch_ex);
+ ut_ad(latch_readers--);
+ latch.rd_unlock();
+ }
+#endif
/** Estimate the used memory occupied by the data dictionary
table and index objects.
@return number of bytes occupied */
- ulint rough_size() const
+ TPOOL_SUPPRESS_TSAN ulint rough_size() const
{
- /* No mutex; this is a very crude approximation anyway */
+ /* No latch; this is a very crude approximation anyway */
ulint size = UT_LIST_GET_LEN(table_LRU) + UT_LIST_GET_LEN(table_non_LRU);
size *= sizeof(dict_table_t)
+ sizeof(dict_index_t) * 2
+ (sizeof(dict_col_t) + sizeof(dict_field_t)) * 10
+ sizeof(dict_field_t) * 5 /* total number of key fields */
+ 200; /* arbitrary, covering names and overhead */
- size += (table_hash.n_cells + table_id_hash.n_cells
- + temp_id_hash.n_cells) * sizeof(hash_cell_t);
+ size += (table_hash.n_cells + table_id_hash.n_cells +
+ temp_id_hash.n_cells) * sizeof(hash_cell_t);
return size;
}
-};
-/** the data dictionary cache */
-extern dict_sys_t dict_sys;
+ /** Evict unused, unlocked tables from table_LRU.
+ @param half whether to consider half the tables only (instead of all)
+ @return number of tables evicted */
+ ulint evict_table_LRU(bool half);
-#define dict_table_prevent_eviction(table) dict_sys.prevent_eviction(table)
-#define dict_sys_lock() dict_sys.lock(__FILE__, __LINE__)
-#define dict_sys_unlock() dict_sys.unlock()
-
-/* Auxiliary structs for checking a table definition @{ */
-
-/* This struct is used to specify the name and type that a column must
-have when checking a table's schema. */
-struct dict_col_meta_t {
- const char* name; /* column name */
- ulint mtype; /* required column main type */
- ulint prtype_mask; /* required column precise type mask;
- if this is non-zero then all the
- bits it has set must also be set
- in the column's prtype */
- ulint len; /* required column length */
-};
+ /** Look up a table in the dictionary cache.
+ @param name table name
+ @return table handle
+ @retval nullptr if not found */
+ dict_table_t *find_table(const span<const char> &name) const
+ {
+ ut_ad(frozen());
+ for (dict_table_t *table= static_cast<dict_table_t*>
+ (HASH_GET_FIRST(&table_hash, table_hash.calc_hash
+ (my_crc32c(0, name.data(), name.size()))));
+ table; table= table->name_hash)
+ if (strlen(table->name.m_name) == name.size() &&
+ !memcmp(table->name.m_name, name.data(), name.size()))
+ return table;
+ return nullptr;
+ }
-/* This struct is used for checking whether a given table exists and
-whether it has a predefined schema (number of columns and column names
-and types) */
-struct dict_table_schema_t {
- const char* table_name; /* the name of the table whose
- structure we are checking */
- ulint n_cols; /* the number of columns the
- table must have */
- dict_col_meta_t* columns; /* metadata for the columns;
- this array has n_cols
- elements */
- ulint n_foreign; /* number of foreign keys this
- table has, pointing to other
- tables (where this table is
- FK child) */
- ulint n_referenced; /* number of foreign keys other
- tables have, pointing to this
- table (where this table is
- parent) */
+ /** Look up or load a table definition
+ @param name table name
+ @param ignore errors to ignore when loading the table definition
+ @return table handle
+ @retval nullptr if not found */
+ dict_table_t *load_table(const span<const char> &name,
+ dict_err_ignore_t ignore= DICT_ERR_IGNORE_NONE);
+
+ /** Attempt to load the system tables on startup
+ @return whether any discrepancy with the expected definition was found */
+ bool load_sys_tables();
+ /** Create or check system tables on startup */
+ dberr_t create_or_check_sys_tables();
};
-/* @} */
-/*********************************************************************//**
-Checks whether a table exists and whether it has the given structure.
-The table must have the same number of columns with the same names and
-types. The order of the columns does not matter.
-The caller must own the dictionary mutex.
-dict_table_schema_check() @{
-@return DB_SUCCESS if the table exists and contains the necessary columns */
-dberr_t
-dict_table_schema_check(
-/*====================*/
- dict_table_schema_t* req_schema, /*!< in/out: required table
- schema */
- char* errstr, /*!< out: human readable error
- message if != DB_SUCCESS and
- != DB_TABLE_NOT_FOUND is
- returned */
- size_t errstr_sz) /*!< in: errstr size */
- MY_ATTRIBUTE((nonnull, warn_unused_result));
-/* @} */
+/** the data dictionary cache */
+extern dict_sys_t dict_sys;
/*********************************************************************//**
Converts a database and table name from filesystem encoding
@@ -1647,43 +1630,12 @@ dict_fs2utf8(
size_t table_utf8_size)/*!< in: table_utf8 size */
MY_ATTRIBUTE((nonnull));
-/**********************************************************************//**
-Check whether the table is corrupted.
-@return nonzero for corrupted table, zero for valid tables */
-UNIV_INLINE
-ulint
-dict_table_is_corrupted(
-/*====================*/
- const dict_table_t* table) /*!< in: table */
- MY_ATTRIBUTE((nonnull, warn_unused_result));
-
-/**********************************************************************//**
-Flags an index and table corrupted both in the data dictionary cache
-and in the system table SYS_INDEXES. */
-void
-dict_set_corrupted(
-/*===============*/
- dict_index_t* index, /*!< in/out: index */
- trx_t* trx, /*!< in/out: transaction */
- const char* ctx) /*!< in: context */
- ATTRIBUTE_COLD __attribute__((nonnull));
-
-/** Flags an index corrupted in the data dictionary cache only. This
-is used mostly to mark a corrupted index when index's own dictionary
-is corrupted, and we force to load such index for repair purpose
-@param[in,out] index index that is corrupted */
-void
-dict_set_corrupted_index_cache_only(
- dict_index_t* index);
-
-/**********************************************************************//**
-Flags a table with specified space_id corrupted in the table dictionary
-cache.
-@return TRUE if successful */
-bool dict_set_corrupted_by_space(const fil_space_t* space);
-
-/** Flag a table encrypted in the data dictionary cache. */
-void dict_set_encrypted_by_space(const fil_space_t* space);
+/** Flag an index corrupted both in the data dictionary cache
+and in the system table SYS_INDEXES.
+@param index index to be flagged as corrupted
+@param ctx context (for error log reporting) */
+void dict_set_corrupted(dict_index_t *index, const char *ctx)
+ ATTRIBUTE_COLD __attribute__((nonnull));
/** Sets merge_threshold in the SYS_INDEXES
@param[in,out] index index
diff --git a/storage/innobase/include/dict0dict.inl b/storage/innobase/include/dict0dict.inl
index eda639ba7c1..a210c839020 100644
--- a/storage/innobase/include/dict0dict.inl
+++ b/storage/innobase/include/dict0dict.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2020, MariaDB Corporation.
+Copyright (c) 2013, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -907,20 +907,6 @@ dict_index_get_page(
return(index->page);
}
-/*********************************************************************//**
-Gets the read-write lock of the index tree.
-@return read-write lock */
-UNIV_INLINE
-rw_lock_t*
-dict_index_get_lock(
-/*================*/
- const dict_index_t* index) /*!< in: index */
-{
- ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
-
- return(&(index->lock));
-}
-
/********************************************************************//**
Returns free space reserved for future updates of records. This is
relevant only in the case of many consecutive inserts, as updates
@@ -977,7 +963,7 @@ dict_index_set_online_status(
enum online_index_status status) /*!< in: status */
{
ut_ad(!(index->type & DICT_FTS));
- ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X));
+ ut_ad(index->lock.have_x());
#ifdef UNIV_DEBUG
switch (dict_index_get_online_status(index)) {
@@ -1114,19 +1100,6 @@ dict_max_v_field_len_store_undo(
return(max_log_len);
}
-/********************************************************************//**
-Check whether the table is corrupted.
-@return nonzero for corrupted table, zero for valid tables */
-UNIV_INLINE
-ulint
-dict_table_is_corrupted(
-/*====================*/
- const dict_table_t* table) /*!< in: table */
-{
- ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
- return(table->corrupted);
-}
-
/** Check if the table is found is a file_per_table tablespace.
This test does not use table flags2 since some REDUNDANT tables in the
system tablespace may have garbage in the MIX_LEN field where flags2 is
@@ -1153,12 +1126,10 @@ dict_table_is_file_per_table(
}
/** Acquire the table handle. */
-inline
-void
-dict_table_t::acquire()
+inline void dict_table_t::acquire()
{
- ut_ad(mutex_own(&dict_sys.mutex));
- n_ref_count++;
+ ut_ad(dict_sys.frozen());
+ n_ref_count++;
}
/** Release the table handle.
diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h
index f067571ca5b..f7d33d5b43b 100644
--- a/storage/innobase/include/dict0load.h
+++ b/storage/innobase/include/dict0load.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -39,30 +39,12 @@ Created 4/24/1996 Heikki Tuuri
/** A stack of table names related through foreign key constraints */
typedef std::deque<const char*, ut_allocator<const char*> > dict_names_t;
-/** enum that defines all system table IDs. @see SYSTEM_TABLE_NAME[] */
-enum dict_system_id_t {
- SYS_TABLES = 0,
- SYS_INDEXES,
- SYS_COLUMNS,
- SYS_FIELDS,
- SYS_FOREIGN,
- SYS_FOREIGN_COLS,
- SYS_TABLESPACES,
- SYS_DATAFILES,
- SYS_VIRTUAL,
-
- /* This must be last item. Defines the number of system tables. */
- SYS_NUM_SYSTEM_TABLES
-};
-
/** Check each tablespace found in the data dictionary.
-Look at each table defined in SYS_TABLES that has a space_id > 0.
-If the tablespace is not yet in the fil_system cache, look up the
-tablespace in SYS_DATAFILES to ensure the correct path.
+Then look at each table defined in SYS_TABLES that has a space_id > 0
+to find all the file-per-table tablespaces.
In a crash recovery we already have some tablespace objects created from
-processing the REDO log. Any other tablespace in SYS_TABLESPACES not
-previously used in recovery will be opened here. We will compare the
+processing the REDO log. We will compare the
space_id information in the data dictionary to what we find in the
tablespace file. In addition, more validation will be done if recovery
was needed and force_recovery is not set.
@@ -70,35 +52,9 @@ was needed and force_recovery is not set.
We also scan the biggest space id, and store it to fil_system. */
void dict_check_tablespaces_and_store_max_id();
-/********************************************************************//**
-Finds the first table name in the given database.
-@return own: table name, NULL if does not exist; the caller must free
-the memory in the string! */
-char*
-dict_get_first_table_name_in_db(
-/*============================*/
- const char* name); /*!< in: database name which ends to '/' */
-
/** Make sure the data_file_name is saved in dict_table_t if needed.
-Try to read it from the fil_system first, then from SYS_DATAFILES.
-@param[in] table Table object
-@param[in] dict_mutex_own true if dict_sys.mutex is owned already */
-void
-dict_get_and_save_data_dir_path(
- dict_table_t* table,
- bool dict_mutex_own);
-
-/** Loads a table definition and also all its index definitions, and also
-the cluster definition if the table is a member in a cluster. Also loads
-all foreign key constraints where the foreign key is in the table or where
-a foreign key references columns in this table.
-@param[in] name Table name in the dbname/tablename format
-@param[in] ignore_err Error to be ignored when loading
- table and its index definition
-@return table, NULL if does not exist; if the table is stored in an
-.ibd file, but the file does not exist, then we set the file_unreadable
-flag in the table object we return. */
-dict_table_t* dict_load_table(const char* name, dict_err_ignore_t ignore_err);
+@param[in,out] table Table object */
+void dict_get_and_save_data_dir_path(dict_table_t* table);
/***********************************************************************//**
Loads a table object based on the table id.
@@ -133,7 +89,8 @@ dict_load_foreigns(
const char* table_name, /*!< in: table name */
const char** col_names, /*!< in: column names, or NULL
to use table->col_names */
- bool check_recursive,/*!< in: Whether to check
+ trx_id_t trx_id, /*!< in: DDL transaction id,
+ or 0 to check
recursive load of tables
chained by FK */
bool check_charsets, /*!< in: whether to check
@@ -143,7 +100,7 @@ dict_load_foreigns(
which must be loaded
subsequently to load all the
foreign key constraints. */
- MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+ MY_ATTRIBUTE((nonnull(1)));
/********************************************************************//**
This function opens a system table, and return the first record.
@@ -154,7 +111,7 @@ dict_startscan_system(
btr_pcur_t* pcur, /*!< out: persistent cursor to
the record */
mtr_t* mtr, /*!< in: the mini-transaction */
- dict_system_id_t system_id); /*!< in: which system table to open */
+ dict_table_t* table); /*!< in: system table */
/********************************************************************//**
This function get the next system table record as we scan the table.
@return the record if found, NULL if end of scan. */
@@ -164,19 +121,19 @@ dict_getnext_system(
btr_pcur_t* pcur, /*!< in/out: persistent cursor
to the record */
mtr_t* mtr); /*!< in: the mini-transaction */
-/********************************************************************//**
-This function processes one SYS_TABLES record and populate the dict_table_t
-struct for the table.
-@return error message, or NULL on success */
-const char*
-dict_process_sys_tables_rec_and_mtr_commit(
-/*=======================================*/
- mem_heap_t* heap, /*!< in: temporary memory heap */
- const rec_t* rec, /*!< in: SYS_TABLES record */
- dict_table_t** table, /*!< out: dict_table_t to fill */
- bool cached, /*!< in: whether to load from cache */
- mtr_t* mtr); /*!< in/out: mini-transaction,
- will be committed */
+
+/** Load a table definition from a SYS_TABLES record to dict_table_t.
+Do not load any columns or indexes.
+@param[in,out] mtr mini-transaction
+@param[in] uncommitted whether to use READ UNCOMMITTED isolation level
+@param[in] rec SYS_TABLES record
+@param[out,own] table table, or nullptr
+@return error message
+@retval nullptr on success */
+const char *dict_load_table_low(mtr_t *mtr, bool uncommitted,
+ const rec_t *rec, dict_table_t **table)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
/********************************************************************//**
This function parses a SYS_INDEXES record and populate a dict_index_t
structure with the information from the record. For detail information
@@ -259,51 +216,5 @@ dict_process_sys_foreign_col_rec(
const char** ref_col_name, /*!< out: referenced column name
in referenced table */
ulint* pos); /*!< out: column position */
-/********************************************************************//**
-This function parses a SYS_TABLESPACES record, extracts necessary
-information from the record and returns to caller.
-@return error message, or NULL on success */
-const char*
-dict_process_sys_tablespaces(
-/*=========================*/
- mem_heap_t* heap, /*!< in/out: heap memory */
- const rec_t* rec, /*!< in: current SYS_TABLESPACES rec */
- uint32_t* space, /*!< out: tablespace identifier */
- const char** name, /*!< out: tablespace name */
- ulint* flags); /*!< out: tablespace flags */
-/********************************************************************//**
-This function parses a SYS_DATAFILES record, extracts necessary
-information from the record and returns to caller.
-@return error message, or NULL on success */
-const char*
-dict_process_sys_datafiles(
-/*=======================*/
- mem_heap_t* heap, /*!< in/out: heap memory */
- const rec_t* rec, /*!< in: current SYS_DATAFILES rec */
- uint32_t* space, /*!< out: tablespace identifier */
- const char** path); /*!< out: datafile path */
-
-/** Update the record for space_id in SYS_TABLESPACES to this filepath.
-@param[in] space_id Tablespace ID
-@param[in] filepath Tablespace filepath
-@return DB_SUCCESS if OK, dberr_t if the insert failed */
-dberr_t
-dict_update_filepath(
- ulint space_id,
- const char* filepath);
-
-/** Replace records in SYS_TABLESPACES and SYS_DATAFILES associated with
-the given space_id using an independent transaction.
-@param[in] space_id Tablespace ID
-@param[in] name Tablespace name
-@param[in] filepath First filepath
-@param[in] fsp_flags Tablespace flags
-@return DB_SUCCESS if OK, dberr_t if the insert failed */
-dberr_t
-dict_replace_tablespace_and_filepath(
- ulint space_id,
- const char* name,
- const char* filepath,
- ulint fsp_flags);
#endif
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
index 0a28a6a9868..c469b9da1c2 100644
--- a/storage/innobase/include/dict0mem.h
+++ b/storage/innobase/include/dict0mem.h
@@ -2,7 +2,7 @@
Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2013, 2021, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -28,14 +28,14 @@ Created 1/8/1996 Heikki Tuuri
#ifndef dict0mem_h
#define dict0mem_h
+#include "dict0types.h"
#include "data0type.h"
#include "mem0mem.h"
#include "row0types.h"
-#include "rem0types.h"
#include "btr0types.h"
#include "lock0types.h"
#include "que0types.h"
-#include "sync0rw.h"
+#include "sux_lock.h"
#include "ut0mem.h"
#include "ut0rnd.h"
#include "ut0byte.h"
@@ -298,17 +298,6 @@ parent table will fail, and user has to drop excessive foreign constraint
before proceeds. */
#define FK_MAX_CASCADE_DEL 15
-/** Create a table memory object.
-@param name table name
-@param space tablespace
-@param n_cols total number of columns (both virtual and non-virtual)
-@param n_v_cols number of virtual columns
-@param flags table flags
-@param flags2 table flags2
-@return own: table object */
-dict_table_t *dict_mem_table_create(const char *name, fil_space_t *space,
- ulint n_cols, ulint n_v_cols, ulint flags,
- ulint flags2);
/****************************************************************/ /**
Free a table memory object. */
void
@@ -977,6 +966,26 @@ const char innobase_index_reserve_name[] = "GEN_CLUST_INDEX";
/** Data structure for an index. Most fields will be
initialized to 0, NULL or FALSE in dict_mem_index_create(). */
struct dict_index_t {
+ /** Columns whose character-set collation is being changed */
+ struct col_info
+ {
+ /** number of columns whose charset-collation is being changed */
+ unsigned n_cols;
+ /** columns with changed charset-collation */
+ dict_col_t *cols;
+
+ /** Add a column with changed collation. */
+ dict_col_t *add(mem_heap_t *heap, const dict_col_t &col, unsigned offset)
+ {
+ ut_ad(offset < n_cols);
+ if (!cols)
+ cols= static_cast<dict_col_t*>
+ (mem_heap_alloc(heap, n_cols * sizeof col));
+ new (&cols[offset]) dict_col_t(col);
+ return &cols[offset];
+ }
+ };
+
/** Maximum number of fields */
static constexpr unsigned MAX_N_FIELDS= (1U << 10) - 1;
@@ -1011,15 +1020,6 @@ struct dict_index_t {
representation we add more columns */
unsigned nulls_equal:1;
/*!< if true, SQL NULL == SQL NULL */
-#ifdef BTR_CUR_HASH_ADAPT
-#ifdef MYSQL_INDEX_DISABLE_AHI
- unsigned disable_ahi:1;
- /*!< whether to disable the
- adaptive hash index.
- Maybe this could be disabled for
- temporary tables? */
-#endif
-#endif /* BTR_CUR_HASH_ADAPT */
unsigned n_uniq:10;/*!< number of fields from the beginning
which are enough to determine an index
entry uniquely */
@@ -1046,8 +1046,7 @@ struct dict_index_t {
/*!< enum online_index_status.
Transitions from ONLINE_INDEX_COMPLETE (to
ONLINE_INDEX_CREATION) are protected
- by dict_sys.latch and
- dict_sys.mutex. Other changes are
+ by dict_sys.latch. Other changes are
protected by index->lock. */
unsigned uncommitted:1;
/*!< a flag that is set for secondary indexes
@@ -1072,6 +1071,16 @@ struct dict_index_t {
It should use heap from dict_index_t. It should be freed
while removing the index from table. */
dict_add_v_col_info* new_vcol_info;
+
+ /** During ALTER TABLE, columns that a being-added index depends on
+ and whose encoding or collation is being changed to something
+ that is compatible with the clustered index.
+ Allocated from dict_index_t::heap.
+
+ @see rollback_inplace_alter_table()
+ @see ha_innobase_inplace_ctx::col_collations */
+ col_info* change_col_info;
+
UT_LIST_NODE_T(dict_index_t)
indexes;/*!< list of indexes of the table */
#ifdef BTR_CUR_ADAPT
@@ -1148,8 +1157,8 @@ public:
when InnoDB was started up */
zip_pad_info_t zip_pad;/*!< Information about state of
compression failures and successes */
- mutable rw_lock_t lock; /*!< read-write lock protecting the
- upper levels of the index tree */
+ /** lock protecting the non-leaf index pages */
+ mutable index_lock lock;
/** Determine if the index has been committed to the
data dictionary.
@@ -1166,6 +1175,7 @@ public:
{
ut_ad(!to_be_dropped);
ut_ad(committed || !(type & DICT_CLUSTERED));
+ ut_ad(!committed || !change_col_info);
uncommitted = !committed;
}
@@ -1205,6 +1215,16 @@ public:
/** @return whether this is the change buffer */
bool is_ibuf() const { return UNIV_UNLIKELY(type & DICT_IBUF); }
+ /** @return whether this index requires locking */
+ bool has_locking() const { return !is_ibuf(); }
+
+ /** @return whether this is a normal B-tree index
+ (not the change buffer, not SPATIAL or FULLTEXT) */
+ bool is_btree() const {
+ return UNIV_LIKELY(!(type & (DICT_IBUF | DICT_SPATIAL
+ | DICT_FTS | DICT_CORRUPT)));
+ }
+
/** @return whether the index includes virtual columns */
bool has_virtual() const { return type & DICT_VIRTUAL; }
@@ -1321,6 +1341,16 @@ public:
ulint get_new_n_vcol() const
{ return new_vcol_info ? new_vcol_info->n_v_col : 0; }
+ /** Assign the number of collation change fields as a part of the index
+ @param n_cols number of columns whose collation is changing */
+ void init_change_cols(unsigned n_cols)
+ {
+ ut_ad(n_fields > n_cols || type & DICT_FTS);
+ change_col_info= static_cast<col_info*>
+ (mem_heap_zalloc(heap, sizeof(col_info)));
+ change_col_info->n_cols= n_cols;
+ }
+
/** Reconstruct the clustered index fields.
@return whether metadata is incorrect */
inline bool reconstruct_fields();
@@ -1415,6 +1445,26 @@ public:
everything in overflow) size of the longest possible row and index
of a field which made index records too big to fit on a page.*/
inline record_size_info_t record_size_info() const;
+
+ /** Clear the index tree and reinitialize the root page, in the
+ rollback of TRX_UNDO_EMPTY. The BTR_SEG_LEAF is freed and reinitialized.
+ @param thr query thread
+ @return error code */
+ dberr_t clear(que_thr_t *thr);
+
+ /** Check whether the online log is dummy value to indicate
+ whether table undergoes active DDL.
+ @retval true if online log is dummy value */
+ bool online_log_is_dummy() const
+ {
+ return online_log == reinterpret_cast<const row_log_t*>(this);
+ }
+
+ /** Assign clustered index online log to dummy value */
+ void online_log_make_dummy()
+ {
+ online_log= reinterpret_cast<row_log_t*>(this);
+ }
};
/** Detach a virtual column from an index.
@@ -1540,24 +1590,6 @@ struct dict_foreign_with_index {
const dict_index_t* m_index;
};
-#ifdef WITH_WSREP
-/** A function object to find a foreign key with the given index as the
-foreign index. Return the foreign key with matching criteria or NULL */
-struct dict_foreign_with_foreign_index {
-
- dict_foreign_with_foreign_index(const dict_index_t* index)
- : m_index(index)
- {}
-
- bool operator()(const dict_foreign_t* foreign) const
- {
- return(foreign->foreign_index == m_index);
- }
-
- const dict_index_t* m_index;
-};
-#endif
-
/* A function object to check if the foreign constraint is between different
tables. Returns true if foreign key constraint is between different tables,
false otherwise. */
@@ -1812,7 +1844,7 @@ typedef enum {
} dict_frm_t;
/** Data structure for a database table. Most fields will be
-initialized to 0, NULL or FALSE in dict_mem_table_create(). */
+zero-initialized in dict_table_t::create(). */
struct dict_table_t {
/** Get reference count.
@@ -1862,7 +1894,7 @@ struct dict_table_t {
which denotes temporary or intermediate tables in MariaDB. */
static bool is_temporary_name(const char* name)
{
- return strstr(name, "/" TEMP_FILE_PREFIX) != NULL;
+ return strstr(name, "/#sql");
}
/** @return whether instant ALTER TABLE is in effect */
@@ -1965,37 +1997,75 @@ struct dict_table_t {
return versioned() && cols[vers_start].mtype == DATA_INT;
}
- void inc_fk_checks()
- {
-#ifdef UNIV_DEBUG
- int32_t fk_checks=
-#endif
- n_foreign_key_checks_running++;
- ut_ad(fk_checks >= 0);
- }
- void dec_fk_checks()
- {
-#ifdef UNIV_DEBUG
- int32_t fk_checks=
-#endif
- n_foreign_key_checks_running--;
- ut_ad(fk_checks > 0);
- }
-
/** For overflow fields returns potential max length stored inline */
inline size_t get_overflow_field_local_len() const;
- /** Parse the table file name into table name and database name.
- @tparam dict_locked whether dict_sys.mutex is being held
- @param[in,out] db_name database name buffer
- @param[in,out] tbl_name table name buffer
- @param[out] db_name_len database name length
- @param[out] tbl_name_len table name length
- @return whether the table name is visible to SQL */
- template<bool dict_locked= false>
- bool parse_name(char (&db_name)[NAME_LEN + 1],
- char (&tbl_name)[NAME_LEN + 1],
- size_t *db_name_len, size_t *tbl_name_len) const;
+ /** Parse the table file name into table name and database name.
+ @tparam dict_frozen whether the caller holds dict_sys.latch
+ @param[in,out] db_name database name buffer
+ @param[in,out] tbl_name table name buffer
+ @param[out] db_name_len database name length
+ @param[out] tbl_name_len table name length
+ @return whether the table name is visible to SQL */
+ template<bool dict_frozen= false>
+ bool parse_name(char (&db_name)[NAME_LEN + 1],
+ char (&tbl_name)[NAME_LEN + 1],
+ size_t *db_name_len, size_t *tbl_name_len) const;
+
+ /** Clear the table when rolling back TRX_UNDO_EMPTY
+ @return error code */
+ dberr_t clear(que_thr_t *thr);
+
+#ifdef UNIV_DEBUG
+ /** @return whether the current thread holds the lock_mutex */
+ bool lock_mutex_is_owner() const
+ { return lock_mutex_owner == pthread_self(); }
+ /** @return whether the current thread holds the stats_mutex (lock_mutex) */
+ bool stats_mutex_is_owner() const
+ { return lock_mutex_owner == pthread_self(); }
+#endif /* UNIV_DEBUG */
+ void lock_mutex_init() { lock_mutex.init(); }
+ void lock_mutex_destroy() { lock_mutex.destroy(); }
+ /** Acquire lock_mutex */
+ void lock_mutex_lock()
+ {
+ ut_ad(!lock_mutex_is_owner());
+ lock_mutex.wr_lock();
+ ut_ad(!lock_mutex_owner.exchange(pthread_self()));
+ }
+ /** Try to acquire lock_mutex */
+ bool lock_mutex_trylock()
+ {
+ ut_ad(!lock_mutex_is_owner());
+ bool acquired= lock_mutex.wr_lock_try();
+ ut_ad(!acquired || !lock_mutex_owner.exchange(pthread_self()));
+ return acquired;
+ }
+ /** Release lock_mutex */
+ void lock_mutex_unlock()
+ {
+ ut_ad(lock_mutex_owner.exchange(0) == pthread_self());
+ lock_mutex.wr_unlock();
+ }
+#ifndef SUX_LOCK_GENERIC
+ /** @return whether the lock mutex is held by some thread */
+ bool lock_mutex_is_locked() const noexcept { return lock_mutex.is_locked(); }
+#endif
+
+ /* stats mutex lock currently defaults to lock_mutex but in the future,
+ there could be a use-case to have separate mutex for stats.
+ extra indirection (through inline so no performance hit) should
+ help simplify code and increase long-term maintainability */
+ void stats_mutex_init() { lock_mutex_init(); }
+ void stats_mutex_destroy() { lock_mutex_destroy(); }
+ void stats_mutex_lock() { lock_mutex_lock(); }
+ void stats_mutex_unlock() { lock_mutex_unlock(); }
+
+ /** Rename the data file.
+ @param new_name name of the table
+ @param replace whether to replace the file with the new name
+ (as part of rolling back TRUNCATE) */
+ dberr_t rename_tablespace(span<const char> new_name, bool replace) const;
private:
/** Initialize instant->field_map.
@@ -2004,12 +2074,12 @@ private:
public:
/** Id of the table. */
table_id_t id;
- /** Hash chain node. */
- hash_node_t id_hash;
- /** Table name. */
+ /** dict_sys.id_hash chain node */
+ dict_table_t* id_hash;
+ /** Table name in name_hash */
table_name_t name;
- /** Hash chain node. */
- hash_node_t name_hash;
+ /** dict_sys.name_hash chain node */
+ dict_table_t* name_hash;
/** Memory heap */
mem_heap_t* heap;
@@ -2057,12 +2127,6 @@ public:
/** TRUE if the table object has been added to the dictionary cache. */
unsigned cached:1;
- /** TRUE if the table is to be dropped, but not yet actually dropped
- (could in the background drop list). It is turned on at the beginning
- of row_drop_table_for_mysql() and turned off just before we start to
- update system tables for the drop. It is protected by dict_sys.latch. */
- unsigned to_be_dropped:1;
-
/** Number of non-virtual columns defined so far. */
unsigned n_def:10;
@@ -2158,23 +2222,24 @@ public:
/** Maximum recursive level we support when loading tables chained
together with FK constraints. If exceeds this level, we will stop
loading child table into memory along with its parent table. */
- unsigned fk_max_recusive_level:8;
+ byte fk_max_recusive_level;
- /** Count of how many foreign key check operations are currently being
- performed on the table. We cannot drop the table while there are
- foreign key checks running on it. */
- Atomic_counter<int32_t> n_foreign_key_checks_running;
+ /** DDL transaction that last touched the table definition, or 0 if
+ no history is available. This includes possible changes in
+ ha_innobase::prepare_inplace_alter_table() and
+ ha_innobase::commit_inplace_alter_table(). */
+ trx_id_t def_trx_id;
- /** Transactions whose view low limit is greater than this number are
- not allowed to store to the MySQL query cache or retrieve from it.
- When a trx with undo logs commits, it sets this to the value of the
- transaction id. */
- trx_id_t query_cache_inv_trx_id;
+ /** Last transaction that inserted into an empty table.
+ Updated while holding exclusive table lock and an exclusive
+ latch on the clustered index root page (which must also be
+ an empty leaf page), and an ahi_latch (if btr_search_enabled). */
+ Atomic_relaxed<trx_id_t> bulk_trx_id;
- /** Transaction id that last touched the table definition. Either when
- loading the definition or CREATE TABLE, or ALTER TABLE (prepare,
- commit, and rollback phases). */
- trx_id_t def_trx_id;
+ /** Original table name, for MDL acquisition in purge. Normally,
+ this points to the same as name. When is_temporary_name(name.m_name) holds,
+ this should be a copy of the original table name, allocated from heap. */
+ table_name_t mdl_name;
/*!< set of foreign key constraints in the table; these refer to
columns in other tables */
@@ -2184,7 +2249,7 @@ public:
dict_foreign_set referenced_set;
/** Statistics for query optimization. Mostly protected by
- dict_sys.mutex. @{ */
+ dict_sys.latch and stats_mutex_lock(). @{ */
/** TRUE if statistics have been calculated the first time after
database startup or table creation. */
@@ -2251,24 +2316,6 @@ public:
any latch, because this is only used for heuristics. */
ib_uint64_t stat_modified_counter;
- /** Background stats thread is not working on this table. */
- #define BG_STAT_NONE 0
-
- /** Set in 'stats_bg_flag' when the background stats code is working
- on this table. The DROP TABLE code waits for this to be cleared before
- proceeding. */
- #define BG_STAT_IN_PROGRESS (1 << 0)
-
- /** Set in 'stats_bg_flag' when DROP TABLE starts waiting on
- BG_STAT_IN_PROGRESS to be cleared. The background stats thread will
- detect this and will eventually quit sooner. */
- #define BG_STAT_SHOULD_QUIT (1 << 1)
-
- /** The state of the background stats thread wrt this table.
- See BG_STAT_NONE, BG_STAT_IN_PROGRESS and BG_STAT_SHOULD_QUIT.
- Writes are covered by dict_sys.mutex. Dirty reads are possible. */
- byte stats_bg_flag;
-
bool stats_error_printed;
/*!< Has persistent stats error beein
already printed for this table ? */
@@ -2280,7 +2327,7 @@ public:
kept in trx_t. In order to quickly determine whether a transaction has
locked the AUTOINC lock we keep a pointer to the transaction here in
the 'autoinc_trx' member. This is to avoid acquiring the
- lock_sys_t::mutex and scanning the vector in trx_t.
+ lock_sys.latch and scanning the vector in trx_t.
When an AUTOINC lock has to wait, the corresponding lock instance is
created on the trx lock heap rather than use the pre-allocated instance
in autoinc_lock below. */
@@ -2292,26 +2339,41 @@ public:
from a select. */
lock_t* autoinc_lock;
- /** Mutex protecting the autoinc counter and freed_indexes. */
- std::mutex autoinc_mutex;
-
- /** Autoinc counter value to give to the next inserted row. */
- ib_uint64_t autoinc;
-
- /** This counter is used to track the number of granted and pending
- autoinc locks on this table. This value is set after acquiring the
- lock_sys_t::mutex but we peek the contents to determine whether other
- transactions have acquired the AUTOINC lock or not. Of course only one
- transaction can be granted the lock but there can be multiple
- waiters. */
- ulong n_waiting_or_granted_auto_inc_locks;
-
- /** The transaction that currently holds the the AUTOINC lock on this
- table. Protected by lock_sys.mutex. */
- const trx_t* autoinc_trx;
+ /** Mutex protecting autoinc and freed_indexes. */
+ srw_spin_mutex autoinc_mutex;
+private:
+ /** Mutex protecting locks on this table. */
+ srw_spin_mutex lock_mutex;
+#ifdef UNIV_DEBUG
+ /** The owner of lock_mutex (0 if none) */
+ Atomic_relaxed<pthread_t> lock_mutex_owner{0};
+#endif
+public:
+ /** Autoinc counter value to give to the next inserted row. */
+ uint64_t autoinc;
+
+ /** The transaction that currently holds the the AUTOINC lock on this table.
+ Protected by lock_mutex.
+ The thread that is executing autoinc_trx may read this field without
+ holding a latch, in row_lock_table_autoinc_for_mysql().
+ Only the autoinc_trx thread may clear this field; it cannot be
+ modified on the behalf of a transaction that is being handled by a
+ different thread. */
+ Atomic_relaxed<const trx_t*> autoinc_trx;
+
+ /** Number of granted or pending autoinc_lock on this table. This
+ value is set after acquiring lock_sys.latch but
+ in innodb_autoinc_lock_mode=1 (the default),
+ ha_innobase::innobase_lock_autoinc() will perform a dirty read
+ to determine whether other transactions have acquired the autoinc_lock. */
+ uint32_t n_waiting_or_granted_auto_inc_locks;
/* @} */
+ /** Number of granted or pending LOCK_S or LOCK_X on the table.
+ Protected by lock_sys.assert_locked(*this). */
+ uint32_t n_lock_x_or_s;
+
/** FTS specific state variables. */
fts_t* fts;
@@ -2320,22 +2382,28 @@ public:
in X mode of this table's indexes. */
ib_quiesce_t quiesce;
- /** Count of the number of record locks on this table. We use this to
- determine whether we can evict the table from the dictionary cache.
- It is protected by lock_sys.mutex. */
- ulint n_rec_locks;
+ /** Count of the number of record locks on this table. We use this to
+ determine whether we can evict the table from the dictionary cache.
+ Modified when lock_sys.is_writer(), or
+ lock_sys.assert_locked(page_id) and trx->mutex_is_owner() hold.
+ @see trx_lock_t::trx_locks */
+ Atomic_counter<uint32_t> n_rec_locks;
private:
- /** Count of how many handles are opened to this table. Dropping of the
- table is NOT allowed until this count gets to zero. MySQL does NOT
- itself check the number of open handles at DROP. */
- Atomic_counter<uint32_t> n_ref_count;
-
+ /** Count of how many handles are opened to this table. Dropping of the
+ table is NOT allowed until this count gets to zero. MySQL does NOT
+ itself check the number of open handles at DROP. */
+ Atomic_counter<uint32_t> n_ref_count;
public:
- /** List of locks on the table. Protected by lock_sys.mutex. */
- table_lock_list_t locks;
+ /** List of locks on the table. Protected by lock_sys.assert_locked(lock). */
+ table_lock_list_t locks;
- /** Timestamp of the last modification of this table. */
- time_t update_time;
+ /** Timestamp of the last modification of this table. */
+ Atomic_relaxed<time_t> update_time;
+ /** Transactions whose view low limit is greater than this number are
+ not allowed to access the MariaDB query cache.
+ @see innobase_query_caching_table_check_low()
+ @see trx_t::commit_tables() */
+ Atomic_relaxed<trx_id_t> query_cache_inv_trx_id;
#ifdef UNIV_DEBUG
/** Value of 'magic_n'. */
@@ -2359,13 +2427,30 @@ public:
return false;
}
- /** Check whether the table name is same as mysql/innodb_stats_table
- or mysql/innodb_index_stats.
- @return true if the table name is same as stats table */
+ /** @return whether a DDL operation is in progress on this table */
+ bool is_active_ddl() const
+ {
+ return UT_LIST_GET_FIRST(indexes)->online_log;
+ }
+
+ /** @return whether the name is
+ mysql.innodb_index_stats or mysql.innodb_table_stats */
bool is_stats_table() const;
/** @return number of unique columns in FTS_DOC_ID index */
unsigned fts_n_uniq() const { return versioned() ? 2 : 1; }
+
+ /** Create metadata.
+ @param name table name
+ @param space tablespace
+ @param n_cols total number of columns (both virtual and non-virtual)
+ @param n_v_cols number of virtual columns
+ @param flags table flags
+ @param flags2 table flags2
+ @return newly allocated table object */
+ static dict_table_t *create(const span<const char> &name, fil_space_t *space,
+ ulint n_cols, ulint n_v_cols, ulint flags,
+ ulint flags2);
};
inline void dict_index_t::set_modified(mtr_t& mtr) const
diff --git a/storage/innobase/include/dict0mem.inl b/storage/innobase/include/dict0mem.inl
index 0a554a54dbd..d60ee5d9bf4 100644
--- a/storage/innobase/include/dict0mem.inl
+++ b/storage/innobase/include/dict0mem.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -64,10 +64,5 @@ dict_mem_fill_index_struct(
/* The '1 +' above prevents allocation
of an empty mem block */
index->nulls_equal = false;
-#ifdef BTR_CUR_HASH_ADAPT
-#ifdef MYSQL_INDEX_DISABLE_AHI
- index->disable_ahi = false;
-#endif
-#endif /* BTR_CUR_HASH_ADAPT */
ut_d(index->magic_n = DICT_INDEX_MAGIC_N);
}
diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h
index 34c1bef26c5..0dc1b984577 100644
--- a/storage/innobase/include/dict0stats.h
+++ b/storage/innobase/include/dict0stats.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2009, 2018, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -30,9 +30,6 @@ Created Jan 06, 2010 Vasil Dimov
#include "dict0types.h"
#include "trx0types.h"
-#define TABLE_STATS_NAME "mysql/innodb_table_stats"
-#define INDEX_STATS_NAME "mysql/innodb_index_stats"
-
enum dict_stats_upd_option_t {
DICT_STATS_RECALC_PERSISTENT,/* (re) calculate the
statistics using a precise and slow
@@ -140,40 +137,33 @@ dict_stats_update(
the stats or to fetch them from
the persistent storage */
-/** Remove the information for a particular index's stats from the persistent
-storage if it exists and if there is data stored for this index.
-This function creates its own trx and commits it.
-
-We must modify system tables in a separate transaction in order to
-adhere to the InnoDB design constraint that dict_sys.latch prevents
-lock waits on system tables. If we modified system and user tables in
-the same transaction, we should exclusively hold dict_sys.latch until
-the transaction is committed, and effectively block other transactions
-that will attempt to open any InnoDB tables. Because we have no
-guarantee that user transactions will be committed fast, we cannot
-afford to keep the system tables locked in a user transaction.
+/** Execute DELETE FROM mysql.innodb_table_stats
+@param database_name database name
+@param table_name table name
+@param trx transaction
@return DB_SUCCESS or error code */
-dberr_t
-dict_stats_drop_index(
-/*==================*/
- const char* tname, /*!< in: table name */
- const char* iname, /*!< in: index name */
- char* errstr, /*!< out: error message if != DB_SUCCESS
- is returned */
- ulint errstr_sz);/*!< in: size of the errstr buffer */
-
-/*********************************************************************//**
-Removes the statistics for a table and all of its indexes from the
-persistent storage if it exists and if there is data stored for the table.
-This function creates its own transaction and commits it.
+dberr_t dict_stats_delete_from_table_stats(const char *database_name,
+ const char *table_name,
+ trx_t *trx)
+ MY_ATTRIBUTE((nonnull));
+/** Execute DELETE FROM mysql.innodb_index_stats
+@param database_name database name
+@param table_name table name
+@param trx transaction
@return DB_SUCCESS or error code */
-dberr_t
-dict_stats_drop_table(
-/*==================*/
- const char* table_name, /*!< in: table name */
- char* errstr, /*!< out: error message
- if != DB_SUCCESS is returned */
- ulint errstr_sz); /*!< in: size of errstr buffer */
+dberr_t dict_stats_delete_from_index_stats(const char *database_name,
+ const char *table_name,
+ trx_t *trx)
+ MY_ATTRIBUTE((nonnull));
+/** Execute DELETE FROM mysql.innodb_index_stats
+@param database_name database name
+@param table_name table name
+@param index_name name of the index
+@param trx transaction (nullptr=start and commit a new one)
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete_from_index_stats(const char *database_name,
+ const char *table_name,
+ const char *index_name, trx_t *trx);
/*********************************************************************//**
Fetches or calculates new estimates for index statistics. */
@@ -183,31 +173,29 @@ dict_stats_update_for_index(
dict_index_t* index) /*!< in/out: index */
MY_ATTRIBUTE((nonnull));
-/*********************************************************************//**
-Renames a table in InnoDB persistent stats storage.
-This function creates its own transaction and commits it.
+/** Rename a table in InnoDB persistent stats storage.
+@param old_name old table name
+@param new_name new table name
+@param trx transaction
@return DB_SUCCESS or error code */
-dberr_t
-dict_stats_rename_table(
-/*====================*/
- const char* old_name, /*!< in: old table name */
- const char* new_name, /*!< in: new table name */
- char* errstr, /*!< out: error string if != DB_SUCCESS
- is returned */
- size_t errstr_sz); /*!< in: errstr size */
-/*********************************************************************//**
-Renames an index in InnoDB persistent stats storage.
-This function creates its own transaction and commits it.
-@return DB_SUCCESS or error code. DB_STATS_DO_NOT_EXIST will be returned
-if the persistent stats do not exist. */
-dberr_t
-dict_stats_rename_index(
-/*====================*/
- const dict_table_t* table, /*!< in: table whose index
- is renamed */
- const char* old_index_name, /*!< in: old index name */
- const char* new_index_name) /*!< in: new index name */
- __attribute__((warn_unused_result));
+dberr_t dict_stats_rename_table(const char *old_name, const char *new_name,
+ trx_t *trx);
+/** Rename an index in InnoDB persistent statistics.
+@param db database name
+@param table table name
+@param old_name old table name
+@param new_name new table name
+@param trx transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_rename_index(const char *db, const char *table,
+ const char *old_name, const char *new_name,
+ trx_t *trx);
+
+/** Delete all persistent statistics for a database.
+@param db database name
+@param trx transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete(const char *db, trx_t *trx);
/** Save an individual index's statistic into the persistent statistics
storage.
@@ -217,9 +205,7 @@ storage.
@param[in] stat_value value of the stat
@param[in] sample_size n pages sampled or NULL
@param[in] stat_description description of the stat
-@param[in,out] trx in case of NULL the function will
-allocate and free the trx object. If it is not NULL then it will be
-rolled back only in the case of error, but not freed.
+@param[in,out] trx transaction
@return DB_SUCCESS or error code */
dberr_t
dict_stats_save_index_stat(
@@ -229,7 +215,8 @@ dict_stats_save_index_stat(
ib_uint64_t stat_value,
ib_uint64_t* sample_size,
const char* stat_description,
- trx_t* trx);
+ trx_t* trx)
+ MY_ATTRIBUTE((nonnull(1, 3, 6, 7)));
/** Report an error if updating table statistics failed because
.ibd file is missing, table decryption failed or table is corrupted.
diff --git a/storage/innobase/include/dict0stats.inl b/storage/innobase/include/dict0stats.inl
index 4972efe8961..dd516275156 100644
--- a/storage/innobase/include/dict0stats.inl
+++ b/storage/innobase/include/dict0stats.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -75,7 +75,7 @@ dict_stats_is_persistent_enabled(const dict_table_t* table)
+ dict_stats_update(DICT_STATS_RECALC_TRANSIENT) on a table that has
just been PS-enabled.
This is acceptable. Avoiding this would mean that we would have to
- protect the stat_persistent with dict_sys.mutex like the
+ hold dict_sys.latch or stats_mutex_lock() like for accessing the
other ::stat_ members which would be too big performance penalty,
especially when this function is called from
dict_stats_update_if_needed(). */
@@ -148,7 +148,7 @@ dict_stats_init(
/*============*/
dict_table_t* table) /*!< in/out: table */
{
- ut_ad(!mutex_own(&dict_sys.mutex));
+ ut_ad(!table->stats_mutex_is_owner());
if (table->stat_initialized) {
return;
@@ -174,17 +174,14 @@ dict_stats_deinit(
/*==============*/
dict_table_t* table) /*!< in/out: table */
{
- ut_ad(mutex_own(&dict_sys.mutex));
-
- ut_a(table->get_ref_count() == 0);
+ ut_ad(table->stats_mutex_is_owner());
+ ut_ad(table->get_ref_count() == 0);
+#ifdef HAVE_valgrind
if (!table->stat_initialized) {
return;
}
- table->stat_initialized = FALSE;
-
-#ifdef HAVE_valgrind
MEM_UNDEFINED(&table->stat_n_rows, sizeof table->stat_n_rows);
MEM_UNDEFINED(&table->stat_clustered_index_size,
sizeof table->stat_clustered_index_size);
@@ -218,4 +215,5 @@ dict_stats_deinit(
sizeof(index->stat_n_leaf_pages));
}
#endif /* HAVE_valgrind */
+ table->stat_initialized = FALSE;
}
diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h
index c09bf4df8e3..d9a2f6282a1 100644
--- a/storage/innobase/include/dict0stats_bg.h
+++ b/storage/innobase/include/dict0stats_bg.h
@@ -28,66 +28,16 @@ Created Apr 26, 2012 Vasil Dimov
#define dict0stats_bg_h
#include "dict0types.h"
-#include "os0thread.h"
#ifdef HAVE_PSI_INTERFACE
-extern mysql_pfs_key_t dict_stats_recalc_pool_mutex_key;
+extern mysql_pfs_key_t recalc_pool_mutex_key;
#endif /* HAVE_PSI_INTERFACE */
-/*****************************************************************//**
-Delete a given table from the auto recalc pool.
-dict_stats_recalc_pool_del() */
-void
-dict_stats_recalc_pool_del(
-/*=======================*/
- const dict_table_t* table); /*!< in: table to remove */
-
-/** Yield the data dictionary latch when waiting
-for the background thread to stop accessing a table.
-@param trx transaction holding the data dictionary locks */
-#define DICT_BG_YIELD(trx) do { \
- row_mysql_unlock_data_dictionary(trx); \
- os_thread_sleep(250000); \
- row_mysql_lock_data_dictionary(trx); \
-} while (0)
-
-/*****************************************************************//**
-Request the background collection of statistics to stop for a table.
-@retval true when no background process is active
-@retval false when it is not safe to modify the table definition */
-UNIV_INLINE
-bool
-dict_stats_stop_bg(
-/*===============*/
- dict_table_t* table) /*!< in/out: table */
-{
- ut_ad(!srv_read_only_mode);
- ut_ad(mutex_own(&dict_sys.mutex));
-
- if (!(table->stats_bg_flag & BG_STAT_IN_PROGRESS)) {
- return(true);
- }
-
- table->stats_bg_flag |= BG_STAT_SHOULD_QUIT;
- return(false);
-}
+/** Delete a table from the auto recalc pool, and ensure that
+no statistics are being updated on it. */
+void dict_stats_recalc_pool_del(table_id_t id, bool have_mdl_exclusive);
/*****************************************************************//**
-Wait until background stats thread has stopped using the specified table.
-The caller must have locked the data dictionary using
-row_mysql_lock_data_dictionary() and this function may unlock it temporarily
-and restore the lock before it exits.
-The background stats thread is guaranteed not to start using the specified
-table after this function returns and before the caller unlocks the data
-dictionary because it sets the BG_STAT_IN_PROGRESS bit in table->stats_bg_flag
-under dict_sys.mutex. */
-void
-dict_stats_wait_bg_to_stop_using_table(
-/*===================================*/
- dict_table_t* table, /*!< in/out: table */
- trx_t* trx); /*!< in/out: transaction to use for
- unlocking/locking the data dict */
-/*****************************************************************//**
Initialize global variables needed for the operation of dict_stats_thread().
Must be called before dict_stats task is started. */
void dict_stats_init();
diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h
index 5c4aaf8c87a..ec50e8cd951 100644
--- a/storage/innobase/include/dict0types.h
+++ b/storage/innobase/include/dict0types.h
@@ -27,9 +27,12 @@ Created 1/8/1996 Heikki Tuuri
#ifndef dict0types_h
#define dict0types_h
-#include <ut0mutex.h>
+#include "univ.i"
+#include "span.h"
#include <rem0types.h>
+using st_::span;
+
struct dict_col_t;
struct dict_field_t;
struct dict_index_t;
@@ -68,18 +71,20 @@ enum dict_err_ignore_t {
DICT_ERR_IGNORE_NONE = 0, /*!< no error to ignore */
DICT_ERR_IGNORE_FK_NOKEY = 1, /*!< ignore error if any foreign
key is missing */
- DICT_ERR_IGNORE_INDEX_ROOT = 2, /*!< ignore error if index root
- page is FIL_NULL or incorrect value */
- DICT_ERR_IGNORE_CORRUPT = 4, /*!< skip corrupted indexes */
- DICT_ERR_IGNORE_RECOVER_LOCK = 8 | DICT_ERR_IGNORE_FK_NOKEY,
+ DICT_ERR_IGNORE_INDEX = 2, /*!< ignore corrupted indexes */
+ DICT_ERR_IGNORE_RECOVER_LOCK = 4 | DICT_ERR_IGNORE_FK_NOKEY,
/*!< Used when recovering table locks
for resurrected transactions.
Silently load a missing
tablespace, and do not load
incomplete index definitions. */
/** ignore all errors above */
- DICT_ERR_IGNORE_ALL = 15,
- /** prepare to drop the table; do not attempt to load tablespace */
+ DICT_ERR_IGNORE_ALL = 7,
+ /** prepare some DDL operation;
+ do not attempt to load tablespace */
+ DICT_ERR_IGNORE_TABLESPACE = 15,
+ /** prepare to drop the table; do not attempt to load tablespace
+ or the metadata */
DICT_ERR_IGNORE_DROP = 31
};
@@ -90,18 +95,9 @@ enum ib_quiesce_t {
QUIESCE_COMPLETE /*!< All done */
};
-#ifndef UNIV_INNOCHECKSUM
-typedef ib_mutex_t DictSysMutex;
-#endif /* !UNIV_INNOCHECKSUM */
-
-/** Prefix for tmp tables, adopted from sql/table.h */
-#define TEMP_FILE_PREFIX "#sql"
-#define TEMP_FILE_PREFIX_LENGTH 4
+/** Prefix for InnoDB internal tables, adopted from sql/table.h */
#define TEMP_FILE_PREFIX_INNODB "#sql-ib"
-#define TEMP_TABLE_PREFIX "#sql"
-#define TEMP_TABLE_PATH_PREFIX "/" TEMP_TABLE_PREFIX
-
/** Table name wrapper for pretty-printing */
struct table_name_t
{
@@ -174,4 +170,7 @@ enum spatial_status_t {
SPATIAL_ONLY = 3
};
+#define TABLE_STATS_NAME "mysql/innodb_table_stats"
+#define INDEX_STATS_NAME "mysql/innodb_index_stats"
+
#endif
diff --git a/storage/innobase/include/dyn0buf.h b/storage/innobase/include/dyn0buf.h
index cb8b998f0ea..208e49c34a7 100644
--- a/storage/innobase/include/dyn0buf.h
+++ b/storage/innobase/include/dyn0buf.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -331,60 +331,6 @@ public:
}
/**
- Iterate over each block and call the functor.
- @return false if iteration was terminated. */
- template <typename Functor>
- bool for_each_block(const Functor& functor) const
- {
- for (typename list_t::iterator it = m_list.begin(),
- end = m_list.end();
- it != end; ++it) {
-
- if (!functor(&*it)) {
- return false;
- }
- }
-
- return(true);
- }
-
- /**
- Iterate over all the blocks in reverse and call the iterator
- @return false if iteration was terminated. */
- template <typename Functor>
- bool for_each_block_in_reverse(Functor& functor) const
- {
- for (list_t::reverse_iterator it = m_list.rbegin(),
- end = m_list.rend();
- it != end; ++it) {
-
- if (!functor(&*it)) {
- return false;
- }
- }
-
- return(true);
- }
-
- /**
- Iterate over all the blocks in reverse and call the iterator
- @return false if iteration was terminated. */
- template <typename Functor>
- bool for_each_block_in_reverse(const Functor& functor) const
- {
- for (list_t::reverse_iterator it = m_list.rbegin(),
- end = m_list.rend();
- it != end; ++it) {
-
- if (!functor(&*it)) {
- return false;
- }
- }
-
- return(true);
- }
-
- /**
@return the first block */
block_t* front()
MY_ATTRIBUTE((warn_unused_result))
diff --git a/storage/innobase/include/fil0crypt.h b/storage/innobase/include/fil0crypt.h
index 62043003a6c..26272761f43 100644
--- a/storage/innobase/include/fil0crypt.h
+++ b/storage/innobase/include/fil0crypt.h
@@ -1,6 +1,6 @@
/*****************************************************************************
Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
-Copyright (c) 2015, 2020, MariaDB Corporation.
+Copyright (c) 2015, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -26,7 +26,6 @@ Created 04/01/2015 Jan Lindström
#ifndef fil0crypt_h
#define fil0crypt_h
-#include "os0event.h"
#include "my_crypt.h"
#include "fil0fil.h"
@@ -41,7 +40,8 @@ static const unsigned char CRYPT_MAGIC[MAGIC_SZ] = {
/* This key will be used if nothing else is given */
#define FIL_DEFAULT_ENCRYPTION_KEY ENCRYPTION_KEY_SYSTEM_DATA
-extern os_event_t fil_crypt_threads_event;
+/** Wake up the encryption threads */
+void fil_crypt_threads_signal(bool broadcast= false);
/**
* CRYPT_SCHEME_UNENCRYPTED
@@ -116,7 +116,7 @@ struct fil_space_crypt_t : st_encryption_scheme
{
key_id = new_key_id;
my_random_bytes(iv, sizeof(iv));
- mutex_create(LATCH_ID_FIL_CRYPT_DATA_MUTEX, &mutex);
+ mysql_mutex_init(0, &mutex, nullptr);
locker = crypt_data_scheme_locker;
type = new_type;
@@ -135,7 +135,7 @@ struct fil_space_crypt_t : st_encryption_scheme
/** Destructor */
~fil_space_crypt_t()
{
- mutex_free(&mutex);
+ mysql_mutex_destroy(&mutex);
}
/** Get latest key version from encryption plugin
@@ -172,12 +172,6 @@ struct fil_space_crypt_t : st_encryption_scheme
return (encryption == FIL_ENCRYPTION_OFF);
}
- /** Fill crypt data information to the give page.
- It should be called during ibd file creation.
- @param[in] flags tablespace flags
- @param[in,out] page first page of the tablespace */
- void fill_page0(ulint flags, byte* page);
-
/** Write encryption metadata to the first page.
@param[in,out] block first page of the tablespace
@param[in,out] mtr mini-transaction */
@@ -186,7 +180,7 @@ struct fil_space_crypt_t : st_encryption_scheme
uint min_key_version; // min key version for this space
fil_encryption_t encryption; // Encryption setup
- ib_mutex_t mutex; // mutex protecting following variables
+ mysql_mutex_t mutex; // mutex protecting following variables
/** Return code from encryption_key_get_latest_version.
If ENCRYPTION_KEY_VERSION_INVALID encryption plugin
@@ -213,25 +207,20 @@ struct fil_space_crypt_status_t {
};
/** Statistics about encryption key rotation */
-struct fil_crypt_stat_t {
- ulint pages_read_from_cache;
- ulint pages_read_from_disk;
- ulint pages_modified;
- ulint pages_flushed;
- ulint estimated_iops;
+struct fil_crypt_stat_t
+{
+ ulint pages_read_from_cache= 0;
+ ulint pages_read_from_disk= 0;
+ ulint pages_modified= 0;
+ ulint pages_flushed= 0;
+ ulint estimated_iops= 0;
};
-/*********************************************************************
-Init space crypt */
-UNIV_INTERN
-void
-fil_space_crypt_init();
+/** Init space crypt */
+void fil_space_crypt_init();
-/*********************************************************************
-Cleanup space crypt */
-UNIV_INTERN
-void
-fil_space_crypt_cleanup();
+/** Cleanup space crypt */
+void fil_space_crypt_cleanup();
/**
Create a fil_space_crypt_t object
@@ -241,23 +230,12 @@ Create a fil_space_crypt_t object
@param[in] key_id Encryption key id
@return crypt object */
-UNIV_INTERN
fil_space_crypt_t*
fil_space_create_crypt_data(
fil_encryption_t encrypt_mode,
uint key_id)
MY_ATTRIBUTE((warn_unused_result));
-/******************************************************************
-Merge fil_space_crypt_t object
-@param[in,out] dst Destination cryp data
-@param[in] src Source crypt data */
-UNIV_INTERN
-void
-fil_space_merge_crypt_data(
- fil_space_crypt_t* dst,
- const fil_space_crypt_t* src);
-
/** Initialize encryption parameters from a tablespace header page.
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
@param[in] page first page of the tablespace
@@ -269,10 +247,7 @@ fil_space_crypt_t* fil_space_read_crypt_data(ulint zip_size, const byte* page)
/**
Free a crypt data object
@param[in,out] crypt_data crypt data to be freed */
-UNIV_INTERN
-void
-fil_space_destroy_crypt_data(
- fil_space_crypt_t **crypt_data);
+void fil_space_destroy_crypt_data(fil_space_crypt_t **crypt_data);
/** Amend encryption information from redo log.
@param[in] space tablespace
@@ -288,7 +263,6 @@ void fil_crypt_parse(fil_space_t* space, const byte* data);
@param[in,out] dst_frame Output buffer
@param[in] use_full_checksum full crc32 algo is used
@return encrypted buffer or NULL */
-UNIV_INTERN
byte*
fil_encrypt_buf(
fil_space_crypt_t* crypt_data,
@@ -315,7 +289,6 @@ byte* fil_space_encrypt(
byte* dst_frame)
MY_ATTRIBUTE((warn_unused_result));
-
/** Decrypt a page.
@param]in] space_id space id
@param[in] crypt_data crypt_data
@@ -323,8 +296,8 @@ byte* fil_space_encrypt(
@param[in] physical_size page size
@param[in] fsp_flags Tablespace flags
@param[in,out] src_frame Page to decrypt
-@return DB_SUCCESS or error */
-UNIV_INTERN
+@retval DB_SUCCESS on success
+@retval DB_DECRYPTION_FAILED on error */
dberr_t
fil_space_decrypt(
ulint space_id,
@@ -340,8 +313,8 @@ Decrypt a page
@param[in] tmp_frame Temporary buffer used for decrypting
@param[in,out] src_frame Page to decrypt
@return decrypted page, or original not encrypted page if decryption is
-not needed.*/
-UNIV_INTERN
+not needed.
+@retval nullptr on failure */
byte*
fil_space_decrypt(
const fil_space_t* space,
@@ -349,39 +322,20 @@ fil_space_decrypt(
byte* src_frame)
MY_ATTRIBUTE((warn_unused_result));
-/**
-Calculate post encryption checksum
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in] dst_frame Block where checksum is calculated
-@return page checksum
-not needed. */
-uint32_t
-fil_crypt_calculate_checksum(ulint zip_size, const byte* dst_frame)
- MY_ATTRIBUTE((warn_unused_result));
-
/*********************************************************************
Adjust thread count for key rotation
@param[in] enw_cnt Number of threads to be used */
-UNIV_INTERN
-void
-fil_crypt_set_thread_cnt(
- uint new_cnt);
+void fil_crypt_set_thread_cnt(const uint new_cnt);
/*********************************************************************
Adjust max key age
@param[in] val New max key age */
-UNIV_INTERN
-void
-fil_crypt_set_rotate_key_age(
- uint val);
+void fil_crypt_set_rotate_key_age(uint val);
/*********************************************************************
Adjust rotation iops
@param[in] val New max roation iops */
-UNIV_INTERN
-void
-fil_crypt_set_rotation_iops(
- uint val);
+void fil_crypt_set_rotation_iops(uint val);
/*********************************************************************
Adjust encrypt tables
@@ -390,30 +344,22 @@ void fil_crypt_set_encrypt_tables(ulong val);
/*********************************************************************
Init threads for key rotation */
-UNIV_INTERN
-void
-fil_crypt_threads_init();
+void fil_crypt_threads_init();
/*********************************************************************
Clean up key rotation threads resources */
-UNIV_INTERN
-void
-fil_crypt_threads_cleanup();
+void fil_crypt_threads_cleanup();
/*********************************************************************
Wait for crypt threads to stop accessing space
@param[in] space Tablespace */
-UNIV_INTERN
-void
-fil_space_crypt_close_tablespace(
- const fil_space_t* space);
+void fil_space_crypt_close_tablespace(const fil_space_t *space);
/*********************************************************************
Get crypt status for a space (used by information_schema)
@param[in] space Tablespace
@param[out] status Crypt status
return 0 if crypt data present */
-UNIV_INTERN
void
fil_space_crypt_get_status(
const fil_space_t* space,
@@ -422,10 +368,7 @@ fil_space_crypt_get_status(
/*********************************************************************
Return crypt statistics
@param[out] stat Crypt statistics */
-UNIV_INTERN
-void
-fil_crypt_total_stat(
- fil_crypt_stat_t *stat);
+void fil_crypt_total_stat(fil_crypt_stat_t *stat);
#include "fil0crypt.inl"
#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
index 385d547a060..165994eef35 100644
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -24,8 +24,7 @@ The low-level file system
Created 10/25/1995 Heikki Tuuri
*******************************************************/
-#ifndef fil0fil_h
-#define fil0fil_h
+#pragma once
#include "fsp0types.h"
#include "mach0data.h"
@@ -33,6 +32,7 @@ Created 10/25/1995 Heikki Tuuri
#ifndef UNIV_INNOCHECKSUM
+#include "srw_lock.h"
#include "buf0dblwr.h"
#include "hash0hash.h"
#include "log0recv.h"
@@ -43,6 +43,10 @@ Created 10/25/1995 Heikki Tuuri
struct unflushed_spaces_tag_t;
struct rotation_list_tag_t;
+struct space_list_tag_t;
+struct named_spaces_tag_t;
+
+using space_list_t= ilist<fil_space_t, space_list_tag_t>;
// Forward declaration
extern my_bool srv_use_doublewrite_buf;
@@ -60,13 +64,11 @@ enum srv_flush_t
/** do not flush after writing */
SRV_NOSYNC,
/** invoke os_file_set_nocache() on data files. This implies using
- non-buffered IO but still using fsync, the reason for which is that
- some FS do not flush meta-data when unbuffered IO happens */
+ unbuffered I/O but still fdatasync(), because some filesystems might
+ not flush meta-data on write completion */
SRV_O_DIRECT,
- /** do not use fsync() when using direct IO i.e.: it can be set to
- avoid the fsync() call that we make when using SRV_UNIX_O_DIRECT.
- However, in this case user/DBA should be sure about the integrity of
- the meta-data */
+ /** Like O_DIRECT, but skip fdatasync(), assuming that the data is
+ durable on write completion */
SRV_O_DIRECT_NO_FSYNC
#ifdef _WIN32
/** Traditional Windows appoach to open all files without caching,
@@ -332,18 +334,27 @@ enum fil_encryption_t
FIL_ENCRYPTION_OFF
};
-struct fil_space_t final :
- ilist_node<unflushed_spaces_tag_t>, ilist_node<rotation_list_tag_t>
+struct fil_space_t final : ilist_node<unflushed_spaces_tag_t>,
+ ilist_node<rotation_list_tag_t>,
+ ilist_node<space_list_tag_t>,
+ ilist_node<named_spaces_tag_t>
#else
struct fil_space_t final
#endif
{
#ifndef UNIV_INNOCHECKSUM
- ~fil_space_t() { ut_free(name); }
friend fil_node_t;
+ ~fil_space_t()
+ {
+ ut_ad(!latch_owner);
+ ut_ad(!latch_count);
+ latch.destroy();
+ }
+
ulint id; /*!< space id */
- hash_node_t hash; /*!< hash chain node */
- char* name; /*!< Tablespace name */
+
+ /** fil_system.spaces chain node */
+ fil_space_t *hash;
lsn_t max_lsn;
/*!< LSN of the most recent
fil_names_write_if_was_clean().
@@ -373,6 +384,14 @@ struct fil_space_t final
/*!< number of reserved free extents for
ongoing operations like B-tree page split */
private:
+#ifdef UNIV_DEBUG
+ fil_space_t *next_in_space_list();
+ fil_space_t *prev_in_space_list();
+
+ fil_space_t *next_in_unflushed_spaces();
+ fil_space_t *prev_in_unflushed_spaces();
+#endif
+
/** the committed size of the tablespace in pages */
Atomic_relaxed<uint32_t> committed_size;
/** Number of pending operations on the file.
@@ -390,49 +409,44 @@ private:
static constexpr uint32_t NEEDS_FSYNC= 1U << 29;
/** The reference count */
static constexpr uint32_t PENDING= ~(STOPPING | CLOSING | NEEDS_FSYNC);
+ /** latch protecting all page allocation bitmap pages */
+ srw_lock latch;
+ pthread_t latch_owner;
+ ut_d(Atomic_relaxed<uint32_t> latch_count;)
public:
- rw_lock_t latch; /*!< latch protecting the file space storage
- allocation */
- UT_LIST_NODE_T(fil_space_t) named_spaces;
- /*!< list of spaces for which FILE_MODIFY
- records have been issued */
- UT_LIST_NODE_T(fil_space_t) space_list;
- /*!< list of all spaces */
+ /** MariaDB encryption data */
+ fil_space_crypt_t *crypt_data;
- /** MariaDB encryption data */
- fil_space_crypt_t* crypt_data;
+ /** Whether needs_flush(), or this is in fil_system.unflushed_spaces */
+ bool is_in_unflushed_spaces;
- /** Checks that this tablespace in a list of unflushed tablespaces. */
- bool is_in_unflushed_spaces;
+ /** Whether this in fil_system.default_encrypt_tables (needs key rotation) */
+ bool is_in_default_encrypt;
- /** Checks that this tablespace needs key rotation. */
- bool is_in_default_encrypt;
-
- /** True if the device this filespace is on supports atomic writes */
- bool atomic_write_supported;
+private:
+ /** Whether any corrupton of this tablespace has been reported */
+ mutable std::atomic_flag is_corrupted;
- /** True if file system storing this tablespace supports
- punch hole */
- bool punch_hole;
+ /** mutex to protect freed_ranges and last_freed_lsn */
+ std::mutex freed_range_mutex;
- /** mutex to protect freed ranges */
- std::mutex freed_range_mutex;
+ /** Ranges of freed page numbers; protected by freed_range_mutex */
+ range_set freed_ranges;
- /** Variables to store freed ranges. This can be used to write
- zeroes/punch the hole in files. Protected by freed_mutex */
- range_set freed_ranges;
+ /** LSN of freeing last page; protected by freed_range_mutex */
+ lsn_t last_freed_lsn;
- /** Stores last page freed lsn. Protected by freed_mutex */
- lsn_t last_freed_lsn;
+public:
+ /** @return whether doublewrite buffering is needed */
+ inline bool use_doublewrite() const;
- ulint magic_n;/*!< FIL_SPACE_MAGIC_N */
+ /** @return whether a page has been freed */
+ inline bool is_freed(uint32_t page);
- /** @return whether doublewrite buffering is needed */
- bool use_doublewrite() const
- {
- return !atomic_write_supported && srv_use_doublewrite_buf &&
- buf_dblwr.is_initialised();
- }
+ /** Apply freed_ranges to the file.
+ @param writable whether the file is writable
+ @return number of pages written or hole-punched */
+ uint32_t flush_freed(bool writable);
/** Append a file to the chain of files of a space.
@param[in] name file name of a file that is not open
@@ -459,7 +473,6 @@ public:
@return whether the reservation succeeded */
bool reserve_free_extents(uint32_t n_free_now, uint32_t n_to_reserve)
{
- ut_ad(rw_lock_own(&latch, RW_LOCK_X));
if (n_reserved_extents + n_to_reserve > n_free_now) {
return false;
}
@@ -473,26 +486,27 @@ public:
void release_free_extents(uint32_t n_reserved)
{
if (!n_reserved) return;
- ut_ad(rw_lock_own(&latch, RW_LOCK_X));
ut_a(n_reserved_extents >= n_reserved);
n_reserved_extents -= n_reserved;
}
- /** Rename a file.
- @param[in] name table name after renaming
- @param[in] path tablespace file name after renaming
- @param[in] log whether to write redo log
- @param[in] replace whether to ignore the existence of path
- @return error code
- @retval DB_SUCCESS on success */
- dberr_t rename(const char* name, const char* path, bool log,
- bool replace = false);
+ /** Rename a file.
+ @param[in] path tablespace file name after renaming
+ @param[in] log whether to write redo log
+ @param[in] replace whether to ignore the existence of path
+ @return error code
+ @retval DB_SUCCESS on success */
+ dberr_t rename(const char *path, bool log, bool replace= false)
+ MY_ATTRIBUTE((nonnull));
/** Note that the tablespace has been imported.
Initially, purpose=FIL_TYPE_IMPORT so that no redo log is
written while the space ID is being updated in each page. */
inline void set_imported();
+ /** Report the tablespace as corrupted */
+ ATTRIBUTE_COLD void set_corrupted() const;
+
/** @return whether the storage device is rotational (HDD, not SSD) */
inline bool is_rotational() const;
@@ -503,8 +517,20 @@ public:
/** Close each file. Only invoked on fil_system.temp_space. */
void close();
- /** Note that operations on the tablespace must stop or can resume */
- inline void set_stopping(bool stopping);
+ /** Note that operations on the tablespace must stop.
+ @return whether the operations were already stopped */
+ inline bool set_stopping_check();
+ /** Note that operations on the tablespace must stop. */
+ inline void set_stopping();
+
+ /** Note that operations on the tablespace can resume after truncation */
+ inline void clear_stopping();
+
+ /** Look up the tablespace and wait for pending operations to cease
+ @param id tablespace identifier
+ @return tablespace
+ @retval nullptr if no tablespace was found */
+ static fil_space_t *check_pending_operations(ulint id);
private:
MY_ATTRIBUTE((warn_unused_result))
@@ -549,9 +575,35 @@ public:
/** Clear the NEEDS_FSYNC flag */
void clear_flush()
- { n_pending.fetch_and(~NEEDS_FSYNC, std::memory_order_release); }
+ {
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+ static_assert(NEEDS_FSYNC == 1U << 29, "compatibility");
+ __asm__ __volatile__("lock btrl $29, %0" : "+m" (n_pending));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+ static_assert(NEEDS_FSYNC == 1U << 29, "compatibility");
+ _interlockedbittestandreset(reinterpret_cast<volatile long*>
+ (&n_pending), 29);
+#else
+ n_pending.fetch_and(~NEEDS_FSYNC, std::memory_order_release);
+#endif
+ }
private:
+ /** Clear the CLOSING flag */
+ void clear_closing()
+ {
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+ static_assert(CLOSING == 1U << 30, "compatibility");
+ __asm__ __volatile__("lock btrl $30, %0" : "+m" (n_pending));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+ static_assert(CLOSING == 1U << 30, "compatibility");
+ _interlockedbittestandreset(reinterpret_cast<volatile long*>
+ (&n_pending), 30);
+#else
+ n_pending.fetch_and(~CLOSING, std::memory_order_relaxed);
+#endif
+ }
+
/** @return pending operations (and flags) */
uint32_t pending()const { return n_pending.load(std::memory_order_acquire); }
public:
@@ -576,8 +628,7 @@ private:
@return number of pending operations, possibly with NEEDS_FSYNC flag */
uint32_t set_closing()
{
- return n_pending.fetch_or(CLOSING, std::memory_order_acquire) &
- (PENDING | NEEDS_FSYNC);
+ return n_pending.fetch_or(CLOSING, std::memory_order_acquire);
}
public:
@@ -901,7 +952,6 @@ public:
#ifndef UNIV_INNOCHECKSUM
MY_ATTRIBUTE((warn_unused_result))
/** Create a tablespace in fil_system.
- @param name tablespace name
@param id tablespace identifier
@param flags tablespace flags
@param purpose tablespace purpose
@@ -910,7 +960,7 @@ public:
@param opened true if space files are opened
@return pointer to created tablespace, to be filled in with add()
@retval nullptr on failure (such as when the same tablespace exists) */
- static fil_space_t *create(const char *name, ulint id, ulint flags,
+ static fil_space_t *create(ulint id, ulint flags,
fil_type_t purpose, fil_space_crypt_t *crypt_data,
fil_encryption_t mode= FIL_ENCRYPTION_DEFAULT,
bool opened= false);
@@ -959,11 +1009,7 @@ public:
}
/** Update committed_size in mtr_t::commit() */
- void set_committed_size()
- {
- ut_ad(rw_lock_own(&latch, RW_LOCK_X));
- committed_size= size;
- }
+ void set_committed_size() { committed_size= size; }
/** @return the last persisted page number */
uint32_t last_page_number() const { return committed_size - 1; }
@@ -996,8 +1042,49 @@ public:
@param encrypt expected state of innodb_encrypt_tables
@return the next tablespace
@retval nullptr upon reaching the end of the iteration */
- static inline fil_space_t *next(fil_space_t *space, bool recheck,
- bool encrypt);
+ static space_list_t::iterator next(space_list_t::iterator space,
+ bool recheck, bool encrypt);
+
+#ifdef UNIV_DEBUG
+ bool is_latched() const { return latch_count != 0; }
+#endif
+ bool is_owner() const { return latch_owner == pthread_self(); }
+ /** Acquire the allocation latch in exclusive mode */
+ void x_lock()
+ {
+ latch.wr_lock(SRW_LOCK_CALL);
+ ut_ad(!latch_owner);
+ latch_owner= pthread_self();
+ ut_ad(!latch_count.fetch_add(1));
+ }
+ /** Release the allocation latch from exclusive mode */
+ void x_unlock()
+ {
+ ut_ad(latch_count.fetch_sub(1) == 1);
+ ut_ad(latch_owner == pthread_self());
+ latch_owner= 0;
+ latch.wr_unlock();
+ }
+ /** Acquire the allocation latch in shared mode */
+ void s_lock()
+ {
+ ut_ad(!is_owner());
+ latch.rd_lock(SRW_LOCK_CALL);
+ ut_ad(!latch_owner);
+ ut_d(latch_count.fetch_add(1));
+ }
+ /** Release the allocation latch from shared mode */
+ void s_unlock()
+ {
+ ut_ad(latch_count.fetch_sub(1));
+ ut_ad(!latch_owner);
+ latch.rd_unlock();
+ }
+
+ typedef span<const char> name_type;
+
+ /** @return the tablespace name (databasename/tablename) */
+ name_type name() const;
private:
/** @return whether the file is usable for io() */
@@ -1008,62 +1095,59 @@ private:
};
#ifndef UNIV_INNOCHECKSUM
-/** Value of fil_space_t::magic_n */
-#define FIL_SPACE_MAGIC_N 89472
-
/** File node of a tablespace or the log data space */
struct fil_node_t final
{
- /** tablespace containing this file */
- fil_space_t* space;
- /** file name; protected by fil_system.mutex and log_sys.mutex. */
- char* name;
- /** file handle (valid if is_open) */
- pfs_os_file_t handle;
- /** whether the file actually is a raw device or disk partition */
- bool is_raw_disk;
- /** whether the file is on non-rotational media (SSD) */
- bool on_ssd;
- /** size of the file in database pages (0 if not known yet);
- the possible last incomplete megabyte may be ignored
- if space->id == 0 */
- uint32_t size;
- /** initial size of the file in database pages;
- FIL_IBD_FILE_INITIAL_SIZE by default */
- uint32_t init_size;
- /** maximum size of the file in database pages (0 if unlimited) */
- uint32_t max_size;
- /** whether the file is currently being extended */
- Atomic_relaxed<bool> being_extended;
- /** link to other files in this tablespace */
- UT_LIST_NODE_T(fil_node_t) chain;
-
- /** whether this file could use atomic write (data file) */
- bool atomic_write;
-
- /** Filesystem block size */
- ulint block_size;
-
- /** FIL_NODE_MAGIC_N */
- ulint magic_n;
-
- /** @return whether this file is open */
- bool is_open() const
- {
- return(handle != OS_FILE_CLOSED);
- }
+ /** tablespace containing this file */
+ fil_space_t *space;
+ /** file name; protected by fil_system.mutex and log_sys.mutex */
+ char *name;
+ /** file handle */
+ pfs_os_file_t handle;
+ /** whether the file is on non-rotational media (SSD) */
+ unsigned on_ssd:1;
+ /** how to write page_compressed tables
+ (0=do not punch holes but write minimal amount of data, 1=punch holes,
+ 2=always write the same amount; thinly provisioned storage will compress) */
+ unsigned punch_hole:2;
+ /** whether this file could use atomic write */
+ unsigned atomic_write:1;
+ /** whether the file actually is a raw device or disk partition */
+ unsigned is_raw_disk:1;
+ /** whether the tablespace discovery is being deferred during crash
+ recovery due to incompletely written page 0 */
+ unsigned deferred:1;
+
+ /** size of the file in database pages (0 if not known yet);
+ the possible last incomplete megabyte may be ignored if space->id == 0 */
+ uint32_t size;
+ /** initial size of the file in database pages;
+ FIL_IBD_FILE_INITIAL_SIZE by default */
+ uint32_t init_size;
+ /** maximum size of the file in database pages (0 if unlimited) */
+ uint32_t max_size;
+ /** whether the file is currently being extended */
+ Atomic_relaxed<bool> being_extended;
+ /** link to other files in this tablespace */
+ UT_LIST_NODE_T(fil_node_t) chain;
+
+ /** Filesystem block size */
+ ulint block_size;
+
+ /** @return whether this file is open */
+ bool is_open() const { return handle != OS_FILE_CLOSED; }
- /** Read the first page of a data file.
- @return whether the page was found valid */
- bool read_page0();
+ /** Read the first page of a data file.
+ @return whether the page was found valid */
+ bool read_page0();
- /** Determine some file metadata when creating or reading the file.
- @param file the file that is being created, or OS_FILE_CLOSED */
- void find_metadata(os_file_t file = OS_FILE_CLOSED
+ /** Determine some file metadata when creating or reading the file.
+ @param file the file that is being created, or OS_FILE_CLOSED */
+ void find_metadata(os_file_t file= OS_FILE_CLOSED
#ifndef _WIN32
- , struct stat* statbuf = NULL
+ , bool create= false, struct stat *statbuf= nullptr
#endif
- );
+ );
/** Close the file handle. */
void close();
@@ -1082,8 +1166,11 @@ private:
void prepare_to_close_or_detach();
};
-/** Value of fil_node_t::magic_n */
-#define FIL_NODE_MAGIC_N 89389
+inline bool fil_space_t::use_doublewrite() const
+{
+ return !UT_LIST_GET_FIRST(chain)->atomic_write && srv_use_doublewrite_buf &&
+ buf_dblwr.is_created();
+}
inline void fil_space_t::set_imported()
{
@@ -1113,8 +1200,8 @@ extern const char* dot_ext[];
#define DOT_ISL dot_ext[ISL]
#define DOT_CFG dot_ext[CFG]
-/** When mysqld is run, the default directory "." is the mysqld datadir,
-but in the MySQL Embedded Server Library and mysqlbackup it is not the default
+/** When mariadbd is run, the default directory "." is the mysqld datadir,
+but in the MariaDB Embedded Server Library and mysqlbackup it is not the default
directory, and we must set the base file path explicitly */
extern const char* fil_path_to_mysql_datadir;
#else
@@ -1183,8 +1270,9 @@ struct fil_addr_t {
/** For the first page in a system tablespace data file(ibdata*, not *.ibd):
the file has been flushed to disk at least up to this lsn
-For other pages: 32-bit key version used to encrypt the page + 32-bit checksum
-or 64 bites of zero if no encryption */
+For other pages of tablespaces not in innodb_checksum_algorithm=full_crc32
+format: 32-bit key version used to encrypt the page + 32-bit checksum
+or 64 bits of zero if no encryption */
#define FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION 26U
/** This overloads FIL_PAGE_FILE_FLUSH_LSN for RTREE Split Sequence Number */
@@ -1344,11 +1432,7 @@ struct fil_system_t {
Some members may require late initialisation, thus we just mark object as
uninitialised. Real initialisation happens in create().
*/
- fil_system_t(): m_initialised(false)
- {
- UT_LIST_INIT(space_list, &fil_space_t::space_list);
- UT_LIST_INIT(named_spaces, &fil_space_t::named_spaces);
- }
+ fil_system_t() : m_initialised(false) {}
bool is_initialised() const { return m_initialised; }
@@ -1388,12 +1472,13 @@ public:
public:
/** Detach a tablespace from the cache and close the files.
@param space tablespace
- @param detach_handle whether to detach or close handles
- @return detached handles or empty vector */
- std::vector<pfs_os_file_t> detach(fil_space_t *space,
- bool detach_handle= false);
+ @param detach_handle whether to detach the handle, instead of closing
+ @return detached handle
+ @retval OS_FILE_CLOSED if no handle was detached */
+ pfs_os_file_t detach(fil_space_t *space, bool detach_handle= false);
- ib_mutex_t mutex; /*!< The mutex protecting the cache */
+ /** the mutex protecting most data fields, and some fields of fil_space_t */
+ mysql_mutex_t mutex;
fil_space_t* sys_space; /*!< The innodb_system tablespace */
fil_space_t* temp_space; /*!< The innodb_temporary tablespace */
/** Map of fil_space_t::id to fil_space_t* */
@@ -1413,12 +1498,11 @@ public:
/** nonzero if fil_node_open_file_low() should avoid moving the tablespace
to the end of space_list, for FIFO policy of try_to_close() */
ulint freeze_space_list;
-
/** List of all file spaces, opened spaces should be at the top of the list
to optimize try_to_close() execution. Protected with fil_system.mutex. */
- UT_LIST_BASE_NODE_T(fil_space_t) space_list;
+ ilist<fil_space_t, space_list_tag_t> space_list;
- UT_LIST_BASE_NODE_T(fil_space_t) named_spaces;
+ ilist<fil_space_t, named_spaces_tag_t> named_spaces;
/*!< list of all file spaces
for which a FILE_MODIFY
record has been written since
@@ -1443,7 +1527,7 @@ public:
fil_system.space_list, so that fil_space_t::try_to_close() should close
it as a last resort.
@param space space to move */
- void move_opened_last_to_space_list(fil_space_t *space)
+ inline void move_opened_last_to_space_list(fil_space_t *space)
{
/* In the case when several files of the same space are added in a
row, there is no need to remove and add a space to the same position
@@ -1451,8 +1535,7 @@ public:
if (freeze_space_list || space_list_last_opened == space)
return;
- UT_LIST_REMOVE(space_list, space);
-
+ space_list.erase(space_list_t::iterator(space));
add_opened_last_to_space_list(space);
}
@@ -1465,10 +1548,17 @@ public:
if (UNIV_UNLIKELY(freeze_space_list))
return;
+ space_list_t::iterator s= space_list_t::iterator(space);
+
if (space_list_last_opened == space)
- space_list_last_opened= UT_LIST_GET_PREV(space_list, space);
- UT_LIST_REMOVE(space_list, space);
- UT_LIST_ADD_LAST(space_list, space);
+ {
+ ut_ad(s != space_list.begin());
+ space_list_t::iterator prev= s;
+ space_list_last_opened= &*--prev;
+ }
+
+ space_list.erase(s);
+ space_list.push_back(*space);
}
/** Return the next tablespace from default_encrypt_tables list.
@@ -1484,6 +1574,11 @@ public:
/** Extend all open data files to the recovered size */
ATTRIBUTE_COLD void extend_to_recv_size();
+
+ /** Determine if a tablespace associated with a file name exists.
+ @param path tablespace file name to look for
+ @return a matching tablespace */
+ inline fil_space_t *find(const char *path) const;
};
/** The tablespace memory cache. */
@@ -1492,23 +1587,65 @@ extern fil_system_t fil_system;
inline void fil_space_t::reacquire()
{
ut_d(uint32_t n=) n_pending.fetch_add(1, std::memory_order_relaxed);
- ut_d(if (mutex_own(&fil_system.mutex)) return);
+#ifdef SAFE_MUTEX
+ if (mysql_mutex_is_owner(&fil_system.mutex)) return;
ut_ad(n & PENDING);
ut_ad(UT_LIST_GET_FIRST(chain)->is_open());
+#endif /* SAFE_MUTEX */
+}
+
+/** Note that operations on the tablespace must stop.
+@return whether the operations were already stopped */
+inline bool fil_space_t::set_stopping_check()
+{
+ mysql_mutex_assert_owner(&fil_system.mutex);
+#if (defined __clang_major__ && __clang_major__ < 10) || defined __APPLE_CC__
+ /* Only clang-10 introduced support for asm goto */
+ return n_pending.fetch_or(STOPPING, std::memory_order_relaxed) & STOPPING;
+#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+ static_assert(STOPPING == 1U << 31, "compatibility");
+ __asm__ goto("lock btsl $31, %0\t\njnc %l1" : : "m" (n_pending)
+ : "cc", "memory" : not_stopped);
+ return true;
+not_stopped:
+ return false;
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+ static_assert(STOPPING == 1U << 31, "compatibility");
+ return _interlockedbittestandset(reinterpret_cast<volatile long*>
+ (&n_pending), 31);
+#else
+ return n_pending.fetch_or(STOPPING, std::memory_order_relaxed) & STOPPING;
+#endif
}
-/** Note that operations on the tablespace must stop or can resume */
-inline void fil_space_t::set_stopping(bool stopping)
+/** Note that operations on the tablespace must stop.
+@return whether the operations were already stopped */
+inline void fil_space_t::set_stopping()
{
- ut_ad(mutex_own(&fil_system.mutex));
- ut_d(auto n=) n_pending.fetch_xor(STOPPING, std::memory_order_relaxed);
- ut_ad(!(n & STOPPING) == stopping);
+ mysql_mutex_assert_owner(&fil_system.mutex);
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+ static_assert(STOPPING == 1U << 31, "compatibility");
+ __asm__ __volatile__("lock btsl $31, %0" : "+m" (n_pending));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+ static_assert(STOPPING == 1U << 31, "compatibility");
+ _interlockedbittestandset(reinterpret_cast<volatile long*>(&n_pending), 31);
+#else
+ n_pending.fetch_or(STOPPING, std::memory_order_relaxed);
+#endif
+}
+
+inline void fil_space_t::clear_stopping()
+{
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ static_assert(STOPPING == 1U << 31, "compatibility");
+ ut_d(auto n=) n_pending.fetch_sub(STOPPING, std::memory_order_relaxed);
+ ut_ad(n & STOPPING);
}
/** Flush pending writes from the file system cache to the file. */
template<bool have_reference> inline void fil_space_t::flush()
{
- ut_ad(!mutex_own(&fil_system.mutex));
+ mysql_mutex_assert_not_owner(&fil_system.mutex);
ut_ad(!have_reference || (pending() & PENDING));
ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT);
if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
@@ -1531,9 +1668,9 @@ inline uint32_t fil_space_t::get_size()
{
if (!size)
{
- mutex_enter(&fil_system.mutex);
+ mysql_mutex_lock(&fil_system.mutex);
read_page0();
- mutex_exit(&fil_system.mutex);
+ mysql_mutex_unlock(&fil_system.mutex);
}
return size;
}
@@ -1585,20 +1722,12 @@ fil_write_flushed_lsn(
lsn_t lsn)
MY_ATTRIBUTE((warn_unused_result));
+MY_ATTRIBUTE((warn_unused_result))
/** Delete a tablespace and associated .ibd file.
-@param[in] id tablespace identifier
-@param[in] if_exists whether to ignore missing tablespace
-@param[out] leaked_handles return detached handles here
-@return DB_SUCCESS or error */
-dberr_t
-fil_delete_tablespace(ulint id, bool if_exists= false,
- std::vector<pfs_os_file_t> *detached_handles= nullptr);
-
-/** Prepare to truncate an undo tablespace.
-@param[in] space_id undo tablespace id
-@return the tablespace
-@retval NULL if the tablespace does not exist */
-fil_space_t* fil_truncate_prepare(ulint space_id);
+@param id tablespace identifier
+@return detached file handle (to be closed by the caller)
+@return OS_FILE_CLOSED if no file existed */
+pfs_os_file_t fil_delete_tablespace(ulint id);
/** Close a single-table tablespace on failed IMPORT TABLESPACE.
The tablespace must be cached in the memory cache.
@@ -1609,15 +1738,15 @@ void fil_close_tablespace(ulint id);
Allocates and builds a file name from a path, a table or tablespace name
and a suffix. The string must be freed by caller with ut_free().
@param[in] path NULL or the directory path or the full path and filename.
-@param[in] name NULL if path is full, or Table/Tablespace name
-@param[in] suffix NULL or the file extention to use.
+@param[in] name {} if path is full, or Table/Tablespace name
+@param[in] ext the file extension to use
+@param[in] trim_name true if the last name on the path should be trimmed.
@return own: file name */
-char*
-fil_make_filepath(
- const char* path,
- const char* name,
- ib_extention suffix,
- bool strip_name);
+char* fil_make_filepath(const char *path, const fil_space_t::name_type &name,
+ ib_extention ext, bool trim_name);
+
+char *fil_make_filepath(const char* path, const table_name_t name,
+ ib_extention suffix, bool strip_name);
/** Create a tablespace file.
@param[in] space_id Tablespace ID
@@ -1634,14 +1763,14 @@ must be >= FIL_IBD_FILE_INITIAL_SIZE
fil_space_t*
fil_ibd_create(
ulint space_id,
- const char* name,
+ const table_name_t name,
const char* path,
ulint flags,
uint32_t size,
fil_encryption_t mode,
uint32_t key_id,
dberr_t* err)
- MY_ATTRIBUTE((nonnull(2,8), warn_unused_result));
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/** Try to adjust FSP_SPACE_FLAGS if they differ from the expectations.
(Typically when upgrading from MariaDB 10.1.0..10.1.20.)
@@ -1655,7 +1784,7 @@ right in it. If does not succeed, prints an error message to the .err log. This
function is used to open a tablespace when we start up mysqld, and also in
IMPORT TABLESPACE.
NOTE that we assume this operation is used either at the database startup
-or under the protection of the dictionary mutex, so that two users cannot
+or under the protection of dict_sys.latch, so that two users cannot
race here. This operation does not leave the file associated with the
tablespace open, but closes it after we have looked at the space id in it.
@@ -1666,15 +1795,11 @@ file inode probably is much faster (the OS caches them) than accessing
the first page of the file. This boolean may be initially false, but if
a remote tablespace is found it will be changed to true.
-If the fix_dict boolean is set, then it is safe to use an internal SQL
-statement to update the dictionary tables if they are incorrect.
-
-@param[in] validate true if we should validate the tablespace
-@param[in] fix_dict true if the dictionary is available to be fixed
+@param[in] validate 0=maybe missing, 1=do not validate, 2=validate
@param[in] purpose FIL_TYPE_TABLESPACE or FIL_TYPE_TEMPORARY
@param[in] id tablespace ID
@param[in] flags expected FSP_SPACE_FLAGS
-@param[in] tablename table name
+@param[in] name table name
If file-per-table, it is the table name in the databasename/tablename format
@param[in] path_in expected filepath, usually read from dictionary
@param[out] err DB_SUCCESS or error code
@@ -1682,12 +1807,11 @@ If file-per-table, it is the table name in the databasename/tablename format
@retval NULL if the tablespace could not be opened */
fil_space_t*
fil_ibd_open(
- bool validate,
- bool fix_dict,
+ unsigned validate,
fil_type_t purpose,
ulint id,
ulint flags,
- const table_name_t& tablename,
+ fil_space_t::name_type name,
const char* path_in,
dberr_t* err = NULL)
MY_ATTRIBUTE((warn_unused_result));
@@ -1700,7 +1824,9 @@ enum fil_load_status {
/** The file(s) were not found */
FIL_LOAD_NOT_FOUND,
/** The file(s) were not valid */
- FIL_LOAD_INVALID
+ FIL_LOAD_INVALID,
+ /** The tablespace file was deferred to open */
+ FIL_LOAD_DEFER
};
/** Open a single-file tablespace and add it to the InnoDB data structures.
@@ -1720,15 +1846,10 @@ fil_ibd_load(
memory cache. Note that if we have not done a crash recovery at the database
startup, there may be many tablespaces which are not yet in the memory cache.
@param[in] id Tablespace ID
-@param[in] name Tablespace name used in fil_space_t::create().
@param[in] table_flags table flags
@return the tablespace
@retval NULL if no matching tablespace exists in the memory cache */
-fil_space_t*
-fil_space_for_table_exists_in_mem(
- ulint id,
- const char* name,
- ulint table_flags);
+fil_space_t *fil_space_for_table_exists_in_mem(ulint id, ulint table_flags);
/** Try to extend a tablespace if it is smaller than the specified size.
@param[in,out] space tablespace
@@ -1759,31 +1880,6 @@ fil_delete_file(
/*============*/
const char* path); /*!< in: filepath of the ibd tablespace */
-/********************************************************************//**
-Looks for a pre-existing fil_space_t with the given tablespace ID
-and, if found, returns the name and filepath in newly allocated buffers that the caller must free.
-@param[in] space_id The tablespace ID to search for.
-@param[out] name Name of the tablespace found.
-@param[out] fileapth The filepath of the first datafile for thtablespace found.
-@return true if tablespace is found, false if not. */
-bool
-fil_space_read_name_and_filepath(
- ulint space_id,
- char** name,
- char** filepath);
-
-/** Convert a file name to a tablespace name.
-@param[in] filename directory/databasename/tablename.ibd
-@return database/tablename string, to be freed with ut_free() */
-char*
-fil_path_to_space_name(
- const char* filename);
-
-/** Acquire the fil_system mutex. */
-#define fil_system_enter() mutex_enter(&fil_system.mutex)
-/** Release the fil_system mutex. */
-#define fil_system_exit() mutex_exit(&fil_system.mutex)
-
/*******************************************************************//**
Returns the table space by a given id, NULL if not found. */
fil_space_t*
@@ -1848,11 +1944,6 @@ void test_make_filepath();
@param[in] space tablespace
@param[in] offset page number
@return block size */
-UNIV_INTERN
-ulint
-fil_space_get_block_size(const fil_space_t* space, unsigned offset);
+ulint fil_space_get_block_size(const fil_space_t* space, unsigned offset);
-#include "fil0fil.inl"
#endif /* UNIV_INNOCHECKSUM */
-
-#endif /* fil0fil_h */
diff --git a/storage/innobase/include/fsp0file.h b/storage/innobase/include/fsp0file.h
index 7db85e87ed0..8c11d61c5aa 100644
--- a/storage/innobase/include/fsp0file.h
+++ b/storage/innobase/include/fsp0file.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -49,7 +49,6 @@ public:
Datafile()
:
- m_name(),
m_filepath(),
m_filename(),
m_handle(),
@@ -68,9 +67,8 @@ public:
/* No op */
}
- Datafile(const char* name, ulint flags, uint32_t size, ulint order)
+ Datafile(ulint flags, uint32_t size, ulint order)
:
- m_name(mem_strdup(name)),
m_filepath(),
m_filename(),
m_handle(),
@@ -86,8 +84,6 @@ public:
m_last_os_error(),
m_file_info()
{
- ut_ad(m_name != NULL);
- /* No op */
}
Datafile(const Datafile& file)
@@ -105,9 +101,6 @@ public:
m_last_os_error(),
m_file_info()
{
- m_name = mem_strdup(file.m_name);
- ut_ad(m_name != NULL);
-
if (file.m_filepath != NULL) {
m_filepath = mem_strdup(file.m_filepath);
ut_a(m_filepath != NULL);
@@ -127,10 +120,6 @@ public:
{
ut_a(this != &file);
- ut_ad(m_name == NULL);
- m_name = mem_strdup(file.m_name);
- ut_a(m_name != NULL);
-
m_size = file.m_size;
m_order = file.m_order;
m_type = file.m_type;
@@ -164,10 +153,8 @@ public:
return(*this);
}
- /** Initialize the name and flags of this datafile.
- @param[in] name tablespace name, will be copied
- @param[in] flags tablespace flags */
- void init(const char* name, ulint flags);
+ /** Initialize the tablespace flags */
+ void init(ulint flags) { m_flags= flags; }
/** Release the resources. */
virtual void shutdown();
@@ -176,14 +163,12 @@ public:
so that it can be validated.
@param[in] strict whether to issue error messages
@return DB_SUCCESS or error code */
- virtual dberr_t open_read_only(bool strict);
+ dberr_t open_read_only(bool strict);
/** Open a data file in read-write mode during start-up so that
doublewrite pages can be restored and then it can be validated.
- @param[in] read_only_mode if true, then readonly mode checks
- are enforced.
@return DB_SUCCESS or error code */
- virtual dberr_t open_read_write(bool read_only_mode)
+ inline dberr_t open_read_write()
MY_ATTRIBUTE((warn_unused_result));
/** Initialize OS specific file info. */
@@ -197,24 +182,15 @@ public:
Prepend the dirpath to filename using the extension given.
If dirpath is NULL, prepend the default datadir to filepath.
Store the result in m_filepath.
- @param[in] dirpath directory path
- @param[in] filename filename or filepath
- @param[in] ext filename extension */
- void make_filepath(
- const char* dirpath,
- const char* filename,
- ib_extention ext);
+ @param dirpath directory path
+ @param name tablespace (table) name
+ @param ext filename extension */
+ void make_filepath(const char* dirpath, fil_space_t::name_type name,
+ ib_extention ext);
/** Set the filepath by duplicating the filepath sent in */
void set_filepath(const char* filepath);
- /** Allocate and set the datafile or tablespace name in m_name.
- If a name is provided, use it; else extract a file-per-table
- tablespace name from m_filepath. The value of m_name
- will be freed in the destructor.
- @param[in] name Tablespace Name if known, NULL if not */
- void set_name(const char* name);
-
/** Validates the datafile and checks that it conforms with
the expected space ID and flags. The file should exist and be
successfully opened in order for this function to validate it.
@@ -247,13 +223,6 @@ public:
dberr_t validate_first_page(lsn_t* flush_lsn)
MY_ATTRIBUTE((warn_unused_result));
- /** Get Datafile::m_name.
- @return m_name */
- const char* name() const
- {
- return(m_name);
- }
-
/** Get Datafile::m_filepath.
@return m_filepath */
const char* filepath() const
@@ -355,6 +324,9 @@ public:
@return the first data page */
const byte* get_first_page() const { return(m_first_page); }
+ void set_space_id(ulint space_id) { m_space_id= space_id; }
+
+ void set_flags(ulint flags) { m_flags = flags; }
private:
/** Free the filepath buffer. */
void free_filepath();
@@ -363,13 +335,22 @@ private:
in the filepath. */
void set_filename()
{
- if (m_filepath == NULL) {
+ if (!m_filepath) {
return;
}
- char* last_slash = strrchr(m_filepath, OS_PATH_SEPARATOR);
-
- m_filename = last_slash ? last_slash + 1 : m_filepath;
+ if (char *last_slash = strrchr(m_filepath, '/')) {
+#if _WIN32
+ if (char *last = strrchr(m_filepath, '\\')) {
+ if (last > last_slash) {
+ last_slash = last;
+ }
+ }
+#endif
+ m_filename = last_slash + 1;
+ } else {
+ m_filename = m_filepath;
+ }
}
/** Create/open a data file.
@@ -406,12 +387,6 @@ private:
/* DATA MEMBERS */
- /** Datafile name at the tablespace location.
- This is either the basename of the file if an absolute path
- was entered, or it is the relative path to the datadir or
- Tablespace::m_path. */
- char* m_name;
-
protected:
/** Physical file path with base name and extension */
char* m_filepath;
@@ -471,6 +446,8 @@ protected:
ulint m_last_os_error;
public:
+ /** true if table is deferred during recovery */
+ bool m_defer=false;
/** Use the following to determine the uniqueness of this datafile. */
#ifdef _WIN32
/* Use fields dwVolumeSerialNumber, nFileIndexLow, nFileIndexHigh. */
@@ -520,57 +497,28 @@ public:
return(m_link_filepath);
}
- /** Create a link filename based on the contents of m_name,
- open that file, and read the contents into m_filepath.
- @retval DB_SUCCESS if remote linked tablespace file is opened and read.
- @retval DB_CANNOT_OPEN_FILE if the link file does not exist. */
- dberr_t open_link_file();
+ /** Attempt to read the contents of an .isl file into m_filepath.
+ @param name table name
+ @return filepath()
+ @retval nullptr if the .isl file does not exist or cannot be read */
+ const char* open_link_file(const fil_space_t::name_type name);
/** Delete an InnoDB Symbolic Link (ISL) file. */
void delete_link_file(void);
- /** Open a handle to the file linked to in an InnoDB Symbolic Link file
- in read-only mode so that it can be validated.
- @param[in] strict whether to issue error messages
- @return DB_SUCCESS or error code */
- dberr_t open_read_only(bool strict) override;
-
- /** Opens a handle to the file linked to in an InnoDB Symbolic Link
- file in read-write mode so that it can be restored from doublewrite
- and validated.
- @param[in] read_only_mode If true, then readonly mode checks
- are enforced.
- @return DB_SUCCESS or error code */
- dberr_t open_read_write(bool read_only_mode) override
- MY_ATTRIBUTE((warn_unused_result));
-
/******************************************************************
Global Static Functions; Cannot refer to data members.
******************************************************************/
- /** Creates a new InnoDB Symbolic Link (ISL) file. It is always
- created under the 'datadir' of MySQL. The datadir is the directory
- of a running mysqld program. We can refer to it by simply using
- the path ".".
- @param[in] name tablespace name
- @param[in] filepath remote filepath of tablespace datafile
+ /** Create InnoDB Symbolic Link (ISL) file.
+ @param name tablespace name
+ @param filepath full file name
@return DB_SUCCESS or error code */
- static dberr_t create_link_file(
- const char* name,
- const char* filepath);
+ static dberr_t create_link_file(fil_space_t::name_type name,
+ const char *filepath);
/** Delete an InnoDB Symbolic Link (ISL) file by name.
- @param[in] name tablespace name */
- static void delete_link_file(const char* name);
-
- /** Read an InnoDB Symbolic Link (ISL) file by name.
- It is always created under the datadir of MySQL.
- For file-per-table tablespaces, the isl file is expected to be
- in a 'database' directory and called 'tablename.isl'.
- The caller must free the memory returned if it is not null.
- @param[in] link_filepath filepath of the ISL file
- @return Filepath of the IBD file read from the ISL file */
- static char* read_link_file(
- const char* link_filepath);
+ @param name tablespace name */
+ static void delete_link_file(fil_space_t::name_type name);
};
#endif /* fsp0file_h */
diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h
index 1be45915239..b73fc2b54eb 100644
--- a/storage/innobase/include/fsp0fsp.h
+++ b/storage/innobase/include/fsp0fsp.h
@@ -207,16 +207,17 @@ typedef byte fseg_inode_t;
(16 + 3 * FLST_BASE_NODE_SIZE \
+ FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE)
-static constexpr uint32_t FSEG_MAGIC_N_VALUE= 97937874;
+static constexpr byte FSEG_MAGIC_N_BYTES[4]={0x05,0xd6,0x69,0xd2};
-#define FSEG_FILLFACTOR 8 /* If this value is x, then if
- the number of unused but reserved
+#define FSEG_FILLFACTOR 8 /* If the number of unused but reserved
pages in a segment is less than
- reserved pages * 1/x, and there are
+ reserved pages / FSEG_FILLFACTOR,
+ and there are
at least FSEG_FRAG_LIMIT used pages,
then we allow a new empty extent to
be added to the segment in
- fseg_alloc_free_page. Otherwise, we
+ fseg_alloc_free_page_general().
+ Otherwise, we
use unused pages of the segment. */
#define FSEG_FRAG_LIMIT FSEG_FRAG_ARR_N_SLOTS
@@ -342,36 +343,28 @@ fsp_header_check_encryption_key(
ulint fsp_flags,
page_t* page);
-/**********************************************************************//**
-Writes the space id and flags to a tablespace header. The flags contain
-row type, physical/compressed page size, and logical/uncompressed page
-size of the tablespace. */
-void
-fsp_header_init_fields(
-/*===================*/
- page_t* page, /*!< in/out: first page in the space */
- ulint space_id, /*!< in: space id */
- ulint flags); /*!< in: tablespace flags (FSP_SPACE_FLAGS):
- 0, or table->flags if newer than COMPACT */
/** Initialize a tablespace header.
@param[in,out] space tablespace
@param[in] size current size in blocks
-@param[in,out] mtr mini-transaction */
-void fsp_header_init(fil_space_t* space, uint32_t size, mtr_t* mtr)
- MY_ATTRIBUTE((nonnull));
+@param[in,out] mtr mini-transaction
+@return error code */
+dberr_t fsp_header_init(fil_space_t *space, uint32_t size, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/** Create a new segment.
@param space tablespace
@param byte_offset byte offset of the created segment header
@param mtr mini-transaction
+@param err error code
@param has_done_reservation whether fsp_reserve_free_extents() was invoked
@param block block where segment header is placed,
or NULL to allocate an additional page for that
@return the block where the segment header is placed, x-latched
-@retval NULL if could not create segment because of lack of space */
+@retval nullptr if could not create segment */
buf_block_t*
-fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr,
- bool has_done_reservation= false, buf_block_t *block= NULL);
+fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, dberr_t *err,
+ bool has_done_reservation= false, buf_block_t *block= nullptr)
+ MY_ATTRIBUTE((nonnull(1,3,4), warn_unused_result));
/** Calculate the number of pages reserved by a segment,
and how many pages are currently used.
@@ -386,22 +379,6 @@ ulint fseg_n_reserved_pages(const buf_block_t &block,
MY_ATTRIBUTE((nonnull));
/**********************************************************************//**
Allocates a single free page from a segment. This function implements
-the intelligent allocation strategy which tries to minimize
-file space fragmentation.
-@param[in,out] seg_header segment header
-@param[in] hint hint of which page would be desirable
-@param[in] direction if the new page is needed because
- of an index page split, and records are
- inserted there in order, into which
- direction they go alphabetically: FSP_DOWN,
- FSP_UP, FSP_NO_DIR
-@param[in,out] mtr mini-transaction
-@return X-latched block, or NULL if no page could be allocated */
-#define fseg_alloc_free_page(seg_header, hint, direction, mtr) \
- fseg_alloc_free_page_general(seg_header, hint, direction, \
- false, mtr, mtr)
-/**********************************************************************//**
-Allocates a single free page from a segment. This function implements
the intelligent allocation strategy which tries to minimize file space
fragmentation.
@retval NULL if no page could be allocated */
@@ -422,8 +399,9 @@ fseg_alloc_free_page_general(
is no need to do the check for this individual
page */
mtr_t* mtr, /*!< in/out: mini-transaction */
- mtr_t* init_mtr)/*!< in/out: mtr or another mini-transaction
+ mtr_t* init_mtr,/*!< in/out: mtr or another mini-transaction
in which the page should be initialized. */
+ dberr_t* err) /*!< out: error code */
MY_ATTRIBUTE((warn_unused_result, nonnull));
/** Reserves free pages from a tablespace. All mini-transactions which may
@@ -452,19 +430,21 @@ if the table only occupies < FSP_EXTENT_SIZE pages. That is why we apply
different rules in that special case, just ensuring that there are n_pages
free pages available.
-@param[out] n_reserved number of extents actually reserved; if we
- return true and the tablespace size is <
- FSP_EXTENT_SIZE pages, then this can be 0,
- otherwise it is n_ext
-@param[in,out] space tablespace
-@param[in] n_ext number of extents to reserve
-@param[in] alloc_type page reservation type (FSP_BLOB, etc)
-@param[in,out] mtr the mini transaction
-@param[in] n_pages for small tablespaces (tablespace size is
- less than FSP_EXTENT_SIZE), number of free
- pages to reserve.
-@return true if we were able to make the reservation */
-bool
+@param[out] n_reserved number of extents actually reserved; if we
+ return true and the tablespace size is <
+ FSP_EXTENT_SIZE pages, then this can be 0,
+ otherwise it is n_ext
+@param[in,out] space tablespace
+@param[in] n_ext number of extents to reserve
+@param[in] alloc_type page reservation type (FSP_BLOB, etc)
+@param[in,out] mtr the mini transaction
+@param[out] err error code
+@param[in] n_pages for small tablespaces (tablespace size is
+ less than FSP_EXTENT_SIZE), number of free
+ pages to reserve.
+@return error code
+@retval DB_SUCCESS if we were able to make the reservation */
+dberr_t
fsp_reserve_free_extents(
uint32_t* n_reserved,
fil_space_t* space,
@@ -477,43 +457,62 @@ fsp_reserve_free_extents(
@param[in,out] seg_header file segment header
@param[in,out] space tablespace
@param[in] offset page number
-@param[in,out] mtr mini-transaction */
-void
+@param[in,out] mtr mini-transaction
+@param[in] have_latch whether space->x_lock() was already called
+@return error code */
+dberr_t
fseg_free_page(
fseg_header_t* seg_header,
fil_space_t* space,
uint32_t offset,
- mtr_t* mtr);
-/** Determine whether a page is free.
-@param[in,out] space tablespace
-@param[in] page page number
-@return whether the page is marked as free */
-bool
-fseg_page_is_free(fil_space_t* space, unsigned page)
+ mtr_t* mtr,
+ bool have_latch = false)
MY_ATTRIBUTE((nonnull, warn_unused_result));
-/**********************************************************************//**
-Frees part of a segment. This function can be used to free a segment
-by repeatedly calling this function in different mini-transactions.
-Doing the freeing in a single mini-transaction might result in
-too big a mini-transaction.
+
+/** Determine whether a page is allocated.
+@param space tablespace
+@param page page number
+@return error code
+@retval DB_SUCCESS if the page is marked as free
+@retval DB_SUCCESS_LOCKED_REC if the page is marked as allocated */
+dberr_t fseg_page_is_allocated(fil_space_t *space, unsigned page)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Frees part of a segment. This function can be used to free
+a segment by repeatedly calling this function in different
+mini-transactions. Doing the freeing in a single mini-transaction
+might result in too big a mini-transaction.
+@param header segment header; NOTE: if the header resides on first
+ page of the frag list of the segment, this pointer
+ becomes obsolete after the last freeing step
+@param mtr mini-transaction
+@param ahi Drop the adaptive hash index
@return whether the freeing was completed */
bool
fseg_free_step(
- fseg_header_t* header, /*!< in, own: segment header; NOTE: if the header
- resides on the first page of the frag list
- of the segment, this pointer becomes obsolete
- after the last freeing step */
- mtr_t* mtr) /*!< in/out: mini-transaction */
+ fseg_header_t* header,
+ mtr_t* mtr
+#ifdef BTR_CUR_HASH_ADAPT
+ ,bool ahi=false
+#endif /* BTR_CUR_HASH_ADAPT */
+ )
MY_ATTRIBUTE((warn_unused_result));
-/**********************************************************************//**
-Frees part of a segment. Differs from fseg_free_step because this function
-leaves the header page unfreed.
+
+/** Frees part of a segment. Differs from fseg_free_step because
+this function leaves the header page unfreed.
+@param header segment header which must reside on the first
+ fragment page of the segment
+@param mtr mini-transaction
+@param ahi drop the adaptive hash index
@return whether the freeing was completed, except for the header page */
bool
fseg_free_step_not_header(
- fseg_header_t* header, /*!< in: segment header which must reside on
- the first fragment page of the segment */
- mtr_t* mtr) /*!< in/out: mini-transaction */
+ fseg_header_t* header,
+ mtr_t* mtr
+#ifdef BTR_CUR_HASH_ADAPT
+ ,bool ahi=false
+#endif /* BTR_CUR_HASH_ADAPT */
+ )
MY_ATTRIBUTE((warn_unused_result));
/** Reset the page type.
@@ -541,9 +540,8 @@ fil_block_check_type(
ulint type,
mtr_t* mtr)
{
- if (UNIV_UNLIKELY(type != fil_page_get_type(block.frame))) {
- fil_block_reset_type(block, type, mtr);
- }
+ if (UNIV_UNLIKELY(type != fil_page_get_type(block.page.frame)))
+ fil_block_reset_type(block, type, mtr);
}
/** Checks if a page address is an extent descriptor page address.
diff --git a/storage/innobase/include/fsp0space.h b/storage/innobase/include/fsp0space.h
index c00c8d689bf..ed65af52bc8 100644
--- a/storage/innobase/include/fsp0space.h
+++ b/storage/innobase/include/fsp0space.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -50,7 +50,6 @@ public:
Tablespace()
:
m_files(),
- m_name(),
m_space_id(ULINT_UNDEFINED),
m_path(),
m_flags(),
@@ -79,9 +78,6 @@ public:
/** Data file iterator */
iterator end() { return m_files.end(); }
- void set_name(const char* name) { m_name = name; }
- const char* name() const { return m_name; }
-
/** Set tablespace path and filename members.
@param[in] path where tablespace file(s) resides
@param[in] len length of the file path */
@@ -90,8 +86,6 @@ public:
ut_ad(m_path == NULL);
m_path = mem_strdupl(path, len);
ut_ad(m_path != NULL);
-
- os_normalize_path(m_path);
}
/** Set tablespace path and filename members.
@@ -218,9 +212,6 @@ private:
/* DATA MEMBERS */
- /** Name of the tablespace. */
- const char* m_name;
-
/** Tablespace ID */
ulint m_space_id;
diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h
index f8e4c06baae..1912c31b744 100644
--- a/storage/innobase/include/fsp0types.h
+++ b/storage/innobase/include/fsp0types.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2014, 2020, MariaDB Corporation.
+Copyright (c) 2014, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -49,7 +49,7 @@ static constexpr size_t SRV_SPACE_ID_UPPER_BOUND= 0xFFFFFFF0;
If records are inserted in order, there are the following
flags to tell this (their type is made byte for the compiler
to warn if direction and hint parameters are switched in
-fseg_alloc_free_page) */
+fseg_alloc_free_page_general) */
/* @{ */
#define FSP_UP ((byte)111) /*!< alphabetically upwards */
#define FSP_DOWN ((byte)112) /*!< alphabetically downwards */
diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h
index 9c2153b7ca3..720fe7f25b9 100644
--- a/storage/innobase/include/fts0fts.h
+++ b/storage/innobase/include/fts0fts.h
@@ -314,7 +314,7 @@ public:
/** Whether the ADDED table record sync-ed after crash recovery */
unsigned added_synced:1;
- /** Whether the table holds dict_sys.mutex */
+ /** Whether the table holds dict_sys.latch */
unsigned dict_locked:1;
/** Work queue for scheduling jobs for the FTS 'Add' thread, or NULL
@@ -373,13 +373,6 @@ extern ulong fts_min_token_size;
need a sync to free some memory */
extern bool fts_need_sync;
-#define fts_que_graph_free(graph) \
-do { \
- mutex_enter(&dict_sys.mutex); \
- que_graph_free(graph); \
- mutex_exit(&dict_sys.mutex); \
-} while (0)
-
/******************************************************************//**
Create a FTS cache. */
fts_cache_t*
@@ -439,8 +432,7 @@ fts_trx_free(
fts_trx_t* fts_trx); /*!< in, own: FTS trx */
/** Creates the common auxiliary tables needed for supporting an FTS index
-on the given table. row_mysql_lock_data_dictionary must have been called
-before this.
+on the given table.
The following tables are created.
CREATE TABLE $FTS_PREFIX_DELETED
(doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
@@ -463,8 +455,7 @@ fts_create_common_tables(
bool skip_doc_id_index)
MY_ATTRIBUTE((nonnull, warn_unused_result));
/** Creates the column specific ancillary tables needed for supporting an
-FTS index on the given table. row_mysql_lock_data_dictionary must have
-been called before this.
+FTS index on the given table.
All FTS AUX Index tables have the following schema.
CREAT TABLE $FTS_PREFIX_INDEX_[1-6](
@@ -489,17 +480,29 @@ fts_add_doc_id_column(
dict_table_t* table, /*!< in/out: Table with FTS index */
mem_heap_t* heap); /*!< in: temporary memory heap, or NULL */
-/*********************************************************************//**
-Drops the ancillary tables needed for supporting an FTS index on the
-given table. row_mysql_lock_data_dictionary must have been called before
-this.
+/** Lock the internal FTS_ tables for an index, before fts_drop_index_tables().
+@param trx transaction
+@param index fulltext index */
+dberr_t fts_lock_index_tables(trx_t *trx, const dict_index_t &index);
+
+/** Lock the internal common FTS_ tables, before fts_drop_common_tables().
+@param trx transaction
+@param table table containing FULLTEXT INDEX
@return DB_SUCCESS or error code */
-dberr_t
-fts_drop_tables(
-/*============*/
- trx_t* trx, /*!< in: transaction */
- dict_table_t* table); /*!< in: table has the FTS
- index */
+dberr_t fts_lock_common_tables(trx_t *trx, const dict_table_t &table);
+
+/** Lock the internal FTS_ tables for table, before fts_drop_tables().
+@param trx transaction
+@param table table containing FULLTEXT INDEX
+@return DB_SUCCESS or error code */
+dberr_t fts_lock_tables(trx_t *trx, const dict_table_t &table);
+
+/** Drop the internal FTS_ tables for table.
+@param trx transaction
+@param table table containing FULLTEXT INDEX
+@return DB_SUCCESS or error code */
+dberr_t fts_drop_tables(trx_t *trx, const dict_table_t &table);
+
/******************************************************************//**
The given transaction is about to be committed; do whatever is necessary
from the FTS system's POV.
@@ -624,11 +627,7 @@ fts_optimize_init(void);
/****************************************************************//**
Drops index ancillary tables for a FTS index
@return DB_SUCCESS or error code */
-dberr_t
-fts_drop_index_tables(
-/*==================*/
- trx_t* trx, /*!< in: transaction */
- dict_index_t* index) /*!< in: Index to drop */
+dberr_t fts_drop_index_tables(trx_t *trx, const dict_index_t &index)
MY_ATTRIBUTE((warn_unused_result));
/** Add the table to add to the OPTIMIZER's list.
@@ -649,12 +648,6 @@ fts_optimize_remove_table(
void
fts_optimize_shutdown();
-/** Send sync fts cache for the table.
-@param[in] table table to sync */
-void
-fts_optimize_request_sync_table(
- dict_table_t* table);
-
/**********************************************************************//**
Take a FTS savepoint. */
void
@@ -706,26 +699,11 @@ fts_savepoint_rollback_last_stmt(
/*=============================*/
trx_t* trx); /*!< in: transaction */
-/** Drop all orphaned FTS auxiliary tables, those that don't have a parent
-table or FTS index defined on them. */
-void fts_drop_orphaned_tables();
-
/** Run SYNC on the table, i.e., write out data from the cache to the
FTS auxiliary INDEX table and clear the cache at the end.
@param[in,out] table fts table
-@param[in] wait whether to wait for existing sync to finish
@return DB_SUCCESS on success, error code on failure. */
-dberr_t fts_sync_table(dict_table_t* table, bool wait = true);
-
-/****************************************************************//**
-Free the query graph but check whether dict_sys.mutex is already
-held */
-void
-fts_que_graph_free_check_lock(
-/*==========================*/
- fts_table_t* fts_table, /*!< in: FTS table */
- const fts_index_cache_t*index_cache, /*!< in: FTS index cache */
- que_t* graph); /*!< in: query graph */
+dberr_t fts_sync_table(dict_table_t* table);
/****************************************************************//**
Create an FTS index cache. */
@@ -863,13 +841,12 @@ fts_table_fetch_doc_ids(
This function brings FTS index in sync when FTS index is first
used. There are documents that have not yet sync-ed to auxiliary
tables from last server abnormally shutdown, we will need to bring
-such document into FTS cache before any further operations
-@return TRUE if all OK */
-ibool
+such document into FTS cache before any further operations */
+void
fts_init_index(
/*===========*/
dict_table_t* table, /*!< in: Table with FTS */
- ibool has_cache_lock); /*!< in: Whether we already
+ bool has_cache_lock); /*!< in: Whether we already
have cache lock */
/*******************************************************************//**
Add a newly create index in FTS cache */
@@ -933,9 +910,8 @@ fts_trx_create(
/** Clear all fts resources when there is no internal DOC_ID
and there are no new fts index to add.
-@param[in,out] table table where fts is to be freed
-@param[in] trx transaction to drop all fts tables */
-void fts_clear_all(dict_table_t *table, trx_t *trx);
+@param[in,out] table table where fts is to be freed */
+void fts_clear_all(dict_table_t *table);
/** Check whether the given name is fts auxiliary table
and fetch the parent table id and index id
diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h
index 660f7459249..ae0bb036e37 100644
--- a/storage/innobase/include/fts0priv.h
+++ b/storage/innobase/include/fts0priv.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -135,7 +135,7 @@ fts_eval_sql(
/** Construct the name of an internal FTS table for the given table.
@param[in] fts_table metadata on fulltext-indexed table
@param[out] table_name a name up to MAX_FULL_NAME_LEN
-@param[in] dict_locked whether dict_sys.mutex is being held */
+@param[in] dict_locked whether dict_sys.latch is being held */
void fts_get_table_name(const fts_table_t* fts_table, char* table_name,
bool dict_locked = false)
MY_ATTRIBUTE((nonnull));
@@ -295,16 +295,6 @@ fts_trx_table_id_cmp(
#define fts_sql_commit(trx) trx_commit_for_mysql(trx)
#define fts_sql_rollback(trx) (trx)->rollback()
/******************************************************************//**
-Parse an SQL string. %s is replaced with the table's id. Don't acquire
-the dict mutex
-@return query graph */
-que_t*
-fts_parse_sql_no_dict_lock(
-/*=======================*/
- pars_info_t* info, /*!< in: parser info */
- const char* sql) /*!< in: SQL string to evaluate */
- MY_ATTRIBUTE((nonnull(2), malloc, warn_unused_result));
-/******************************************************************//**
Get value from config table. The caller must ensure that enough
space is allocated for value to hold the column contents
@return DB_SUCCESS or error code */
@@ -424,8 +414,7 @@ Append deleted doc ids to vector and sort the vector. */
void
fts_cache_append_deleted_doc_ids(
/*=============================*/
- const fts_cache_t*
- cache, /*!< in: cache to use */
+ fts_cache_t* cache, /*!< in: cache to use */
ib_vector_t* vector); /*!< in: append to this vector */
/******************************************************************//**
Search the index specific cache for a particular FTS index.
@@ -470,12 +459,6 @@ fts_get_table_id(
FTS_AUX_MIN_TABLE_ID_LENGTH bytes
long */
MY_ATTRIBUTE((nonnull, warn_unused_result));
-/** Construct the name of an internal FTS table for the given table.
-@param[in] fts_table metadata on fulltext-indexed table
-@param[in] dict_locked whether dict_sys.mutex is being held
-@return the prefix, must be freed with ut_free() */
-char* fts_get_table_name_prefix(const fts_table_t* fts_table)
- MY_ATTRIBUTE((nonnull, malloc, warn_unused_result));
/******************************************************************//**
Add node positions. */
void
diff --git a/storage/innobase/include/fts0types.h b/storage/innobase/include/fts0types.h
index 2cddf152d04..04e99d595c5 100644
--- a/storage/innobase/include/fts0types.h
+++ b/storage/innobase/include/fts0types.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -28,7 +28,6 @@ Created 2007-03-27 Sunny Bains
#define INNOBASE_FTS0TYPES_H
#include "fts0fts.h"
-#include "fut0fut.h"
#include "pars0pars.h"
#include "que0types.h"
#include "ut0byte.h"
@@ -76,7 +75,6 @@ struct fts_index_cache_t {
que_t** ins_graph; /*!< Insert query graphs */
- que_t** sel_graph; /*!< Select query graphs */
CHARSET_INFO* charset; /*!< charset */
};
@@ -88,52 +86,23 @@ struct fts_stopword_t {
CHARSET_INFO* charset; /*!< charset for stopword */
};
-/** The SYNC state of the cache. There is one instance of this struct
-associated with each ADD thread. */
-struct fts_sync_t {
- trx_t* trx; /*!< The transaction used for SYNCing
- the cache to disk */
- dict_table_t* table; /*!< Table with FTS index(es) */
- ulint max_cache_size; /*!< Max size in bytes of the cache */
- ibool cache_full; /*!< flag, when true it indicates that
- we need to sync the cache to disk */
- ulint lower_index; /*!< the start index of the doc id
- vector from where to start adding
- documents to the FTS cache */
- ulint upper_index; /*!< max index of the doc id vector to
- add to the FTS cache */
- ibool interrupted; /*!< TRUE if SYNC was interrupted */
- doc_id_t min_doc_id; /*!< The smallest doc id added to the
- cache. It should equal to
- doc_ids[lower_index] */
- doc_id_t max_doc_id; /*!< The doc id at which the cache was
- noted as being full, we use this to
- set the upper_limit field */
- time_t start_time; /*!< SYNC start time; only used if
- fts_enable_diag_print */
- bool in_progress; /*!< flag whether sync is in progress.*/
- bool unlock_cache; /*!< flag whether unlock cache when
- write fts node */
- os_event_t event; /*!< sync finish event;
- only os_event_set() and os_event_wait()
- are used */
-};
+struct fts_sync_t;
/** The cache for the FTS system. It is a memory-based inverted index
that new entries are added to, until it grows over the configured maximum
size, at which time its contents are written to the INDEX table. */
-struct fts_cache_t {
- rw_lock_t lock; /*!< lock protecting all access to the
- memory buffer. FIXME: this needs to
- be our new upgrade-capable rw-lock */
-
- rw_lock_t init_lock; /*!< lock used for the cache
- intialization, it has different
- SYNC level as above cache lock */
+struct fts_cache_t
+{
+ /** lock protecting all access to the memory buffer */
+ mysql_mutex_t lock;
+ /** cache initialization */
+ mysql_mutex_t init_lock;
- ib_mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */
+ /** protection for deleted_doc_ids */
+ mysql_mutex_t deleted_lock;
- ib_mutex_t doc_id_lock; /*!< Lock covering Doc ID */
+ /** protection for DOC_ID */
+ mysql_mutex_t doc_id_lock;
ib_vector_t* deleted_doc_ids;/*!< Array of deleted doc ids, each
element is of type fts_update_t */
@@ -206,7 +175,6 @@ struct fts_node_t {
ulint ilist_size_alloc;
/*!< Allocated size of ilist in
bytes */
- bool synced; /*!< flag whether the node is synced */
};
/** A tokenizer word. Contains information about one word. */
diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h
index 1ade24cd069..746dab80400 100644
--- a/storage/innobase/include/fut0lst.h
+++ b/storage/innobase/include/fut0lst.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,30 +24,21 @@ File-based list utilities
Created 11/28/1995 Heikki Tuuri
***********************************************************************/
-#ifndef fut0lst_h
-#define fut0lst_h
+#pragma once
+
+/* The physical size of a list base node in bytes */
+#define FLST_BASE_NODE_SIZE (4 + 2 * FIL_ADDR_SIZE)
+/* The physical size of a list node in bytes */
+#define FLST_NODE_SIZE (2 * FIL_ADDR_SIZE)
#ifdef UNIV_INNOCHECKSUM
# include "fil0fil.h"
#else
-#include "fut0fut.h"
-#include "mtr0log.h"
-
-/* The C 'types' of base node and list node: these should be used to
-write self-documenting code. Of course, the sizeof macro cannot be
-applied to these types! */
+# include "mtr0log.h"
typedef byte flst_base_node_t;
typedef byte flst_node_t;
-#endif /* !UNIV_INNOCHECKSUM */
-
-/* The physical size of a list base node in bytes */
-#define FLST_BASE_NODE_SIZE (4 + 2 * FIL_ADDR_SIZE)
-/* The physical size of a list node in bytes */
-#define FLST_NODE_SIZE (2 * FIL_ADDR_SIZE)
-
-#ifndef UNIV_INNOCHECKSUM
/* We define the field offsets of a node for the list */
#define FLST_PREV 0 /* 6-byte address of the previous list element;
the page part of address is FIL_NULL, if no
@@ -70,9 +61,10 @@ typedef byte flst_node_t;
@param[in,out] mtr mini-transaction */
inline void flst_init(const buf_block_t* block, uint16_t ofs, mtr_t* mtr)
{
- ut_ad(!mach_read_from_2(FLST_LEN + ofs + block->frame));
- ut_ad(!mach_read_from_2(FLST_FIRST + FIL_ADDR_BYTE + ofs + block->frame));
- ut_ad(!mach_read_from_2(FLST_LAST + FIL_ADDR_BYTE + ofs + block->frame));
+ ut_d(const page_t *page= block->page.frame);
+ ut_ad(!mach_read_from_2(FLST_LEN + ofs + page));
+ ut_ad(!mach_read_from_2(FLST_FIRST + FIL_ADDR_BYTE + ofs + page));
+ ut_ad(!mach_read_from_2(FLST_LAST + FIL_ADDR_BYTE + ofs + page));
compile_time_assert(FIL_NULL == 0xffU * 0x1010101U);
mtr->memset(block, FLST_FIRST + FIL_ADDR_PAGE + ofs, 4, 0xff);
mtr->memset(block, FLST_LAST + FIL_ADDR_PAGE + ofs, 4, 0xff);
@@ -82,7 +74,7 @@ inline void flst_init(const buf_block_t* block, uint16_t ofs, mtr_t* mtr)
@param[in] block file page
@param[in,out] base base node
@param[in,out] mtr mini-transaction */
-void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr)
+void flst_init(const buf_block_t &block, byte *base, mtr_t *mtr)
MY_ATTRIBUTE((nonnull));
/** Append a file list node to a list.
@@ -90,28 +82,31 @@ void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr)
@param[in] boffset byte offset of the base node
@param[in,out] add block to be added
@param[in] aoffset byte offset of the node to be added
-@param[in,outr] mtr mini-transaction */
-void flst_add_last(buf_block_t *base, uint16_t boffset,
- buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
- MY_ATTRIBUTE((nonnull));
+@param[in,out] mtr mini-transaction
+@return error code */
+dberr_t flst_add_last(buf_block_t *base, uint16_t boffset,
+ buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/** Prepend a file list node to a list.
@param[in,out] base base node block
@param[in] boffset byte offset of the base node
@param[in,out] add block to be added
@param[in] aoffset byte offset of the node to be added
-@param[in,outr] mtr mini-transaction */
-void flst_add_first(buf_block_t *base, uint16_t boffset,
+@param[in,out] mtr mini-transaction
+@return error code */
+dberr_t flst_add_first(buf_block_t *base, uint16_t boffset,
buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
- MY_ATTRIBUTE((nonnull));
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/** Remove a file list node.
@param[in,out] base base node block
@param[in] boffset byte offset of the base node
@param[in,out] cur block to be removed
@param[in] coffset byte offset of the current record to be removed
-@param[in,outr] mtr mini-transaction */
-void flst_remove(buf_block_t *base, uint16_t boffset,
- buf_block_t *cur, uint16_t coffset, mtr_t *mtr)
- MY_ATTRIBUTE((nonnull));
+@param[in,out] mtr mini-transaction
+@return error code */
+dberr_t flst_remove(buf_block_t *base, uint16_t boffset,
+ buf_block_t *cur, uint16_t coffset, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/** @return the length of a list */
inline uint32_t flst_get_len(const flst_base_node_t *base)
@@ -153,11 +148,9 @@ inline fil_addr_t flst_get_prev_addr(const flst_node_t *node)
return flst_read_addr(node + FLST_PREV);
}
-#ifdef UNIV_DEBUG
+# ifdef UNIV_DEBUG
/** Validate a file-based list. */
void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr);
-#endif
+# endif
#endif /* !UNIV_INNOCHECKSUM */
-
-#endif
diff --git a/storage/innobase/include/gis0rtree.h b/storage/innobase/include/gis0rtree.h
index 9a350325bca..b07261ce042 100644
--- a/storage/innobase/include/gis0rtree.h
+++ b/storage/innobase/include/gis0rtree.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -59,6 +59,44 @@ Created 2013/03/27 Jimmy Yang and Allen Lai
/* Geometry data header */
#define GEO_DATA_HEADER_SIZE 4
+
+/** Search for a spatial index leaf page record.
+@param cur cursor
+@param tuple search tuple
+@param latch_mode latching mode
+@param mtr mini-transaction
+@param mode search mode */
+dberr_t rtr_search_leaf(btr_cur_t *cur, const dtuple_t *tuple,
+ btr_latch_mode latch_mode, mtr_t *mtr,
+ page_cur_mode_t mode= PAGE_CUR_RTREE_LOCATE)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Search for inserting a spatial index leaf page record.
+@param cur cursor
+@param tuple search tuple
+@param latch_mode latching mode
+@param mtr mini-transaction */
+inline dberr_t rtr_insert_leaf(btr_cur_t *cur, const dtuple_t *tuple,
+ btr_latch_mode latch_mode, mtr_t *mtr)
+{
+ return rtr_search_leaf(cur, tuple, latch_mode, mtr, PAGE_CUR_RTREE_INSERT);
+}
+
+/** Search for a spatial index leaf page record.
+@param pcur cursor
+@param tuple search tuple
+@param mode search mode
+@param mtr mini-transaction */
+dberr_t rtr_search_leaf(btr_pcur_t *pcur, const dtuple_t *tuple,
+ page_cur_mode_t mode, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
+ page_cur_mode_t mode,
+ btr_latch_mode latch_mode,
+ btr_cur_t *cur, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
/**********************************************************************//**
Builds a Rtree node pointer out of a physical record and a page number.
@return own: node pointer */
@@ -93,7 +131,8 @@ rtr_page_split_and_insert(
mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
const dtuple_t* tuple, /*!< in: tuple to insert */
ulint n_ext, /*!< in: number of externally stored columns */
- mtr_t* mtr); /*!< in: mtr */
+ mtr_t* mtr, /*!< in: mtr */
+ dberr_t* err); /*!< out: error code */
/**************************************************************//**
Sets the child node mbr in a node pointer. */
@@ -123,7 +162,8 @@ rtr_pcur_move_to_next(
function may release the page latch */
ulint cur_level,
/*!< in: current level */
- mtr_t* mtr); /*!< in: mtr */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((warn_unused_result));
/****************************************************************//**
Searches the right position in rtree for a page cursor. */
@@ -254,21 +294,14 @@ rtr_get_mbr_from_tuple(
rtr_mbr* mbr); /*!< out: mbr to fill */
/* Get the rtree page father.
-@param[in] offsets work area for the return value
-@param[in] index rtree index
-@param[in] block child page in the index
-@param[in] mtr mtr
+@param[in,out] mtr mtr
@param[in] sea_cur search cursor, contains information
about parent nodes in search
-@param[in] cursor cursor on node pointer record,
- its page x-latched */
-void
-rtr_page_get_father(
- dict_index_t* index,
- buf_block_t* block,
- mtr_t* mtr,
- btr_cur_t* sea_cur,
- btr_cur_t* cursor);
+@param[in,out] cursor cursor on node pointer record,
+ its page x-latched
+@return whether the cursor was successfully positioned */
+bool rtr_page_get_father(mtr_t *mtr, btr_cur_t *sea_cur, btr_cur_t *cursor)
+ MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
/************************************************************//**
Returns the father block to a page. It is assumed that mtr holds
@@ -279,8 +312,6 @@ rtr_page_get_father_block(
/*======================*/
rec_offs* offsets,/*!< in: work area for the return value */
mem_heap_t* heap, /*!< in: memory heap to use */
- dict_index_t* index, /*!< in: b-tree index */
- buf_block_t* block, /*!< in: child page in the index */
mtr_t* mtr, /*!< in: mtr */
btr_cur_t* sea_cur,/*!< in: search cursor, contains information
about parent nodes in search */
@@ -294,7 +325,7 @@ rtr_store_parent_path(
/*==================*/
const buf_block_t* block, /*!< in: block of the page */
btr_cur_t* btr_cur,/*!< in/out: persistent cursor */
- ulint latch_mode,
+ btr_latch_mode latch_mode,
/*!< in: latch_mode */
ulint level, /*!< in: index level */
mtr_t* mtr); /*!< in: mtr */
@@ -302,28 +333,12 @@ rtr_store_parent_path(
/**************************************************************//**
Initializes and opens a persistent cursor to an index tree. It should be
closed with btr_pcur_close. */
-void
-rtr_pcur_open_low(
-/*==============*/
- dict_index_t* index, /*!< in: index */
- ulint level, /*!< in: level in the btree */
+bool rtr_search(
const dtuple_t* tuple, /*!< in: tuple on which search done */
- page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...;
- NOTE that if the search is made using a unique
- prefix of a record, mode should be
- PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
- may end up on the previous page from the
- record! */
- ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+ btr_latch_mode latch_mode,/*!< in: BTR_MODIFY_LEAF, ... */
btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
- const char* file, /*!< in: file name */
- unsigned line, /*!< in: line where called */
- mtr_t* mtr); /*!< in: mtr */
-
-#define rtr_pcur_open(i,t,md,l,c,m) \
- rtr_pcur_open_low(i,0,t,md,l,c,__FILE__,__LINE__,m)
-
-struct btr_cur_t;
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((warn_unused_result));
/*********************************************************//**
Returns the R-Tree node stored in the parent search path
@@ -347,9 +362,12 @@ rtr_get_parent_cursor(
ulint level, /*!< in: index level of buffer page */
ulint is_insert); /*!< in: whether insert operation */
+MY_ATTRIBUTE((warn_unused_result))
/*************************************************************//**
-Copy recs from a page to new_block of rtree. */
-void
+Copy recs from a page to new_block of rtree.
+
+@return error code */
+dberr_t
rtr_page_copy_rec_list_end_no_locks(
/*================================*/
buf_block_t* new_block, /*!< in: index page to copy to */
@@ -362,9 +380,12 @@ rtr_page_copy_rec_list_end_no_locks(
ulint* num_moved, /*!< out: num of rec to move */
mtr_t* mtr); /*!< in: mtr */
+MY_ATTRIBUTE((warn_unused_result))
/*************************************************************//**
-Copy recs till a specified rec from a page to new_block of rtree. */
-void
+Copy recs till a specified rec from a page to new_block of rtree.
+
+@return error code */
+dberr_t
rtr_page_copy_rec_list_start_no_locks(
/*==================================*/
buf_block_t* new_block, /*!< in: index page to copy to */
@@ -436,7 +457,6 @@ rtr_check_same_block(
btr_cur_t* cur, /*!< in/out: position at the parent entry
pointing to the child if successful */
buf_block_t* parentb,/*!< in: parent page to check */
- buf_block_t* childb, /*!< in: child Page */
mem_heap_t* heap); /*!< in: memory heap */
/*********************************************************************//**
diff --git a/storage/innobase/include/gis0rtree.inl b/storage/innobase/include/gis0rtree.inl
index 1b53caa306b..5101eeb6f7a 100644
--- a/storage/innobase/include/gis0rtree.inl
+++ b/storage/innobase/include/gis0rtree.inl
@@ -57,6 +57,9 @@ rtr_page_cal_mbr(
page = buf_block_get_frame(block);
rec = page_rec_get_next(page_get_infimum_rec(page));
+ if (UNIV_UNLIKELY(!rec)) {
+ return;
+ }
offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page)
? index->n_fields : 0,
ULINT_UNDEFINED, &heap);
@@ -176,12 +179,12 @@ rtr_get_parent_node(
return(NULL);
}
- mutex_enter(&btr_cur->rtr_info->rtr_path_mutex);
+ mysql_mutex_lock(&btr_cur->rtr_info->rtr_path_mutex);
num = btr_cur->rtr_info->parent_path->size();
if (!num) {
- mutex_exit(&btr_cur->rtr_info->rtr_path_mutex);
+ mysql_mutex_unlock(&btr_cur->rtr_info->rtr_path_mutex);
return(NULL);
}
@@ -204,7 +207,7 @@ rtr_get_parent_node(
}
}
- mutex_exit(&btr_cur->rtr_info->rtr_path_mutex);
+ mysql_mutex_unlock(&btr_cur->rtr_info->rtr_path_mutex);
return(found_node);
}
diff --git a/storage/innobase/include/gis0type.h b/storage/innobase/include/gis0type.h
index 55944bfcce3..d6a4ef67a38 100644
--- a/storage/innobase/include/gis0type.h
+++ b/storage/innobase/include/gis0type.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -72,7 +72,7 @@ typedef struct matched_rec {
buf_block_t block; /*!< the shadow buffer block */
ulint used; /*!< memory used */
rtr_rec_vector* matched_recs; /*!< vector holding the matching rec */
- ib_mutex_t rtr_match_mutex;/*!< mutex protect the match_recs
+ mysql_mutex_t rtr_match_mutex;/*!< mutex protect the match_recs
vector */
bool valid; /*!< whether result in matched_recs
or this search is valid (page not
@@ -103,14 +103,8 @@ typedef struct rtr_info{
/*!< vector holding parent pages during
search */
matched_rec_t* matches;/*!< struct holding matching leaf records */
- ib_mutex_t rtr_path_mutex;
+ mysql_mutex_t rtr_path_mutex;
/*!< mutex protect the "path" vector */
- buf_block_t* tree_blocks[RTR_MAX_LEVELS + RTR_LEAF_LATCH_NUM];
- /*!< tracking pages that would be locked
- at leaf level, for future free */
- ulint tree_savepoints[RTR_MAX_LEVELS + RTR_LEAF_LATCH_NUM];
- /*!< savepoint used to release latches/blocks
- on each level and leaf level */
rtr_mbr_t mbr; /*!< the search MBR */
que_thr_t* thr; /*!< the search thread */
mem_heap_t* heap; /*!< memory heap */
@@ -137,7 +131,7 @@ typedef struct rtr_info{
struct rtr_info_track_t {
/** Active search info */
std::forward_list<rtr_info_t*, ut_allocator<rtr_info_t*> > rtr_active;
- ib_mutex_t rtr_active_mutex;
+ mysql_mutex_t rtr_active_mutex;
/*!< mutex to protect
rtr_active */
};
diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
index 04e1ec96b73..d5239ec3f9a 100644
--- a/storage/innobase/include/ha_prototypes.h
+++ b/storage/innobase/include/ha_prototypes.h
@@ -109,15 +109,6 @@ innobase_convert_name(
THD* thd); /*!< in: MySQL connection thread, or NULL */
/******************************************************************//**
-Returns true if the thread is the replication thread on the slave
-server.
-@return true if thd is the replication thread */
-ibool
-thd_is_replication_slave_thread(
-/*============================*/
- THD* thd); /*!< in: thread handle */
-
-/******************************************************************//**
Returns true if the transaction this thread is processing has edited
non-transactional tables. Used by the deadlock detector when deciding
which transaction to rollback in case of a deadlock - we try to avoid
@@ -128,13 +119,6 @@ thd_has_edited_nontrans_tables(
/*===========================*/
THD* thd); /*!< in: thread handle */
-/**
-Get high resolution timestamp for the current query start time.
-
-@retval timestamp in microseconds precision
-*/
-unsigned long long thd_query_start_micro(const MYSQL_THD thd);
-
/*************************************************************//**
Prints info of a THD object (== user session thread) to the given file. */
void
@@ -156,15 +140,6 @@ uint8_t
get_innobase_type_from_mysql_type(unsigned *unsigned_flag, const Field *field);
/******************************************************************//**
-Get the variable length bounds of the given character set. */
-void
-innobase_get_cset_width(
-/*====================*/
- ulint cset, /*!< in: MySQL charset-collation code */
- unsigned*mbminlen, /*!< out: minimum length of a char (in bytes) */
- unsigned*mbmaxlen); /*!< out: maximum length of a char (in bytes) */
-
-/******************************************************************//**
Compares NUL-terminated UTF-8 strings case insensitively.
@return 0 if a=b, <0 if a<b, >1 if a>b */
int
@@ -208,11 +183,6 @@ innobase_casedn_str(
char* a); /*!< in/out: string to put in lower case */
#ifdef WITH_WSREP
-void
-wsrep_innobase_kill_one_trx(
- THD* bf_thd,
- trx_t *victim_trx,
- my_bool signal);
ulint wsrep_innobase_mysql_sort(int mysql_type, uint charset_number,
unsigned char* str, ulint str_length,
ulint buf_length);
@@ -220,6 +190,15 @@ ulint wsrep_innobase_mysql_sort(int mysql_type, uint charset_number,
extern "C" struct charset_info_st *thd_charset(THD *thd);
+/** Get high resolution timestamp for the current query start time.
+The timestamp is not anchored to any specific point in time,
+but can be used for comparison.
+@param thd user thread
+@retval timestamp in microseconds precision
+*/
+extern "C" unsigned long long thd_start_utime(const MYSQL_THD thd);
+
+
/** Determines the current SQL statement.
Thread unsafe, can only be called from the thread owning the THD.
@param[in] thd MySQL thread handle
@@ -250,45 +229,16 @@ innobase_get_at_most_n_mbchars(
@param[in] thd thread handle, or NULL to query
the global innodb_tmpdir.
@retval NULL if innodb_tmpdir="" */
-UNIV_INTERN
-const char*
-thd_innodb_tmpdir(
- THD* thd);
+const char *thd_innodb_tmpdir(THD *thd);
/******************************************************************//**
Returns the lock wait timeout for the current connection.
@return the lock wait timeout, in seconds */
-ulong
+uint&
thd_lock_wait_timeout(
/*==================*/
THD* thd); /*!< in: thread handle, or NULL to query
the global innodb_lock_wait_timeout */
-/** Get status of innodb_tmpdir.
-@param[in] thd thread handle, or NULL to query
- the global innodb_tmpdir.
-@retval NULL if innodb_tmpdir="" */
-const char*
-thd_innodb_tmpdir(
- THD* thd);
-
-/**********************************************************************//**
-Get the current setting of the table_cache_size global parameter. We do
-a dirty read because for one there is no synchronization object and
-secondly there is little harm in doing so even if we get a torn read.
-@return SQL statement string */
-ulint
-innobase_get_table_cache_size(void);
-/*===============================*/
-
-/**********************************************************************//**
-Get the current setting of the lower_case_table_names global parameter from
-mysqld.cc. We do a dirty read because for one there is no synchronization
-object and secondly there is little harm in doing so even if we get a torn
-read.
-@return value of lower_case_table_names */
-ulint
-innobase_get_lower_case_table_names(void);
-/*=====================================*/
/******************************************************************//**
compare two character string case insensitively according to their charset. */
@@ -458,7 +408,6 @@ innobase_convert_to_filename_charset(
/********************************************************************//**
Helper function to push warnings from InnoDB internals to SQL-layer. */
-UNIV_INTERN
void
ib_push_warning(
trx_t* trx, /*!< in: trx */
@@ -468,7 +417,6 @@ ib_push_warning(
/********************************************************************//**
Helper function to push warnings from InnoDB internals to SQL-layer. */
-UNIV_INTERN
void
ib_push_warning(
void* ithd, /*!< in: thd */
@@ -478,7 +426,6 @@ ib_push_warning(
/********************************************************************//**
Helper function to push warnings from InnoDB internals to SQL-layer. */
-UNIV_INTERN
void
ib_foreign_warn(
trx_t* trx, /*!< in: trx */
@@ -498,19 +445,13 @@ normalize_table_name_c_low(
char* norm_name, /*!< out: normalized name as a
null-terminated string */
const char* name, /*!< in: table name string */
- ibool set_lower_case); /*!< in: TRUE if we want to set
+ bool set_lower_case); /*!< in: true if we want to set
name to lower case */
-/** Update the system variable with the given value of the InnoDB
-buffer pool size.
-@param[in] buf_pool_size given value of buffer pool size.*/
-void
-innodb_set_buf_pool_size(ulonglong buf_pool_size);
/** Create a MYSQL_THD for a background thread and mark it as such.
@param name thread info for SHOW PROCESSLIST
@return new MYSQL_THD */
-MYSQL_THD
-innobase_create_background_thd(const char* name);
+MYSQL_THD innobase_create_background_thd(const char* name);
/** Destroy a THD object associated with a background task.
@param[in] thd MYSQL_THD to destroy */
@@ -521,5 +462,15 @@ void destroy_background_thd(MYSQL_THD thd);
void
innobase_reset_background_thd(MYSQL_THD);
+#ifdef WITH_WSREP
+/** Append table-level exclusive key.
+@param thd MySQL thread handle
+@param table table
+@retval false on success
+@retval true on failure */
+struct dict_table_t;
+bool wsrep_append_table_key(MYSQL_THD thd, const dict_table_t &table);
+#endif /* WITH_WSREP */
+
#endif /* !UNIV_INNOCHECKSUM */
#endif /* HA_INNODB_PROTOTYPES_H */
diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h
index 981ff5a0814..6eb5bb3f183 100644
--- a/storage/innobase/include/hash0hash.h
+++ b/storage/innobase/include/hash0hash.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -26,12 +26,47 @@ Created 5/20/1997 Heikki Tuuri
#pragma once
#include "ut0rnd.h"
+#include "ut0new.h"
struct hash_table_t;
-struct hash_cell_t{
- void* node; /*!< hash chain node, NULL if none */
+struct hash_cell_t
+{
+ /** singly-linked, nullptr terminated list of hash buckets */
+ void *node;
+
+ /** Append an element.
+ @tparam T type of the element
+ @param insert the being-inserted element
+ @param next the next-element pointer in T */
+ template<typename T>
+ void append(T &insert, T *T::*next)
+ {
+ void **after;
+ for (after= &node; *after;
+ after= reinterpret_cast<void**>(&(static_cast<T*>(*after)->*next)));
+ insert.*next= nullptr;
+ *after= &insert;
+ }
+
+ /** Insert an element after another.
+ @tparam T type of the element
+ @param after the element after which to insert
+ @param insert the being-inserted element
+ @param next the next-element pointer in T */
+ template<typename T>
+ void insert_after(T &after, T &insert, T *T::*next)
+ {
+#ifdef UNIV_DEBUG
+ for (const T *c= static_cast<const T*>(node); c; c= c->*next)
+ if (c == &after)
+ goto found;
+ ut_error;
+ found:
+#endif
+ insert.*next= after.*next;
+ after.*next= &insert;
+ }
};
-typedef void* hash_node_t;
/*******************************************************************//**
Inserts a struct to a hash table. */
@@ -59,29 +94,6 @@ do {\
}\
} while (0)
-/*******************************************************************//**
-Inserts a struct to the head of hash table. */
-
-#define HASH_PREPEND(TYPE, NAME, TABLE, FOLD, DATA) \
-do { \
- hash_cell_t* cell3333; \
- TYPE* struct3333; \
- \
- (DATA)->NAME = NULL; \
- \
- cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)]; \
- \
- if (cell3333->node == NULL) { \
- cell3333->node = DATA; \
- DATA->NAME = NULL; \
- } else { \
- struct3333 = (TYPE*) cell3333->node; \
- \
- DATA->NAME = struct3333; \
- \
- cell3333->node = DATA; \
- } \
-} while (0)
#ifdef UNIV_HASH_DEBUG
# define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1)
# define HASH_INVALIDATE(DATA, NAME) *(void**) (&DATA->NAME) = (void*) -1
@@ -117,18 +129,6 @@ do {\
HASH_INVALIDATE(DATA, NAME);\
} while (0)
-#define HASH_REPLACE(TYPE, NAME, TABLE, FOLD, DATA_OLD, DATA_NEW) \
- do { \
- (DATA_NEW)->NAME = (DATA_OLD)->NAME; \
- \
- hash_cell_t& cell3333 \
- = (TABLE)->array[(TABLE)->calc_hash(FOLD)]; \
- TYPE** struct3333 = (TYPE**)&cell3333.node; \
- while (*struct3333 != DATA_OLD) { \
- struct3333 = &((*struct3333)->NAME); \
- } \
- *struct3333 = DATA_NEW; \
- } while (0)
/*******************************************************************//**
Gets the first struct in a hash chain, NULL if none. */
@@ -183,33 +183,6 @@ do { \
} \
} while (0)
-/****************************************************************//**
-Move all hash table entries from OLD_TABLE to NEW_TABLE. */
-
-#define HASH_MIGRATE(OLD_TABLE, NEW_TABLE, NODE_TYPE, PTR_NAME, FOLD_FUNC) \
-do {\
- ulint i2222;\
- ulint cell_count2222;\
-\
- cell_count2222 = (OLD_TABLE)->n_cells; \
-\
- for (i2222 = 0; i2222 < cell_count2222; i2222++) {\
- NODE_TYPE* node2222 = static_cast<NODE_TYPE*>(\
- HASH_GET_FIRST((OLD_TABLE), i2222));\
-\
- while (node2222) {\
- NODE_TYPE* next2222 = static_cast<NODE_TYPE*>(\
- node2222->PTR_NAME);\
- ulint fold2222 = FOLD_FUNC(node2222);\
-\
- HASH_INSERT(NODE_TYPE, PTR_NAME, (NEW_TABLE),\
- fold2222, node2222);\
-\
- node2222 = next2222;\
- }\
- }\
-} while (0)
-
/** Hash table with singly-linked overflow lists */
struct hash_table_t
{
diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h
index 73be4b0a8e8..e38515f0402 100644
--- a/storage/innobase/include/ibuf0ibuf.h
+++ b/storage/innobase/include/ibuf0ibuf.h
@@ -279,8 +279,6 @@ Must not be called when recv_no_ibuf_operations==true.
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
@param[in] x_latch FALSE if relaxed check (avoid latching the
bitmap page)
-@param[in] file file name
-@param[in] line line where called
@param[in,out] mtr mtr which will contain an x-latch to the
bitmap page if the page is not one of the fixed address ibuf pages, or NULL,
in which case a new transaction is created.
@@ -292,8 +290,6 @@ ibuf_page_low(
#ifdef UNIV_DEBUG
bool x_latch,
#endif /* UNIV_DEBUG */
- const char* file,
- unsigned line,
mtr_t* mtr)
MY_ATTRIBUTE((warn_unused_result));
@@ -305,7 +301,7 @@ Must not be called when recv_no_ibuf_operations==true.
@param[in,out] mtr mini-transaction or NULL
@return TRUE if level 2 or level 3 page */
# define ibuf_page(page_id, zip_size, mtr) \
- ibuf_page_low(page_id, zip_size, true, __FILE__, __LINE__, mtr)
+ ibuf_page_low(page_id, zip_size, true, mtr)
#else /* UNIV_DEBUG */
@@ -316,7 +312,7 @@ Must not be called when recv_no_ibuf_operations==true.
@param[in,out] mtr mini-transaction or NULL
@return TRUE if level 2 or level 3 page */
# define ibuf_page(page_id, zip_size, mtr) \
- ibuf_page_low(page_id, zip_size, __FILE__, __LINE__, mtr)
+ ibuf_page_low(page_id, zip_size, mtr)
#endif /* UNIV_DEBUG */
/***********************************************************************//**
@@ -360,9 +356,11 @@ exist entries for such a page if the page belonged to an index which
subsequently was dropped.
@param block X-latched page to try to apply changes to, or NULL to discard
@param page_id page identifier
-@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 */
-void ibuf_merge_or_delete_for_page(buf_block_t *block, const page_id_t page_id,
- ulint zip_size);
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return error code */
+dberr_t ibuf_merge_or_delete_for_page(buf_block_t *block,
+ const page_id_t page_id,
+ ulint zip_size);
/** Delete all change buffer entries for a tablespace,
in DISCARD TABLESPACE, IMPORT TABLESPACE, or read-ahead.
diff --git a/storage/innobase/include/ibuf0ibuf.inl b/storage/innobase/include/ibuf0ibuf.inl
index 9f4e937f31d..1e21f74ff2b 100644
--- a/storage/innobase/include/ibuf0ibuf.inl
+++ b/storage/innobase/include/ibuf0ibuf.inl
@@ -100,9 +100,8 @@ ibuf_should_try(
decide */
{
return(innodb_change_buffering
+ && !(index->type & (DICT_CLUSTERED | DICT_IBUF))
&& ibuf.max_size != 0
- && !dict_index_is_clust(index)
- && !dict_index_is_spatial(index)
&& index->table->quiesce == QUIESCE_NONE
&& (ignore_sec_unique || !dict_index_is_unique(index)));
}
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
index 40bb557a5b2..16acd031177 100644
--- a/storage/innobase/include/lock0lock.h
+++ b/storage/innobase/include/lock0lock.h
@@ -28,40 +28,30 @@ Created 5/7/1996 Heikki Tuuri
#define lock0lock_h
#include "buf0types.h"
-#include "trx0types.h"
+#include "trx0trx.h"
#include "mtr0types.h"
#include "rem0types.h"
-#include "que0types.h"
-#include "lock0types.h"
#include "hash0hash.h"
#include "srv0srv.h"
#include "ut0vec.h"
#include "gis0rtree.h"
#include "lock0prdt.h"
-
-/** Alternatives for innodb_lock_schedule_algorithm, which can be changed by
- setting innodb_lock_schedule_algorithm. */
-enum innodb_lock_schedule_algorithm_t {
- /*!< First Come First Served */
- INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS,
- /*!< Variance-Aware-Transaction-Scheduling */
- INNODB_LOCK_SCHEDULE_ALGORITHM_VATS
-};
-
-extern ulong innodb_lock_schedule_algorithm;
+#include "transactional_lock_guard.h"
// Forward declaration
class ReadView;
/** The value of innodb_deadlock_detect */
-extern my_bool innobase_deadlock_detect;
+extern my_bool innodb_deadlock_detect;
+/** The value of innodb_deadlock_report */
+extern ulong innodb_deadlock_report;
+
+namespace Deadlock
+{
+ /** The allowed values of innodb_deadlock_report */
+ enum report { REPORT_OFF, REPORT_BASIC, REPORT_FULL };
+}
-/*********************************************************************//**
-Gets the size of a lock struct.
-@return size in bytes */
-ulint
-lock_get_size(void);
-/*===============*/
/*********************************************************************//**
Gets the heap_no of the smallest user record on a page.
@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
@@ -70,6 +60,12 @@ ulint
lock_get_min_heap_no(
/*=================*/
const buf_block_t* block); /*!< in: buffer block */
+
+/** Discard locks for an index when purging DELETE FROM SYS_INDEXES
+after an aborted CREATE INDEX operation.
+@param index a stale index on which ADD INDEX operation was aborted */
+ATTRIBUTE_COLD void lock_discard_for_index(const dict_index_t &index);
+
/*************************************************************//**
Updates the lock table when we have reorganized a page. NOTE: we copy
also the locks set on the infimum of the page; the infimum may carry
@@ -129,28 +125,18 @@ lock_update_merge_right(
const buf_block_t* left_block); /*!< in: merged index
page which will be
discarded */
-/*************************************************************//**
-Updates the lock table when the root page is copied to another in
-btr_root_raise_and_insert. Note that we leave lock structs on the
+/** Update locks when the root page is copied to another in
+btr_root_raise_and_insert(). Note that we leave lock structs on the
root page, even though they do not make sense on other than leaf
pages: the reason is that in a pessimistic update the infimum record
of the root page will act as a dummy carrier of the locks of the record
to be updated. */
-void
-lock_update_root_raise(
-/*===================*/
- const buf_block_t* block, /*!< in: index page to which copied */
- const buf_block_t* root); /*!< in: root page */
-/*************************************************************//**
-Updates the lock table when a page is copied to another and the original page
-is removed from the chain of leaf pages, except if page is the root! */
-void
-lock_update_copy_and_discard(
-/*=========================*/
- const buf_block_t* new_block, /*!< in: index page to
- which copied */
- const buf_block_t* block); /*!< in: index page;
- NOT the root! */
+void lock_update_root_raise(const buf_block_t &block, const page_id_t root);
+/** Update the lock table when a page is copied to another.
+@param new_block the target page
+@param old old page (not index root page) */
+void lock_update_copy_and_discard(const buf_block_t &new_block, page_id_t old);
+
/** Update gap locks between the last record of the left_block and the
first record of the right_block when a record is about to be inserted
at the start of the right_block, even though it should "naturally" be
@@ -192,24 +178,16 @@ lock_update_split_left(
/*===================*/
const buf_block_t* right_block, /*!< in: right page */
const buf_block_t* left_block); /*!< in: left page */
-/*************************************************************//**
-Updates the lock table when a page is merged to the left. */
-void
-lock_update_merge_left(
-/*===================*/
- const buf_block_t* left_block, /*!< in: left page to
- which merged */
- const rec_t* orig_pred, /*!< in: original predecessor
- of supremum on the left page
- before merge */
- const buf_block_t* right_block); /*!< in: merged index page
- which will be discarded */
-/*************************************************************//**
-Updates the lock table when a page is split and merged to
-two pages. */
-UNIV_INTERN
-void
-lock_update_split_and_merge(
+/** Update the lock table when a page is merged to the left.
+@param left left page
+@param orig_pred original predecessor of supremum on the left page before merge
+@param right merged, to-be-discarded right page */
+void lock_update_merge_left(const buf_block_t& left, const rec_t *orig_pred,
+ const page_id_t right);
+
+/** Update the locks when a page is split and merged to two pages,
+in defragmentation. */
+void lock_update_split_and_merge(
const buf_block_t* left_block, /*!< in: left page to which merged */
const rec_t* orig_pred, /*!< in: original predecessor of
supremum on the left page before merge*/
@@ -220,9 +198,9 @@ inherited from rec. */
void
lock_rec_reset_and_inherit_gap_locks(
/*=================================*/
- const buf_block_t* heir_block, /*!< in: block containing the
+ const buf_block_t& heir_block, /*!< in: block containing the
record which inherits */
- const buf_block_t* block, /*!< in: block containing the
+ const page_id_t donor, /*!< in: page containing the
record from which inherited;
does NOT reset the locks on
this record */
@@ -271,20 +249,25 @@ lock_rec_store_on_page_infimum(
record of the same page; lock
bits are reset on the
record */
-/*********************************************************************//**
-Restores the state of explicit lock requests on a single record, where the
-state was stored on the infimum of the page. */
-void
-lock_rec_restore_from_page_infimum(
-/*===============================*/
- const buf_block_t* block, /*!< in: buffer block containing rec */
- const rec_t* rec, /*!< in: record whose lock state
- is restored */
- const buf_block_t* donator);/*!< in: page (rec is not
- necessarily on this page)
- whose infimum stored the lock
- state; lock bits are reset on
- the infimum */
+/** Restore the explicit lock requests on a single record, where the
+state was stored on the infimum of a page.
+@param block buffer block containing rec
+@param rec record whose lock state is restored
+@param donator page (rec is not necessarily on this page)
+whose infimum stored the lock state; lock bits are reset on the infimum */
+void lock_rec_restore_from_page_infimum(const buf_block_t &block,
+ const rec_t *rec, page_id_t donator);
+
+/**
+Create a table lock, without checking for deadlocks or lock compatibility.
+@param table table on which the lock is created
+@param type_mode lock type and mode
+@param trx transaction
+@param c_lock conflicting lock
+@return the created lock object */
+lock_t *lock_table_create(dict_table_t *table, unsigned type_mode, trx_t *trx,
+ lock_t *c_lock= nullptr);
+
/*********************************************************************//**
Checks if locks of other transactions prevent an immediate insert of
a record. If they do, first tests if the query thread should anyway
@@ -295,8 +278,6 @@ for a gap x-lock to the lock queue.
dberr_t
lock_rec_insert_check_and_lock(
/*===========================*/
- ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is
- set, does nothing */
const rec_t* rec, /*!< in: record after which to insert */
buf_block_t* block, /*!< in/out: buffer block of rec */
dict_index_t* index, /*!< in: index */
@@ -319,8 +300,6 @@ lock queue.
dberr_t
lock_clust_rec_modify_check_and_lock(
/*=================================*/
- ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
- bit is set, does nothing */
const buf_block_t* block, /*!< in: buffer block of rec */
const rec_t* rec, /*!< in: record which should be
modified */
@@ -430,71 +409,41 @@ lock_clust_rec_read_check_and_lock_alt(
LOCK_REC_NOT_GAP */
que_thr_t* thr) /*!< in: query thread */
MY_ATTRIBUTE((warn_unused_result));
-/*********************************************************************//**
-Checks that a record is seen in a consistent read.
-@return true if sees, or false if an earlier version of the record
-should be retrieved */
-bool
-lock_clust_rec_cons_read_sees(
-/*==========================*/
- const rec_t* rec, /*!< in: user record which should be read or
- passed over by a read cursor */
- dict_index_t* index, /*!< in: clustered index */
- const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
- ReadView* view); /*!< in: consistent read view */
-/*********************************************************************//**
-Checks that a non-clustered index record is seen in a consistent read.
-NOTE that a non-clustered index page contains so little information on
-its modifications that also in the case false, the present version of
-rec may be the right, but we must check this from the clustered index
-record.
-
-@return true if certainly sees, or false if an earlier version of the
-clustered index record might be needed */
-bool
-lock_sec_rec_cons_read_sees(
-/*========================*/
- const rec_t* rec, /*!< in: user record which
- should be read or passed over
- by a read cursor */
- const dict_index_t* index, /*!< in: index */
- const ReadView* view) /*!< in: consistent read view */
- MY_ATTRIBUTE((warn_unused_result));
-/*********************************************************************//**
-Locks the specified database table in the mode given. If the lock cannot
-be granted immediately, the query thread is put to wait.
-@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
-dberr_t
-lock_table(
-/*=======*/
- unsigned flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is set,
- does nothing */
- dict_table_t* table, /*!< in/out: database table
- in dictionary cache */
- lock_mode mode, /*!< in: lock mode */
- que_thr_t* thr) /*!< in: query thread */
- MY_ATTRIBUTE((warn_unused_result));
-/*********************************************************************//**
-Creates a table IX lock object for a resurrected transaction. */
-void
-lock_table_ix_resurrect(
-/*====================*/
- dict_table_t* table, /*!< in/out: table */
- trx_t* trx); /*!< in/out: transaction */
+/** Acquire a table lock.
+@param table table to be locked
+@param fktable pointer to table, in case of a FOREIGN key check
+@param mode lock mode
+@param thr SQL execution thread
+@retval DB_SUCCESS if the lock was acquired
+@retval DB_DEADLOCK if a deadlock occurred, or fktable && *fktable != table
+@retval DB_LOCK_WAIT if lock_wait() must be invoked */
+dberr_t lock_table(dict_table_t *table, dict_table_t *const*fktable,
+ lock_mode mode, que_thr_t *thr)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Create a table lock object for a resurrected transaction.
+@param table table to be X-locked
+@param trx transaction
+@param mode LOCK_X or LOCK_IX */
+void lock_table_resurrect(dict_table_t *table, trx_t *trx, lock_mode mode);
/** Sets a lock on a table based on the given mode.
-@param[in] table table to lock
-@param[in,out] trx transaction
-@param[in] mode LOCK_X or LOCK_S
-@return error code or DB_SUCCESS. */
-dberr_t
-lock_table_for_trx(
- dict_table_t* table,
- trx_t* trx,
- enum lock_mode mode)
+@param table table to lock
+@param trx transaction
+@param mode LOCK_X or LOCK_S
+@param no_wait whether to skip handling DB_LOCK_WAIT
+@return error code */
+dberr_t lock_table_for_trx(dict_table_t *table, trx_t *trx, lock_mode mode,
+ bool no_wait= false)
MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Exclusively lock the data dictionary tables.
+@param trx dictionary transaction
+@return error code
+@retval DB_SUCCESS on success */
+dberr_t lock_sys_tables(trx_t *trx);
+
/*************************************************************//**
Removes a granted record lock of a transaction from the queue and grants
locks to other transactions waiting in the queue if they now are entitled
@@ -504,7 +453,7 @@ lock_rec_unlock(
/*============*/
trx_t* trx, /*!< in/out: transaction that has
set a record lock */
- const buf_block_t* block, /*!< in: buffer block containing rec */
+ const page_id_t id, /*!< in: page containing rec */
const rec_t* rec, /*!< in: record */
lock_mode lock_mode);/*!< in: LOCK_S or LOCK_X */
@@ -512,17 +461,17 @@ lock_rec_unlock(
and release possible other transactions waiting because of these locks. */
void lock_release(trx_t* trx);
+/** Release the explicit locks of a committing transaction while
+dict_sys.latch is exclusively locked,
+and release possible other transactions waiting because of these locks. */
+void lock_release_on_drop(trx_t *trx);
+
/** Release non-exclusive locks on XA PREPARE,
and release possible other transactions waiting because of these locks. */
void lock_release_on_prepare(trx_t *trx);
-/*************************************************************//**
-Get the lock hash table */
-UNIV_INLINE
-hash_table_t*
-lock_hash_get(
-/*==========*/
- ulint mode); /*!< in: lock mode */
+/** Release locks on a table whose creation is being rolled back */
+ATTRIBUTE_COLD void lock_release_on_rollback(trx_t *trx, dict_table_t *table);
/**********************************************************************//**
Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED,
@@ -559,124 +508,48 @@ lock_report_trx_id_insanity(
trx_id_t max_trx_id); /*!< in: trx_sys.get_max_trx_id() */
/*********************************************************************//**
Prints info of locks for all transactions.
-@return FALSE if not able to obtain lock mutex and exits without
-printing info */
+@return FALSE if not able to acquire lock_sys.latch (and display info) */
ibool
lock_print_info_summary(
/*====================*/
FILE* file, /*!< in: file where to print */
- ibool nowait) /*!< in: whether to wait for the lock mutex */
+ ibool nowait) /*!< in: whether to wait for lock_sys.latch */
MY_ATTRIBUTE((warn_unused_result));
/** Prints transaction lock wait and MVCC state.
@param[in,out] file file where to print
@param[in] trx transaction
-@param[in] now current time */
-void
-lock_trx_print_wait_and_mvcc_state(FILE* file, const trx_t* trx, time_t now);
+@param[in] now current my_hrtime_coarse() */
+void lock_trx_print_wait_and_mvcc_state(FILE *file, const trx_t *trx,
+ my_hrtime_t now);
/*********************************************************************//**
-Prints info of locks for each transaction. This function assumes that the
-caller holds the lock mutex and more importantly it will release the lock
-mutex on behalf of the caller. (This should be fixed in the future). */
+Prints info of locks for each transaction. This function will release
+lock_sys.latch, which the caller must be holding in exclusive mode. */
void
lock_print_info_all_transactions(
/*=============================*/
FILE* file); /*!< in: file where to print */
-/*********************************************************************//**
-Return approximate number or record locks (bits set in the bitmap) for
-this transaction. Since delete-marked records may be removed, the
-record count will not be precise.
-The caller must be holding lock_sys.mutex. */
-ulint
-lock_number_of_rows_locked(
-/*=======================*/
- const trx_lock_t* trx_lock) /*!< in: transaction locks */
- MY_ATTRIBUTE((warn_unused_result));
/*********************************************************************//**
Return the number of table locks for a transaction.
-The caller must be holding lock_sys.mutex. */
+The caller must be holding lock_sys.latch. */
ulint
lock_number_of_tables_locked(
/*=========================*/
const trx_lock_t* trx_lock) /*!< in: transaction locks */
MY_ATTRIBUTE((warn_unused_result));
-/*******************************************************************//**
-Gets the type of a lock. Non-inline version for using outside of the
-lock module.
-@return LOCK_TABLE or LOCK_REC */
-ulint
-lock_get_type(
-/*==========*/
- const lock_t* lock); /*!< in: lock */
-
-/*******************************************************************//**
-Gets the id of the table on which the lock is.
-@return id of the table */
-table_id_t
-lock_get_table_id(
-/*==============*/
- const lock_t* lock); /*!< in: lock */
-
-/** Determine which table a lock is associated with.
-@param[in] lock the lock
-@return name of the table */
-const table_name_t&
-lock_get_table_name(
- const lock_t* lock);
-
-/*******************************************************************//**
-For a record lock, gets the index on which the lock is.
-@return index */
-const dict_index_t*
-lock_rec_get_index(
-/*===============*/
- const lock_t* lock); /*!< in: lock */
-
-/*******************************************************************//**
-For a record lock, gets the name of the index on which the lock is.
-The string should not be free()'d or modified.
-@return name of the index */
-const char*
-lock_rec_get_index_name(
-/*====================*/
- const lock_t* lock); /*!< in: lock */
-
-/*******************************************************************//**
-Check if there are any locks (table or rec) against table.
-@return TRUE if locks exist */
-bool
-lock_table_has_locks(
-/*=================*/
- const dict_table_t* table); /*!< in: check if there are any locks
- held on records in this table or on the
- table itself */
+/** Check if there are any locks on a table.
+@return true if table has either table or record locks. */
+bool lock_table_has_locks(dict_table_t *table);
-/** A task which wakes up threads whose lock wait may have lasted too long */
-void lock_wait_timeout_task(void*);
-
-/********************************************************************//**
-Releases a user OS thread waiting for a lock to be released, if the
-thread is already suspended. */
-void
-lock_wait_release_thread_if_suspended(
-/*==================================*/
- que_thr_t* thr); /*!< in: query thread associated with the
- user OS thread */
-
-/***************************************************************//**
-Puts a user OS thread to wait for a lock to be released. If an error
-occurs during the wait trx->error_state associated with thr is
-!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
-are possible errors. DB_DEADLOCK is returned if selective deadlock
-resolution chose this transaction as a victim. */
-void
-lock_wait_suspend_thread(
-/*=====================*/
- que_thr_t* thr); /*!< in: query thread associated with the
- user OS thread */
+/** Wait for a lock to be released.
+@retval DB_DEADLOCK if this transaction was chosen as the deadlock victim
+@retval DB_INTERRUPTED if the execution was interrupted by the user
+@retval DB_LOCK_WAIT_TIMEOUT if the lock wait timed out
+@retval DB_SUCCESS if the lock was granted */
+dberr_t lock_wait(que_thr_t *thr);
/*********************************************************************//**
Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
function should be called at the the end of an SQL statement, by the
@@ -685,28 +558,15 @@ void
lock_unlock_table_autoinc(
/*======================*/
trx_t* trx); /*!< in/out: transaction */
-/*********************************************************************//**
-Check whether the transaction has already been rolled back because it
-was selected as a deadlock victim, or if it has to wait then cancel
-the wait lock.
-@return DB_DEADLOCK, DB_LOCK_WAIT or DB_SUCCESS */
-dberr_t
-lock_trx_handle_wait(
-/*=================*/
- trx_t* trx); /*!< in/out: trx lock state */
-/*********************************************************************//**
-Get the number of locks on a table.
-@return number of locks */
-ulint
-lock_table_get_n_locks(
-/*===================*/
- const dict_table_t* table); /*!< in: table */
-/*******************************************************************//**
-Initialise the trx lock list. */
-void
-lock_trx_lock_list_init(
-/*====================*/
- trx_lock_list_t* lock_list); /*!< List to initialise */
+
+/** Handle a pending lock wait (DB_LOCK_WAIT) in a semi-consistent read
+while holding a clustered index leaf page latch.
+@param trx transaction that is or was waiting for a lock
+@retval DB_SUCCESS if the lock was granted
+@retval DB_DEADLOCK if the transaction must be aborted due to a deadlock
+@retval DB_LOCK_WAIT if a lock wait would be necessary; the pending
+ lock request was released */
+dberr_t lock_trx_handle_wait(trx_t *trx);
/*********************************************************************//**
Checks that a transaction id is sensible, i.e., not in the future.
@@ -732,16 +592,11 @@ lock_trx_has_sys_table_locks(
/** Check if the transaction holds an explicit exclusive lock on a record.
@param[in] trx transaction
@param[in] table table
-@param[in] block leaf page
+@param[in] id leaf page identifier
@param[in] heap_no heap number identifying the record
@return whether an explicit X-lock is held */
-bool
-lock_trx_has_expl_x_lock(
- const trx_t* trx, /*!< in: transaction to check */
- const dict_table_t* table, /*!< in: table to check */
- const buf_block_t* block, /*!< in: buffer block of the record */
- ulint heap_no)/*!< in: record heap number */
- MY_ATTRIBUTE((nonnull, warn_unused_result));
+bool lock_trx_has_expl_x_lock(const trx_t &trx, const dict_table_t &table,
+ page_id_t id, ulint heap_no);
#endif /* UNIV_DEBUG */
/** Lock operation struct */
@@ -750,47 +605,164 @@ struct lock_op_t{
lock_mode mode; /*!< lock mode */
};
-typedef ib_mutex_t LockMutex;
-
/** The lock system struct */
class lock_sys_t
{
+ friend struct LockGuard;
+ friend struct LockMultiGuard;
+ friend struct TMLockGuard;
+ friend struct TMLockMutexGuard;
+ friend struct TMLockTrxGuard;
+
+ /** Hash table latch */
+ struct hash_latch
+#ifdef SUX_LOCK_GENERIC
+ : private rw_lock
+ {
+ /** Wait for an exclusive lock */
+ void wait();
+ /** Try to acquire a lock */
+ bool try_acquire() { return write_trylock(); }
+ /** Acquire a lock */
+ void acquire() { if (!try_acquire()) wait(); }
+ /** Release a lock */
+ void release();
+ /** @return whether any lock is being held or waited for by any thread */
+ bool is_locked_or_waiting() const
+ { return rw_lock::is_locked_or_waiting(); }
+ /** @return whether this latch is possibly held by any thread */
+ bool is_locked() const { return rw_lock::is_locked(); }
+#else
+ {
+ private:
+ srw_spin_lock_low lock;
+ public:
+ /** Try to acquire a lock */
+ bool try_acquire() { return lock.wr_lock_try(); }
+ /** Acquire a lock */
+ void acquire() { lock.wr_lock(); }
+ /** Release a lock */
+ void release() { lock.wr_unlock(); }
+ /** @return whether any lock may be held by any thread */
+ bool is_locked_or_waiting() const noexcept
+ { return lock.is_locked_or_waiting(); }
+ /** @return whether this latch is possibly held by any thread */
+ bool is_locked() const noexcept { return lock.is_locked(); }
+#endif
+ };
+
+public:
+ struct hash_table
+ {
+ /** Number of consecutive array[] elements occupied by a hash_latch */
+ static constexpr size_t LATCH= sizeof(void*) >= sizeof(hash_latch) ? 1 : 2;
+ static_assert(sizeof(hash_latch) <= LATCH * sizeof(void*), "allocation");
+
+ /** Number of array[] elements per hash_latch.
+ Must be LATCH less than a power of 2. */
+ static constexpr size_t ELEMENTS_PER_LATCH= (64 / sizeof(void*)) - LATCH;
+ static constexpr size_t EMPTY_SLOTS_PER_LATCH=
+ ((CPU_LEVEL1_DCACHE_LINESIZE / 64) - 1) * (64 / sizeof(void*));
+
+ /** number of payload elements in array[]. Protected by lock_sys.latch. */
+ ulint n_cells;
+ /** the hash table, with pad(n_cells) elements, aligned to L1 cache size;
+ in any hash chain, lock_t::is_waiting() entries must not precede
+ granted locks */
+ hash_cell_t *array;
+
+ /** Create the hash table.
+ @param n the lower bound of n_cells */
+ void create(ulint n);
+
+ /** Resize the hash table.
+ @param n the lower bound of n_cells */
+ void resize(ulint n);
+
+ /** Free the hash table. */
+ void free() { aligned_free(array); array= nullptr; }
+
+ /** @return the index of an array element */
+ inline ulint calc_hash(ulint fold) const;
+
+ /** @return raw array index converted to padded index */
+ static ulint pad(ulint h)
+ {
+ ulint latches= LATCH * (h / ELEMENTS_PER_LATCH);
+ ulint empty_slots= (h / ELEMENTS_PER_LATCH) * EMPTY_SLOTS_PER_LATCH;
+ return LATCH + latches + empty_slots + h;
+ }
+
+ /** Get a latch. */
+ static hash_latch *latch(hash_cell_t *cell)
+ {
+ void *l= ut_align_down(cell, sizeof *cell *
+ (ELEMENTS_PER_LATCH + LATCH));
+ return static_cast<hash_latch*>(l);
+ }
+ /** Get a hash table cell. */
+ inline hash_cell_t *cell_get(ulint fold) const;
+
+#ifdef UNIV_DEBUG
+ void assert_locked(const page_id_t id) const;
+#else
+ void assert_locked(const page_id_t) const {}
+#endif
+
+ private:
+ /** @return the hash value before any ELEMENTS_PER_LATCH padding */
+ static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); }
+
+ /** @return the index of an array element */
+ static ulint calc_hash(ulint fold, ulint n_cells)
+ {
+ return pad(hash(fold, n_cells));
+ }
+ };
+
+private:
bool m_initialised;
+ /** mutex proteting the locks */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock latch;
+#ifdef UNIV_DEBUG
+ /** The owner of exclusive latch (0 if none); protected by latch */
+ std::atomic<pthread_t> writer{0};
+ /** Number of shared latches */
+ std::atomic<ulint> readers{0};
+#endif
+#ifdef SUX_LOCK_GENERIC
+protected:
+ /** mutex for hash_latch::wait() */
+ pthread_mutex_t hash_mutex;
+ /** condition variable for hash_latch::wait() */
+ pthread_cond_t hash_cond;
+#endif
public:
- MY_ALIGNED(CACHE_LINE_SIZE)
- LockMutex mutex; /*!< Mutex protecting the
- locks */
/** record locks */
- hash_table_t rec_hash;
+ hash_table rec_hash;
/** predicate locks for SPATIAL INDEX */
- hash_table_t prdt_hash;
+ hash_table prdt_hash;
/** page locks for SPATIAL INDEX */
- hash_table_t prdt_page_hash;
-
- MY_ALIGNED(CACHE_LINE_SIZE)
- LockMutex wait_mutex; /*!< Mutex protecting the
- next two fields */
- srv_slot_t* waiting_threads; /*!< Array of user threads
- suspended while waiting for
- locks within InnoDB, protected
- by the lock_sys.wait_mutex;
- os_event_set() and
- os_event_reset() on
- waiting_threads[]->event
- are protected by
- trx_t::mutex */
- srv_slot_t* last_slot; /*!< highest slot ever used
- in the waiting_threads array,
- protected by
- lock_sys.wait_mutex */
-
- ulint n_lock_max_wait_time; /*!< Max wait time */
-
- std::unique_ptr<tpool::timer> timeout_timer; /*!< Thread pool timer task */
- bool timeout_timer_active;
-
-
+ hash_table prdt_page_hash;
+
+ /** mutex covering lock waits; @see trx_lock_t::wait_lock */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t wait_mutex;
+private:
+ /** The increment of wait_count for a wait. Anything smaller is a
+ pending wait count. */
+ static constexpr uint64_t WAIT_COUNT_STEP= 1U << 19;
+ /** waits and total number of lock waits; protected by wait_mutex */
+ uint64_t wait_count;
+ /** Cumulative wait time; protected by wait_mutex */
+ uint32_t wait_time;
+ /** Longest wait time; protected by wait_mutex */
+ uint32_t wait_time_max;
+public:
+ /** number of deadlocks detected; protected by wait_mutex */
+ ulint deadlocks;
+ /** number of lock wait timeouts; protected by wait_mutex */
+ ulint timeouts;
/**
Constructor.
@@ -800,8 +772,99 @@ public:
lock_sys_t(): m_initialised(false) {}
- bool is_initialised() { return m_initialised; }
+ bool is_initialised() const { return m_initialised; }
+
+#ifdef UNIV_PFS_RWLOCK
+ /** Acquire exclusive lock_sys.latch */
+ ATTRIBUTE_NOINLINE
+ void wr_lock(const char *file, unsigned line);
+ /** Release exclusive lock_sys.latch */
+ ATTRIBUTE_NOINLINE void wr_unlock();
+ /** Acquire shared lock_sys.latch */
+ ATTRIBUTE_NOINLINE void rd_lock(const char *file, unsigned line);
+ /** Release shared lock_sys.latch */
+ ATTRIBUTE_NOINLINE void rd_unlock();
+#else
+ /** Acquire exclusive lock_sys.latch */
+ void wr_lock()
+ {
+ mysql_mutex_assert_not_owner(&wait_mutex);
+ ut_ad(!is_writer());
+ latch.wr_lock();
+ ut_ad(!writer.exchange(pthread_self(),
+ std::memory_order_relaxed));
+ }
+ /** Release exclusive lock_sys.latch */
+ void wr_unlock()
+ {
+ ut_ad(writer.exchange(0, std::memory_order_relaxed) ==
+ pthread_self());
+ latch.wr_unlock();
+ }
+ /** Acquire shared lock_sys.latch */
+ void rd_lock()
+ {
+ mysql_mutex_assert_not_owner(&wait_mutex);
+ ut_ad(!is_writer());
+ latch.rd_lock();
+ ut_ad(!writer.load(std::memory_order_relaxed));
+ ut_d(readers.fetch_add(1, std::memory_order_relaxed));
+ }
+ /** Release shared lock_sys.latch */
+ void rd_unlock()
+ {
+ ut_ad(!is_writer());
+ ut_ad(readers.fetch_sub(1, std::memory_order_relaxed));
+ latch.rd_unlock();
+ }
+#endif
+ /** Try to acquire exclusive lock_sys.latch
+ @return whether the latch was acquired */
+ bool wr_lock_try()
+ {
+ ut_ad(!is_writer());
+ if (!latch.wr_lock_try()) return false;
+ ut_ad(!writer.exchange(pthread_self(),
+ std::memory_order_relaxed));
+ return true;
+ }
+ /** Try to acquire shared lock_sys.latch
+ @return whether the latch was acquired */
+ bool rd_lock_try()
+ {
+ ut_ad(!is_writer());
+ if (!latch.rd_lock_try()) return false;
+ ut_ad(!writer.load(std::memory_order_relaxed));
+ ut_d(readers.fetch_add(1, std::memory_order_relaxed));
+ return true;
+ }
+ /** Assert that wr_lock() has been invoked by this thread */
+ void assert_locked() const { ut_ad(is_writer()); }
+ /** Assert that wr_lock() has not been invoked by this thread */
+ void assert_unlocked() const { ut_ad(!is_writer()); }
+#ifdef UNIV_DEBUG
+ /** @return whether the current thread is the lock_sys.latch writer */
+ bool is_writer() const
+ {
+# ifdef SUX_LOCK_GENERIC
+ return writer.load(std::memory_order_relaxed) == pthread_self();
+# else
+ return writer.load(std::memory_order_relaxed) == pthread_self() ||
+ (xtest() && !latch.is_locked_or_waiting());
+# endif
+ }
+ /** Assert that a lock shard is exclusively latched (by some thread) */
+ void assert_locked(const lock_t &lock) const;
+ /** Assert that a table lock shard is exclusively latched by this thread */
+ void assert_locked(const dict_table_t &table) const;
+ /** Assert that a hash table cell is exclusively latched (by some thread) */
+ void assert_locked(const hash_cell_t &cell) const;
+#else
+ void assert_locked(const lock_t &) const {}
+ void assert_locked(const dict_table_t &) const {}
+ void assert_locked(const hash_cell_t &) const {}
+#endif
/**
Creates the lock system at database start.
@@ -822,45 +885,296 @@ public:
/** Closes the lock system at database shutdown. */
void close();
- /** @return the hash value for a page address */
- ulint hash(const page_id_t id) const
- { ut_ad(mutex_own(&mutex)); return rec_hash.calc_hash(id.fold()); }
- /** Get the first lock on a page.
- @param lock_hash hash table to look at
- @param id page number
- @return first lock
- @retval nullptr if none exists */
- lock_t *get_first(const hash_table_t &lock_hash, const page_id_t id) const
+ /** Check for deadlocks while holding only lock_sys.wait_mutex. */
+ void deadlock_check();
+
+ /** Cancel a waiting lock request.
+ @tparam check_victim whether to check for DB_DEADLOCK
+ @param trx active transaction
+ @param lock waiting lock request
+ @retval DB_SUCCESS if no lock existed
+ @retval DB_DEADLOCK if trx->lock.was_chosen_as_deadlock_victim was set
+ @retval DB_LOCK_WAIT if the lock was canceled */
+ template<bool check_victim>
+ static dberr_t cancel(trx_t *trx, lock_t *lock);
+ /** Cancel a waiting lock request (if any) when killing a transaction */
+ static void cancel(trx_t *trx);
+
+ /** Note that a record lock wait started */
+ inline void wait_start();
+
+ /** Note that a record lock wait resumed */
+ inline void wait_resume(THD *thd, my_hrtime_t start, my_hrtime_t now);
+
+ /** @return pending number of lock waits */
+ ulint get_wait_pending() const
+ {
+ return static_cast<ulint>(wait_count & (WAIT_COUNT_STEP - 1));
+ }
+ /** @return cumulative number of lock waits */
+ ulint get_wait_cumulative() const
+ { return static_cast<ulint>(wait_count / WAIT_COUNT_STEP); }
+ /** Cumulative wait time; protected by wait_mutex */
+ ulint get_wait_time_cumulative() const { return wait_time; }
+ /** Longest wait time; protected by wait_mutex */
+ ulint get_wait_time_max() const { return wait_time_max; }
+
+ /** Get the lock hash table for a mode */
+ hash_table &hash_get(ulint mode)
{
- ut_ad(&lock_hash == &rec_hash || &lock_hash == &prdt_hash ||
- &lock_hash == &prdt_page_hash);
- for (lock_t *lock= static_cast<lock_t*>
- (HASH_GET_FIRST(&lock_hash, hash(id)));
- lock; lock= static_cast<lock_t*>(HASH_GET_NEXT(hash, lock)))
- if (lock->un_member.rec_lock.page_id == id)
- return lock;
- return nullptr;
+ if (UNIV_LIKELY(!(mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE))))
+ return rec_hash;
+ return (mode & LOCK_PREDICATE) ? prdt_hash : prdt_page_hash;
}
- /** Get the first record lock on a page.
- @param id page number
- @return first lock
- @retval nullptr if none exists */
- lock_t *get_first(const page_id_t id) const
- { return get_first(rec_hash, id); }
- /** Get the first predicate lock on a SPATIAL INDEX page.
+ /** Get the lock hash table for predicate a mode */
+ hash_table &prdt_hash_get(bool page)
+ { return page ? prdt_page_hash : prdt_hash; }
+
+ /** Get the first lock on a page.
+ @param cell hash table cell
@param id page number
@return first lock
@retval nullptr if none exists */
- lock_t *get_first_prdt(const page_id_t id) const
- { return get_first(prdt_hash, id); }
- /** Get the first predicate lock on a SPATIAL INDEX page.
- @param id page number
+ static inline lock_t *get_first(const hash_cell_t &cell, page_id_t id);
+
+ /** Get the first explicit lock request on a record.
+ @param cell first lock hash table cell
+ @param id page identifier
+ @param heap_no record identifier in page
@return first lock
@retval nullptr if none exists */
- lock_t *get_first_prdt_page(const page_id_t id) const
- { return get_first(prdt_page_hash, id); }
+ static inline lock_t *get_first(const hash_cell_t &cell, page_id_t id,
+ ulint heap_no);
+
+ /** Remove locks on a discarded SPATIAL INDEX page.
+ @param id page to be discarded
+ @param page whether to discard also from lock_sys.prdt_hash */
+ void prdt_page_free_from_discard(const page_id_t id, bool all= false);
+
+ /** Cancel possible lock waiting for a transaction */
+ static void cancel_lock_wait_for_trx(trx_t *trx);
+};
+
+/** The lock system */
+extern lock_sys_t lock_sys;
+
+/** @return the index of an array element */
+inline ulint lock_sys_t::hash_table::calc_hash(ulint fold) const
+{
+ ut_ad(lock_sys.is_writer() || lock_sys.readers);
+ return calc_hash(fold, n_cells);
+}
+
+/** Get a hash table cell. */
+inline hash_cell_t *lock_sys_t::hash_table::cell_get(ulint fold) const
+{
+ ut_ad(lock_sys.is_writer() || lock_sys.readers);
+ return &array[calc_hash(fold)];
+}
+
+/** Get the first lock on a page.
+@param cell hash table cell
+@param id page number
+@return first lock
+@retval nullptr if none exists */
+inline lock_t *lock_sys_t::get_first(const hash_cell_t &cell, page_id_t id)
+{
+ lock_sys.assert_locked(cell);
+ for (auto lock= static_cast<lock_t*>(cell.node); lock; lock= lock->hash)
+ {
+ ut_ad(!lock->is_table());
+ if (lock->un_member.rec_lock.page_id == id)
+ return lock;
+ }
+ return nullptr;
+}
+
+/** lock_sys.latch exclusive guard */
+struct LockMutexGuard
+{
+ LockMutexGuard(SRW_LOCK_ARGS(const char *file, unsigned line))
+ { lock_sys.wr_lock(SRW_LOCK_ARGS(file, line)); }
+ ~LockMutexGuard() { lock_sys.wr_unlock(); }
+};
+
+/** lock_sys latch guard for 1 page_id_t */
+struct LockGuard
+{
+ LockGuard(lock_sys_t::hash_table &hash, const page_id_t id);
+ ~LockGuard()
+ {
+ lock_sys_t::hash_table::latch(cell_)->release();
+ /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */
+ lock_sys.rd_unlock();
+ }
+ /** @return the hash array cell */
+ hash_cell_t &cell() const { return *cell_; }
+private:
+ /** The hash array cell */
+ hash_cell_t *cell_;
+};
+
+/** lock_sys latch guard for 2 page_id_t */
+struct LockMultiGuard
+{
+ LockMultiGuard(lock_sys_t::hash_table &hash,
+ const page_id_t id1, const page_id_t id2);
+ ~LockMultiGuard();
+
+ /** @return the first hash array cell */
+ hash_cell_t &cell1() const { return *cell1_; }
+ /** @return the second hash array cell */
+ hash_cell_t &cell2() const { return *cell2_; }
+private:
+ /** The first hash array cell */
+ hash_cell_t *cell1_;
+ /** The second hash array cell */
+ hash_cell_t *cell2_;
+};
+
+/** lock_sys.latch exclusive guard using transactional memory */
+struct TMLockMutexGuard
+{
+ TRANSACTIONAL_INLINE
+ TMLockMutexGuard(SRW_LOCK_ARGS(const char *file, unsigned line))
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (xbegin())
+ {
+ if (was_elided())
+ return;
+ xabort();
+ }
+#endif
+ lock_sys.wr_lock(SRW_LOCK_ARGS(file, line));
+ }
+ TRANSACTIONAL_INLINE
+ ~TMLockMutexGuard()
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (was_elided()) xend(); else
+#endif
+ lock_sys.wr_unlock();
+ }
+
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ bool was_elided() const noexcept
+ { return !lock_sys.latch.is_locked_or_waiting(); }
+#else
+ bool was_elided() const noexcept { return false; }
+#endif
+};
+
+/** lock_sys latch guard for 1 page_id_t, using transactional memory */
+struct TMLockGuard
+{
+ TRANSACTIONAL_TARGET
+ TMLockGuard(lock_sys_t::hash_table &hash, const page_id_t id);
+ TRANSACTIONAL_INLINE ~TMLockGuard()
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (elided)
+ {
+ xend();
+ return;
+ }
+#endif
+ lock_sys_t::hash_table::latch(cell_)->release();
+ /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */
+ lock_sys.rd_unlock();
+ }
+ /** @return the hash array cell */
+ hash_cell_t &cell() const { return *cell_; }
+private:
+ /** The hash array cell */
+ hash_cell_t *cell_;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ /** whether the latches were elided */
+ bool elided;
+#endif
+};
+
+/** guard for shared lock_sys.latch and trx_t::mutex using
+transactional memory */
+struct TMLockTrxGuard
+{
+ trx_t &trx;
+
+ TRANSACTIONAL_INLINE
+#ifndef UNIV_PFS_RWLOCK
+ TMLockTrxGuard(trx_t &trx) : trx(trx)
+# define TMLockTrxArgs(trx) trx
+#else
+ TMLockTrxGuard(const char *file, unsigned line, trx_t &trx) : trx(trx)
+# define TMLockTrxArgs(trx) SRW_LOCK_CALL, trx
+#endif
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (xbegin())
+ {
+ if (!lock_sys.latch.is_write_locked() && was_elided())
+ return;
+ xabort();
+ }
+#endif
+ lock_sys.rd_lock(SRW_LOCK_ARGS(file, line));
+ trx.mutex_lock();
+ }
+ TRANSACTIONAL_INLINE
+ ~TMLockTrxGuard()
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (was_elided())
+ {
+ xend();
+ return;
+ }
+#endif
+ lock_sys.rd_unlock();
+ trx.mutex_unlock();
+ }
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ bool was_elided() const noexcept { return !trx.mutex_is_locked(); }
+#else
+ bool was_elided() const noexcept { return false; }
+#endif
+};
+
+/** guard for trx_t::mutex using transactional memory */
+struct TMTrxGuard
+{
+ trx_t &trx;
+
+ TRANSACTIONAL_INLINE TMTrxGuard(trx_t &trx) : trx(trx)
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (xbegin())
+ {
+ if (was_elided())
+ return;
+ xabort();
+ }
+#endif
+ trx.mutex_lock();
+ }
+ TRANSACTIONAL_INLINE ~TMTrxGuard()
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (was_elided())
+ {
+ xend();
+ return;
+ }
+#endif
+ trx.mutex_unlock();
+ }
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ bool was_elided() const noexcept { return !trx.mutex_is_locked(); }
+#else
+ bool was_elided() const noexcept { return false; }
+#endif
};
/*********************************************************************//**
@@ -871,13 +1185,8 @@ UNIV_INLINE
lock_t*
lock_rec_create(
/*============*/
-#ifdef WITH_WSREP
lock_t* c_lock, /*!< conflicting lock */
- que_thr_t* thr, /*!< thread owning trx */
-#endif
- unsigned type_mode,/*!< in: lock mode and wait
- flag, type is ignored and
- replaced by LOCK_REC */
+ unsigned type_mode,/*!< in: lock mode and wait flag */
const buf_block_t* block, /*!< in: buffer block containing
the record */
ulint heap_no,/*!< in: heap number of the record */
@@ -887,19 +1196,15 @@ lock_rec_create(
/*!< in: true if caller owns
trx mutex */
-/*************************************************************//**
-Removes a record lock request, waiting or granted, from the queue. */
-void
-lock_rec_discard(
-/*=============*/
- lock_t* in_lock); /*!< in: record lock object: all
- record locks which are contained
- in this lock object are removed */
+/** Remove a record lock request, waiting or granted, on a discarded page
+@param hash hash table
+@param in_lock lock object */
+void lock_rec_discard(lock_sys_t::hash_table &lock_hash, lock_t *in_lock);
/** Create a new record lock and inserts it to the lock queue,
without checking for deadlocks or conflicts.
-@param[in] type_mode lock mode and wait flag; type will be replaced
- with LOCK_REC
+@param[in] c_lock conflicting lock, or NULL
+@param[in] type_mode lock mode and wait flag
@param[in] page_id index page number
@param[in] page R-tree index page, or NULL
@param[in] heap_no record heap number in the index page
@@ -909,10 +1214,7 @@ without checking for deadlocks or conflicts.
@return created lock */
lock_t*
lock_rec_create_low(
-#ifdef WITH_WSREP
- lock_t* c_lock, /*!< conflicting lock */
- que_thr_t* thr, /*!< thread owning trx */
-#endif
+ lock_t* c_lock,
unsigned type_mode,
const page_id_t page_id,
const page_t* page,
@@ -920,8 +1222,10 @@ lock_rec_create_low(
dict_index_t* index,
trx_t* trx,
bool holds_trx_mutex);
+
/** Enqueue a waiting request for a lock which cannot be granted immediately.
Check for deadlocks.
+@param[in] c_lock conflicting lock
@param[in] type_mode the requested lock mode (LOCK_S or LOCK_X)
possibly ORed with LOCK_GAP or
LOCK_REC_NOT_GAP, ORed with
@@ -929,22 +1233,20 @@ Check for deadlocks.
waiting lock request is set
when performing an insert of
an index record
-@param[in] block leaf page in the index
+@param[in] id page identifier
+@param[in] page leaf page in the index
@param[in] heap_no record heap number in the block
@param[in] index index tree
@param[in,out] thr query thread
@param[in] prdt minimum bounding box (spatial index)
@retval DB_LOCK_WAIT if the waiting lock was enqueued
-@retval DB_DEADLOCK if this transaction was chosen as the victim
-@retval DB_SUCCESS_LOCKED_REC if the other transaction was chosen as a victim
- (or it happened to commit) */
+@retval DB_DEADLOCK if this transaction was chosen as the victim */
dberr_t
lock_rec_enqueue_waiting(
-#ifdef WITH_WSREP
- lock_t* c_lock, /*!< conflicting lock */
-#endif
+ lock_t* c_lock,
unsigned type_mode,
- const buf_block_t* block,
+ const page_id_t id,
+ const page_t* page,
ulint heap_no,
dict_index_t* index,
que_thr_t* thr,
@@ -962,67 +1264,6 @@ lock_rtr_move_rec_list(
moved */
ulint num_move); /*!< in: num of rec to move */
-/*************************************************************//**
-Removes record lock objects set on an index page which is discarded. This
-function does not move locks, or check for waiting locks, therefore the
-lock bitmaps must already be reset when this function is called. */
-void
-lock_rec_free_all_from_discard_page(
-/*================================*/
- const buf_block_t* block); /*!< in: page to be discarded */
-
-/** The lock system */
-extern lock_sys_t lock_sys;
-
-/** Test if lock_sys.mutex can be acquired without waiting. */
-#define lock_mutex_enter_nowait() \
- (lock_sys.mutex.trylock(__FILE__, __LINE__))
-
-/** Test if lock_sys.mutex is owned. */
-#define lock_mutex_own() (lock_sys.mutex.is_owned())
-
-/** Acquire the lock_sys.mutex. */
-#define lock_mutex_enter() do { \
- mutex_enter(&lock_sys.mutex); \
-} while (0)
-
-/** Release the lock_sys.mutex. */
-#define lock_mutex_exit() do { \
- lock_sys.mutex.exit(); \
-} while (0)
-
-/** Test if lock_sys.wait_mutex is owned. */
-#define lock_wait_mutex_own() (lock_sys.wait_mutex.is_owned())
-
-/** Acquire the lock_sys.wait_mutex. */
-#define lock_wait_mutex_enter() do { \
- mutex_enter(&lock_sys.wait_mutex); \
-} while (0)
-
-/** Release the lock_sys.wait_mutex. */
-#define lock_wait_mutex_exit() do { \
- lock_sys.wait_mutex.exit(); \
-} while (0)
-
-#ifdef WITH_WSREP
-/*********************************************************************//**
-Cancels a waiting lock request and releases possible other transactions
-waiting behind it. */
-UNIV_INTERN
-void
-lock_cancel_waiting_and_release(
-/*============================*/
- lock_t* lock); /*!< in/out: waiting lock request */
-
-/*******************************************************************//**
-Get lock mode and table/index name
-@return string containing lock info */
-std::string
-lock_get_info(
- const lock_t*);
-
-#endif /* WITH_WSREP */
-
#include "lock0lock.inl"
#endif
diff --git a/storage/innobase/include/lock0lock.inl b/storage/innobase/include/lock0lock.inl
index 2d5b6ff37f1..1b9255ffb3e 100644
--- a/storage/innobase/include/lock0lock.inl
+++ b/storage/innobase/include/lock0lock.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -37,7 +37,7 @@ lock_get_min_heap_no(
/*=================*/
const buf_block_t* block) /*!< in: buffer block */
{
- const page_t* page = block->frame;
+ const page_t* page = block->page.frame;
if (page_is_comp(page)) {
return(rec_get_heap_no_new(
@@ -52,23 +52,6 @@ lock_get_min_heap_no(
}
}
-/*************************************************************//**
-Get the lock hash table */
-UNIV_INLINE
-hash_table_t*
-lock_hash_get(
-/*==========*/
- ulint mode) /*!< in: lock mode */
-{
- if (mode & LOCK_PREDICATE) {
- return &lock_sys.prdt_hash;
- } else if (mode & LOCK_PRDT_PAGE) {
- return &lock_sys.prdt_page_hash;
- } else {
- return &lock_sys.rec_hash;
- }
-}
-
/*********************************************************************//**
Creates a new record lock and inserts it to the lock queue. Does NOT check
for deadlocks or lock compatibility!
@@ -77,13 +60,8 @@ UNIV_INLINE
lock_t*
lock_rec_create(
/*============*/
-#ifdef WITH_WSREP
lock_t* c_lock, /*!< conflicting lock */
- que_thr_t* thr, /*!< thread owning trx */
-#endif
- unsigned type_mode,/*!< in: lock mode and wait
- flag, type is ignored and
- replaced by LOCK_REC */
+ unsigned type_mode,/*!< in: lock mode and wait flag */
const buf_block_t* block, /*!< in: buffer block containing
the record */
ulint heap_no,/*!< in: heap number of the record */
@@ -93,11 +71,8 @@ lock_rec_create(
/*!< in: TRUE if caller owns
trx mutex */
{
- btr_assert_not_corrupted(block, index);
return lock_rec_create_low(
-#ifdef WITH_WSREP
- c_lock, thr,
-#endif
- type_mode, block->page.id(), block->frame, heap_no,
+ c_lock,
+ type_mode, block->page.id(), block->page.frame, heap_no,
index, trx, caller_owns_trx_mutex);
}
diff --git a/storage/innobase/include/lock0prdt.h b/storage/innobase/include/lock0prdt.h
index 43d68996691..db8e33922c4 100644
--- a/storage/innobase/include/lock0prdt.h
+++ b/storage/innobase/include/lock0prdt.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -126,8 +126,6 @@ a predicate record.
dberr_t
lock_prdt_insert_check_and_lock(
/*============================*/
- ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is
- set, does nothing */
const rec_t* rec, /*!< in: record after which to insert */
buf_block_t* block, /*!< in/out: buffer block of rec */
dict_index_t* index, /*!< in: index */
@@ -183,8 +181,7 @@ lock_prdt_rec_move(
/*===============*/
const buf_block_t* receiver, /*!< in: buffer block containing
the receiving record */
- const buf_block_t* donator); /*!< in: buffer block containing
- the donating record */
+ const page_id_t donator); /*!< in: target page */
/** Check whether there are R-tree Page lock on a page
@param[in] trx trx to test the lock
@@ -192,13 +189,4 @@ lock_prdt_rec_move(
@return true if there is none */
bool lock_test_prdt_page_lock(const trx_t *trx, const page_id_t page_id);
-/** Removes predicate lock objects set on an index page which is discarded.
-@param[in] block page to be discarded
-@param[in] lock_hash lock hash */
-void
-lock_prdt_page_free_from_discard(
-/*=============================*/
- const buf_block_t* block,
- hash_table_t* lock_hash);
-
#endif
diff --git a/storage/innobase/include/lock0priv.h b/storage/innobase/include/lock0priv.h
index f39692903fa..b0a5f7aaf3b 100644
--- a/storage/innobase/include/lock0priv.h
+++ b/storage/innobase/include/lock0priv.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2018, MariaDB Corporation.
+Copyright (c) 2015, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -64,49 +64,44 @@ operator<<(std::ostream& out, const lock_table_t& lock)
return(lock.print(out));
}
-/** Convert the member 'type_mode' into a human readable string.
-@return human readable string */
-inline
-std::string
-ib_lock_t::type_mode_string() const
-{
- std::ostringstream sout;
- sout << type_string();
- sout << " | " << lock_mode_string(mode());
-
- if (is_record_not_gap()) {
- sout << " | LOCK_REC_NOT_GAP";
- }
-
- if (is_waiting()) {
- sout << " | LOCK_WAIT";
- }
-
- if (is_gap()) {
- sout << " | LOCK_GAP";
- }
-
- if (is_insert_intention()) {
- sout << " | LOCK_INSERT_INTENTION";
- }
- return(sout.str());
-}
-
inline
std::ostream&
ib_lock_t::print(std::ostream& out) const
{
- out << "[lock_t: type_mode=" << type_mode << "("
- << type_mode_string() << ")";
-
- if (is_record_lock()) {
- out << un_member.rec_lock;
- } else {
- out << un_member.tab_lock;
- }
-
- out << "]";
- return(out);
+ static_assert(LOCK_MODE_MASK == 7, "compatibility");
+ static_assert(LOCK_IS == 0, "compatibility");
+ static_assert(LOCK_IX == 1, "compatibility");
+ static_assert(LOCK_S == 2, "compatibility");
+ static_assert(LOCK_X == 3, "compatibility");
+ static_assert(LOCK_AUTO_INC == 4, "compatibility");
+ static_assert(LOCK_NONE == 5, "compatibility");
+ static_assert(LOCK_NONE_UNSET == 7, "compatibility");
+ const char *const modes[8]=
+ { "IS", "IX", "S", "X", "AUTO_INC", "NONE", "?", "NONE_UNSET" };
+
+ out << "[lock_t: type_mode=" << type_mode << "(" << type_string()
+ << " | LOCK_" << modes[mode()];
+
+ if (is_record_not_gap())
+ out << " | LOCK_REC_NOT_GAP";
+ if (is_waiting())
+ out << " | LOCK_WAIT";
+
+ if (is_gap())
+ out << " | LOCK_GAP";
+
+ if (is_insert_intention())
+ out << " | LOCK_INSERT_INTENTION";
+
+ out << ")";
+
+ if (is_table())
+ out << un_member.tab_lock;
+ else
+ out << un_member.rec_lock;
+
+ out << "]";
+ return out;
}
inline
@@ -120,24 +115,6 @@ operator<<(std::ostream& out, const ib_lock_t& lock)
extern ibool lock_print_waits;
#endif /* UNIV_DEBUG */
-/** Restricts the length of search we will do in the waits-for
-graph of transactions */
-static const ulint LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK = 1000000;
-
-/** Restricts the search depth we will do in the waits-for graph of
-transactions */
-static const ulint LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK = 200;
-
-/** When releasing transaction locks, this specifies how often we release
-the lock mutex for a moment to give also others access to it */
-static const ulint LOCK_RELEASE_INTERVAL = 1000;
-
-/* Safety margin when creating a new record lock: this many extra records
-can be inserted to the page without need to create a lock with a bigger
-bitmap */
-
-static const ulint LOCK_PAGE_BITMAP_MARGIN = 64;
-
/* An explicit record lock affects both the record and the gap before it.
An implicit x-lock does not affect the gap, it only locks the index
record from read or update.
@@ -414,9 +391,6 @@ static const byte lock_strength_matrix[5][5] = {
/* AI */ { FALSE, FALSE, FALSE, FALSE, TRUE}
};
-/** Maximum depth of the DFS stack. */
-static const ulint MAX_STACK_SIZE = 4096;
-
#define PRDT_HEAPNO PAGE_HEAP_NO_INFIMUM
/** Record locking request status */
enum lock_rec_req_status {
@@ -434,15 +408,6 @@ static const ulint lock_types = UT_ARR_SIZE(lock_compatibility_matrix);
#endif /* UNIV_DEBUG */
/*********************************************************************//**
-Gets the type of a lock.
-@return LOCK_TABLE or LOCK_REC */
-UNIV_INLINE
-ulint
-lock_get_type_low(
-/*==============*/
- const lock_t* lock); /*!< in: lock */
-
-/*********************************************************************//**
Gets the previous record lock set on a record.
@return previous lock on the same record, NULL if none exists */
const lock_t*
@@ -452,14 +417,6 @@ lock_rec_get_prev(
ulint heap_no);/*!< in: heap number of the record */
/*********************************************************************//**
-Cancels a waiting lock request and releases possible other transactions
-waiting behind it. */
-void
-lock_cancel_waiting_and_release(
-/*============================*/
- lock_t* lock); /*!< in/out: waiting lock request */
-
-/*********************************************************************//**
Checks if some transaction has an implicit x-lock on a record in a clustered
index.
@return transaction id of the transaction which has the x-lock, or 0 */
@@ -502,7 +459,7 @@ lock_rec_get_n_bits(
/**********************************************************************//**
Sets the nth bit of a record lock to TRUE. */
-UNIV_INLINE
+inline
void
lock_rec_set_nth_bit(
/*=================*/
@@ -515,7 +472,13 @@ lock_rec_set_nth_bit(
@return previous value of the bit */
inline byte lock_rec_reset_nth_bit(lock_t* lock, ulint i)
{
- ut_ad(lock_get_type_low(lock) == LOCK_REC);
+ ut_ad(!lock->is_table());
+#ifdef SUX_LOCK_GENERIC
+ ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner());
+#else
+ ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner()
+ || (xtest() && !lock->trx->mutex_is_locked()));
+#endif
ut_ad(i < lock->un_member.rec_lock.n_bits);
byte* b = reinterpret_cast<byte*>(&lock[1]) + (i >> 3);
@@ -524,8 +487,9 @@ inline byte lock_rec_reset_nth_bit(lock_t* lock, ulint i)
*b &= byte(~mask);
if (bit != 0) {
- ut_ad(lock->trx->lock.n_rec_locks > 0);
- --lock->trx->lock.n_rec_locks;
+ ut_d(auto n=)
+ lock->trx->lock.n_rec_locks--;
+ ut_ad(n);
}
return(bit);
@@ -560,25 +524,26 @@ lock_rec_get_next_const(
ulint heap_no,/*!< in: heap number of the record */
const lock_t* lock); /*!< in: lock */
-/*********************************************************************//**
-Gets the first explicit lock request on a record.
-@return first lock, NULL if none exists */
-UNIV_INLINE
-lock_t*
-lock_rec_get_first(
-/*===============*/
- hash_table_t* hash, /*!< in: hash chain the lock on */
- const buf_block_t* block, /*!< in: block containing the record */
- ulint heap_no);/*!< in: heap number of the record */
-
-/*********************************************************************//**
-Gets the mode of a lock.
-@return mode */
-UNIV_INLINE
-enum lock_mode
-lock_get_mode(
-/*==========*/
- const lock_t* lock); /*!< in: lock */
+/** Get the first explicit lock request on a record.
+@param cell first lock hash table cell
+@param id page identifier
+@param heap_no record identifier in page
+@return first lock
+@retval nullptr if none exists */
+inline lock_t *lock_sys_t::get_first(const hash_cell_t &cell, page_id_t id,
+ ulint heap_no)
+{
+ lock_sys.assert_locked(cell);
+
+ for (lock_t *lock= static_cast<lock_t*>(cell.node); lock; lock= lock->hash)
+ {
+ ut_ad(!lock->is_table());
+ if (lock->un_member.rec_lock.page_id == id &&
+ lock_rec_get_nth_bit(lock, heap_no))
+ return lock;
+ }
+ return nullptr;
+}
/*********************************************************************//**
Calculates if lock mode 1 is compatible with lock mode 2.
@@ -601,15 +566,6 @@ lock_mode_stronger_or_eq(
enum lock_mode mode2); /*!< in: lock mode */
/*********************************************************************//**
-Gets the wait flag of a lock.
-@return LOCK_WAIT if waiting, 0 if not */
-UNIV_INLINE
-ulint
-lock_get_wait(
-/*==========*/
- const lock_t* lock); /*!< in: lock */
-
-/*********************************************************************//**
Checks if a transaction has the specified table lock, or stronger. This
function should only be called by the thread that owns the transaction.
@return lock or NULL */
@@ -621,33 +577,6 @@ lock_table_has(
const dict_table_t* table, /*!< in: table */
enum lock_mode mode); /*!< in: lock mode */
-/** Set the wait status of a lock.
-@param[in,out] lock lock that will be waited for
-@param[in,out] trx transaction that will wait for the lock */
-inline void lock_set_lock_and_trx_wait(lock_t* lock, trx_t* trx)
-{
- ut_ad(lock);
- ut_ad(lock->trx == trx);
- ut_ad(trx->lock.wait_lock == NULL);
- ut_ad(lock_mutex_own());
- ut_ad(trx_mutex_own(trx));
-
- trx->lock.wait_lock = lock;
- lock->type_mode |= LOCK_WAIT;
-}
-
-/** Reset the wait status of a lock.
-@param[in,out] lock lock that was possibly being waited for */
-inline void lock_reset_lock_and_trx_wait(lock_t* lock)
-{
- ut_ad(lock_get_wait(lock));
- ut_ad(lock_mutex_own());
- ut_ad(lock->trx->lock.wait_lock == NULL
- || lock->trx->lock.wait_lock == lock);
- lock->trx->lock.wait_lock = NULL;
- lock->type_mode &= ~LOCK_WAIT;
-}
-
#include "lock0priv.inl"
#endif /* lock0priv_h */
diff --git a/storage/innobase/include/lock0priv.inl b/storage/innobase/include/lock0priv.inl
index e16949a4917..3b4ebcc835b 100644
--- a/storage/innobase/include/lock0priv.inl
+++ b/storage/innobase/include/lock0priv.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -36,20 +36,6 @@ methods but they are used only in that file. */
#include "row0row.h"
/*********************************************************************//**
-Gets the type of a lock.
-@return LOCK_TABLE or LOCK_REC */
-UNIV_INLINE
-ulint
-lock_get_type_low(
-/*==============*/
- const lock_t* lock) /*!< in: lock */
-{
- ut_ad(lock);
-
- return(lock->type_mode & LOCK_TYPE_MASK);
-}
-
-/*********************************************************************//**
Checks if some transaction has an implicit x-lock on a record in a clustered
index.
@return transaction id of the transaction which has the x-lock, or 0 */
@@ -81,7 +67,7 @@ lock_rec_get_n_bits(
/**********************************************************************//**
Sets the nth bit of a record lock to TRUE. */
-UNIV_INLINE
+inline
void
lock_rec_set_nth_bit(
/*=================*/
@@ -91,8 +77,7 @@ lock_rec_set_nth_bit(
ulint byte_index;
ulint bit_index;
- ut_ad(lock);
- ut_ad(lock_get_type_low(lock) == LOCK_REC);
+ ut_ad(!lock->is_table());
ut_ad(i < lock->un_member.rec_lock.n_bits);
byte_index = i / 8;
@@ -106,7 +91,13 @@ lock_rec_set_nth_bit(
#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
# pragma GCC diagnostic pop
#endif
- ++lock->trx->lock.n_rec_locks;
+#ifdef SUX_LOCK_GENERIC
+ ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner());
+#else
+ ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner()
+ || (xtest() && !lock->trx->mutex_is_locked()));
+#endif
+ lock->trx->lock.n_rec_locks++;
}
/*********************************************************************//**
@@ -118,7 +109,7 @@ lock_rec_get_next_on_page(
/*======================*/
lock_t* lock) /*!< in: a record lock */
{
- return((lock_t*) lock_rec_get_next_on_page_const(lock));
+ return const_cast<lock_t*>(lock_rec_get_next_on_page_const(lock));
}
/*********************************************************************//**
@@ -131,10 +122,7 @@ lock_rec_get_next(
ulint heap_no,/*!< in: heap number of the record */
lock_t* lock) /*!< in: lock */
{
- ut_ad(lock_mutex_own());
-
do {
- ut_ad(lock_get_type_low(lock) == LOCK_REC);
lock = lock_rec_get_next_on_page(lock);
} while (lock && !lock_rec_get_nth_bit(lock, heap_no));
@@ -151,25 +139,7 @@ lock_rec_get_next_const(
ulint heap_no,/*!< in: heap number of the record */
const lock_t* lock) /*!< in: lock */
{
- return(lock_rec_get_next(heap_no, (lock_t*) lock));
-}
-
-/*********************************************************************//**
-Gets the first explicit lock request on a record.
-@return first lock, NULL if none exists */
-UNIV_INLINE
-lock_t*
-lock_rec_get_first(
-/*===============*/
- hash_table_t* hash, /*!< in: hash chain the lock on */
- const buf_block_t* block, /*!< in: block containing the record */
- ulint heap_no)/*!< in: heap number of the record */
-{
- for (lock_t *lock= lock_sys.get_first(*hash, block->page.id());
- lock; lock= lock_rec_get_next_on_page(lock))
- if (lock_rec_get_nth_bit(lock, heap_no))
- return lock;
- return nullptr;
+ return lock_rec_get_next(heap_no, const_cast<lock_t*>(lock));
}
/*********************************************************************//**
@@ -184,8 +154,7 @@ lock_rec_get_nth_bit(
{
const byte* b;
- ut_ad(lock);
- ut_ad(lock_get_type_low(lock) == LOCK_REC);
+ ut_ad(!lock->is_table());
if (i >= lock->un_member.rec_lock.n_bits) {
@@ -206,10 +175,9 @@ lock_rec_get_next_on_page_const(
/*============================*/
const lock_t* lock) /*!< in: a record lock */
{
- ut_ad(lock_mutex_own());
- ut_ad(lock_get_type_low(lock) == LOCK_REC);
+ ut_ad(!lock->is_table());
- const page_id_t page_id(lock->un_member.rec_lock.page_id);
+ const page_id_t page_id{lock->un_member.rec_lock.page_id};
while (!!(lock= static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock))))
if (lock->un_member.rec_lock.page_id == page_id)
@@ -218,20 +186,6 @@ lock_rec_get_next_on_page_const(
}
/*********************************************************************//**
-Gets the mode of a lock.
-@return mode */
-UNIV_INLINE
-enum lock_mode
-lock_get_mode(
-/*==========*/
- const lock_t* lock) /*!< in: lock */
-{
- ut_ad(lock);
-
- return(static_cast<enum lock_mode>(lock->type_mode & LOCK_MODE_MASK));
-}
-
-/*********************************************************************//**
Calculates if lock mode 1 is compatible with lock mode 2.
@return nonzero if mode1 compatible with mode2 */
UNIV_INLINE
@@ -264,20 +218,6 @@ lock_mode_stronger_or_eq(
}
/*********************************************************************//**
-Gets the wait flag of a lock.
-@return LOCK_WAIT if waiting, 0 if not */
-UNIV_INLINE
-ulint
-lock_get_wait(
-/*==========*/
- const lock_t* lock) /*!< in: lock */
-{
- ut_ad(lock);
-
- return(lock->type_mode & LOCK_WAIT);
-}
-
-/*********************************************************************//**
Checks if a transaction has the specified table lock, or stronger. This
function should only be called by the thread that owns the transaction.
@return lock or NULL */
@@ -300,22 +240,16 @@ lock_table_has(
continue;
}
- lock_mode mode = lock_get_mode(lock);
-
ut_ad(trx == lock->trx);
- ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
- ut_ad(lock->un_member.tab_lock.table != NULL);
+ ut_ad(lock->is_table());
+ ut_ad(lock->un_member.tab_lock.table);
if (table == lock->un_member.tab_lock.table
- && lock_mode_stronger_or_eq(mode, in_mode)) {
-
- ut_ad(!lock_get_wait(lock));
-
+ && lock_mode_stronger_or_eq(lock->mode(), in_mode)) {
+ ut_ad(!lock->is_waiting());
return(lock);
}
}
return(NULL);
}
-
-/* vim: set filetype=c: */
diff --git a/storage/innobase/include/lock0types.h b/storage/innobase/include/lock0types.h
index 23307375426..dc57a31c5f8 100644
--- a/storage/innobase/include/lock0types.h
+++ b/storage/innobase/include/lock0types.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -46,35 +46,9 @@ enum lock_mode {
in an exclusive mode */
LOCK_NONE, /* this is used elsewhere to note consistent read */
LOCK_NUM = LOCK_NONE, /* number of lock modes */
- LOCK_NONE_UNSET = 255
+ LOCK_NONE_UNSET = 7
};
-/** Convert the given enum value into string.
-@param[in] mode the lock mode
-@return human readable string of the given enum value */
-inline
-const char* lock_mode_string(enum lock_mode mode)
-{
- switch (mode) {
- case LOCK_IS:
- return("LOCK_IS");
- case LOCK_IX:
- return("LOCK_IX");
- case LOCK_S:
- return("LOCK_S");
- case LOCK_X:
- return("LOCK_X");
- case LOCK_AUTO_INC:
- return("LOCK_AUTO_INC");
- case LOCK_NONE:
- return("LOCK_NONE");
- case LOCK_NONE_UNSET:
- return("LOCK_NONE_UNSET");
- default:
- ut_error;
- }
-}
-
/** A table lock */
struct lock_table_t {
dict_table_t* table; /*!< database table in dictionary
@@ -121,17 +95,12 @@ operator<<(std::ostream& out, const lock_rec_t& lock)
return(lock.print(out));
}
-#define LOCK_MODE_MASK 0xFUL /*!< mask used to extract mode from the
+#define LOCK_MODE_MASK 0x7 /*!< mask used to extract mode from the
type_mode field in a lock */
/** Lock types */
/* @{ */
-#define LOCK_TABLE 16U /*!< table lock */
-#define LOCK_REC 32U /*!< record lock */
-#define LOCK_TYPE_MASK 0xF0UL /*!< mask used to extract lock type from the
- type_mode field in a lock */
-#if LOCK_MODE_MASK & LOCK_TYPE_MASK
-# error "LOCK_MODE_MASK & LOCK_TYPE_MASK"
-#endif
+/** table lock (record lock if the flag is not set) */
+#define LOCK_TABLE 8U
#define LOCK_WAIT 256U /*!< Waiting lock flag; when set, it
means that the lock has not yet been
@@ -176,14 +145,14 @@ operator<<(std::ostream& out, const lock_rec_t& lock)
#endif
/* @} */
-/** Lock struct; protected by lock_sys.mutex */
+/** Lock struct; protected by lock_sys.latch */
struct ib_lock_t
{
- trx_t* trx; /*!< transaction owning the
- lock */
- UT_LIST_NODE_T(ib_lock_t)
- trx_locks; /*!< list of the locks of the
- transaction */
+ /** the owner of the lock */
+ trx_t *trx;
+ /** other locks of the transaction; protected by
+ lock_sys.is_writer() and trx->mutex_is_owner(); @see trx_lock_t::trx_locks */
+ UT_LIST_NODE_T(ib_lock_t) trx_locks;
dict_index_t* index; /*!< index for a record lock */
@@ -210,13 +179,6 @@ struct ib_lock_t
LOCK_INSERT_INTENTION,
wait flag, ORed */
- /** Determine if the lock object is a record lock.
- @return true if record lock, false otherwise. */
- bool is_record_lock() const
- {
- return(type() == LOCK_REC);
- }
-
bool is_waiting() const
{
return(type_mode & LOCK_WAIT);
@@ -237,9 +199,7 @@ struct ib_lock_t
return(type_mode & LOCK_INSERT_INTENTION);
}
- ulint type() const {
- return(type_mode & LOCK_TYPE_MASK);
- }
+ bool is_table() const { return type_mode & LOCK_TABLE; }
enum lock_mode mode() const
{
@@ -251,21 +211,8 @@ struct ib_lock_t
@return the given output stream. */
std::ostream& print(std::ostream& out) const;
- /** Convert the member 'type_mode' into a human readable string.
- @return human readable string */
- std::string type_mode_string() const;
-
const char* type_string() const
- {
- switch (type_mode & LOCK_TYPE_MASK) {
- case LOCK_REC:
- return("LOCK_REC");
- case LOCK_TABLE:
- return("LOCK_TABLE");
- default:
- ut_error;
- }
- }
+ { return is_table() ? "LOCK_TABLE" : "LOCK_REC"; }
};
typedef UT_LIST_BASE_NODE_T(ib_lock_t) trx_lock_list_t;
diff --git a/storage/innobase/include/log0crypt.h b/storage/innobase/include/log0crypt.h
index 980a79d8f9e..b9390927ece 100644
--- a/storage/innobase/include/log0crypt.h
+++ b/storage/innobase/include/log0crypt.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
-Copyright (C) 2014, 2020, MariaDB Corporation.
+Copyright (C) 2014, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -38,19 +38,13 @@ The random parameters will be persisted in the log checkpoint pages.
@see log_crypt_write_checkpoint_buf()
@see log_crypt_read_checkpoint_buf()
@return whether the operation succeeded */
-UNIV_INTERN
-bool
-log_crypt_init();
+bool log_crypt_init();
/*********************************************************************//**
Writes the crypto (version, msg and iv) info, which has been used for
log blocks with lsn <= this checkpoint's lsn, to a log header's
checkpoint buf. */
-UNIV_INTERN
-void
-log_crypt_write_checkpoint_buf(
-/*===========================*/
- byte* buf); /*!< in/out: checkpoint buffer */
+void log_crypt_write_checkpoint_buf(byte *buf);
/** Read the MariaDB 10.1 checkpoint crypto (version, msg and iv) info.
@param[in] buf checkpoint buffer
@@ -93,9 +87,7 @@ bool log_crypt(byte* buf, lsn_t lsn, ulint size, log_crypt_t op = LOG_ENCRYPT);
@param[in] offs offset to block
@param[in] encrypt true=encrypt; false=decrypt
@return whether the operation succeeded */
-UNIV_INTERN
-bool
-log_tmp_block_encrypt(
+bool log_tmp_block_encrypt(
const byte* src,
ulint size,
byte* dst,
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
index 4a5567ff62d..0f9a4da049b 100644
--- a/storage/innobase/include/log0log.h
+++ b/storage/innobase/include/log0log.h
@@ -77,7 +77,7 @@ log_reserve_and_write_fast(
Checks if there is need for a log buffer flush or a new checkpoint, and does
this if yes. Any database operation should call this when it has modified
more than about 4 pages. NOTE that this function may only be called when the
-OS thread owns no synchronization objects except the dictionary mutex. */
+OS thread owns no synchronization objects except dict_sys.latch. */
UNIV_INLINE
void
log_free_check(void);
@@ -97,15 +97,21 @@ bool
log_set_capacity(ulonglong file_size)
MY_ATTRIBUTE((warn_unused_result));
-/** Ensure that the log has been written to the log file up to a given
+/**
+Ensure that the log has been written to the log file up to a given
log entry (such as that of a transaction commit). Start a new write, or
wait and check if an already running write is covering the request.
@param[in] lsn log sequence number that should be
included in the redo log file write
@param[in] flush_to_disk whether the written log should also
be flushed to the file system
-@param[in] rotate_key whether to rotate the encryption key */
-void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key = false);
+@param[in] rotate_key whether to rotate the encryption key
+@param[in] cb completion callback. If not NULL, the callback will be called
+ whenever lsn is written or flushed.
+*/
+struct completion_callback;
+void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key = false,
+ const completion_callback* cb=nullptr);
/** Write to the log file up to the last log entry.
@param sync whether to wait for a durable write to complete */
@@ -445,7 +451,7 @@ struct log_t{
private:
/** The log sequence number of the last change of durable InnoDB files */
- MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE)
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE)
std::atomic<lsn_t> lsn;
/** the first guaranteed-durable log sequence number */
std::atomic<lsn_t> flushed_to_disk_lsn;
@@ -455,7 +461,7 @@ private:
std::atomic<bool> check_flush_or_checkpoint_;
public:
/** mutex protecting the log */
- MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
/** first free offset within the log buffer in use */
size_t buf_free;
/** recommended maximum size of buf, after which the buffer is flushed */
@@ -469,7 +475,7 @@ public:
dirty blocks in the list. The idea behind this mutex is to be able
to release log_sys.mutex during mtr_commit and still ensure that
insertions in the flush_list happen in the LSN order. */
- MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_order_mutex;
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_order_mutex;
/** log_buffer, append data here */
byte *buf;
/** log_buffer, writing data to file from this buffer.
diff --git a/storage/innobase/include/log0log.inl b/storage/innobase/include/log0log.inl
index 0ff8c2523d7..c29c0bfa55f 100644
--- a/storage/innobase/include/log0log.inl
+++ b/storage/innobase/include/log0log.inl
@@ -294,7 +294,7 @@ log_reserve_and_write_fast(
Checks if there is need for a log buffer flush or a new checkpoint, and does
this if yes. Any database operation should call this when it has modified
more than about 4 pages. NOTE that this function may only be called when the
-OS thread owns no synchronization objects except the dictionary mutex. */
+OS thread owns no synchronization objects except dict_sys.latch. */
UNIV_INLINE
void
log_free_check(void)
@@ -304,22 +304,6 @@ log_free_check(void)
are holding some latches. This is OK, as long as we are not holding
any latches on buffer blocks. */
-#ifdef UNIV_DEBUG
- static const latch_level_t latches[] = {
- SYNC_REDO_RSEG, /* trx_purge_free_segment() */
- SYNC_DICT, /* dict_sys.mutex during
- commit_try_rebuild() */
- SYNC_DICT_OPERATION, /* dict_sys.latch X-latch during
- commit_try_rebuild() */
- SYNC_FTS_CACHE, /* fts_cache_t::lock */
- SYNC_INDEX_TREE /* index->lock */
- };
-#endif /* UNIV_DEBUG */
-
- ut_ad(!sync_check_iterate(
- sync_allowed_latches(latches,
- latches + UT_ARR_SIZE(latches))));
-
if (log_sys.check_flush_or_checkpoint()) {
log_check_margins();
diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h
index e4a8f0d25a2..5e8dc1c0160 100644
--- a/storage/innobase/include/log0recv.h
+++ b/storage/innobase/include/log0recv.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -26,12 +26,13 @@ Created 9/20/1997 Heikki Tuuri
#pragma once
-#include "ut0byte.h"
+#include "ut0new.h"
#include "buf0types.h"
#include "log0log.h"
#include "mtr0types.h"
#include <deque>
+#include <map>
/** @return whether recovery is currently running. */
#define recv_recovery_is_on() UNIV_UNLIKELY(recv_sys.recovery_on)
@@ -43,11 +44,12 @@ dberr_t
recv_find_max_checkpoint(ulint* max_field)
MY_ATTRIBUTE((nonnull, warn_unused_result));
+ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result))
/** Apply any buffered redo log to a page that was just read from a data file.
@param[in,out] space tablespace
-@param[in,out] bpage buffer pool page */
-ATTRIBUTE_COLD void recv_recover_page(fil_space_t* space, buf_page_t* bpage)
- MY_ATTRIBUTE((nonnull));
+@param[in,out] bpage buffer pool page
+@return whether the page was recovered correctly */
+bool recv_recover_page(fil_space_t* space, buf_page_t* bpage);
/** Start recovering from a redo log checkpoint.
@param[in] flush_lsn FIL_PAGE_FILE_FLUSH_LSN
@@ -81,12 +83,12 @@ void recv_sys_justify_left_parsing_buf();
/** Report an operation to create, delete, or rename a file during backup.
@param[in] space_id tablespace identifier
-@param[in] create whether the file is being created
+@param[in] type file operation redo log type
@param[in] name file name (not NUL-terminated)
@param[in] len length of name, in bytes
@param[in] new_name new file name (NULL if not rename)
@param[in] new_len length of new_name, in bytes (0 if NULL) */
-extern void (*log_file_op)(ulint space_id, bool create,
+extern void (*log_file_op)(ulint space_id, int type,
const byte* name, ulint len,
const byte* new_name, ulint new_len);
@@ -95,6 +97,10 @@ during backup
@param space_id undo tablespace identifier */
extern void (*undo_space_trunc)(uint32_t space_id);
+/** Report an operation which does INIT_PAGE for page0 during backup.
+@param space_id tablespace identifier */
+extern void (*first_page_init)(ulint space_id);
+
/** Stored redo log record */
struct log_rec_t
{
@@ -213,14 +219,25 @@ struct page_recv_t
struct recv_sys_t
{
/** mutex protecting apply_log_recs and page_recv_t::state */
- ib_mutex_t mutex;
+ mysql_mutex_t mutex;
+private:
+ /** condition variable for
+ !apply_batch_on || pages.empty() || found_corrupt_log || found_corrupt_fs */
+ pthread_cond_t cond;
+ /** whether recv_apply_hashed_log_recs() is running */
+ bool apply_batch_on;
+ /** set when finding a corrupt log block or record, or there is a
+ log parsing buffer overflow */
+ bool found_corrupt_log;
+ /** set when an inconsistency with the file system contents is detected
+ during log scan or apply */
+ bool found_corrupt_fs;
+public:
/** whether we are applying redo log records during crash recovery */
bool recovery_on;
- /** whether recv_recover_page(), invoked from buf_page_read_complete(),
+ /** whether recv_recover_page(), invoked from buf_page_t::read_complete(),
should apply log records*/
bool apply_log_recs;
- /** whether apply() is running */
- bool apply_batch_on;
byte* buf; /*!< buffer for parsing log records */
ulint len; /*!< amount of data in buf */
lsn_t parse_start_lsn;
@@ -240,14 +257,6 @@ struct recv_sys_t
lsn_t recovered_lsn;
/*!< the log records have been parsed up to
this lsn */
- bool found_corrupt_log;
- /*!< set when finding a corrupt log
- block or record, or there is a log
- parsing buffer overflow */
- bool found_corrupt_fs;
- /*!< set when an inconsistency with
- the file system contents is detected
- during log scan or apply */
lsn_t mlog_checkpoint_lsn;
/*!< the LSN of a FILE_CHECKPOINT
record, or 0 if none was parsed */
@@ -293,13 +302,16 @@ private:
@param p iterator pointing to page_id
@param mtr mini-transaction
@param b pre-allocated buffer pool block
- @return whether the page was successfully initialized */
+ @return the recovered block
+ @retval nullptr if the page cannot be initialized based on log records
+ @retval -1 if the page cannot be recovered due to corruption */
inline buf_block_t *recover_low(const page_id_t page_id, map::iterator &p,
mtr_t &mtr, buf_block_t *b);
/** Attempt to initialize a page based on redo log records.
@param page_id page identifier
@return the recovered block
- @retval nullptr if the page cannot be initialized based on log records */
+ @retval nullptr if the page cannot be initialized based on log records
+ @retval -1 if the page cannot be recovered due to corruption */
buf_block_t *recover_low(const page_id_t page_id);
/** All found log files (multiple ones are possible if we are upgrading
@@ -386,14 +398,36 @@ public:
@param page_id corrupted page identifier */
ATTRIBUTE_COLD void free_corrupted_page(page_id_t page_id);
+ /** Flag data file corruption during recovery. */
+ ATTRIBUTE_COLD void set_corrupt_fs();
+ /** Flag log file corruption during recovery. */
+ ATTRIBUTE_COLD void set_corrupt_log();
+ /** Possibly finish a recovery batch. */
+ inline void maybe_finish_batch();
+
+ /** @return whether data file corruption was found */
+ bool is_corrupt_fs() const { return UNIV_UNLIKELY(found_corrupt_fs); }
+ /** @return whether log file corruption was found */
+ bool is_corrupt_log() const { return UNIV_UNLIKELY(found_corrupt_log); }
+
/** Attempt to initialize a page based on redo log records.
@param page_id page identifier
@return the recovered block
- @retval nullptr if the page cannot be initialized based on log records */
+ @retval nullptr if the page cannot be initialized based on log records
+ @retval -1 if the page cannot be recovered due to corruption */
buf_block_t *recover(const page_id_t page_id)
{
return UNIV_UNLIKELY(recovery_on) ? recover_low(page_id) : nullptr;
}
+
+ /** Try to recover a tablespace that was not readable earlier
+ @param p iterator, initially pointing to page_id_t{space_id,0};
+ the records will be freed and the iterator advanced
+ @param name tablespace file name
+ @param free_block spare buffer block
+ @return whether recovery failed */
+ bool recover_deferred(map::iterator &p, const std::string &name,
+ buf_block_t *&free_block);
};
/** The recovery system */
diff --git a/storage/innobase/include/mach0data.inl b/storage/innobase/include/mach0data.inl
index bfccf611991..2f970fd27f0 100644
--- a/storage/innobase/include/mach0data.inl
+++ b/storage/innobase/include/mach0data.inl
@@ -28,6 +28,7 @@ Created 11/28/1995 Heikki Tuuri
#ifndef UNIV_INNOCHECKSUM
#include "mtr0types.h"
+#include "ut0byte.h"
/*******************************************************//**
The following function is used to store data in one byte. */
diff --git a/storage/innobase/include/mem0mem.inl b/storage/innobase/include/mem0mem.inl
index 9236bbef05d..9906daf3eb9 100644
--- a/storage/innobase/include/mem0mem.inl
+++ b/storage/innobase/include/mem0mem.inl
@@ -24,6 +24,8 @@ The memory management
Created 6/8/1994 Heikki Tuuri
*************************************************************************/
+#include "ut0new.h"
+
#ifdef UNIV_DEBUG
# define mem_heap_create_block(heap, n, type, file_name, line) \
mem_heap_create_block_func(heap, n, file_name, line, type)
diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h
index 285672be898..d34a62e7bb2 100644
--- a/storage/innobase/include/mtr0log.h
+++ b/storage/innobase/include/mtr0log.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2019, 2022, MariaDB Corporation.
+Copyright (c) 2019, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -173,7 +173,7 @@ inline uint32_t mlog_decode_len(const byte *log, const byte *end)
template<unsigned l,mtr_t::write_type w,typename V>
inline bool mtr_t::write(const buf_block_t &block, void *ptr, V val)
{
- ut_ad(ut_align_down(ptr, srv_page_size) == block.frame);
+ ut_ad(ut_align_down(ptr, srv_page_size) == block.page.frame);
static_assert(l == 1 || l == 2 || l == 4 || l == 8, "wrong length");
byte buf[l];
@@ -196,7 +196,7 @@ inline bool mtr_t::write(const buf_block_t &block, void *ptr, V val)
}
byte *p= static_cast<byte*>(ptr);
const byte *const end= p + l;
- if (w != FORCED && m_log_mode == MTR_LOG_ALL)
+ if (w != FORCED && is_logged())
{
const byte *b= buf;
while (*p++ == *b++)
@@ -224,7 +224,7 @@ inline void mtr_t::memset(const buf_block_t &b, ulint ofs, ulint len, byte val)
{
ut_ad(len);
set_modified(b);
- if (m_log_mode != MTR_LOG_ALL)
+ if (!is_logged())
return;
static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
@@ -245,7 +245,7 @@ inline void mtr_t::memset(const buf_block_t *b, ulint ofs, ulint len, byte val)
{
ut_ad(ofs <= ulint(srv_page_size));
ut_ad(ofs + len <= ulint(srv_page_size));
- ::memset(ofs + b->frame, val, len);
+ ::memset(ofs + b->page.frame, val, len);
memset(*b, ofs, len, val);
}
@@ -261,7 +261,7 @@ inline void mtr_t::memset(const buf_block_t &b, ulint ofs, size_t len,
ut_ad(size);
ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */
set_modified(b);
- if (m_log_mode != MTR_LOG_ALL)
+ if (!is_logged())
return;
static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
@@ -289,10 +289,10 @@ inline void mtr_t::memset(const buf_block_t *b, ulint ofs, size_t len,
size_t s= 0;
while (s < len)
{
- ::memcpy(ofs + s + b->frame, str, size);
+ ::memcpy(ofs + s + b->page.frame, str, size);
s+= len;
}
- ::memcpy(ofs + s + b->frame, str, len - s);
+ ::memcpy(ofs + s + b->page.frame, str, len - s);
memset(*b, ofs, len, str, size);
}
@@ -306,7 +306,7 @@ inline void mtr_t::memcpy(const buf_block_t &b, ulint offset, ulint len)
ut_ad(len);
ut_ad(offset <= ulint(srv_page_size));
ut_ad(offset + len <= ulint(srv_page_size));
- memcpy_low(b, uint16_t(offset), &b.frame[offset], len);
+ memcpy_low(b, uint16_t(offset), &b.page.frame[offset], len);
}
/** Log a write of a byte string to a page.
@@ -319,7 +319,7 @@ inline void mtr_t::memcpy_low(const buf_block_t &block, uint16_t offset,
{
ut_ad(len);
set_modified(block);
- if (m_log_mode != MTR_LOG_ALL)
+ if (!is_logged())
return;
if (len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5))
{
@@ -354,7 +354,7 @@ inline void mtr_t::memmove(const buf_block_t &b, ulint d, ulint s, ulint len)
ut_ad(d + len <= ulint(srv_page_size));
set_modified(b);
- if (m_log_mode != MTR_LOG_ALL)
+ if (!is_logged())
return;
static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3);
@@ -387,7 +387,7 @@ template<byte type>
inline byte *mtr_t::log_write(const page_id_t id, const buf_page_t *bpage,
size_t len, bool alloc, size_t offset)
{
- static_assert(!(type & 15) && type != RESERVED && type != OPTION &&
+ static_assert(!(type & 15) && type != RESERVED &&
type <= FILE_CHECKPOINT, "invalid type");
ut_ad(type >= FILE_CREATE || is_named_space(id.space()));
ut_ad(!bpage || bpage->id() == id);
@@ -401,7 +401,8 @@ inline byte *mtr_t::log_write(const page_id_t id, const buf_page_t *bpage,
ut_ad(have_offset || offset == 0);
ut_ad(offset + len <= srv_page_size);
static_assert(MIN_4BYTE >= UNIV_PAGE_SIZE_MAX, "consistency");
-
+ ut_ad(type == FREE_PAGE || type == OPTION || (type == EXTENDED && !bpage) ||
+ memo_contains_flagged(bpage, MTR_MEMO_MODIFY));
size_t max_len;
if (!have_len)
max_len= 1 + 5 + 5;
@@ -488,10 +489,10 @@ template<mtr_t::write_type w>
inline void mtr_t::memcpy(const buf_block_t &b, void *dest, const void *str,
ulint len)
{
- ut_ad(ut_align_down(dest, srv_page_size) == b.frame);
+ ut_ad(ut_align_down(dest, srv_page_size) == b.page.frame);
char *d= static_cast<char*>(dest);
const char *s= static_cast<const char*>(str);
- if (w != FORCED && m_log_mode == MTR_LOG_ALL)
+ if (w != FORCED && is_logged())
{
ut_ad(len);
const char *const end= d + len;
@@ -511,55 +512,13 @@ inline void mtr_t::memcpy(const buf_block_t &b, void *dest, const void *str,
memcpy(b, ut_align_offset(d, srv_page_size), len);
}
-/** Initialize an entire page.
-@param[in,out] b buffer page */
-inline void mtr_t::init(buf_block_t *b)
-{
- const page_id_t id{b->page.id()};
- ut_ad(is_named_space(id.space()));
- ut_ad(!m_freed_pages == !m_freed_space);
-
- if (UNIV_LIKELY_NULL(m_freed_space) &&
- m_freed_space->id == id.space() &&
- m_freed_pages->remove_if_exists(b->page.id().page_no()) &&
- m_freed_pages->empty())
- {
- delete m_freed_pages;
- m_freed_pages= nullptr;
- m_freed_space= nullptr;
- }
-
- b->page.status= buf_page_t::INIT_ON_FLUSH;
-
- if (m_log_mode != MTR_LOG_ALL)
- {
- ut_ad(m_log_mode == MTR_LOG_NONE || m_log_mode == MTR_LOG_NO_REDO);
- return;
- }
-
- m_log.close(log_write<INIT_PAGE>(b->page.id(), &b->page));
- m_last_offset= FIL_PAGE_TYPE;
-}
-
-/** Free a page.
-@param[in] space tablespace contains page to be freed
-@param[in] offset page offset to be freed */
-inline void mtr_t::free(fil_space_t &space, uint32_t offset)
-{
- ut_ad(is_named_space(&space));
- ut_ad(!m_freed_space || m_freed_space == &space);
-
- if (m_log_mode == MTR_LOG_ALL)
- m_log.close(log_write<FREE_PAGE>({space.id, offset}, nullptr));
-}
-
/** Write an EXTENDED log record.
@param block buffer pool page
@param type extended record subtype; @see mrec_ext_t */
inline void mtr_t::log_write_extended(const buf_block_t &block, byte type)
{
set_modified(block);
- if (m_log_mode != MTR_LOG_ALL)
+ if (!is_logged())
return;
byte *l= log_write<EXTENDED>(block.page.id(), &block.page, 1, true);
*l++= type;
@@ -586,7 +545,7 @@ inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec)
ut_ad(!block.zip_size());
ut_ad(prev_rec < block.physical_size());
set_modified(block);
- if (m_log_mode != MTR_LOG_ALL)
+ if (!is_logged())
return;
size_t len= (prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4);
byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, true);
@@ -613,7 +572,7 @@ inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec,
ut_ad(hdr_size < MIN_3BYTE);
ut_ad(prev_rec < block.physical_size());
ut_ad(data_size < block.physical_size());
- if (m_log_mode != MTR_LOG_ALL)
+ if (!is_logged())
return;
size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4;
len+= hdr_size < MIN_2BYTE ? 1 : 2;
@@ -645,7 +604,7 @@ inline void mtr_t::undo_append(const buf_block_t &block,
{
ut_ad(len > 2);
set_modified(block);
- if (m_log_mode != MTR_LOG_ALL)
+ if (!is_logged())
return;
const bool small= len + 1 < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5);
byte *end= log_write<EXTENDED>(block.page.id(), &block.page, len + 1, small);
@@ -668,7 +627,7 @@ inline void mtr_t::undo_append(const buf_block_t &block,
@param id first page identifier that will not be in the file */
inline void mtr_t::trim_pages(const page_id_t id)
{
- if (m_log_mode != MTR_LOG_ALL)
+ if (!is_logged())
return;
byte *l= log_write<EXTENDED>(id, nullptr, 1, true);
*l++= TRIM_PAGES;
diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h
index b64dccb887f..1c044319ca0 100644
--- a/storage/innobase/include/mtr0mtr.h
+++ b/storage/innobase/include/mtr0mtr.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,11 +24,12 @@ Mini-transaction buffer
Created 11/26/1995 Heikki Tuuri
*******************************************************/
-#ifndef mtr0mtr_h
-#define mtr0mtr_h
+#pragma once
#include "fil0fil.h"
#include "dyn0buf.h"
+#include "buf0buf.h"
+#include "small_vector.h"
/** Start a mini-transaction. */
#define mtr_start(m) (m)->start()
@@ -36,73 +37,73 @@ Created 11/26/1995 Heikki Tuuri
/** Commit a mini-transaction. */
#define mtr_commit(m) (m)->commit()
-/** Set and return a savepoint in mtr.
-@return savepoint */
-#define mtr_set_savepoint(m) (m)->get_savepoint()
-
-/** Release the (index tree) s-latch stored in an mtr memo after a
-savepoint. */
-#define mtr_release_s_latch_at_savepoint(m, s, l) \
- (m)->release_s_latch_at_savepoint((s), (l))
-
/** Change the logging mode of a mini-transaction.
@return old mode */
#define mtr_set_log_mode(m, d) (m)->set_log_mode((d))
-/** Release an object in the memo stack.
-@return true if released */
-#define mtr_memo_release(m, o, t) \
- (m)->memo_release((o), (t))
-
-/** Push an object to an mtr memo stack. */
-#define mtr_memo_push(m, o, t) (m)->memo_push(o, t)
-
-#define mtr_x_lock_space(s, m) (m)->x_lock_space((s), __FILE__, __LINE__)
-#define mtr_sx_lock_space(s, m) (m)->sx_lock_space((s), __FILE__, __LINE__)
-
-#define mtr_s_lock_index(i, m) (m)->s_lock(&(i)->lock, __FILE__, __LINE__)
-#define mtr_x_lock_index(i, m) (m)->x_lock(&(i)->lock, __FILE__, __LINE__)
-#define mtr_sx_lock_index(i, m) (m)->sx_lock(&(i)->lock, __FILE__, __LINE__)
-
-#define mtr_release_block_at_savepoint(m, s, b) \
- (m)->release_block_at_savepoint((s), (b))
-
-#define mtr_block_sx_latch_at_savepoint(m, s, b) \
- (m)->sx_latch_at_savepoint((s), (b))
-
-#define mtr_block_x_latch_at_savepoint(m, s, b) \
- (m)->x_latch_at_savepoint((s), (b))
+#ifdef UNIV_PFS_RWLOCK
+# define mtr_s_lock_index(i,m) (m)->s_lock(__FILE__, __LINE__, &(i)->lock)
+# define mtr_x_lock_index(i,m) (m)->x_lock(__FILE__, __LINE__, &(i)->lock)
+# define mtr_sx_lock_index(i,m) (m)->u_lock(__FILE__, __LINE__, &(i)->lock)
+#else
+# define mtr_s_lock_index(i,m) (m)->s_lock(&(i)->lock)
+# define mtr_x_lock_index(i,m) (m)->x_lock(&(i)->lock)
+# define mtr_sx_lock_index(i,m) (m)->u_lock(&(i)->lock)
+#endif
/** Mini-transaction memo stack slot. */
-struct mtr_memo_slot_t {
- /** pointer to the object */
- void* object;
-
- /** type of the stored object */
- mtr_memo_type_t type;
+struct mtr_memo_slot_t
+{
+ /** pointer to the object */
+ void *object;
+ /** type of the stored object */
+ mtr_memo_type_t type;
+
+ /** Release the object */
+ void release() const;
};
/** Mini-transaction handle and buffer */
struct mtr_t {
+ mtr_t();
+ ~mtr_t();
+
/** Start a mini-transaction. */
void start();
/** Commit the mini-transaction. */
void commit();
- /** Release latches till savepoint. To simplify the code only
- MTR_MEMO_S_LOCK and MTR_MEMO_PAGE_S_FIX slot types are allowed to be
- released, otherwise it would be neccesary to add one more argument in the
- function to point out what slot types are allowed for rollback, and this
- would be overengineering as currently the function is used only in one place
- in the code.
- @param savepoint savepoint, can be obtained with get_savepoint */
- void rollback_to_savepoint(ulint savepoint);
+ /** Release latches of unmodified buffer pages.
+ @param begin first slot to release
+ @param end last slot to release, or get_savepoint() */
+ void rollback_to_savepoint(ulint begin, ulint end);
+
+ /** Release latches of unmodified buffer pages.
+ @param begin first slot to release */
+ void rollback_to_savepoint(ulint begin)
+ { rollback_to_savepoint(begin, m_memo.size()); }
+
+ /** Release the last acquired buffer page latch. */
+ void release_last_page()
+ { auto s= m_memo.size(); rollback_to_savepoint(s - 1, s); }
/** Commit a mini-transaction that is shrinking a tablespace.
@param space tablespace that is being shrunk */
ATTRIBUTE_COLD void commit_shrink(fil_space_t &space);
+ /** Commit a mini-transaction that is deleting or renaming a file.
+ @param space tablespace that is being renamed or deleted
+ @param name new file name (nullptr=the file will be deleted)
+ @param detached_handle if detached_handle != nullptr and if space is detached
+ during the function execution the file handle if its
+ node will be set to OS_FILE_CLOSED, and the previous
+ value of the file handle will be assigned to the
+ address, pointed by detached_handle.
+ @return whether the operation succeeded */
+ ATTRIBUTE_COLD bool commit_file(fil_space_t &space, const char *name,
+ pfs_os_file_t *detached_handle= nullptr);
+
/** Commit a mini-transaction that did not modify any pages,
but generated some redo log on a higher level, such as
FILE_MODIFY records and an optional FILE_CHECKPOINT marker.
@@ -112,35 +113,59 @@ struct mtr_t {
void commit_files(lsn_t checkpoint_lsn= 0);
/** @return mini-transaction savepoint (current size of m_memo) */
- ulint get_savepoint() const { ut_ad(is_active()); return m_memo.size(); }
-
- /** Release the (index tree) s-latch stored in an mtr memo after a
- savepoint.
- @param savepoint value returned by @see set_savepoint.
- @param lock latch to release */
- inline void release_s_latch_at_savepoint(
- ulint savepoint,
- rw_lock_t* lock);
+ ulint get_savepoint() const
+ {
+ ut_ad(is_active());
+ return m_memo.size();
+ }
- /** Release the block in an mtr memo after a savepoint. */
- inline void release_block_at_savepoint(
- ulint savepoint,
- buf_block_t* block);
+ /** Get the block at a savepoint */
+ buf_block_t *at_savepoint(ulint savepoint) const
+ {
+ ut_ad(is_active());
+ const mtr_memo_slot_t &slot= m_memo[savepoint];
+ ut_ad(slot.type < MTR_MEMO_S_LOCK);
+ ut_ad(slot.object);
+ return static_cast<buf_block_t*>(slot.object);
+ }
- /** SX-latch a not yet latched block after a savepoint. */
- inline void sx_latch_at_savepoint(ulint savepoint, buf_block_t* block);
+ /** Try to get a block at a savepoint.
+ @param savepoint the savepoint right before the block was acquired
+ @return the block at the savepoint
+ @retval nullptr if no buffer block was registered at that savepoint */
+ buf_block_t *block_at_savepoint(ulint savepoint) const
+ {
+ ut_ad(is_active());
+ const mtr_memo_slot_t &slot= m_memo[savepoint];
+ return slot.type < MTR_MEMO_S_LOCK
+ ? static_cast<buf_block_t*>(slot.object)
+ : nullptr;
+ }
- /** X-latch a not yet latched block after a savepoint. */
- inline void x_latch_at_savepoint(ulint savepoint, buf_block_t* block);
+ /** Retrieve a page that has already been latched.
+ @param id page identifier
+ @param type page latch type
+ @return block
+ @retval nullptr if the block had not been latched yet */
+ buf_block_t *get_already_latched(const page_id_t id, mtr_memo_type_t type)
+ const;
/** @return the logging mode */
mtr_log_t get_log_mode() const
{
static_assert(MTR_LOG_ALL == 0, "efficiency");
- ut_ad(m_log_mode <= MTR_LOG_NO_REDO);
return static_cast<mtr_log_t>(m_log_mode);
}
+ /** @return whether log is to be written for changes */
+ bool is_logged() const
+ {
+ static_assert(MTR_LOG_ALL == 0, "efficiency");
+ static_assert(MTR_LOG_NONE & MTR_LOG_NO_REDO, "efficiency");
+ static_assert(!(MTR_LOG_NONE & MTR_LOG_SUB), "efficiency");
+ return !(m_log_mode & MTR_LOG_NONE);
+ }
+
/** Change the logging mode.
@param mode logging mode
@return old mode */
@@ -151,10 +176,23 @@ struct mtr_t {
return old_mode;
}
+ /** Set the log mode of a sub-minitransaction
+ @param mtr parent mini-transaction */
+ void set_log_mode_sub(const mtr_t &mtr)
+ {
+ ut_ad(mtr.m_log_mode == MTR_LOG_ALL || mtr.m_log_mode == MTR_LOG_NO_REDO);
+ m_log_mode= mtr.m_log_mode | MTR_LOG_SUB;
+ static_assert((MTR_LOG_SUB | MTR_LOG_NO_REDO) == MTR_LOG_NO_REDO, "");
+ }
+
/** Check if we are holding a block latch in exclusive mode
@param block buffer pool block to search for */
bool have_x_latch(const buf_block_t &block) const;
+ /** Check if we are holding a block latch in S or U mode
+ @param block buffer pool block to search for */
+ bool have_u_or_x_latch(const buf_block_t &block) const;
+
/** Copy the tablespaces associated with the mini-transaction
(needed for generating FILE_MODIFY records)
@param[in] mtr mini-transaction that may modify
@@ -214,89 +252,61 @@ struct mtr_t {
/** Acquire a tablespace X-latch.
@param[in] space_id tablespace ID
- @param[in] file file name from where called
- @param[in] line line number in file
@return the tablespace object (never NULL) */
- fil_space_t* x_lock_space(
- ulint space_id,
- const char* file,
- unsigned line);
-
- /** Acquire a shared rw-latch.
- @param[in] lock rw-latch
- @param[in] file file name from where called
- @param[in] line line number in file */
- void s_lock(rw_lock_t* lock, const char* file, unsigned line)
- {
- rw_lock_s_lock_inline(lock, 0, file, line);
- memo_push(lock, MTR_MEMO_S_LOCK);
- }
+ fil_space_t* x_lock_space(ulint space_id);
- /** Acquire an exclusive rw-latch.
- @param[in] lock rw-latch
- @param[in] file file name from where called
- @param[in] line line number in file */
- void x_lock(rw_lock_t* lock, const char* file, unsigned line)
- {
- rw_lock_x_lock_inline(lock, 0, file, line);
- memo_push(lock, MTR_MEMO_X_LOCK);
- }
+ /** Acquire a shared rw-latch. */
+ void s_lock(
+#ifdef UNIV_PFS_RWLOCK
+ const char *file, unsigned line,
+#endif
+ index_lock *lock)
+ {
+ lock->s_lock(SRW_LOCK_ARGS(file, line));
+ memo_push(lock, MTR_MEMO_S_LOCK);
+ }
- /** Acquire an shared/exclusive rw-latch.
- @param[in] lock rw-latch
- @param[in] file file name from where called
- @param[in] line line number in file */
- void sx_lock(rw_lock_t* lock, const char* file, unsigned line)
- {
- rw_lock_sx_lock_inline(lock, 0, file, line);
- memo_push(lock, MTR_MEMO_SX_LOCK);
- }
+ /** Acquire an exclusive rw-latch. */
+ void x_lock(
+#ifdef UNIV_PFS_RWLOCK
+ const char *file, unsigned line,
+#endif
+ index_lock *lock)
+ {
+ lock->x_lock(SRW_LOCK_ARGS(file, line));
+ memo_push(lock, MTR_MEMO_X_LOCK);
+ }
- /** Acquire a tablespace X-latch.
- @param[in] space tablespace
- @param[in] file file name from where called
- @param[in] line line number in file */
- void x_lock_space(fil_space_t* space, const char* file, unsigned line)
- {
- ut_ad(space->purpose == FIL_TYPE_TEMPORARY
- || space->purpose == FIL_TYPE_IMPORT
- || space->purpose == FIL_TYPE_TABLESPACE);
- memo_push(space, MTR_MEMO_SPACE_X_LOCK);
- rw_lock_x_lock_inline(&space->latch, 0, file, line);
- }
+ /** Acquire an update latch. */
+ void u_lock(
+#ifdef UNIV_PFS_RWLOCK
+ const char *file, unsigned line,
+#endif
+ index_lock *lock)
+ {
+ lock->u_lock(SRW_LOCK_ARGS(file, line));
+ memo_push(lock, MTR_MEMO_SX_LOCK);
+ }
- /** Acquire a tablespace SX-latch.
- @param[in] space tablespace
- @param[in] file file name from where called
- @param[in] line line number in file */
- void sx_lock_space(fil_space_t *space, const char *file, unsigned line)
- {
- ut_ad(space->purpose == FIL_TYPE_TEMPORARY
- || space->purpose == FIL_TYPE_IMPORT
- || space->purpose == FIL_TYPE_TABLESPACE);
- sx_lock(&space->latch, file, line);
- }
-
- /** Release an object in the memo stack.
- @param object object
- @param type object type
- @return bool if lock released */
- bool memo_release(const void* object, ulint type);
+ /** Acquire an exclusive tablespace latch.
+ @param space tablespace */
+ void x_lock_space(fil_space_t *space);
+ /** Release an index latch. */
+ void release(const index_lock &lock) { release(&lock); }
+ /** Release a latch to an unmodified page. */
+ void release(const buf_block_t &block) { release(&block); }
private:
- /** Note that the mini-transaction will modify data. */
- void flag_modified() { m_modifications = true; }
+ /** Release an unmodified object. */
+ void release(const void *object);
+public:
/** Mark the given latched page as modified.
@param block page that will be modified */
- void modify(const buf_block_t& block);
-public:
- /** Note that the mini-transaction will modify a block. */
- void set_modified(const buf_block_t &block)
- { flag_modified(); if (m_log_mode != MTR_LOG_NONE) modify(block); }
+ void set_modified(const buf_block_t &block);
/** Set the state to not-modified. This will not log the changes.
This is only used during redo log apply, to avoid logging the changes. */
- void discard_modifications() { m_modifications = false; }
+ void discard_modifications() { m_modifications= false; }
/** Get the LSN of commit().
@return the commit LSN
@@ -318,59 +328,127 @@ public:
/** @return true if pages has been trimed */
bool is_trim_pages() { return m_trim_pages; }
+ /** Latch a buffer pool block.
+ @param block block to be latched
+ @param rw_latch RW_S_LATCH, RW_SX_LATCH, RW_X_LATCH, RW_NO_LATCH */
+ void page_lock(buf_block_t *block, ulint rw_latch);
+
+ /** Acquire a latch on a buffer-fixed buffer pool block.
+ @param savepoint savepoint location of the buffer-fixed block
+ @param rw_latch latch to acquire */
+ void upgrade_buffer_fix(ulint savepoint, rw_lock_type_t rw_latch);
+
+ /** Register a change to the page latch state. */
+ void lock_register(ulint savepoint, mtr_memo_type_t type)
+ {
+ mtr_memo_slot_t &slot= m_memo[savepoint];
+ ut_ad(slot.type <= MTR_MEMO_BUF_FIX);
+ ut_ad(type <= MTR_MEMO_BUF_FIX);
+ slot.type= type;
+ }
+
+ /** Upgrade U locks on a block to X */
+ void page_lock_upgrade(const buf_block_t &block);
+
+ /** Upgrade index U lock to X */
+ ATTRIBUTE_COLD void index_lock_upgrade();
+
+ /** Check if we are holding tablespace latch
+ @param space tablespace to search for
+ @return whether space.latch is being held */
+ bool memo_contains(const fil_space_t& space) const
+ MY_ATTRIBUTE((warn_unused_result));
#ifdef UNIV_DEBUG
/** Check if we are holding an rw-latch in this mini-transaction
@param lock latch to search for
@param type held latch type
@return whether (lock,type) is contained */
- bool memo_contains(const rw_lock_t &lock, mtr_memo_type_t type)
- MY_ATTRIBUTE((warn_unused_result));
- /** Check if we are holding exclusive tablespace latch
- @param space tablespace to search for
- @return whether space.latch is being held */
- bool memo_contains(const fil_space_t& space)
+ bool memo_contains(const index_lock &lock, mtr_memo_type_t type) const
MY_ATTRIBUTE((warn_unused_result));
-
- /** Check if memo contains the given item.
- @param object object to search
- @param flags specify types of object (can be ORred) of
- MTR_MEMO_PAGE_S_FIX ... values
- @return true if contains */
- bool memo_contains_flagged(const void* ptr, ulint flags) const;
-
- /** Check if memo contains the given page.
- @param[in] ptr pointer to within buffer frame
- @param[in] flags specify types of object with OR of
- MTR_MEMO_PAGE_S_FIX... values
- @return the block
- @retval NULL if not found */
- buf_block_t* memo_contains_page_flagged(
- const byte* ptr,
- ulint flags) const;
-
- /** @return true if mini-transaction contains modifications. */
- bool has_modifications() const { return m_modifications; }
+ /** Check if memo contains an index or buffer block latch.
+ @param object object to search
+ @param flags specify types of object latches
+ @return true if contains */
+ bool memo_contains_flagged(const void *object, ulint flags) const
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+ /** Check if memo contains the given page.
+ @param ptr pointer to within page frame
+ @param flags types latch to look for
+ @return the block
+ @retval nullptr if not found */
+ buf_block_t *memo_contains_page_flagged(const byte *ptr, ulint flags) const;
+
+ /** @return whether this mini-transaction modifies persistent data */
+ bool has_modifications() const { return m_modifications; }
#endif /* UNIV_DEBUG */
- /** @return true if a record was added to the mini-transaction */
- bool is_dirty() const { return m_made_dirty; }
-
- /** Push an object to an mtr memo stack.
- @param object object
- @param type object type: MTR_MEMO_S_LOCK, ... */
- inline void memo_push(void* object, mtr_memo_type_t type);
+ /** Push a buffer page to an the memo.
+ @param block buffer block
+ @param type object type: MTR_MEMO_S_LOCK, ... */
+ void memo_push(buf_block_t *block, mtr_memo_type_t type)
+ __attribute__((nonnull))
+ {
+ ut_ad(is_active());
+ ut_ad(type <= MTR_MEMO_PAGE_SX_MODIFY);
+ ut_ad(block->page.buf_fix_count());
+ ut_ad(block->page.in_file());
+#ifdef UNIV_DEBUG
+ switch (type) {
+ case MTR_MEMO_PAGE_S_FIX:
+ ut_ad(block->page.lock.have_s());
+ break;
+ case MTR_MEMO_PAGE_X_FIX: case MTR_MEMO_PAGE_X_MODIFY:
+ ut_ad(block->page.lock.have_x());
+ break;
+ case MTR_MEMO_PAGE_SX_FIX: case MTR_MEMO_PAGE_SX_MODIFY:
+ ut_ad(block->page.lock.have_u_or_x());
+ break;
+ case MTR_MEMO_BUF_FIX:
+ break;
+ case MTR_MEMO_MODIFY:
+ case MTR_MEMO_S_LOCK: case MTR_MEMO_X_LOCK: case MTR_MEMO_SX_LOCK:
+ case MTR_MEMO_SPACE_X_LOCK:
+ ut_ad("invalid type" == 0);
+ }
+#endif
+ if (!(type & MTR_MEMO_MODIFY));
+ else if (block->page.id().space() >= SRV_TMP_SPACE_ID)
+ {
+ block->page.set_temp_modified();
+ type= mtr_memo_type_t(type & ~MTR_MEMO_MODIFY);
+ }
+ else
+ {
+ m_modifications= true;
+ if (!m_made_dirty)
+ /* If we are going to modify a previously clean persistent page,
+ we must set m_made_dirty, so that commit() will acquire
+ log_sys.flush_order_mutex and insert the block into
+ buf_pool.flush_list. */
+ m_made_dirty= block->page.oldest_modification() <= 1;
+ }
+ m_memo.emplace_back(mtr_memo_slot_t{block, type});
+ }
- /** Check if this mini-transaction is dirtying a clean page.
- @param block block being x-fixed
- @return true if the mtr is dirtying a clean page. */
- static inline bool is_block_dirtied(const buf_block_t* block)
- MY_ATTRIBUTE((warn_unused_result));
+ /** Push an index lock or tablespace latch to the memo.
+ @param object index lock or tablespace latch
+ @param type object type: MTR_MEMO_S_LOCK, ... */
+ void memo_push(void *object, mtr_memo_type_t type) __attribute__((nonnull))
+ {
+ ut_ad(is_active());
+ ut_ad(type >= MTR_MEMO_S_LOCK);
+ m_memo.emplace_back(mtr_memo_slot_t{object, type});
+ }
/** @return the size of the log is empty */
size_t get_log_size() const { return m_log.size(); }
/** @return whether the log and memo are empty */
- bool is_empty() const { return m_memo.size() == 0 && m_log.size() == 0; }
+ bool is_empty() const { return !get_savepoint() && !get_log_size(); }
+
+ /** Write an OPT_PAGE_CHECKSUM record. */
+ inline void page_checksum(const buf_page_t &bpage);
/** Write request types */
enum write_type
@@ -470,9 +548,9 @@ public:
@param[in,out] b buffer page */
void init(buf_block_t *b);
/** Free a page.
- @param[in] space tablespace contains page to be freed
- @param[in] offset page offset to be freed */
- inline void free(fil_space_t &space, uint32_t offset);
+ @param space tablespace
+ @param offset offset of the page to be freed */
+ void free(const fil_space_t &space, uint32_t offset);
/** Write log for partly initializing a B-tree or R-tree page.
@param block B-tree or R-tree page
@param comp false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */
@@ -618,6 +696,8 @@ private:
@return {start_lsn,flush_ahead} */
inline std::pair<lsn_t,page_flush_ahead> finish_write(ulint len);
+ /** Release all latches. */
+ void release();
/** Release the resources */
inline void release_resources();
@@ -628,11 +708,17 @@ public:
{ ut_ad(!m_commit || m_start); return m_start && !m_commit; }
/** @return whether the mini-transaction has been committed */
bool has_committed() const { ut_ad(!m_commit || m_start); return m_commit; }
+ /** @return whether the mini-transaction is freeing an index tree */
+ bool is_freeing_tree() const { return m_freeing_tree; }
+ /** Notify that the mini-transaction is freeing an index tree */
+ void freeing_tree() { m_freeing_tree= true; }
private:
/** whether start() has been called */
bool m_start= false;
/** whether commit() has been called */
bool m_commit= false;
+ /** whether freeing_tree() has been called */
+ bool m_freeing_tree= false;
#endif
/** The page of the most recent m_log record written, or NULL */
@@ -643,7 +729,7 @@ private:
/** specifies which operations should be logged; default MTR_LOG_ALL */
uint16_t m_log_mode:2;
- /** whether at least one buffer pool page was written to */
+ /** whether at least one persistent page was written to */
uint16_t m_modifications:1;
/** whether at least one previously clean buffer pool page was written to */
@@ -663,7 +749,7 @@ private:
#endif /* UNIV_DEBUG */
/** acquired dict_index_t::lock, fil_space_t::latch, buf_block_t */
- mtr_buf_t m_memo;
+ small_vector<mtr_memo_slot_t, 16> m_memo;
/** mini-transaction log */
mtr_buf_t m_log;
@@ -679,7 +765,3 @@ private:
/** set of freed page ids */
range_set *m_freed_pages= nullptr;
};
-
-#include "mtr0mtr.inl"
-
-#endif /* mtr0mtr_h */
diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h
index 9e59dc814d3..465c20fe7d2 100644
--- a/storage/innobase/include/mtr0types.h
+++ b/storage/innobase/include/mtr0types.h
@@ -24,14 +24,11 @@ Mini-transaction buffer global types
Created 11/26/1995 Heikki Tuuri
*******************************************************/
-#ifndef mtr0types_h
-#define mtr0types_h
+#pragma once
-#ifndef UNIV_INNOCHECKSUM
-#include "sync0rw.h"
-#else
-#include "univ.i"
-#endif /* UNIV_INNOCHECKSUM */
+#include "buf0types.h"
+
+#include "ut0byte.h"
struct mtr_t;
@@ -44,6 +41,11 @@ enum mtr_log_t {
Set for attempting modification of a ROW_FORMAT=COMPRESSED page. */
MTR_LOG_NONE,
+ /** Log all operations, but do not write any OPT_PAGE_CHECKSUM
+ records because some of the modified pages were also modified
+ by another mini-transaction that did not write its log yet. */
+ MTR_LOG_SUB,
+
/** Don't generate REDO log but add dirty pages to flush list */
MTR_LOG_NO_REDO
};
@@ -80,12 +82,8 @@ type. The following record types refer to data pages:
RESERVED (6): reserved for future use; a subtype code
(encoded immediately after the length) would be written
to reserve code space for further extensions
- OPTION (7): optional record that may be ignored; a subtype code
- (encoded immediately after the length) would distinguish actual
- usage, such as:
- * MDEV-18976 page checksum record
- * binlog record
- * SQL statement (at the start of statement)
+ OPTION (7): optional record that may be ignored; a subtype @see mrec_opt
+ (encoded immediately after the length) would distinguish actual usage
Bits 3..0 indicate the redo log record length, excluding the first
byte, but including additional length bytes and any other bytes,
@@ -232,9 +230,7 @@ enum mrec_type_t
/** Reserved for future use. */
RESERVED= 0x60,
/** Optional record that may be ignored in crash recovery.
- A subtype code will be encoded immediately after the length.
- Possible subtypes would include a MDEV-18976 page checksum record,
- a binlog record, or an SQL statement. */
+ A subtype (@see mrec_opt) will be encoded after the page identifier. */
OPTION= 0x70
};
@@ -286,6 +282,15 @@ enum mrec_ext_t
};
+/** Recognized OPTION record subtypes. */
+enum mrec_opt
+{
+ /** page checksum at the end of the mini-transaction */
+ OPT_PAGE_CHECKSUM= 0
+ /* Other possible subtypes: a binlog record, or an SQL statement. */
+};
+
+
/** Redo log record types for file-level operations. These bit
patterns will be written to redo log files, so the existing codes or
their interpretation on crash recovery must not be changed. */
@@ -331,9 +336,7 @@ enum mtr_memo_type_t {
MTR_MEMO_SX_LOCK = RW_SX_LATCH << 5,
- /** acquire X-latch on fil_space_t::latch */
+ /** wr_lock() on fil_space_t::latch */
MTR_MEMO_SPACE_X_LOCK = MTR_MEMO_SX_LOCK << 1
};
-#endif /* !UNIV_CHECKSUM */
-
-#endif /* mtr0types_h */
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h
index a22dc3562b5..f8ae0f51557 100644
--- a/storage/innobase/include/os0file.h
+++ b/storage/innobase/include/os0file.h
@@ -51,6 +51,8 @@ extern bool os_has_said_disk_full;
/** File offset in bytes */
typedef ib_uint64_t os_offset_t;
+class buf_tmp_buffer_t;
+
#ifdef _WIN32
/** We define always WIN_ASYNC_IO, and check at run-time whether
@@ -206,11 +208,13 @@ public:
PUNCH_RANGE= WRITE_SYNC | 128,
};
- constexpr IORequest(buf_page_t *bpage, fil_node_t *node, Type type) :
- bpage(bpage), node(node), type(type) {}
+ constexpr IORequest(buf_page_t *bpage, buf_tmp_buffer_t *slot,
+ fil_node_t *node, Type type) :
+ bpage(bpage), slot(slot), node(node), type(type) {}
- constexpr IORequest(Type type= READ_SYNC, buf_page_t *bpage= nullptr) :
- bpage(bpage), type(type) {}
+ constexpr IORequest(Type type= READ_SYNC, buf_page_t *bpage= nullptr,
+ buf_tmp_buffer_t *slot= nullptr) :
+ bpage(bpage), slot(slot), type(type) {}
bool is_read() const { return (type & READ_SYNC) != 0; }
bool is_write() const { return (type & WRITE_SYNC) != 0; }
@@ -237,7 +241,10 @@ private:
public:
/** Page to be written on write operation */
- buf_page_t* const bpage= nullptr;
+ buf_page_t *const bpage= nullptr;
+
+ /** Memory to be used for encrypted or page_compressed pages */
+ buf_tmp_buffer_t *const slot= nullptr;
/** File descriptor */
fil_node_t *const node= nullptr;
@@ -263,8 +270,8 @@ struct os_file_size_t {
constexpr ulint OS_AIO_N_PENDING_IOS_PER_THREAD= 256;
extern Atomic_counter<ulint> os_n_file_reads;
-extern ulint os_n_file_writes;
-extern ulint os_n_fsyncs;
+extern Atomic_counter<size_t> os_n_file_writes;
+extern Atomic_counter<size_t> os_n_fsyncs;
/* File types for directory entry data type */
@@ -575,12 +582,8 @@ The wrapper functions have the prefix of "innodb_". */
# define os_file_close(file) \
pfs_os_file_close_func(file, __FILE__, __LINE__)
-# define os_file_read(type, file, buf, offset, n) \
- pfs_os_file_read_func(type, file, buf, offset, n, __FILE__, __LINE__)
-
-# define os_file_read_no_error_handling(type, file, buf, offset, n, o) \
- pfs_os_file_read_no_error_handling_func( \
- type, file, buf, offset, n, o, __FILE__, __LINE__)
+# define os_file_read(type, file, buf, offset, n, o) \
+ pfs_os_file_read_func(type, file, buf, offset, n,o, __FILE__, __LINE__)
# define os_file_write(type, name, file, buf, offset, n) \
pfs_os_file_write_func(type, name, file, buf, offset, \
@@ -725,31 +728,6 @@ pfs_os_file_read_func(
void* buf,
os_offset_t offset,
ulint n,
- const char* src_file,
- uint src_line);
-
-/** NOTE! Please use the corresponding macro os_file_read_no_error_handling(),
-not directly this function!
-This is the performance schema instrumented wrapper function for
-os_file_read_no_error_handling_func() which requests a synchronous
-read operation.
-@param[in] type IO request context
-@param[in] file Open file handle
-@param[out] buf buffer where to read
-@param[in] offset file offset where to read
-@param[in] n number of bytes to read
-@param[out] o number of bytes actually read
-@param[in] src_file file name where func invoked
-@param[in] src_line line where the func invoked
-@return DB_SUCCESS if request was successful */
-UNIV_INLINE
-dberr_t
-pfs_os_file_read_no_error_handling_func(
- const IORequest& type,
- pfs_os_file_t file,
- void* buf,
- os_offset_t offset,
- ulint n,
ulint* o,
const char* src_file,
uint src_line);
@@ -875,11 +853,8 @@ to original un-instrumented file I/O APIs */
# define os_file_close(file) os_file_close_func(file)
-# define os_file_read(type, file, buf, offset, n) \
- os_file_read_func(type, file, buf, offset, n)
-
-# define os_file_read_no_error_handling(type, file, buf, offset, n, o) \
- os_file_read_no_error_handling_func(type, file, buf, offset, n, o)
+# define os_file_read(type, file, buf, offset, n, o) \
+ os_file_read_func(type, file, buf, offset, n, o)
# define os_file_write(type, name, file, buf, offset, n) \
os_file_write_func(type, name, file, buf, offset, n)
@@ -985,6 +960,7 @@ Requests a synchronous read operation.
@param[out] buf buffer where to read
@param[in] offset file offset where to read
@param[in] n number of bytes to read
+@param[out] o number of bytes actually read
@return DB_SUCCESS if request was successful */
dberr_t
os_file_read_func(
@@ -992,7 +968,8 @@ os_file_read_func(
os_file_t file,
void* buf,
os_offset_t offset,
- ulint n)
+ ulint n,
+ ulint* o)
MY_ATTRIBUTE((warn_unused_result));
/** Rewind file to its start, read at most size - 1 bytes from it to str, and
@@ -1007,27 +984,6 @@ os_file_read_string(
char* str,
ulint size);
-/** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
-not directly this function!
-Requests a synchronous positioned read operation. This function does not do
-any error handling. In case of error it returns FALSE.
-@param[in] type IO request context
-@param[in] file Open file handle
-@param[out] buf buffer where to read
-@param[in] offset file offset where to read
-@param[in] n number of bytes to read
-@param[out] o number of bytes actually read
-@return DB_SUCCESS or error code */
-dberr_t
-os_file_read_no_error_handling_func(
- const IORequest& type,
- os_file_t file,
- void* buf,
- os_offset_t offset,
- ulint n,
- ulint* o)
- MY_ATTRIBUTE((warn_unused_result));
-
/** NOTE! Use the corresponding macro os_file_write(), not directly this
function!
Requests a synchronous write operation.
@@ -1058,23 +1014,6 @@ os_file_status(
bool* exists,
os_file_type_t* type);
-/** This function returns a new path name after replacing the basename
-in an old path with a new basename. The old_path is a full path
-name including the extension. The tablename is in the normal
-form "databasename/tablename". The new base name is found after
-the forward slash. Both input strings are null terminated.
-
-This function allocates memory to be returned. It is the callers
-responsibility to free the return value after it is no longer needed.
-
-@param[in] old_path pathname
-@param[in] new_name new file name
-@return own: new full pathname */
-char*
-os_file_make_new_pathname(
- const char* old_path,
- const char* new_name);
-
/** This function reduces a null-terminated full remote path name into
the path that is sent by MySQL for DATA DIRECTORY clause. It replaces
the 'databasename/tablename.ibd' found at the end of the path with just
@@ -1120,14 +1059,19 @@ void os_aio_free();
@retval DB_IO_ERROR on I/O error */
dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n);
+/** @return number of pending reads */
+size_t os_aio_pending_reads();
+/** @return approximate number of pending reads */
+size_t os_aio_pending_reads_approx();
+/** @return number of pending writes */
+size_t os_aio_pending_writes();
+
/** Wait until there are no pending asynchronous writes. */
void os_aio_wait_until_no_pending_writes();
-
-/** Wait until there are no pending asynchronous reads. */
+/** Wait until all pending asynchronous reads have completed. */
void os_aio_wait_until_no_pending_reads();
-
/** Prints info of the aio arrays.
@param[in/out] file file where to print */
void
@@ -1208,31 +1152,34 @@ os_file_punch_hole(
os_offset_t len)
MY_ATTRIBUTE((warn_unused_result));
-/** Normalizes a directory path for the current OS:
-On Windows, we convert '/' to '\', else we convert '\' to '/'.
-@param[in,out] str A null-terminated directory and file path */
-void os_normalize_path(char* str);
-
/* Determine if a path is an absolute path or not.
@param[in] OS directory or file path to evaluate
@retval true if an absolute path
@retval false if a relative path */
-UNIV_INLINE
-bool
-is_absolute_path(
- const char* path)
+inline bool is_absolute_path(const char *path)
{
- if (path[0] == OS_PATH_SEPARATOR) {
- return(true);
- }
+ switch (path[0]) {
+#ifdef _WIN32
+ case '\0':
+ return false;
+ case '\\':
+#endif
+ case '/':
+ return true;
+ }
#ifdef _WIN32
- if (path[1] == ':' && path[2] == OS_PATH_SEPARATOR) {
- return(true);
- }
+ if (path[1] == ':')
+ {
+ switch (path[2]) {
+ case '/':
+ case '\\':
+ return true;
+ }
+ }
#endif /* _WIN32 */
- return(false);
+ return false;
}
#include "os0file.inl"
diff --git a/storage/innobase/include/os0file.inl b/storage/innobase/include/os0file.inl
index e88f94b8ff3..7de3150540d 100644
--- a/storage/innobase/include/os0file.inl
+++ b/storage/innobase/include/os0file.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2010, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2020, MariaDB Corporation.
+Copyright (c) 2013, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -210,6 +210,7 @@ os_file_read() which requests a synchronous read operation.
@param[out] buf buffer where to read
@param[in] offset file offset where to read
@param[in] n number of bytes to read
+@param[out] o number of bytes actually read
@param[in] src_file file name where func invoked
@param[in] src_line line where the func invoked
@return DB_SUCCESS if request was successful */
@@ -221,6 +222,7 @@ pfs_os_file_read_func(
void* buf,
os_offset_t offset,
ulint n,
+ ulint* o,
const char* src_file,
uint src_line)
{
@@ -232,47 +234,7 @@ pfs_os_file_read_func(
dberr_t result;
- result = os_file_read_func(type, file, buf, offset, n);
-
- register_pfs_file_io_end(locker, n);
-
- return(result);
-}
-
-/** NOTE! Please use the corresponding macro os_file_read_no_error_handling(),
-not directly this function!
-This is the performance schema instrumented wrapper function for
-os_file_read_no_error_handling_func() which requests a synchronous
-read operation.
-@param[in] type IO request context
-@param[in] file Open file handle
-@param[out] buf buffer where to read
-@param[in] offset file offset where to read
-@param[in] n number of bytes to read
-@param[out] o number of bytes actually read
-@param[in] src_file file name where func invoked
-@param[in] src_line line where the func invoked
-@return DB_SUCCESS if request was successful */
-UNIV_INLINE
-dberr_t
-pfs_os_file_read_no_error_handling_func(
- const IORequest& type,
- pfs_os_file_t file,
- void* buf,
- os_offset_t offset,
- ulint n,
- ulint* o,
- const char* src_file,
- uint src_line)
-{
- PSI_file_locker_state state;
- struct PSI_file_locker* locker = NULL;
-
- register_pfs_file_io_begin(
- &state, locker, file, n, PSI_FILE_READ, src_file, src_line);
-
- dberr_t result = os_file_read_no_error_handling_func(
- type, file, buf, offset, n, o);
+ result = os_file_read_func(type, file, buf, offset, n, o);
register_pfs_file_io_end(locker, n);
diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h
index d80eb4567e5..28aa30565e4 100644
--- a/storage/innobase/include/page0cur.h
+++ b/storage/innobase/include/page0cur.h
@@ -54,14 +54,11 @@ page_zip_des_t*
page_cur_get_page_zip(
/*==================*/
page_cur_t* cur); /*!< in: page cursor */
-/*********************************************************//**
-Gets the record where the cursor is positioned.
+/* Gets the record where the cursor is positioned.
+@param cur page cursor
@return record */
UNIV_INLINE
-rec_t*
-page_cur_get_rec(
-/*=============*/
- page_cur_t* cur); /*!< in: page cursor */
+rec_t *page_cur_get_rec(const page_cur_t *cur);
#else /* UNIV_DEBUG */
# define page_cur_get_page(cur) page_align((cur)->rec)
# define page_cur_get_block(cur) (cur)->block
@@ -113,20 +110,6 @@ page_cur_position(
const buf_block_t* block, /*!< in: buffer block containing
the record */
page_cur_t* cur); /*!< out: page cursor */
-/**********************************************************//**
-Moves the cursor to the next record on page. */
-UNIV_INLINE
-void
-page_cur_move_to_next(
-/*==================*/
- page_cur_t* cur); /*!< in/out: cursor; must not be after last */
-/**********************************************************//**
-Moves the cursor to the previous record on page. */
-UNIV_INLINE
-void
-page_cur_move_to_prev(
-/*==================*/
- page_cur_t* cur); /*!< in/out: cursor; not before first */
/***********************************************************//**
Inserts a record next to page cursor. Returns pointer to inserted record if
@@ -146,7 +129,6 @@ page_cur_tuple_insert(
/*==================*/
page_cur_t* cursor, /*!< in/out: a page cursor */
const dtuple_t* tuple, /*!< in: pointer to a data tuple */
- dict_index_t* index, /*!< in: record descriptor */
rec_offs** offsets,/*!< out: offsets on *rec */
mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
ulint n_ext, /*!< in: number of externally stored columns */
@@ -160,7 +142,6 @@ rec_t*
page_cur_insert_rec_low(
/*====================*/
const page_cur_t*cur, /*!< in: page cursor */
- dict_index_t* index, /*!< in: record descriptor */
const rec_t* rec, /*!< in: record to insert after cur */
rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */
mtr_t* mtr) /*!< in/out: mini-transaction */
@@ -168,21 +149,20 @@ page_cur_insert_rec_low(
/***********************************************************//**
Inserts a record next to page cursor on a compressed and uncompressed
-page. Returns pointer to inserted record if succeed, i.e.,
-enough space available, NULL otherwise.
-The cursor stays at the same position.
+page.
IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
if this is a compressed leaf page in a secondary index.
This has to be done either within the same mini-transaction,
or by invoking ibuf_reset_free_bits() before mtr_commit().
-@return pointer to record if succeed, NULL otherwise */
+@return pointer to inserted record
+@return nullptr on failure */
rec_t*
page_cur_insert_rec_zip(
/*====================*/
- page_cur_t* cursor, /*!< in/out: page cursor */
- dict_index_t* index, /*!< in: record descriptor */
+ page_cur_t* cursor, /*!< in/out: page cursor,
+ logical position unchanged */
const rec_t* rec, /*!< in: pointer to a physical record */
rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */
mtr_t* mtr) /*!< in/out: mini-transaction */
@@ -194,7 +174,6 @@ void
page_cur_delete_rec(
/*================*/
page_cur_t* cursor, /*!< in/out: a page cursor */
- const dict_index_t* index, /*!< in: record descriptor */
const rec_offs* offsets,/*!< in: rec_get_offsets(
cursor->rec, index) */
mtr_t* mtr) /*!< in/out: mini-transaction */
@@ -250,43 +229,12 @@ page_cur_delete_rec() for a ROW_FORMAT=COMPACT or DYNAMIC page.
bool page_apply_delete_dynamic(const buf_block_t &block, ulint prev,
size_t hdr_size, size_t data_size);
-/** Search the right position for a page cursor.
-@param[in] block buffer block
-@param[in] index index tree
-@param[in] tuple data tuple
-@param[in] mode PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE
-@param[out] cursor page cursor
-@return number of matched fields on the left */
-UNIV_INLINE
-ulint
-page_cur_search(
- const buf_block_t* block,
- const dict_index_t* index,
- const dtuple_t* tuple,
- page_cur_mode_t mode,
- page_cur_t* cursor);
-
-/** Search the right position for a page cursor.
-@param[in] block buffer block
-@param[in] index index tree
-@param[in] tuple data tuple
-@param[out] cursor page cursor
-@return number of matched fields on the left */
-UNIV_INLINE
-ulint
-page_cur_search(
- const buf_block_t* block,
- const dict_index_t* index,
- const dtuple_t* tuple,
- page_cur_t* cursor);
-
+MY_ATTRIBUTE((warn_unused_result))
/****************************************************************//**
Searches the right position for a page cursor. */
-void
+bool
page_cur_search_with_match(
/*=======================*/
- const buf_block_t* block, /*!< in: buffer block */
- const dict_index_t* index, /*!< in: record descriptor */
const dtuple_t* tuple, /*!< in: data tuple */
page_cur_mode_t mode, /*!< in: PAGE_CUR_L,
PAGE_CUR_LE, PAGE_CUR_G, or
@@ -297,12 +245,11 @@ page_cur_search_with_match(
ulint* ilow_matched_fields,
/*!< in/out: already matched
fields in lower limit record */
- page_cur_t* cursor, /*!< out: page cursor */
+ page_cur_t* cursor, /*!< in/out: page cursor */
rtr_info_t* rtr_info);/*!< in/out: rtree search stack */
#ifdef BTR_CUR_HASH_ADAPT
+MY_ATTRIBUTE((warn_unused_result))
/** Search the right position for a page cursor.
-@param[in] block buffer block
-@param[in] index index tree
@param[in] tuple key to be searched for
@param[in] mode search mode
@param[in,out] iup_matched_fields already matched fields in the
@@ -313,11 +260,9 @@ first partially matched field in the upper limit record
lower limit record
@param[in,out] ilow_matched_bytes already matched bytes in the
first partially matched field in the lower limit record
-@param[out] cursor page cursor */
-void
+@param[in,out] cursor page cursor */
+bool
page_cur_search_with_match_bytes(
- const buf_block_t* block,
- const dict_index_t* index,
const dtuple_t* tuple,
page_cur_mode_t mode,
ulint* iup_matched_fields,
@@ -329,21 +274,30 @@ page_cur_search_with_match_bytes(
/***********************************************************//**
Positions a page cursor on a randomly chosen user record on a page. If there
are no user records, sets the cursor on the infimum record. */
-void
-page_cur_open_on_rnd_user_rec(
-/*==========================*/
- buf_block_t* block, /*!< in: page */
- page_cur_t* cursor);/*!< out: page cursor */
+void page_cur_open_on_rnd_user_rec(page_cur_t *cursor);
/** Index page cursor */
struct page_cur_t{
- const dict_index_t* index;
+ dict_index_t* index;
rec_t* rec; /*!< pointer to a record on page */
rec_offs* offsets;
buf_block_t* block; /*!< pointer to the block containing rec */
};
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline rec_t *page_cur_move_to_next(page_cur_t *cur)
+{
+ return cur->rec= page_rec_get_next(cur->rec);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline rec_t *page_cur_move_to_prev(page_cur_t *cur)
+{
+ return cur->rec= page_rec_get_prev(cur->rec);
+}
+
#include "page0cur.inl"
#endif
diff --git a/storage/innobase/include/page0cur.inl b/storage/innobase/include/page0cur.inl
index 828be6840d2..1638b5749ff 100644
--- a/storage/innobase/include/page0cur.inl
+++ b/storage/innobase/include/page0cur.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2021, MariaDB Corporation.
+Copyright (c) 2015, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -34,13 +34,7 @@ page_cur_get_page(
/*==============*/
page_cur_t* cur) /*!< in: page cursor */
{
- ut_ad(cur);
-
- if (cur->rec) {
- ut_ad(page_align(cur->rec) == cur->block->frame);
- }
-
- return(page_align(cur->rec));
+ return page_align(page_cur_get_rec(cur));
}
/*********************************************************//**
@@ -52,13 +46,9 @@ page_cur_get_block(
/*===============*/
page_cur_t* cur) /*!< in: page cursor */
{
- ut_ad(cur);
-
- if (cur->rec) {
- ut_ad(page_align(cur->rec) == cur->block->frame);
- }
-
- return(cur->block);
+ ut_ad(cur);
+ ut_ad(!cur->rec || page_align(cur->rec) == cur->block->page.frame);
+ return cur->block;
}
/*********************************************************//**
@@ -73,22 +63,15 @@ page_cur_get_page_zip(
return(buf_block_get_page_zip(page_cur_get_block(cur)));
}
-/*********************************************************//**
-Gets the record where the cursor is positioned.
+/* Gets the record where the cursor is positioned.
+@param cur page cursor
@return record */
UNIV_INLINE
-rec_t*
-page_cur_get_rec(
-/*=============*/
- page_cur_t* cur) /*!< in: page cursor */
+rec_t *page_cur_get_rec(const page_cur_t *cur)
{
- ut_ad(cur);
-
- if (cur->rec) {
- ut_ad(page_align(cur->rec) == cur->block->frame);
- }
-
- return(cur->rec);
+ ut_ad(cur);
+ ut_ad(!cur->rec || page_align(cur->rec) == cur->block->page.frame);
+ return cur->rec;
}
#endif /* UNIV_DEBUG */
@@ -102,7 +85,7 @@ page_cur_set_before_first(
const buf_block_t* block, /*!< in: index page */
page_cur_t* cur) /*!< in: cursor */
{
- cur->block = (buf_block_t*) block;
+ cur->block = const_cast<buf_block_t*>(block);
cur->rec = page_get_infimum_rec(buf_block_get_frame(cur->block));
}
@@ -116,7 +99,7 @@ page_cur_set_after_last(
const buf_block_t* block, /*!< in: index page */
page_cur_t* cur) /*!< in: cursor */
{
- cur->block = (buf_block_t*) block;
+ cur->block = const_cast<buf_block_t*>(block);
cur->rec = page_get_supremum_rec(buf_block_get_frame(cur->block));
}
@@ -130,7 +113,7 @@ page_cur_is_before_first(
const page_cur_t* cur) /*!< in: cursor */
{
ut_ad(cur);
- ut_ad(page_align(cur->rec) == cur->block->frame);
+ ut_ad(page_align(cur->rec) == cur->block->page.frame);
return(page_rec_is_infimum(cur->rec));
}
@@ -144,7 +127,7 @@ page_cur_is_after_last(
const page_cur_t* cur) /*!< in: cursor */
{
ut_ad(cur);
- ut_ad(page_align(cur->rec) == cur->block->frame);
+ ut_ad(page_align(cur->rec) == cur->block->page.frame);
return(page_rec_is_supremum(cur->rec));
}
@@ -160,81 +143,12 @@ page_cur_position(
page_cur_t* cur) /*!< out: page cursor */
{
ut_ad(rec && block && cur);
- ut_ad(page_align(rec) == block->frame);
+ ut_ad(page_align(rec) == block->page.frame);
cur->rec = (rec_t*) rec;
cur->block = (buf_block_t*) block;
}
-/**********************************************************//**
-Moves the cursor to the next record on page. */
-UNIV_INLINE
-void
-page_cur_move_to_next(
-/*==================*/
- page_cur_t* cur) /*!< in/out: cursor; must not be after last */
-{
- ut_ad(!page_cur_is_after_last(cur));
-
- cur->rec = page_rec_get_next(cur->rec);
-}
-
-/**********************************************************//**
-Moves the cursor to the previous record on page. */
-UNIV_INLINE
-void
-page_cur_move_to_prev(
-/*==================*/
- page_cur_t* cur) /*!< in/out: page cursor, not before first */
-{
- ut_ad(!page_cur_is_before_first(cur));
-
- cur->rec = page_rec_get_prev(cur->rec);
-}
-
-/** Search the right position for a page cursor.
-@param[in] block buffer block
-@param[in] index index tree
-@param[in] tuple data tuple
-@param[in] mode PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE
-@param[out] cursor page cursor
-@return number of matched fields on the left */
-UNIV_INLINE
-ulint
-page_cur_search(
- const buf_block_t* block,
- const dict_index_t* index,
- const dtuple_t* tuple,
- page_cur_mode_t mode,
- page_cur_t* cursor)
-{
- ulint low_match = 0;
- ulint up_match = 0;
-
- ut_ad(dtuple_check_typed(tuple));
-
- page_cur_search_with_match(block, index, tuple, mode,
- &up_match, &low_match, cursor, NULL);
- return(low_match);
-}
-
-/** Search the right position for a page cursor.
-@param[in] block buffer block
-@param[in] index index tree
-@param[in] tuple data tuple
-@param[out] cursor page cursor
-@return number of matched fields on the left */
-UNIV_INLINE
-ulint
-page_cur_search(
- const buf_block_t* block,
- const dict_index_t* index,
- const dtuple_t* tuple,
- page_cur_t* cursor)
-{
- return(page_cur_search(block, index, tuple, PAGE_CUR_LE, cursor));
-}
-
/***********************************************************//**
Inserts a record next to page cursor. Returns pointer to inserted record if
succeed, i.e., enough space available, NULL otherwise. The cursor stays at
@@ -253,14 +167,12 @@ page_cur_tuple_insert(
/*==================*/
page_cur_t* cursor, /*!< in/out: a page cursor */
const dtuple_t* tuple, /*!< in: pointer to a data tuple */
- dict_index_t* index, /*!< in: record descriptor */
rec_offs** offsets,/*!< out: offsets on *rec */
mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
ulint n_ext, /*!< in: number of externally stored columns */
mtr_t* mtr) /*!< in/out: mini-transaction */
{
- rec_t* rec;
- ulint size = rec_get_converted_size(index, tuple, n_ext);
+ ulint size = rec_get_converted_size(cursor->index, tuple, n_ext);
if (!*heap) {
*heap = mem_heap_create(size
@@ -269,21 +181,20 @@ page_cur_tuple_insert(
* sizeof **offsets);
}
- rec = rec_convert_dtuple_to_rec((byte*) mem_heap_alloc(*heap, size),
- index, tuple, n_ext);
+ rec_t* rec = rec_convert_dtuple_to_rec(
+ static_cast<byte*>(mem_heap_alloc(*heap, size)),
+ cursor->index, tuple, n_ext);
- *offsets = rec_get_offsets(rec, index, *offsets,
- page_is_leaf(cursor->block->frame)
- ? index->n_core_fields : 0,
+ *offsets = rec_get_offsets(rec, cursor->index, *offsets,
+ page_is_leaf(cursor->block->page.frame)
+ ? cursor->index->n_core_fields : 0,
ULINT_UNDEFINED, heap);
ut_ad(size == rec_offs_size(*offsets));
if (is_buf_block_get_page_zip(cursor->block)) {
- rec = page_cur_insert_rec_zip(
- cursor, index, rec, *offsets, mtr);
+ rec = page_cur_insert_rec_zip(cursor, rec, *offsets, mtr);
} else {
- rec = page_cur_insert_rec_low(cursor,
- index, rec, *offsets, mtr);
+ rec = page_cur_insert_rec_low(cursor, rec, *offsets, mtr);
}
ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, *offsets));
diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h
index eb6bf56e8dd..0ad42474f84 100644
--- a/storage/innobase/include/page0page.h
+++ b/storage/innobase/include/page0page.h
@@ -1,6 +1,6 @@
/*****************************************************************************
Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2021, MariaDB Corporation.
+Copyright (c) 2013, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -418,8 +418,8 @@ template<bool compressed>
inline void page_rec_set_n_owned(buf_block_t *block, rec_t *rec, ulint n_owned,
bool comp, mtr_t *mtr)
{
- ut_ad(block->frame == page_align(rec));
- ut_ad(comp == (page_is_comp(block->frame) != 0));
+ ut_ad(block->page.frame == page_align(rec));
+ ut_ad(comp == (page_is_comp(block->page.frame) != 0));
if (page_zip_des_t *page_zip= compressed
? buf_block_get_page_zip(block) : nullptr)
@@ -534,7 +534,8 @@ inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr)
/************************************************************//**
Returns the nth record of the record list.
This is the inverse function of page_rec_get_n_recs_before().
-@return nth record */
+@return nth record
+@retval nullptr on corrupted page */
const rec_t*
page_rec_get_nth_const(
/*===================*/
@@ -544,14 +545,12 @@ page_rec_get_nth_const(
/************************************************************//**
Returns the nth record of the record list.
This is the inverse function of page_rec_get_n_recs_before().
-@return nth record */
-UNIV_INLINE
-rec_t*
-page_rec_get_nth(
-/*=============*/
- page_t* page, /*< in: page */
- ulint nth) /*!< in: nth record */
- MY_ATTRIBUTE((nonnull, warn_unused_result));
+@return nth record
+@retval nullptr on corrupted page */
+inline rec_t *page_rec_get_nth(page_t* page, ulint nth)
+{
+ return const_cast<rec_t*>(page_rec_get_nth_const(page, nth));
+}
/************************************************************//**
Returns the middle record of the records on the page. If there is an
@@ -592,15 +591,11 @@ page_get_n_recs(
/*============*/
const page_t* page); /*!< in: index page */
-/***************************************************************//**
-Returns the number of records before the given record in chain.
-The number includes infimum and supremum records.
-This is the inverse function of page_rec_get_nth().
-@return number of records */
-ulint
-page_rec_get_n_recs_before(
-/*=======================*/
- const rec_t* rec); /*!< in: the physical record */
+/** Return the number of preceding records in an index page.
+@param rec index record
+@return number of preceding records, including the infimum pseudo-record
+@retval ULINT_UNDEFINED on corrupted page */
+ulint page_rec_get_n_recs_before(const rec_t *rec);
/*************************************************************//**
Gets the number of records in the heap.
@return number of user records */
@@ -649,6 +644,23 @@ inline const rec_t *page_dir_slot_get_rec(const page_dir_slot_t *slot)
{
return page_dir_slot_get_rec(const_cast<rec_t*>(slot));
}
+
+inline rec_t *page_dir_slot_get_rec_validate(page_dir_slot_t *slot)
+{
+ const size_t s= mach_read_from_2(my_assume_aligned<2>(slot));
+ page_t *page= page_align(slot);
+
+ return UNIV_LIKELY(s >= PAGE_NEW_INFIMUM &&
+ s <= page_header_get_field(page, PAGE_HEAP_TOP))
+ ? page + s
+ : nullptr;
+}
+inline const rec_t *page_dir_slot_get_rec_validate(const page_dir_slot_t *slot)
+{
+ return page_dir_slot_get_rec_validate(const_cast<rec_t*>(slot));
+}
+
+
/***************************************************************//**
Gets the number of records owned by a directory slot.
@return number of records */
@@ -669,7 +681,8 @@ page_dir_calc_reserved_space(
ulint n_recs); /*!< in: number of records */
/***************************************************************//**
Looks for the directory slot which owns the given record.
-@return the directory slot number */
+@return the directory slot number
+@retval ULINT_UNDEFINED on corruption */
ulint
page_dir_find_owner_slot(
/*=====================*/
@@ -752,19 +765,9 @@ page_rec_get_next_const(
/*====================*/
const rec_t* rec); /*!< in: pointer to record */
/************************************************************//**
-Gets the pointer to the next non delete-marked record on the page.
-If all subsequent records are delete-marked, then this function
-will return the supremum record.
-@return pointer to next non delete-marked record or pointer to supremum */
-UNIV_INLINE
-const rec_t*
-page_rec_get_next_non_del_marked(
-/*=============================*/
- const rec_t* rec); /*!< in: pointer to record */
-/************************************************************//**
Gets the pointer to the previous record.
-@return pointer to previous record */
-UNIV_INLINE
+@return pointer to previous record
+@retval nullptr on error */
const rec_t*
page_rec_get_prev_const(
/*====================*/
@@ -772,13 +775,13 @@ page_rec_get_prev_const(
infimum */
/************************************************************//**
Gets the pointer to the previous record.
-@return pointer to previous record */
-UNIV_INLINE
-rec_t*
-page_rec_get_prev(
-/*==============*/
- rec_t* rec); /*!< in: pointer to record,
- must not be page infimum */
+@param rec record (not page infimum)
+@return pointer to previous record
+@retval nullptr on error */
+inline rec_t *page_rec_get_prev(rec_t *rec)
+{
+ return const_cast<rec_t*>(page_rec_get_prev_const(rec));
+}
/************************************************************//**
true if the record is the first user record on a page.
@@ -792,17 +795,6 @@ page_rec_is_first(
MY_ATTRIBUTE((warn_unused_result));
/************************************************************//**
-true if the record is the second user record on a page.
-@return true if the second user record */
-UNIV_INLINE
-bool
-page_rec_is_second(
-/*===============*/
- const rec_t* rec, /*!< in: record */
- const page_t* page) /*!< in: page */
- MY_ATTRIBUTE((warn_unused_result));
-
-/************************************************************//**
true if the record is the last user record on a page.
@return true if the last user record */
UNIV_INLINE
@@ -814,33 +806,6 @@ page_rec_is_last(
MY_ATTRIBUTE((warn_unused_result));
/************************************************************//**
-true if distance between the records (measured in number of times we have to
-move to the next record) is at most the specified value
-@param[in] left_rec lefter record
-@param[in] right_rec righter record
-@param[in] val specified value to compare
-@return true if the distance is smaller than the value */
-UNIV_INLINE
-bool
-page_rec_distance_is_at_most(
-/*=========================*/
- const rec_t* left_rec,
- const rec_t* right_rec,
- ulint val)
- MY_ATTRIBUTE((warn_unused_result));
-
-/************************************************************//**
-true if the record is the second last user record on a page.
-@return true if the second last user record */
-UNIV_INLINE
-bool
-page_rec_is_second_last(
-/*====================*/
- const rec_t* rec, /*!< in: record */
- const page_t* page) /*!< in: page */
- MY_ATTRIBUTE((warn_unused_result));
-
-/************************************************************//**
Returns the maximum combined size of records which can be inserted on top
of record heap.
@return maximum combined size for inserted records */
@@ -930,6 +895,8 @@ page_create_empty(
dict_index_t* index, /*!< in: the index of the page */
mtr_t* mtr) /*!< in/out: mini-transaction */
MY_ATTRIBUTE((nonnull(1,2)));
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
/*************************************************************//**
Differs from page_copy_rec_list_end, because this function does not
touch the lock table and max trx id on page or compress the page.
@@ -937,8 +904,10 @@ touch the lock table and max trx id on page or compress the page.
IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
if new_block is a compressed leaf page in a secondary index.
This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit(). */
-void
+or by invoking ibuf_reset_free_bits() before mtr_t::commit().
+
+@return error code */
+dberr_t
page_copy_rec_list_end_no_locks(
/*============================*/
buf_block_t* new_block, /*!< in: index page to copy to */
@@ -954,10 +923,10 @@ The records are copied to the start of the record list on new_page.
IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
if new_block is a compressed leaf page in a secondary index.
This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
+or by invoking ibuf_reset_free_bits() before mtr_t::commit().
-@return pointer to the original successor of the infimum record on
-new_page, or NULL on zip overflow (new_block will be decompressed) */
+@return pointer to the original successor of the infimum record on new_block
+@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */
rec_t*
page_copy_rec_list_end(
/*===================*/
@@ -965,8 +934,9 @@ page_copy_rec_list_end(
buf_block_t* block, /*!< in: index page containing rec */
rec_t* rec, /*!< in: record on page */
dict_index_t* index, /*!< in: record descriptor */
- mtr_t* mtr) /*!< in: mtr */
- MY_ATTRIBUTE((nonnull));
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ dberr_t* err) /*!< out: error code */
+ MY_ATTRIBUTE((nonnull(1,2,3,4,5), warn_unused_result));
/*************************************************************//**
Copies records from page to new_page, up to the given record, NOT
including that record. Infimum and supremum records are not copied.
@@ -977,8 +947,8 @@ if new_block is a compressed leaf page in a secondary index.
This has to be done either within the same mini-transaction,
or by invoking ibuf_reset_free_bits() before mtr_commit().
-@return pointer to the original predecessor of the supremum record on
-new_page, or NULL on zip overflow (new_block will be decompressed) */
+@return pointer to the original predecessor of the supremum record on new_block
+@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */
rec_t*
page_copy_rec_list_start(
/*=====================*/
@@ -986,12 +956,13 @@ page_copy_rec_list_start(
buf_block_t* block, /*!< in: index page containing rec */
rec_t* rec, /*!< in: record on page */
dict_index_t* index, /*!< in: record descriptor */
- mtr_t* mtr) /*!< in: mtr */
- MY_ATTRIBUTE((nonnull));
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ dberr_t* err) /*!< out: error code */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/*************************************************************//**
Deletes records from a page from a given record onward, including that record.
The infimum and supremum records are not deleted. */
-void
+dberr_t
page_delete_rec_list_end(
/*=====================*/
rec_t* rec, /*!< in: pointer to record on page */
@@ -1003,7 +974,7 @@ page_delete_rec_list_end(
records in the end of the chain to
delete, or ULINT_UNDEFINED if not known */
mtr_t* mtr) /*!< in: mtr */
- MY_ATTRIBUTE((nonnull));
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/*************************************************************//**
Deletes records from page, up to the given record, NOT including
that record. Infimum and supremum records are not deleted. */
@@ -1015,45 +986,6 @@ page_delete_rec_list_start(
dict_index_t* index, /*!< in: record descriptor */
mtr_t* mtr) /*!< in: mtr */
MY_ATTRIBUTE((nonnull));
-/*************************************************************//**
-Moves record list end to another page. Moved records include
-split_rec.
-
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if new_block is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
-@return TRUE on success; FALSE on compression failure (new_block will
-be decompressed) */
-ibool
-page_move_rec_list_end(
-/*===================*/
- buf_block_t* new_block, /*!< in/out: index page where to move */
- buf_block_t* block, /*!< in: index page from where to move */
- rec_t* split_rec, /*!< in: first record to move */
- dict_index_t* index, /*!< in: record descriptor */
- mtr_t* mtr) /*!< in: mtr */
- MY_ATTRIBUTE((nonnull(1, 2, 4, 5)));
-/*************************************************************//**
-Moves record list start to another page. Moved records do not include
-split_rec.
-
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if new_block is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
-@return TRUE on success; FALSE on compression failure */
-ibool
-page_move_rec_list_start(
-/*=====================*/
- buf_block_t* new_block, /*!< in/out: index page where to move */
- buf_block_t* block, /*!< in/out: page containing split_rec */
- rec_t* split_rec, /*!< in: first record not to move */
- dict_index_t* index, /*!< in: record descriptor */
- mtr_t* mtr) /*!< in: mtr */
- MY_ATTRIBUTE((nonnull(1, 2, 4, 5)));
/** Create an index page.
@param[in,out] block buffer block
@param[in] comp nonzero=compact page format */
@@ -1160,9 +1092,7 @@ page_find_rec_with_heap_no(
@param[in] page index tree leaf page
@return the last record, not delete-marked
@retval infimum record if all records are delete-marked */
-const rec_t*
-page_find_rec_max_not_deleted(
- const page_t* page);
+const rec_t *page_find_rec_max_not_deleted(const page_t *page);
#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/page0page.inl b/storage/innobase/include/page0page.inl
index 6514886dd67..6c0167edcf9 100644
--- a/storage/innobase/include/page0page.inl
+++ b/storage/innobase/include/page0page.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2016, 2021, MariaDB Corporation.
+Copyright (c) 2016, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,9 +24,6 @@ Index page routines
Created 2/2/1994 Heikki Tuuri
*******************************************************/
-#ifndef page0page_ic
-#define page0page_ic
-
#ifndef UNIV_INNOCHECKSUM
#include "rem0cmp.h"
#include "mtr0log.h"
@@ -87,7 +84,7 @@ page_set_ssn_id(
MTR_MEMO_PAGE_X_FIX));
ut_ad(!page_zip || page_zip == &block->page.zip);
constexpr uint16_t field= FIL_RTREE_SPLIT_SEQ_NUM;
- byte *b= my_assume_aligned<2>(&block->frame[field]);
+ byte *b= my_assume_aligned<2>(&block->page.frame[field]);
if (mtr->write<8,mtr_t::MAYBE_NOP>(*block, b, ssn_id) &&
UNIV_LIKELY_NULL(page_zip))
memcpy_aligned<2>(&page_zip->data[field], b, 8);
@@ -125,7 +122,7 @@ Reset PAGE_LAST_INSERT.
inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr)
{
constexpr uint16_t field= PAGE_HEADER + PAGE_LAST_INSERT;
- byte *b= my_assume_aligned<2>(&block->frame[field]);
+ byte *b= my_assume_aligned<2>(&block->page.frame[field]);
if (mtr->write<2,mtr_t::MAYBE_NOP>(*block, b, 0U) &&
UNIV_LIKELY_NULL(block->page.zip.data))
memset_aligned<2>(&block->page.zip.data[field], 0, 2);
@@ -196,22 +193,6 @@ page_rec_is_first(
}
/************************************************************//**
-true if the record is the second user record on a page.
-@return true if the second user record */
-UNIV_INLINE
-bool
-page_rec_is_second(
-/*===============*/
- const rec_t* rec, /*!< in: record */
- const page_t* page) /*!< in: page */
-{
- ut_ad(page_get_n_recs(page) > 1);
-
- return(page_rec_get_next_const(
- page_rec_get_next_const(page_get_infimum_rec(page))) == rec);
-}
-
-/************************************************************//**
true if the record is the last user record on a page.
@return true if the last user record */
UNIV_INLINE
@@ -227,57 +208,6 @@ page_rec_is_last(
}
/************************************************************//**
-true if distance between the records (measured in number of times we have to
-move to the next record) is at most the specified value */
-UNIV_INLINE
-bool
-page_rec_distance_is_at_most(
-/*=========================*/
- const rec_t* left_rec,
- const rec_t* right_rec,
- ulint val)
-{
- for (ulint i = 0; i <= val; i++) {
- if (left_rec == right_rec) {
- return (true);
- }
- left_rec = page_rec_get_next_const(left_rec);
- }
- return (false);
-}
-
-/************************************************************//**
-true if the record is the second last user record on a page.
-@return true if the second last user record */
-UNIV_INLINE
-bool
-page_rec_is_second_last(
-/*====================*/
- const rec_t* rec, /*!< in: record */
- const page_t* page) /*!< in: page */
-{
- ut_ad(page_get_n_recs(page) > 1);
- ut_ad(!page_rec_is_last(rec, page));
-
- return(page_rec_get_next_const(
- page_rec_get_next_const(rec)) == page_get_supremum_rec(page));
-}
-
-/************************************************************//**
-Returns the nth record of the record list.
-This is the inverse function of page_rec_get_n_recs_before().
-@return nth record */
-UNIV_INLINE
-rec_t*
-page_rec_get_nth(
-/*=============*/
- page_t* page, /*!< in: page */
- ulint nth) /*!< in: nth record */
-{
- return((rec_t*) page_rec_get_nth_const(page, nth));
-}
-
-/************************************************************//**
Returns the middle record of the records on the page. If there is an
even number of records in the list, returns the first record of the
upper half-list.
@@ -424,36 +354,19 @@ page_rec_get_next_low(
const rec_t* rec, /*!< in: pointer to record */
ulint comp) /*!< in: nonzero=compact page layout */
{
- ulint offs;
- const page_t* page;
-
- ut_ad(page_rec_check(rec));
-
- page = page_align(rec);
-
- offs = rec_get_next_offs(rec, comp);
-
- if (offs >= srv_page_size) {
- fprintf(stderr,
- "InnoDB: Next record offset is nonsensical %lu"
- " in record at offset %lu\n"
- "InnoDB: rec address %p, space id %lu, page %lu\n",
- (ulong) offs, (ulong) page_offset(rec),
- (void*) rec,
- (ulong) page_get_space_id(page),
- (ulong) page_get_page_no(page));
- ut_error;
- } else if (offs == 0) {
-
- return(NULL);
- }
-
- ut_ad(page_rec_is_infimum(rec)
- || (!page_is_leaf(page) && !page_has_prev(page))
- || !(rec_get_info_bits(page + offs, comp)
- & REC_INFO_MIN_REC_FLAG));
-
- return(page + offs);
+ const page_t *page= page_align(rec);
+ ut_ad(page_rec_check(rec));
+ ulint offs= rec_get_next_offs(rec, comp);
+ if (!offs)
+ return nullptr;
+ if (UNIV_UNLIKELY(offs < (comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM)))
+ return nullptr;
+ if (UNIV_UNLIKELY(offs > page_header_get_field(page, PAGE_HEAP_TOP)))
+ return nullptr;
+ ut_ad(page_rec_is_infimum(rec) ||
+ (!page_is_leaf(page) && !page_has_prev(page)) ||
+ !(rec_get_info_bits(page + offs, comp) & REC_INFO_MIN_REC_FLAG));
+ return page + offs;
}
/************************************************************//**
@@ -479,91 +392,6 @@ page_rec_get_next_const(
{
return(page_rec_get_next_low(rec, page_rec_is_comp(rec)));
}
-
-/************************************************************//**
-Gets the pointer to the next non delete-marked record on the page.
-If all subsequent records are delete-marked, then this function
-will return the supremum record.
-@return pointer to next non delete-marked record or pointer to supremum */
-UNIV_INLINE
-const rec_t*
-page_rec_get_next_non_del_marked(
-/*=============================*/
- const rec_t* rec) /*!< in: pointer to record */
-{
- const rec_t* r;
- ulint page_is_compact = page_rec_is_comp(rec);
-
- for (r = page_rec_get_next_const(rec);
- !page_rec_is_supremum(r)
- && rec_get_deleted_flag(r, page_is_compact);
- r = page_rec_get_next_const(r)) {
- /* noop */
- }
-
- return(r);
-}
-
-/************************************************************//**
-Gets the pointer to the previous record.
-@return pointer to previous record */
-UNIV_INLINE
-const rec_t*
-page_rec_get_prev_const(
-/*====================*/
- const rec_t* rec) /*!< in: pointer to record, must not be page
- infimum */
-{
- const page_dir_slot_t* slot;
- ulint slot_no;
- const rec_t* rec2;
- const rec_t* prev_rec = NULL;
- const page_t* page;
-
- ut_ad(page_rec_check(rec));
-
- page = page_align(rec);
-
- ut_ad(!page_rec_is_infimum(rec));
-
- slot_no = page_dir_find_owner_slot(rec);
-
- ut_a(slot_no != 0);
-
- slot = page_dir_get_nth_slot(page, slot_no - 1);
-
- rec2 = page_dir_slot_get_rec(slot);
-
- if (page_is_comp(page)) {
- while (rec != rec2) {
- prev_rec = rec2;
- rec2 = page_rec_get_next_low(rec2, TRUE);
- }
- } else {
- while (rec != rec2) {
- prev_rec = rec2;
- rec2 = page_rec_get_next_low(rec2, FALSE);
- }
- }
-
- ut_a(prev_rec);
-
- return(prev_rec);
-}
-
-/************************************************************//**
-Gets the pointer to the previous record.
-@return pointer to previous record */
-UNIV_INLINE
-rec_t*
-page_rec_get_prev(
-/*==============*/
- rec_t* rec) /*!< in: pointer to record, must not be page
- infimum */
-{
- return((rec_t*) page_rec_get_prev_const(rec));
-}
-
#endif /* UNIV_INNOCHECKSUM */
/************************************************************//**
@@ -720,5 +548,3 @@ page_get_instant(const page_t* page)
return static_cast<uint16_t>(i >> 3); /* i / 8 */
}
#endif /* !UNIV_INNOCHECKSUM */
-
-#endif
diff --git a/storage/innobase/include/page0types.h b/storage/innobase/include/page0types.h
index 6c5a681f3b5..83fc45cdfc4 100644
--- a/storage/innobase/include/page0types.h
+++ b/storage/innobase/include/page0types.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, 2020, MariaDB Corporation.
+Copyright (c) 2019, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -30,6 +30,7 @@ Created 2/2/1994 Heikki Tuuri
#include "dict0types.h"
#include "mtr0types.h"
#include "rem0types.h"
+#include "ut0new.h"
#include <map>
@@ -87,26 +88,52 @@ enum page_cur_mode_t {
PAGE_CUR_RTREE_GET_FATHER = 14
};
+class buf_pool_t;
+class buf_page_t;
+
/** Compressed page descriptor */
struct page_zip_des_t
{
page_zip_t* data; /*!< compressed page data */
-#ifdef UNIV_DEBUG
- unsigned m_start:16; /*!< start offset of modification log */
- bool m_external; /*!< Allocated externally, not from the
- buffer pool */
-#endif /* UNIV_DEBUG */
- unsigned m_end:16; /*!< end offset of modification log */
- unsigned m_nonempty:1; /*!< TRUE if the modification log
+ uint32_t m_end:16; /*!< end offset of modification log */
+ uint32_t m_nonempty:1; /*!< TRUE if the modification log
is not empty */
- unsigned n_blobs:12; /*!< number of externally stored
+ uint32_t n_blobs:12; /*!< number of externally stored
columns on the page; the maximum
is 744 on a 16 KiB page */
- unsigned ssize:PAGE_ZIP_SSIZE_BITS;
+ uint32_t ssize:PAGE_ZIP_SSIZE_BITS;
/*!< 0 or compressed page shift size;
the size in bytes is
(UNIV_ZIP_SIZE_MIN >> 1) << ssize. */
+#ifdef UNIV_DEBUG
+ uint16_t m_start; /*!< start offset of modification log */
+ bool m_external; /*!< Allocated externally, not from the
+ buffer pool */
+#endif /* UNIV_DEBUG */
+
+ void clear() {
+ /* Clear everything except the member "fix". */
+ memset((void*) this, 0,
+ reinterpret_cast<char*>(&fix)
+ - reinterpret_cast<char*>(this));
+ }
+
+ page_zip_des_t() = default;
+ page_zip_des_t(const page_zip_des_t&) = default;
+
+ /* Initialize everything except the member "fix". */
+ page_zip_des_t(const page_zip_des_t& old, bool) {
+ memcpy((void*) this, (void*) &old,
+ reinterpret_cast<char*>(&fix)
+ - reinterpret_cast<char*>(this));
+ }
+
+private:
+ friend buf_pool_t;
+ friend buf_page_t;
+ /** fix count and state used in buf_page_t */
+ Atomic_relaxed<uint32_t> fix;
};
/** Compression statistics for a given page size */
diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h
index 5b98fdea004..4332990619e 100644
--- a/storage/innobase/include/page0zip.h
+++ b/storage/innobase/include/page0zip.h
@@ -2,7 +2,7 @@
Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -109,12 +109,7 @@ page_zip_is_too_big(
/**********************************************************************//**
Initialize a compressed page descriptor. */
-UNIV_INLINE
-void
-page_zip_des_init(
-/*==============*/
- page_zip_des_t* page_zip); /*!< in/out: compressed page
- descriptor */
+#define page_zip_des_init(page_zip) (page_zip)->clear()
/**********************************************************************//**
Configure the zlib allocator to use the given memory heap. */
@@ -332,9 +327,9 @@ IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
non-clustered index, the caller must update the insert buffer free
bits in the same mini-transaction in such a way that the modification
will be redo-logged.
-@retval true on success
-@retval false on failure; the block_zip will be left intact */
-bool
+@return error code
+@retval DB_FAIL on overflow; the block_zip will be left intact */
+dberr_t
page_zip_reorganize(
buf_block_t* block, /*!< in/out: page with compressed page;
on the compressed page, in: size;
@@ -344,7 +339,7 @@ page_zip_reorganize(
ulint z_level,/*!< in: compression level */
mtr_t* mtr, /*!< in: mini-transaction */
bool restore = false)/*!< whether to restore on failure */
- MY_ATTRIBUTE((nonnull));
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/**********************************************************************//**
Copy the records of a page byte for byte. Do not copy the page header
@@ -361,15 +356,11 @@ page_zip_copy_recs(
#endif /* !UNIV_INNOCHECKSUM */
/** Calculate the compressed page checksum.
-@param[in] data compressed page
-@param[in] size size of compressed page
-@param[in] algo algorithm to use
+@param data compressed page
+@param size size of compressed page
+@param use_adler whether to use Adler32 instead of a XOR of 3 CRC-32C
@return page checksum */
-uint32_t
-page_zip_calc_checksum(
- const void* data,
- ulint size,
- srv_checksum_algorithm_t algo);
+uint32_t page_zip_calc_checksum(const void *data, size_t size, bool use_adler);
/** Validate the checksum on a ROW_FORMAT=COMPRESSED page.
@param data ROW_FORMAT=COMPRESSED page
diff --git a/storage/innobase/include/page0zip.inl b/storage/innobase/include/page0zip.inl
index b0622ba79c3..afc877c3720 100644
--- a/storage/innobase/include/page0zip.inl
+++ b/storage/innobase/include/page0zip.inl
@@ -304,18 +304,6 @@ page_zip_available(
}
/**********************************************************************//**
-Initialize a compressed page descriptor. */
-UNIV_INLINE
-void
-page_zip_des_init(
-/*==============*/
- page_zip_des_t* page_zip) /*!< in/out: compressed page
- descriptor */
-{
- memset(page_zip, 0, sizeof *page_zip);
-}
-
-/**********************************************************************//**
Reset the counters used for filling
INFORMATION_SCHEMA.innodb_cmp_per_index. */
UNIV_INLINE
@@ -323,11 +311,7 @@ void
page_zip_reset_stat_per_index()
/*===========================*/
{
- mutex_enter(&page_zip_stat_per_index_mutex);
-
- page_zip_stat_per_index.erase(
- page_zip_stat_per_index.begin(),
- page_zip_stat_per_index.end());
-
- mutex_exit(&page_zip_stat_per_index_mutex);
+ mysql_mutex_lock(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index.clear();
+ mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
}
diff --git a/storage/innobase/include/pars0grm.h b/storage/innobase/include/pars0grm.h
index 58d424abfdc..e7112d9996f 100644
--- a/storage/innobase/include/pars0grm.h
+++ b/storage/innobase/include/pars0grm.h
@@ -1,8 +1,8 @@
-/* A Bison parser, made by GNU Bison 3.4.2. */
+/* A Bison parser, made by GNU Bison 3.7.6. */
/* Bison interface for Yacc-like parsers in C
- Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2019 Free Software Foundation,
+ Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2021 Free Software Foundation,
Inc.
This program is free software: you can redistribute it and/or modify
@@ -16,7 +16,7 @@
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>. */
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
/* As a special exception, you may create a larger work that contains
part or all of the Bison parser skeleton and distribute that work
@@ -31,8 +31,9 @@
This special exception was added by the Free Software Foundation in
version 2.2 of Bison. */
-/* Undocumented macros, especially those whose name start with YY_,
- are private implementation details. Do not rely on them. */
+/* DO NOT RELY ON FEATURES THAT ARE NOT DOCUMENTED in the manual,
+ especially those whose name start with YY_ or yy_. They are
+ private implementation details that can be changed or removed. */
#ifndef YY_YY_PARS0GRM_TAB_H_INCLUDED
# define YY_YY_PARS0GRM_TAB_H_INCLUDED
@@ -44,90 +45,95 @@
extern int yydebug;
#endif
-/* Token type. */
+/* Token kinds. */
#ifndef YYTOKENTYPE
# define YYTOKENTYPE
enum yytokentype
{
- PARS_INT_LIT = 258,
- PARS_FLOAT_LIT = 259,
- PARS_STR_LIT = 260,
- PARS_NULL_LIT = 261,
- PARS_ID_TOKEN = 262,
- PARS_AND_TOKEN = 263,
- PARS_OR_TOKEN = 264,
- PARS_NOT_TOKEN = 265,
- PARS_GE_TOKEN = 266,
- PARS_LE_TOKEN = 267,
- PARS_NE_TOKEN = 268,
- PARS_PROCEDURE_TOKEN = 269,
- PARS_IN_TOKEN = 270,
- PARS_INT_TOKEN = 271,
- PARS_CHAR_TOKEN = 272,
- PARS_IS_TOKEN = 273,
- PARS_BEGIN_TOKEN = 274,
- PARS_END_TOKEN = 275,
- PARS_IF_TOKEN = 276,
- PARS_THEN_TOKEN = 277,
- PARS_ELSE_TOKEN = 278,
- PARS_ELSIF_TOKEN = 279,
- PARS_LOOP_TOKEN = 280,
- PARS_WHILE_TOKEN = 281,
- PARS_RETURN_TOKEN = 282,
- PARS_SELECT_TOKEN = 283,
- PARS_COUNT_TOKEN = 284,
- PARS_FROM_TOKEN = 285,
- PARS_WHERE_TOKEN = 286,
- PARS_FOR_TOKEN = 287,
- PARS_DDOT_TOKEN = 288,
- PARS_ORDER_TOKEN = 289,
- PARS_BY_TOKEN = 290,
- PARS_ASC_TOKEN = 291,
- PARS_DESC_TOKEN = 292,
- PARS_INSERT_TOKEN = 293,
- PARS_INTO_TOKEN = 294,
- PARS_VALUES_TOKEN = 295,
- PARS_UPDATE_TOKEN = 296,
- PARS_SET_TOKEN = 297,
- PARS_DELETE_TOKEN = 298,
- PARS_CURRENT_TOKEN = 299,
- PARS_OF_TOKEN = 300,
- PARS_CREATE_TOKEN = 301,
- PARS_TABLE_TOKEN = 302,
- PARS_INDEX_TOKEN = 303,
- PARS_UNIQUE_TOKEN = 304,
- PARS_CLUSTERED_TOKEN = 305,
- PARS_ON_TOKEN = 306,
- PARS_ASSIGN_TOKEN = 307,
- PARS_DECLARE_TOKEN = 308,
- PARS_CURSOR_TOKEN = 309,
- PARS_SQL_TOKEN = 310,
- PARS_OPEN_TOKEN = 311,
- PARS_FETCH_TOKEN = 312,
- PARS_CLOSE_TOKEN = 313,
- PARS_NOTFOUND_TOKEN = 314,
- PARS_TO_BINARY_TOKEN = 315,
- PARS_SUBSTR_TOKEN = 316,
- PARS_CONCAT_TOKEN = 317,
- PARS_INSTR_TOKEN = 318,
- PARS_LENGTH_TOKEN = 319,
- PARS_COMMIT_TOKEN = 320,
- PARS_ROLLBACK_TOKEN = 321,
- PARS_WORK_TOKEN = 322,
- PARS_EXIT_TOKEN = 323,
- PARS_FUNCTION_TOKEN = 324,
- PARS_LOCK_TOKEN = 325,
- PARS_SHARE_TOKEN = 326,
- PARS_MODE_TOKEN = 327,
- PARS_LIKE_TOKEN = 328,
- PARS_LIKE_TOKEN_EXACT = 329,
- PARS_LIKE_TOKEN_PREFIX = 330,
- PARS_LIKE_TOKEN_SUFFIX = 331,
- PARS_LIKE_TOKEN_SUBSTR = 332,
- PARS_TABLE_NAME_TOKEN = 333,
- PARS_BIGINT_TOKEN = 334,
- NEG = 335
+ YYEMPTY = -2,
+ YYEOF = 0, /* "end of file" */
+ YYerror = 256, /* error */
+ YYUNDEF = 257, /* "invalid token" */
+ PARS_INT_LIT = 258, /* PARS_INT_LIT */
+ PARS_FLOAT_LIT = 259, /* PARS_FLOAT_LIT */
+ PARS_STR_LIT = 260, /* PARS_STR_LIT */
+ PARS_NULL_LIT = 261, /* PARS_NULL_LIT */
+ PARS_ID_TOKEN = 262, /* PARS_ID_TOKEN */
+ PARS_AND_TOKEN = 263, /* PARS_AND_TOKEN */
+ PARS_OR_TOKEN = 264, /* PARS_OR_TOKEN */
+ PARS_NOT_TOKEN = 265, /* PARS_NOT_TOKEN */
+ PARS_GE_TOKEN = 266, /* PARS_GE_TOKEN */
+ PARS_LE_TOKEN = 267, /* PARS_LE_TOKEN */
+ PARS_NE_TOKEN = 268, /* PARS_NE_TOKEN */
+ PARS_PROCEDURE_TOKEN = 269, /* PARS_PROCEDURE_TOKEN */
+ PARS_IN_TOKEN = 270, /* PARS_IN_TOKEN */
+ PARS_INT_TOKEN = 271, /* PARS_INT_TOKEN */
+ PARS_CHAR_TOKEN = 272, /* PARS_CHAR_TOKEN */
+ PARS_IS_TOKEN = 273, /* PARS_IS_TOKEN */
+ PARS_BEGIN_TOKEN = 274, /* PARS_BEGIN_TOKEN */
+ PARS_END_TOKEN = 275, /* PARS_END_TOKEN */
+ PARS_IF_TOKEN = 276, /* PARS_IF_TOKEN */
+ PARS_THEN_TOKEN = 277, /* PARS_THEN_TOKEN */
+ PARS_ELSE_TOKEN = 278, /* PARS_ELSE_TOKEN */
+ PARS_ELSIF_TOKEN = 279, /* PARS_ELSIF_TOKEN */
+ PARS_LOOP_TOKEN = 280, /* PARS_LOOP_TOKEN */
+ PARS_WHILE_TOKEN = 281, /* PARS_WHILE_TOKEN */
+ PARS_RETURN_TOKEN = 282, /* PARS_RETURN_TOKEN */
+ PARS_SELECT_TOKEN = 283, /* PARS_SELECT_TOKEN */
+ PARS_COUNT_TOKEN = 284, /* PARS_COUNT_TOKEN */
+ PARS_FROM_TOKEN = 285, /* PARS_FROM_TOKEN */
+ PARS_WHERE_TOKEN = 286, /* PARS_WHERE_TOKEN */
+ PARS_FOR_TOKEN = 287, /* PARS_FOR_TOKEN */
+ PARS_DDOT_TOKEN = 288, /* PARS_DDOT_TOKEN */
+ PARS_ORDER_TOKEN = 289, /* PARS_ORDER_TOKEN */
+ PARS_BY_TOKEN = 290, /* PARS_BY_TOKEN */
+ PARS_ASC_TOKEN = 291, /* PARS_ASC_TOKEN */
+ PARS_DESC_TOKEN = 292, /* PARS_DESC_TOKEN */
+ PARS_INSERT_TOKEN = 293, /* PARS_INSERT_TOKEN */
+ PARS_INTO_TOKEN = 294, /* PARS_INTO_TOKEN */
+ PARS_VALUES_TOKEN = 295, /* PARS_VALUES_TOKEN */
+ PARS_UPDATE_TOKEN = 296, /* PARS_UPDATE_TOKEN */
+ PARS_SET_TOKEN = 297, /* PARS_SET_TOKEN */
+ PARS_DELETE_TOKEN = 298, /* PARS_DELETE_TOKEN */
+ PARS_CURRENT_TOKEN = 299, /* PARS_CURRENT_TOKEN */
+ PARS_OF_TOKEN = 300, /* PARS_OF_TOKEN */
+ PARS_CREATE_TOKEN = 301, /* PARS_CREATE_TOKEN */
+ PARS_TABLE_TOKEN = 302, /* PARS_TABLE_TOKEN */
+ PARS_INDEX_TOKEN = 303, /* PARS_INDEX_TOKEN */
+ PARS_UNIQUE_TOKEN = 304, /* PARS_UNIQUE_TOKEN */
+ PARS_CLUSTERED_TOKEN = 305, /* PARS_CLUSTERED_TOKEN */
+ PARS_ON_TOKEN = 306, /* PARS_ON_TOKEN */
+ PARS_ASSIGN_TOKEN = 307, /* PARS_ASSIGN_TOKEN */
+ PARS_DECLARE_TOKEN = 308, /* PARS_DECLARE_TOKEN */
+ PARS_CURSOR_TOKEN = 309, /* PARS_CURSOR_TOKEN */
+ PARS_SQL_TOKEN = 310, /* PARS_SQL_TOKEN */
+ PARS_OPEN_TOKEN = 311, /* PARS_OPEN_TOKEN */
+ PARS_FETCH_TOKEN = 312, /* PARS_FETCH_TOKEN */
+ PARS_CLOSE_TOKEN = 313, /* PARS_CLOSE_TOKEN */
+ PARS_NOTFOUND_TOKEN = 314, /* PARS_NOTFOUND_TOKEN */
+ PARS_TO_BINARY_TOKEN = 315, /* PARS_TO_BINARY_TOKEN */
+ PARS_SUBSTR_TOKEN = 316, /* PARS_SUBSTR_TOKEN */
+ PARS_CONCAT_TOKEN = 317, /* PARS_CONCAT_TOKEN */
+ PARS_INSTR_TOKEN = 318, /* PARS_INSTR_TOKEN */
+ PARS_LENGTH_TOKEN = 319, /* PARS_LENGTH_TOKEN */
+ PARS_COMMIT_TOKEN = 320, /* PARS_COMMIT_TOKEN */
+ PARS_ROLLBACK_TOKEN = 321, /* PARS_ROLLBACK_TOKEN */
+ PARS_WORK_TOKEN = 322, /* PARS_WORK_TOKEN */
+ PARS_EXIT_TOKEN = 323, /* PARS_EXIT_TOKEN */
+ PARS_FUNCTION_TOKEN = 324, /* PARS_FUNCTION_TOKEN */
+ PARS_LOCK_TOKEN = 325, /* PARS_LOCK_TOKEN */
+ PARS_SHARE_TOKEN = 326, /* PARS_SHARE_TOKEN */
+ PARS_MODE_TOKEN = 327, /* PARS_MODE_TOKEN */
+ PARS_LIKE_TOKEN = 328, /* PARS_LIKE_TOKEN */
+ PARS_LIKE_TOKEN_EXACT = 329, /* PARS_LIKE_TOKEN_EXACT */
+ PARS_LIKE_TOKEN_PREFIX = 330, /* PARS_LIKE_TOKEN_PREFIX */
+ PARS_LIKE_TOKEN_SUFFIX = 331, /* PARS_LIKE_TOKEN_SUFFIX */
+ PARS_LIKE_TOKEN_SUBSTR = 332, /* PARS_LIKE_TOKEN_SUBSTR */
+ PARS_TABLE_NAME_TOKEN = 333, /* PARS_TABLE_NAME_TOKEN */
+ PARS_BIGINT_TOKEN = 334, /* PARS_BIGINT_TOKEN */
+ NEG = 335 /* NEG */
};
+ typedef enum yytokentype yytoken_kind_t;
#endif
/* Value type. */
diff --git a/storage/innobase/include/pars0pars.h b/storage/innobase/include/pars0pars.h
index 4c588dca061..16823ce1461 100644
--- a/storage/innobase/include/pars0pars.h
+++ b/storage/innobase/include/pars0pars.h
@@ -367,19 +367,8 @@ pars_procedure_definition(
table */
que_node_t* stat_list); /*!< in: statement list */
-/*************************************************************//**
-Parses a stored procedure call, when this is not within another stored
-procedure, that is, the client issues a procedure call directly.
-In MySQL/InnoDB, stored InnoDB procedures are invoked via the
-parsed procedure tree, not via InnoDB SQL, so this function is not used.
-@return query graph */
-que_fork_t*
-pars_stored_procedure_call(
-/*=======================*/
- sym_node_t* sym_node); /*!< in: stored procedure name */
/** Completes a query graph by adding query thread and fork nodes
-above it and prepares the graph for running. The fork created is of
-type QUE_FORK_MYSQL_INTERFACE.
+above it and prepares the graph for running.
@param[in] node root node for an incomplete query
graph, or NULL for dummy graph
@param[in] trx transaction handle
@@ -402,13 +391,6 @@ pars_info_create(void);
/*==================*/
/****************************************************************//**
-Free info struct and everything it contains. */
-void
-pars_info_free(
-/*===========*/
- pars_info_t* info); /*!< in, own: info struct */
-
-/****************************************************************//**
Add bound literal. */
void
pars_info_add_literal(
@@ -570,11 +552,10 @@ struct pars_info_t {
(pars_bound_lit_t*) */
ib_vector_t* bound_ids; /*!< bound ids, or NULL
(pars_bound_id_t*) */
-
- ibool graph_owns_us; /*!< if TRUE (which is the default),
- que_graph_free() will free us */
};
+inline void pars_info_free(pars_info_t *info) { mem_heap_free(info->heap); }
+
/** User-supplied function and argument. */
struct pars_user_func_t {
const char* name; /*!< function name */
diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h
index 962bd359f0b..c60f390a092 100644
--- a/storage/innobase/include/que0que.h
+++ b/storage/innobase/include/que0que.h
@@ -38,15 +38,7 @@ Created 5/27/1996 Heikki Tuuri
/***********************************************************************//**
Creates a query graph fork node.
@return own: fork node */
-que_fork_t*
-que_fork_create(
-/*============*/
- que_t* graph, /*!< in: graph, if NULL then this
- fork node is assumed to be the
- graph root */
- que_node_t* parent, /*!< in: parent node */
- ulint fork_type, /*!< in: fork type */
- mem_heap_t* heap); /*!< in: memory heap where created */
+que_fork_t *que_fork_create(mem_heap_t* heap);
/***********************************************************************//**
Gets the first thr in a fork. */
UNIV_INLINE
@@ -96,43 +88,14 @@ que_graph_free(
to this graph: if not, then use
que_graph_free_recursive and free the heap
afterwards! */
-/**********************************************************************//**
-Stops a query thread if graph or trx is in a state requiring it. The
-conditions are tested in the order (1) graph, (2) trx. The lock_sys_t::mutex
-has to be reserved.
-@return TRUE if stopped */
-ibool
-que_thr_stop(
-/*=========*/
- que_thr_t* thr); /*!< in: query thread */
/**********************************************************************//**
-A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The
-query thread is stopped and made inactive, except in the case where
-it was put to the lock wait state in lock0lock.cc, but the lock has already
-been granted or the transaction chosen as a victim in deadlock resolution. */
-void
-que_thr_stop_for_mysql(
-/*===================*/
- que_thr_t* thr); /*!< in: query thread */
-/**********************************************************************//**
Run a query thread. Handles lock waits. */
void
que_run_threads(
/*============*/
que_thr_t* thr); /*!< in: query thread */
/**********************************************************************//**
-Moves a suspended query thread to the QUE_THR_RUNNING state and release
-a worker thread to execute it. This function should be used to end
-the wait state of a query thread waiting for a lock or a stored procedure
-completion.
-@return query thread instance of thread to wakeup or NULL */
-que_thr_t*
-que_thr_end_lock_wait(
-/*==================*/
- trx_t* trx); /*!< in: transaction in the
- QUE_THR_LOCK_WAIT state */
-/**********************************************************************//**
Starts execution of a command in a query fork. Picks a query thread which
is not in the QUE_THR_RUNNING state and moves it to that state. If none
can be chosen, a situation which may arise in parallelized fetches, NULL
@@ -236,31 +199,6 @@ ulint
que_node_list_get_len(
/*==================*/
que_node_t* node_list); /*!< in: node list, or NULL */
-/**********************************************************************//**
-Checks if graph, trx, or session is in a state where the query thread should
-be stopped.
-@return TRUE if should be stopped; NOTE that if the peek is made
-without reserving the trx_t::mutex, then another peek with the mutex
-reserved is necessary before deciding the actual stopping */
-UNIV_INLINE
-ibool
-que_thr_peek_stop(
-/*==============*/
- que_thr_t* thr); /*!< in: query thread */
-/***********************************************************************//**
-Returns TRUE if the query graph is for a SELECT statement.
-@return TRUE if a select */
-UNIV_INLINE
-ibool
-que_graph_is_select(
-/*================*/
- que_t* graph); /*!< in: graph */
-/**********************************************************************//**
-Prints info of an SQL query graph node. */
-void
-que_node_print_info(
-/*================*/
- que_node_t* node); /*!< in: query graph node */
/*********************************************************************//**
Evaluate the given SQL
@return error code or DB_SUCCESS */
@@ -269,9 +207,6 @@ que_eval_sql(
/*=========*/
pars_info_t* info, /*!< in: info struct, or NULL */
const char* sql, /*!< in: SQL string */
- bool reserve_dict_mutex,
- /*!< in: whether to acquire/release
- dict_sys.mutex around call to pars_sql. */
trx_t* trx); /*!< in: trx */
/**********************************************************************//**
@@ -287,14 +222,11 @@ que_fork_scheduler_round_robin(
/** Query thread states */
enum que_thr_state_t {
- QUE_THR_RUNNING,
/** in selects this means that the thread is at the end of its
result set (or start, in case of a scroll cursor); in other
statements, this means the thread has done its task */
QUE_THR_COMPLETED,
- QUE_THR_COMMAND_WAIT,
- QUE_THR_LOCK_WAIT,
- QUE_THR_SUSPENDED
+ QUE_THR_RUNNING
};
/** Query thread lock states */
@@ -312,7 +244,6 @@ struct que_thr_t{
que_node_t* child; /*!< graph child node */
que_t* graph; /*!< graph where this node belongs */
que_thr_state_t state; /*!< state of the query thread */
- bool is_active; /*!< whether the thread is active */
/*------------------------------*/
/* The following fields are private to the OS thread executing the
query thread, and are not protected by any mutex: */
@@ -326,9 +257,6 @@ struct que_thr_t{
thus far */
ulint lock_state; /*!< lock state of thread (table or
row) */
- struct srv_slot_t*
- slot; /* The thread slot in the wait
- array in srv_sys_t */
/*------------------------------*/
/* The following fields are links for the various lists that
this type can be on. */
@@ -343,40 +271,12 @@ struct que_thr_t{
related delete/updates */
row_prebuilt_t* prebuilt; /*!< prebuilt structure processed by
the query thread */
-
-#ifdef UNIV_DEBUG
- /** Change the 'active' status */
- inline void set_active(bool active);
-#endif
- /** Transition to the QUE_THR_RUNNING state. */
- inline void start_running()
- {
- ut_d(if (!is_active) set_active(true));
- is_active= true;
- state= QUE_THR_RUNNING;
- }
-
- /** Stop query execution when there is no error or lock wait. */
- void stop_no_error()
- {
- ut_ad(is_active);
- ut_d(set_active(false));
- state= QUE_THR_COMPLETED;
- is_active= false;
- }
};
/* Query graph fork node: its fields are protected by the query thread mutex */
struct que_fork_t{
que_common_t common; /*!< type: QUE_NODE_FORK */
que_t* graph; /*!< query graph of this node */
- ulint fork_type; /*!< fork type */
-#ifdef UNIV_DEBUG
- /** For the query graph root, updated in set_active() */
- ulint n_active_thrs;
- /** Change the 'active' status */
- void set_active(bool active);
-#endif
trx_t* trx; /*!< transaction: this is set only in
the root node */
ulint state; /*!< state of the fork node */
@@ -402,30 +302,9 @@ struct que_fork_t{
};
-#ifdef UNIV_DEBUG
-inline void que_thr_t::set_active(bool active) { graph->set_active(active); };
-#endif
-
-/* Query fork (or graph) types */
-#define QUE_FORK_SELECT_NON_SCROLL 1 /* forward-only cursor */
-#define QUE_FORK_SELECT_SCROLL 2 /* scrollable cursor */
-#define QUE_FORK_INSERT 3
-#define QUE_FORK_UPDATE 4
-#define QUE_FORK_ROLLBACK 5
- /* This is really the undo graph used in rollback,
- no signal-sending roll_node in this graph */
-#define QUE_FORK_PURGE 6
-#define QUE_FORK_EXECUTE 7
-#define QUE_FORK_PROCEDURE 8
-#define QUE_FORK_PROCEDURE_CALL 9
-#define QUE_FORK_MYSQL_INTERFACE 10
-#define QUE_FORK_RECOVERY 11
-
/* Query fork (or graph) states */
#define QUE_FORK_ACTIVE 1
#define QUE_FORK_COMMAND_WAIT 2
-#define QUE_FORK_INVALID 3
-#define QUE_FORK_BEING_FREED 4
/* Flag which is ORed to control structure statement node types */
#define QUE_NODE_CONTROL_STAT 1024
diff --git a/storage/innobase/include/que0que.inl b/storage/innobase/include/que0que.inl
index 1c3ac242bf2..e21cbad3815 100644
--- a/storage/innobase/include/que0que.inl
+++ b/storage/innobase/include/que0que.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2020, MariaDB Corporation.
+Copyright (c) 2020, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -243,51 +243,3 @@ que_node_get_parent(
{
return(((que_common_t*) node)->parent);
}
-
-/**********************************************************************//**
-Checks if graph, trx, or session is in a state where the query thread should
-be stopped.
-@return TRUE if should be stopped; NOTE that if the peek is made
-without reserving the trx mutex, then another peek with the mutex
-reserved is necessary before deciding the actual stopping */
-UNIV_INLINE
-ibool
-que_thr_peek_stop(
-/*==============*/
- que_thr_t* thr) /*!< in: query thread */
-{
- trx_t* trx;
- que_t* graph;
-
- graph = thr->graph;
- trx = graph->trx;
-
- if (graph->state != QUE_FORK_ACTIVE
- || trx->lock.que_state == TRX_QUE_LOCK_WAIT
- || (trx->lock.que_state != TRX_QUE_ROLLING_BACK
- && trx->lock.que_state != TRX_QUE_RUNNING)) {
-
- return(TRUE);
- }
-
- return(FALSE);
-}
-
-/***********************************************************************//**
-Returns TRUE if the query graph is for a SELECT statement.
-@return TRUE if a select */
-UNIV_INLINE
-ibool
-que_graph_is_select(
-/*================*/
- que_t* graph) /*!< in: graph */
-{
- if (graph->fork_type == QUE_FORK_SELECT_SCROLL
- || graph->fork_type == QUE_FORK_SELECT_NON_SCROLL) {
-
- return(TRUE);
- }
-
- return(FALSE);
-}
-
diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h
index 21143ab609d..e002f1b77e1 100644
--- a/storage/innobase/include/read0types.h
+++ b/storage/innobase/include/read0types.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,14 +24,13 @@ Cursor read
Created 2/16/1997 Heikki Tuuri
*******************************************************/
-#ifndef read0types_h
-#define read0types_h
+#pragma once
#include "dict0mem.h"
#include "trx0types.h"
+#include "srw_lock.h"
#include <algorithm>
-
/**
Read view lists the trx ids of those transactions for which a consistent read
should not see the modifications to the database.
@@ -42,7 +41,7 @@ class ReadViewBase
The read should not see any transaction with trx id >= this value.
In other words, this is the "high water mark".
*/
- trx_id_t m_low_limit_id;
+ trx_id_t m_low_limit_id= 0;
/**
The read should see all trx ids which are strictly
@@ -68,9 +67,6 @@ protected:
trx_id_t up_limit_id() const { return m_up_limit_id; }
public:
- ReadViewBase(): m_low_limit_id(0) {}
-
-
/**
Append state from another view.
@@ -126,39 +122,20 @@ loop:
/**
- Check whether transaction id is valid.
- @param[in] id transaction id to check
- @param[in] name table name
-
- @todo changes_visible() was an unfortunate choice for this check.
- It should be moved towards the functions that load trx id like
- trx_read_trx_id(). No need to issue a warning, error log message should
- be enough. Although statement should ideally fail if it sees corrupt
- data.
- */
- static void check_trx_id_sanity(trx_id_t id, const table_name_t &name);
-
-
- /**
Check whether the changes by id are visible.
@param[in] id transaction id to check against the view
- @param[in] name table name
@return whether the view sees the modifications of id.
*/
- bool changes_visible(trx_id_t id, const table_name_t &name) const
+ bool changes_visible(trx_id_t id) const
MY_ATTRIBUTE((warn_unused_result))
{
if (id >= m_low_limit_id)
- {
- check_trx_id_sanity(id, name);
return false;
- }
return id < m_up_limit_id ||
m_ids.empty() ||
!std::binary_search(m_ids.begin(), m_ids.end(), id);
}
-
/**
@param id transaction to check
@return true if view sees transaction id
@@ -170,6 +147,13 @@ loop:
/** @return the low limit id */
trx_id_t low_limit_id() const { return m_low_limit_id; }
+
+ /** Clamp the low limit id for purge_sys.end_view */
+ void clamp_low_limit_id(trx_id_t limit)
+ {
+ if (m_low_limit_id > limit)
+ m_low_limit_id= limit;
+ }
};
@@ -190,7 +174,7 @@ class ReadView: public ReadViewBase
std::atomic<bool> m_open;
/** For synchronisation with purge coordinator. */
- mutable ib_mutex_t m_mutex;
+ mutable srw_mutex m_mutex;
/**
trx id of creating transaction.
@@ -199,8 +183,12 @@ class ReadView: public ReadViewBase
trx_id_t m_creator_trx_id;
public:
- ReadView(): m_open(false) { mutex_create(LATCH_ID_READ_VIEW, &m_mutex); }
- ~ReadView() { mutex_free(&m_mutex); }
+ ReadView()
+ {
+ memset(reinterpret_cast<void*>(this), 0, sizeof *this);
+ m_mutex.init();
+ }
+ ~ReadView() { m_mutex.destroy(); }
/**
@@ -236,7 +224,6 @@ public:
*/
void set_creator_trx_id(trx_id_t id)
{
- ut_ad(id > 0);
ut_ad(m_creator_trx_id == 0);
m_creator_trx_id= id;
}
@@ -248,12 +235,12 @@ public:
*/
void print_limits(FILE *file) const
{
- mutex_enter(&m_mutex);
+ m_mutex.wr_lock();
if (is_open())
fprintf(file, "Trx read view will not see trx with"
" id >= " TRX_ID_FMT ", sees < " TRX_ID_FMT "\n",
low_limit_id(), up_limit_id());
- mutex_exit(&m_mutex);
+ m_mutex.wr_unlock();
}
@@ -261,9 +248,8 @@ public:
A wrapper around ReadViewBase::changes_visible().
Intended to be called by the ReadView owner thread.
*/
- bool changes_visible(trx_id_t id, const table_name_t &name) const
- { return id == m_creator_trx_id || ReadViewBase::changes_visible(id, name); }
-
+ bool changes_visible(trx_id_t id) const
+ { return id == m_creator_trx_id || ReadViewBase::changes_visible(id); }
/**
A wrapper around ReadViewBase::append().
@@ -271,23 +257,19 @@ public:
*/
void append_to(ReadViewBase *to) const
{
- mutex_enter(&m_mutex);
+ m_mutex.wr_lock();
if (is_open())
to->append(*this);
- mutex_exit(&m_mutex);
+ m_mutex.wr_unlock();
}
-
/**
Declare the object mostly unaccessible.
- innodb_monitor_set_option is operating also on freed transaction objects.
*/
void mem_noaccess() const
{
MEM_NOACCESS(&m_open, sizeof m_open);
- /* m_mutex is accessed by innodb_show_mutex_status()
- and innodb_monitor_update() even after trx_t::free() */
+ /* m_mutex is accessed via trx_sys.rw_trx_hash */
MEM_NOACCESS(&m_creator_trx_id, sizeof m_creator_trx_id);
}
};
-#endif
diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h
index a179c313235..2f038ab349f 100644
--- a/storage/innobase/include/rem0rec.h
+++ b/storage/innobase/include/rem0rec.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -141,28 +141,7 @@ constexpr rec_offs REC_OFFS_EXTERNAL= REC_OFFS_COMPACT >> 1;
/** Default value flag in offsets returned by rec_get_offsets() */
constexpr rec_offs REC_OFFS_DEFAULT= REC_OFFS_COMPACT >> 2;
constexpr rec_offs REC_OFFS_MASK= REC_OFFS_DEFAULT - 1;
-/******************************************************//**
-The following function is used to get the pointer of the next chained record
-on the same page.
-@return pointer to the next chained record, or NULL if none */
-UNIV_INLINE
-const rec_t*
-rec_get_next_ptr_const(
-/*===================*/
- const rec_t* rec, /*!< in: physical record */
- ulint comp) /*!< in: nonzero=compact page format */
- MY_ATTRIBUTE((warn_unused_result));
-/******************************************************//**
-The following function is used to get the pointer of the next chained record
-on the same page.
-@return pointer to the next chained record, or NULL if none */
-UNIV_INLINE
-rec_t*
-rec_get_next_ptr(
-/*=============*/
- rec_t* rec, /*!< in: physical record */
- ulint comp) /*!< in: nonzero=compact page format */
- MY_ATTRIBUTE((warn_unused_result));
+
/******************************************************//**
The following function is used to get the offset of the
next chained record on the same page.
@@ -727,11 +706,9 @@ in the clustered index for instant ADD COLUMN or ALTER TABLE.
@param[in] rec leaf page record
@param[in] index index of the record
@return whether the record is the metadata pseudo-record */
-inline bool rec_is_metadata(const rec_t* rec, const dict_index_t& index)
+inline bool rec_is_metadata(const rec_t *rec, const dict_index_t &index)
{
- bool is = rec_is_metadata(rec, dict_table_is_comp(index.table));
- ut_ad(!is || index.is_instant());
- return is;
+ return rec_is_metadata(rec, index.table->not_redundant());
}
/** Determine if the record is the metadata pseudo-record
diff --git a/storage/innobase/include/rem0rec.inl b/storage/innobase/include/rem0rec.inl
index 30c72a7415a..46c209cbdec 100644
--- a/storage/innobase/include/rem0rec.inl
+++ b/storage/innobase/include/rem0rec.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -204,76 +204,6 @@ rec_set_bit_field_2(
}
/******************************************************//**
-The following function is used to get the pointer of the next chained record
-on the same page.
-@return pointer to the next chained record, or NULL if none */
-UNIV_INLINE
-const rec_t*
-rec_get_next_ptr_const(
-/*===================*/
- const rec_t* rec, /*!< in: physical record */
- ulint comp) /*!< in: nonzero=compact page format */
-{
- ulint field_value;
-
- compile_time_assert(REC_NEXT_MASK == 0xFFFFUL);
- compile_time_assert(REC_NEXT_SHIFT == 0);
-
- field_value = mach_read_from_2(rec - REC_NEXT);
-
- if (field_value == 0) {
-
- return(NULL);
- }
-
- if (comp) {
-#if UNIV_PAGE_SIZE_MAX <= 32768
- /* Note that for 64 KiB pages, field_value can 'wrap around'
- and the debug assertion is not valid */
-
- /* In the following assertion, field_value is interpreted
- as signed 16-bit integer in 2's complement arithmetics.
- If all platforms defined int16_t in the standard headers,
- the expression could be written simpler as
- (int16_t) field_value + ut_align_offset(...) < srv_page_size
- */
- ut_ad((field_value >= 32768
- ? field_value - 65536
- : field_value)
- + ut_align_offset(rec, srv_page_size)
- < srv_page_size);
-#endif
- /* There must be at least REC_N_NEW_EXTRA_BYTES + 1
- between each record. */
- ut_ad((field_value > REC_N_NEW_EXTRA_BYTES
- && field_value < 32768)
- || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES);
-
- return((byte*) ut_align_down(rec, srv_page_size)
- + ut_align_offset(rec + field_value, srv_page_size));
- } else {
- ut_ad(field_value < srv_page_size);
-
- return((byte*) ut_align_down(rec, srv_page_size)
- + field_value);
- }
-}
-
-/******************************************************//**
-The following function is used to get the pointer of the next chained record
-on the same page.
-@return pointer to the next chained record, or NULL if none */
-UNIV_INLINE
-rec_t*
-rec_get_next_ptr(
-/*=============*/
- rec_t* rec, /*!< in: physical record */
- ulint comp) /*!< in: nonzero=compact page format */
-{
- return(const_cast<rec_t*>(rec_get_next_ptr_const(rec, comp)));
-}
-
-/******************************************************//**
The following function is used to get the offset of the next chained record
on the same page.
@return the page offset of the next chained record, or 0 if none */
diff --git a/storage/innobase/include/row0ftsort.h b/storage/innobase/include/row0ftsort.h
index 99c85601d5d..65508caf751 100644
--- a/storage/innobase/include/row0ftsort.h
+++ b/storage/innobase/include/row0ftsort.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2019, MariaDB Corporation.
+Copyright (c) 2015, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -65,7 +65,7 @@ struct fts_psort_common_t {
ulint old_zip_size;
trx_t* trx; /*!< transaction */
fts_psort_t* all_info; /*!< all parallel sort info */
- os_event_t sort_event; /*!< sort event */
+ pthread_cond_t sort_cond; /*!< sort completion */
ibool opt_doc_id_size;/*!< whether to use 4 bytes
instead of 8 bytes integer to
store Doc ID during sort, if
@@ -90,7 +90,7 @@ struct fts_psort_t {
tpool::waitable_task* task; /*!< threadpool task */
dberr_t error; /*!< db error during psort */
ulint memory_used; /*!< memory used by fts_doc_list */
- ib_mutex_t mutex; /*!< mutex for fts_doc_list */
+ mysql_mutex_t mutex; /*!< mutex for fts_doc_list */
};
/** Row fts token for plugin parser */
@@ -152,7 +152,6 @@ typedef struct fts_psort_insert fts_psort_insert_t;
#define FTS_PARENT_COMPLETE 1
#define FTS_PARENT_EXITING 2
#define FTS_CHILD_COMPLETE 1
-#define FTS_CHILD_EXITING 2
/** Print some debug information */
#define FTSORT_PRINT
diff --git a/storage/innobase/include/row0ins.h b/storage/innobase/include/row0ins.h
index 75db0ad04b2..ac2479c4863 100644
--- a/storage/innobase/include/row0ins.h
+++ b/storage/innobase/include/row0ins.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -78,7 +78,7 @@ dberr_t
row_ins_clust_index_entry_low(
/*==========================*/
ulint flags, /*!< in: undo logging and locking flags */
- ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
depending on whether we wish optimistic or
pessimistic descent down the index tree */
dict_index_t* index, /*!< in: clustered index */
@@ -94,13 +94,13 @@ same fields is found, the other record is necessarily marked deleted.
It is then unmarked. Otherwise, the entry is just inserted to the index.
@retval DB_SUCCESS on success
@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
-@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@retval DB_FAIL if retry with BTR_INSERT_TREE is needed
@return error code */
dberr_t
row_ins_sec_index_entry_low(
/*========================*/
ulint flags, /*!< in: undo logging and locking flags */
- ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF or BTR_INSERT_TREE,
depending on whether we wish optimistic or
pessimistic descent down the index tree */
dict_index_t* index, /*!< in: secondary index */
diff --git a/storage/innobase/include/row0log.h b/storage/innobase/include/row0log.h
index 978a3f906c0..469f1f8a356 100644
--- a/storage/innobase/include/row0log.h
+++ b/storage/innobase/include/row0log.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,15 +24,15 @@ Modification log for online index creation and online table rebuild
Created 2011-05-26 Marko Makela
*******************************************************/
-#ifndef row0log_h
-#define row0log_h
+#pragma once
#include "que0types.h"
#include "mtr0types.h"
#include "row0types.h"
#include "rem0types.h"
-#include "data0types.h"
+#include "dict0dict.h"
#include "trx0types.h"
+#include "trx0undo.h"
class ut_stage_alter_t;
@@ -74,37 +74,23 @@ row_log_free(
/******************************************************//**
Free the row log for an index on which online creation was aborted. */
-UNIV_INLINE
-void
-row_log_abort_sec(
-/*==============*/
- dict_index_t* index) /*!< in/out: index (x-latched) */
- MY_ATTRIBUTE((nonnull));
-
-/******************************************************//**
-Try to log an operation to a secondary index that is
-(or was) being created.
-@retval true if the operation was logged or can be ignored
-@retval false if online index creation is not taking place */
-UNIV_INLINE
-bool
-row_log_online_op_try(
-/*==================*/
- dict_index_t* index, /*!< in/out: index, S or X latched */
- const dtuple_t* tuple, /*!< in: index tuple */
- trx_id_t trx_id) /*!< in: transaction ID for insert,
- or 0 for delete */
- MY_ATTRIBUTE((nonnull, warn_unused_result));
-/******************************************************//**
-Logs an operation to a secondary index that is (or was) being created. */
-void
-row_log_online_op(
-/*==============*/
- dict_index_t* index, /*!< in/out: index, S or X latched */
- const dtuple_t* tuple, /*!< in: index tuple */
- trx_id_t trx_id) /*!< in: transaction ID for insert,
- or 0 for delete */
- ATTRIBUTE_COLD __attribute__((nonnull));
+inline void row_log_abort_sec(dict_index_t *index)
+{
+ ut_ad(index->lock.have_u_or_x());
+ ut_ad(!index->is_clust());
+ dict_index_set_online_status(index, ONLINE_INDEX_ABORTED);
+ row_log_free(index->online_log);
+ index->online_log= nullptr;
+}
+
+/** Logs an operation to a secondary index that is (or was) being created.
+@param index index, S or X latched
+@param tuple index tuple
+@param trx_id transaction ID for insert, or 0 for delete
+@retval false if row_log_apply() failure happens
+or true otherwise */
+bool row_log_online_op(dict_index_t *index, const dtuple_t *tuple,
+ trx_id_t trx_id) ATTRIBUTE_COLD;
/******************************************************//**
Gets the error status of the online index rebuild log.
@@ -185,22 +171,6 @@ row_log_table_insert(
dict_index_t* index, /*!< in/out: clustered index, S-latched
or X-latched */
const rec_offs* offsets);/*!< in: rec_get_offsets(rec,index) */
-/******************************************************//**
-Notes that a BLOB is being freed during online ALTER TABLE. */
-void
-row_log_table_blob_free(
-/*====================*/
- dict_index_t* index, /*!< in/out: clustered index, X-latched */
- ulint page_no)/*!< in: starting page number of the BLOB */
- ATTRIBUTE_COLD __attribute__((nonnull));
-/******************************************************//**
-Notes that a BLOB is being allocated during online ALTER TABLE. */
-void
-row_log_table_blob_alloc(
-/*=====================*/
- dict_index_t* index, /*!< in/out: clustered index, X-latched */
- ulint page_no)/*!< in: starting page number of the BLOB */
- ATTRIBUTE_COLD __attribute__((nonnull));
/** Apply the row_log_table log to a table upon completing rebuild.
@param[in] thr query graph
@@ -252,6 +222,11 @@ row_log_apply(
@return number of n_core_fields */
unsigned row_log_get_n_core_fields(const dict_index_t *index);
+/** Get the error code of online log for the index
+@param index online index
+@return error code present in online log */
+dberr_t row_log_get_error(const dict_index_t *index);
+
#ifdef HAVE_PSI_STAGE_INTERFACE
/** Estimate how much work is to be done by the log apply phase
of an ALTER TABLE for this index.
@@ -262,7 +237,3 @@ ulint
row_log_estimate_work(
const dict_index_t* index);
#endif /* HAVE_PSI_STAGE_INTERFACE */
-
-#include "row0log.inl"
-
-#endif /* row0log.h */
diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h
index 1d7f9bb145b..52096d48313 100644
--- a/storage/innobase/include/row0merge.h
+++ b/storage/innobase/include/row0merge.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2020, MariaDB Corporation.
+Copyright (c) 2015, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -145,28 +145,6 @@ row_merge_dup_report(
const dfield_t* entry) /*!< in: duplicate index entry */
MY_ATTRIBUTE((nonnull));
-/*********************************************************************//**
-Sets an exclusive lock on a table, for the duration of creating indexes.
-@return error code or DB_SUCCESS */
-dberr_t
-row_merge_lock_table(
-/*=================*/
- trx_t* trx, /*!< in/out: transaction */
- dict_table_t* table, /*!< in: table to lock */
- enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */
- MY_ATTRIBUTE((nonnull(1,2), warn_unused_result));
-
-/*********************************************************************//**
-Drop indexes that were created before an error occurred.
-The data dictionary must have been locked exclusively by the caller,
-because the transaction will not be committed. */
-void
-row_merge_drop_indexes_dict(
-/*========================*/
- trx_t* trx, /*!< in/out: dictionary transaction */
- table_id_t table_id)/*!< in: table identifier */
- MY_ATTRIBUTE((nonnull));
-
/** Drop indexes that were created before an error occurred.
The data dictionary must have been locked exclusively by the caller,
because the transaction will not be committed.
@@ -182,11 +160,9 @@ row_merge_drop_indexes(
bool locked,
const trx_t* alter_trx=NULL);
-/*********************************************************************//**
-Drop all partially created indexes during crash recovery. */
-void
-row_merge_drop_temp_indexes(void);
-/*=============================*/
+/** During recovery, drop recovered index stubs that were created in
+prepare_inplace_alter_table_dict(). */
+void row_merge_drop_temp_indexes();
/** Create temporary merge files in the given paramater path, and if
UNIV_PFS_IO defined, register the file descriptor with Performance Schema.
@@ -217,19 +193,6 @@ row_merge_rename_index_to_add(
index_id_t index_id) /*!< in: index identifier */
MY_ATTRIBUTE((nonnull(1), warn_unused_result));
-/*********************************************************************//**
-Rename an index in the dictionary that is to be dropped. The data
-dictionary must have been locked exclusively by the caller, because
-the transaction will not be committed.
-@return DB_SUCCESS if all OK */
-dberr_t
-row_merge_rename_index_to_drop(
-/*===========================*/
- trx_t* trx, /*!< in/out: transaction */
- table_id_t table_id, /*!< in: table identifier */
- index_id_t index_id) /*!< in: index identifier */
- MY_ATTRIBUTE((nonnull(1), warn_unused_result));
-
/** Create the index and load in to the dictionary.
@param[in,out] table the index is on this table
@param[in] index_def the index definition
@@ -253,18 +216,10 @@ row_merge_is_index_usable(
const dict_index_t* index) /*!< in: index to check */
MY_ATTRIBUTE((nonnull, warn_unused_result));
-/*********************************************************************//**
-Drop a table. The caller must have ensured that the background stats
-thread is not processing the table. This can be done by calling
-dict_stats_wait_bg_to_stop_using_table() after locking the dictionary and
-before calling this function.
-@return DB_SUCCESS or error code */
-dberr_t
-row_merge_drop_table(
-/*=================*/
- trx_t* trx, /*!< in: transaction */
- dict_table_t* table) /*!< in: table instance to drop */
- MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Map from column numbers to column definitions that include
+changes to the collation, when the encoding is compatible with
+the original column and no table rebuild is needed */
+typedef std::map<unsigned, dict_col_t*> col_collations;
/** Build indexes on a table by reading a clustered index, creating a temporary
file containing index entries, merge sorting these index entries and inserting
@@ -294,6 +249,7 @@ this function and it will be passed to other functions for further accounting.
@param[in] eval_table mysql table used to evaluate virtual column
value, see innobase_get_computed_value().
@param[in] allow_non_null allow the conversion from null to not-null
+@param[in] col_collate columns whose collations changed, or nullptr
@return DB_SUCCESS or error code */
dberr_t
row_merge_build_indexes(
@@ -313,7 +269,8 @@ row_merge_build_indexes(
ut_stage_alter_t* stage,
const dict_add_v_col_t* add_v,
struct TABLE* eval_table,
- bool allow_non_null)
+ bool allow_non_null,
+ const col_collations* col_collate)
MY_ATTRIBUTE((warn_unused_result));
/********************************************************************//**
@@ -341,10 +298,8 @@ Write a merge block to the file system.
@return whether the request was completed successfully
@retval false on error
@retval true on success */
-UNIV_INTERN
bool
row_merge_write(
-/*============*/
const pfs_os_file_t& fd, /*!< in: file descriptor */
ulint offset, /*!< in: offset where to write,
in number of row_merge_block_t elements */
diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h
index eb90ec0f04c..a9f1c87d600 100644
--- a/storage/innobase/include/row0mysql.h
+++ b/storage/innobase/include/row0mysql.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -37,11 +37,6 @@ Created 9/17/2000 Heikki Tuuri
#include "fts0fts.h"
#include "gis0type.h"
-#include "sql_list.h"
-#include "sql_cmd.h"
-
-extern ibool row_rollback_on_timeout;
-
struct row_prebuilt_t;
class ha_innobase;
@@ -187,13 +182,8 @@ row_create_prebuilt(
dict_table_t* table, /*!< in: Innobase table handle */
ulint mysql_row_len); /*!< in: length in bytes of a row in
the MySQL format */
-/********************************************************************//**
-Free a prebuilt struct for a MySQL table handle. */
-void
-row_prebuilt_free(
-/*==============*/
- row_prebuilt_t* prebuilt, /*!< in, own: prebuilt struct */
- ibool dict_locked); /*!< in: TRUE=data dictionary locked */
+/** Free a prebuilt struct for a TABLE handle. */
+void row_prebuilt_free(row_prebuilt_t *prebuilt);
/*********************************************************************//**
Updates the transaction pointers in query graphs stored in the prebuilt
struct. */
@@ -273,7 +263,7 @@ row_update_for_mysql(
/** This can only be used when the current transaction is at
READ COMMITTED or READ UNCOMMITTED isolation level.
-Before calling this function row_search_for_mysql() must have
+Before calling this function row_search_mvcc() must have
initialized prebuilt->new_rec_locks to store the information which new
record locks really were set. This function removes a newly set
clustered index record lock under prebuilt->pcur or
@@ -310,40 +300,24 @@ row_update_cascade_for_mysql(
or set null operation */
dict_table_t* table) /*!< in: table where we do the operation */
MY_ATTRIBUTE((nonnull, warn_unused_result));
-/*********************************************************************//**
-Locks the data dictionary exclusively for performing a table create or other
-data dictionary modification operation. */
-void
-row_mysql_lock_data_dictionary_func(
-/*================================*/
- trx_t* trx, /*!< in/out: transaction */
- const char* file, /*!< in: file name */
- unsigned line); /*!< in: line number */
-#define row_mysql_lock_data_dictionary(trx) \
- row_mysql_lock_data_dictionary_func(trx, __FILE__, __LINE__)
-/*********************************************************************//**
-Unlocks the data dictionary exclusive lock. */
-void
-row_mysql_unlock_data_dictionary(
-/*=============================*/
- trx_t* trx); /*!< in/out: transaction */
-/*********************************************************************//**
-Locks the data dictionary in shared mode from modifications, for performing
-foreign key check, rollback, or other operation invisible to MySQL. */
-void
-row_mysql_freeze_data_dictionary_func(
-/*==================================*/
- trx_t* trx, /*!< in/out: transaction */
- const char* file, /*!< in: file name */
- unsigned line); /*!< in: line number */
-#define row_mysql_freeze_data_dictionary(trx) \
- row_mysql_freeze_data_dictionary_func(trx, __FILE__, __LINE__)
-/*********************************************************************//**
-Unlocks the data dictionary shared lock. */
-void
-row_mysql_unfreeze_data_dictionary(
-/*===============================*/
- trx_t* trx); /*!< in/out: transaction */
+
+/** Lock the data dictionary cache exclusively. */
+#define row_mysql_lock_data_dictionary(trx) \
+ do { \
+ ut_ad(!trx->dict_operation_lock_mode); \
+ dict_sys.lock(SRW_LOCK_CALL); \
+ trx->dict_operation_lock_mode = true; \
+ } while (0)
+
+/** Unlock the data dictionary. */
+#define row_mysql_unlock_data_dictionary(trx) \
+ do { \
+ ut_ad(!lock_trx_has_sys_table_locks(trx)); \
+ ut_ad(trx->dict_operation_lock_mode); \
+ trx->dict_operation_lock_mode = false; \
+ dict_sys.unlock(); \
+ } while (0)
+
/*********************************************************************//**
Creates a table for MySQL. On failure the transaction will be rolled back
and the 'table' object will be freed.
@@ -354,9 +328,7 @@ row_create_table_for_mysql(
dict_table_t* table, /*!< in, own: table definition
(will be freed, or on DB_SUCCESS
added to the data dictionary cache) */
- trx_t* trx, /*!< in/out: transaction */
- fil_encryption_t mode, /*!< in: encryption mode */
- uint32_t key_id) /*!< in: encryption key_id */
+ trx_t* trx) /*!< in/out: transaction */
MY_ATTRIBUTE((warn_unused_result));
/*********************************************************************//**
@@ -369,78 +341,22 @@ row_create_index_for_mysql(
dict_index_t* index, /*!< in, own: index definition
(will be freed) */
trx_t* trx, /*!< in: transaction handle */
- const ulint* field_lengths) /*!< in: if not NULL, must contain
+ const ulint* field_lengths, /*!< in: if not NULL, must contain
dict_index_get_n_fields(index)
actual field lengths for the
index columns, which are
then checked for not being too
large. */
+ fil_encryption_t mode, /*!< in: encryption mode */
+ uint32_t key_id) /*!< in: encryption key_id */
MY_ATTRIBUTE((warn_unused_result));
-/*********************************************************************//**
-The master thread in srv0srv.cc calls this regularly to drop tables which
-we must drop in background after queries to them have ended. Such lazy
-dropping of tables is needed in ALTER TABLE on Unix.
-@return how many tables dropped + remaining tables in list */
-ulint
-row_drop_tables_for_mysql_in_background(void);
-/*=========================================*/
-/*********************************************************************//**
-Get the background drop list length. NOTE: the caller must own the kernel
-mutex!
-@return how many tables in list */
-ulint
-row_get_background_drop_list_len_low(void);
-/*======================================*/
-
-/** Drop garbage tables during recovery. */
-void
-row_mysql_drop_garbage_tables();
-
-/*********************************************************************//**
-Sets an exclusive lock on a table.
-@return error code or DB_SUCCESS */
-dberr_t
-row_mysql_lock_table(
-/*=================*/
- trx_t* trx, /*!< in/out: transaction */
- dict_table_t* table, /*!< in: table to lock */
- enum lock_mode mode, /*!< in: LOCK_X or LOCK_S */
- const char* op_info) /*!< in: string for trx->op_info */
- MY_ATTRIBUTE((nonnull, warn_unused_result));
-
-/** Drop a table.
-If the data dictionary was not already locked by the transaction,
-the transaction will be committed. Otherwise, the data dictionary
-will remain locked.
-@param[in] name Table name
-@param[in,out] trx Transaction handle
-@param[in] sqlcom type of SQL operation
-@param[in] create_failed true=create table failed
- because e.g. foreign key column
-@param[in] nonatomic Whether it is permitted to release
- and reacquire dict_sys.latch
-@return error code */
-dberr_t
-row_drop_table_for_mysql(
- const char* name,
- trx_t* trx,
- enum_sql_command sqlcom,
- bool create_failed = false,
- bool nonatomic = true);
-
-/** Drop a table after failed CREATE TABLE. */
-dberr_t row_drop_table_after_create_fail(const char* name, trx_t* trx);
/*********************************************************************//**
Discards the tablespace of a table which stored in an .ibd file. Discarding
means that this function deletes the .ibd file and assigns a new table id for
the table. Also the file_unreadable flag is set.
@return error code or DB_SUCCESS */
-dberr_t
-row_discard_tablespace_for_mysql(
-/*=============================*/
- const char* name, /*!< in: table name */
- trx_t* trx) /*!< in: transaction handle */
+dberr_t row_discard_tablespace_for_mysql(dict_table_t *table, trx_t *trx)
MY_ATTRIBUTE((nonnull, warn_unused_result));
/*****************************************************************//**
Imports a tablespace. The space id in the .ibd file must match the space id
@@ -453,17 +369,6 @@ row_import_tablespace_for_mysql(
row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL */
MY_ATTRIBUTE((nonnull, warn_unused_result));
-/** Drop a database for MySQL.
-@param[in] name database name which ends at '/'
-@param[in] trx transaction handle
-@param[out] found number of dropped tables/partitions
-@return error code or DB_SUCCESS */
-dberr_t
-row_drop_database_for_mysql(
- const char* name,
- trx_t* trx,
- ulint* found);
-
/*********************************************************************//**
Renames a table for MySQL.
@return error code or DB_SUCCESS */
@@ -473,38 +378,10 @@ row_rename_table_for_mysql(
const char* old_name, /*!< in: old table name */
const char* new_name, /*!< in: new table name */
trx_t* trx, /*!< in/out: transaction */
- bool commit, /*!< in: whether to commit trx */
bool use_fk) /*!< in: whether to parse and enforce
FOREIGN KEY constraints */
MY_ATTRIBUTE((nonnull, warn_unused_result));
-/*********************************************************************//**
-Scans an index for either COOUNT(*) or CHECK TABLE.
-If CHECK TABLE; Checks that the index contains entries in an ascending order,
-unique constraint is not broken, and calculates the number of index entries
-in the read view of the current transaction.
-@return DB_SUCCESS or other error */
-dberr_t
-row_scan_index_for_mysql(
-/*=====================*/
- row_prebuilt_t* prebuilt, /*!< in: prebuilt struct
- in MySQL handle */
- const dict_index_t* index, /*!< in: index */
- ulint* n_rows) /*!< out: number of entries
- seen in the consistent read */
- MY_ATTRIBUTE((warn_unused_result));
-/*********************************************************************//**
-Initialize this module */
-void
-row_mysql_init(void);
-/*================*/
-
-/*********************************************************************//**
-Close this module */
-void
-row_mysql_close(void);
-/*=================*/
-
/* A struct describing a place for an individual column in the MySQL
row format which is presented to the table handler in ha_innobase.
This template struct is used to speed up row transformations between
@@ -686,6 +563,7 @@ struct row_prebuilt_t {
dtuple_t* clust_ref; /*!< prebuilt dtuple used in
sel/upd/del */
lock_mode select_lock_type;/*!< LOCK_NONE, LOCK_S, or LOCK_X */
+ bool skip_locked; /*!< TL_{READ,WRITE}_SKIP_LOCKED */
lock_mode stored_select_lock_type;/*!< this field is used to
remember the original select_lock_type
that was decided in ha_innodb.cc,
@@ -712,7 +590,7 @@ struct row_prebuilt_t {
ROW_READ_TRY_SEMI_CONSISTENT and
to simply skip the row. If
the row matches, the next call to
- row_search_for_mysql() will lock
+ row_search_mvcc() will lock
the row.
This eliminates lock waits in some
cases; note that this breaks
@@ -721,7 +599,7 @@ struct row_prebuilt_t {
the session is using READ
COMMITTED or READ UNCOMMITTED
isolation level, set in
- row_search_for_mysql() if we set a new
+ row_search_mvcc() if we set a new
record lock on the secondary
or clustered index; this is
used in row_unlock_for_mysql()
@@ -861,9 +739,8 @@ struct VCOL_STORAGE
@return TRUE malloc failure
*/
-bool innobase_allocate_row_for_vcol(
- THD * thd,
- dict_index_t* index,
+bool innobase_allocate_row_for_vcol(THD *thd,
+ const dict_index_t* index,
mem_heap_t** heap,
TABLE** table,
VCOL_STORAGE* storage);
@@ -879,17 +756,13 @@ public:
ib_vcol_row(mem_heap_t *heap) : heap(heap) {}
- byte *record(THD *thd, dict_index_t *index, TABLE **table)
+ byte *record(THD *thd, const dict_index_t *index, TABLE **table)
{
- if (!storage.innobase_record)
- {
- bool ok = innobase_allocate_row_for_vcol(thd, index, &heap, table,
- &storage);
- if (!ok)
- return NULL;
- }
+ if (!storage.innobase_record &&
+ !innobase_allocate_row_for_vcol(thd, index, &heap, table, &storage))
+ return nullptr;
return storage.innobase_record;
- };
+ }
~ib_vcol_row()
{
@@ -958,7 +831,7 @@ innobase_rename_vc_templ(
#define ROW_MYSQL_REC_FIELDS 1
#define ROW_MYSQL_NO_TEMPLATE 2
#define ROW_MYSQL_DUMMY_TEMPLATE 3 /* dummy template used in
- row_scan_and_check_index */
+ row_check_index() */
/* Values for hint_need_to_fetch_extra_cols */
#define ROW_RETRIEVE_PRIMARY_KEY 1
@@ -969,10 +842,4 @@ innobase_rename_vc_templ(
#define ROW_READ_TRY_SEMI_CONSISTENT 1
#define ROW_READ_DID_SEMI_CONSISTENT 2
-#ifdef UNIV_DEBUG
-/** Wait for the background drop list to become empty. */
-void
-row_wait_for_background_drop_list_empty();
-#endif /* UNIV_DEBUG */
-
#endif /* row0mysql.h */
diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h
index 091d80adec5..b1390fd1ef1 100644
--- a/storage/innobase/include/row0purge.h
+++ b/storage/innobase/include/row0purge.h
@@ -72,9 +72,8 @@ row_purge_poss_sec(
bool is_tree=false);
/***************************************************************
-Does the purge operation for a single undo log record. This is a high-level
-function used in an SQL execution graph.
-@return query thread to run next or NULL */
+Does the purge operation.
+@return query thread to run next */
que_thr_t*
row_purge_step(
/*===========*/
@@ -198,21 +197,7 @@ public:
}
/** Start processing an undo log record. */
- void start()
- {
- ut_ad(in_progress);
- DBUG_ASSERT(common.type == QUE_NODE_PURGE);
-
- row= nullptr;
- ref= nullptr;
- index= nullptr;
- update= nullptr;
- found_clust= FALSE;
- rec_type= ULINT_UNDEFINED;
- cmpl_info= ULINT_UNDEFINED;
- if (!purge_thd)
- purge_thd= current_thd;
- }
+ inline void start();
/** Close the existing table and release the MDL for it. */
@@ -226,7 +211,7 @@ public:
}
innobase_reset_background_thd(purge_thd);
- dict_table_close(table, false, false, purge_thd, mdl_ticket);
+ dict_table_close(table, false, purge_thd, mdl_ticket);
table= nullptr;
mdl_ticket= nullptr;
}
@@ -253,16 +238,7 @@ public:
/** Reset the state at end
@return the query graph parent */
- que_node_t* end()
- {
- DBUG_ASSERT(common.type == QUE_NODE_PURGE);
- close_table();
- ut_ad(undo_recs.empty());
- ut_d(in_progress= false);
- purge_thd= nullptr;
- mem_heap_empty(heap);
- return common.parent;
- }
+ inline que_node_t *end();
};
#endif
diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h
index 1e0fdc65238..a1350740e2a 100644
--- a/storage/innobase/include/row0row.h
+++ b/storage/innobase/include/row0row.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2016, 2020, MariaDB Corporation.
+Copyright (c) 2016, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -303,13 +303,13 @@ row_build_row_ref_fast(
/***************************************************************//**
Searches the clustered index record for a row, if we have the row
reference.
-@return TRUE if found */
-ibool
+@return true if found */
+bool
row_search_on_row_ref(
/*==================*/
btr_pcur_t* pcur, /*!< out: persistent cursor, which must
be closed by the caller */
- ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF, ... */
const dict_table_t* table, /*!< in: table */
const dtuple_t* ref, /*!< in: row reference */
mtr_t* mtr) /*!< in/out: mtr */
@@ -321,7 +321,7 @@ on the secondary index record are preserved.
rec_t*
row_get_clust_rec(
/*==============*/
- ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF, ... */
const rec_t* rec, /*!< in: record in a secondary index */
dict_index_t* index, /*!< in: secondary index */
dict_index_t** clust_index,/*!< out: clustered index */
@@ -363,9 +363,8 @@ Searches an index record.
enum row_search_result
row_search_index_entry(
/*===================*/
- dict_index_t* index, /*!< in: index */
const dtuple_t* entry, /*!< in: index entry */
- ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF, ... */
btr_pcur_t* pcur, /*!< in/out: persistent cursor, which must
be closed by the caller */
mtr_t* mtr) /*!< in: mtr */
diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h
index eb83a4bcad6..8134c60fe72 100644
--- a/storage/innobase/include/row0sel.h
+++ b/storage/innobase/include/row0sel.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1997, 2017, Oracle and/or its affiliates.
-Copyright (c) 2017, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,8 +24,7 @@ Select
Created 12/19/1997 Heikki Tuuri
*******************************************************/
-#ifndef row0sel_h
-#define row0sel_h
+#pragma once
#include "data0data.h"
#include "que0types.h"
@@ -58,15 +57,6 @@ void
sel_col_prefetch_buf_free(
/*======================*/
sel_buf_t* prefetch_buf); /*!< in, own: prefetch buffer */
-/*********************************************************************//**
-Gets the plan node for the nth table in a join.
-@return plan node */
-UNIV_INLINE
-plan_t*
-sel_node_get_nth_plan(
-/*==================*/
- sel_node_t* node, /*!< in: select node */
- ulint i); /*!< in: get ith plan node */
/**********************************************************************//**
Performs a select step. This is a high-level function used in SQL execution
graphs.
@@ -76,14 +66,6 @@ row_sel_step(
/*=========*/
que_thr_t* thr); /*!< in: query thread */
/**********************************************************************//**
-Performs an execution step of an open or close cursor statement node.
-@return query thread to run next or NULL */
-UNIV_INLINE
-que_thr_t*
-open_step(
-/*======*/
- que_thr_t* thr); /*!< in: query thread */
-/**********************************************************************//**
Performs a fetch for a cursor.
@return query thread to run next or NULL */
que_thr_t*
@@ -136,37 +118,7 @@ row_sel_convert_mysql_key_to_innobase(
ulint key_len); /*!< in: MySQL key value length */
-/** Searches for rows in the database. This is used in the interface to
-MySQL. This function opens a cursor, and also implements fetch next
-and fetch prev. NOTE that if we do a search with a full key value
-from a unique index (ROW_SEL_EXACT), then we will not store the cursor
-position and fetch next or fetch prev must not be tried to the cursor!
-
-@param[out] buf buffer for the fetched row in MySQL format
-@param[in] mode search mode PAGE_CUR_L
-@param[in,out] prebuilt prebuilt struct for the table handler;
- this contains the info to search_tuple,
- index; if search tuple contains 0 field then
- we position the cursor at start or the end of
- index, depending on 'mode'
-@param[in] match_mode 0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX
-@param[in] direction 0 or ROW_SEL_NEXT or ROW_SEL_PREV;
- Note: if this is != 0, then prebuilt must has a
- pcur with stored position! In opening of a
- cursor 'direction' should be 0.
-@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
-DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
-UNIV_INLINE
-dberr_t
-row_search_for_mysql(
- byte* buf,
- page_cur_mode_t mode,
- row_prebuilt_t* prebuilt,
- ulint match_mode,
- ulint direction)
- MY_ATTRIBUTE((warn_unused_result));
-
-/** Searches for rows in the database using cursor.
+/** Search for rows in the database using cursor.
Function is mainly used for tables that are shared across connections and
so it employs technique that can help re-construct the rows that
transaction is suppose to see.
@@ -184,7 +136,8 @@ It also has optimization such as pre-caching the rows, using AHI, etc.
Note: if this is != 0, then prebuilt must has a
pcur with stored position! In opening of a
cursor 'direction' should be 0.
-@return DB_SUCCESS or error code */
+@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
+DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
dberr_t
row_search_mvcc(
byte* buf,
@@ -210,6 +163,21 @@ row_count_rtree_recs(
ulint* n_rows); /*!< out: number of entries
seen in the consistent read */
+/**
+Check the index records in CHECK TABLE.
+The index must contain entries in an ascending order,
+unique constraint must not be violated by duplicated keys,
+and the number of index entries is counted in according to the
+current read view.
+
+@param prebuilt index and transaction
+@param n_rows number of records counted
+
+@return error code
+@retval DB_SUCCESS if no error was found */
+dberr_t row_check_index(row_prebuilt_t *prebuilt, ulint *n_rows)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
/** Read the max AUTOINC value from an index.
@param[in] index index starting with an AUTO_INCREMENT column
@return the largest AUTO_INCREMENT value
@@ -382,6 +350,17 @@ struct sel_node_t{
fetches */
};
+/**
+Get the plan node for a table in a join.
+@param node query graph node for SELECT
+@param i plan node element
+@return ith plan node */
+inline plan_t *sel_node_get_nth_plan(sel_node_t *node, ulint i)
+{
+ ut_ad(i < node->n_tables);
+ return &node->plans[i];
+}
+
/** Fetch statement node */
struct fetch_node_t{
que_common_t common; /*!< type: QUE_NODE_FETCH */
@@ -476,7 +455,3 @@ row_sel_field_store_in_mysql_format_func(
#endif /* UNIV_DEBUG */
const byte* data, /*!< in: data to store */
ulint len); /*!< in: length of the data */
-
-#include "row0sel.inl"
-
-#endif
diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h
index 59ed14aeff6..f60fc3595dc 100644
--- a/storage/innobase/include/row0upd.h
+++ b/storage/innobase/include/row0upd.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -118,14 +118,6 @@ row_upd_changes_field_size_or_external(
dict_index_t* index, /*!< in: index */
const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
const upd_t* update);/*!< in: update vector */
-/***********************************************************//**
-Returns true if row update contains disowned external fields.
-@return true if the update contains disowned external fields. */
-bool
-row_upd_changes_disowned_external(
-/*==============================*/
- const upd_t* update) /*!< in: update vector */
- MY_ATTRIBUTE((nonnull, warn_unused_result));
/***************************************************************//**
Builds an update vector from those fields which in a secondary index entry
diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h
index d54384f837c..60f310e1b0f 100644
--- a/storage/innobase/include/row0vers.h
+++ b/storage/innobase/include/row0vers.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -45,7 +45,7 @@ index record.
@param[in] index secondary index
@param[in] offsets rec_get_offsets(rec, index)
@return the active transaction; state must be rechecked after
-trx_mutex_enter(), and trx->release_reference() must be invoked
+acquiring trx->mutex, and trx->release_reference() must be invoked
@retval NULL if the record was committed */
trx_t*
row_vers_impl_x_locked(
@@ -55,7 +55,7 @@ row_vers_impl_x_locked(
const rec_offs* offsets);
/** Finds out if a version of the record, where the version >= the current
-purge view, should have ientry as its secondary index entry. We check
+purge_sys.view, should have ientry as its secondary index entry. We check
if there is any not delete marked version of the record where the trx
id >= purge view, and the secondary index entry == ientry; exactly in
this case we return TRUE.
@@ -85,7 +85,9 @@ row_vers_old_has_index_entry(
Constructs the version of a clustered index record which a consistent
read should see. We assume that the trx id stored in rec is such that
the consistent read should not see rec in its present version.
-@return DB_SUCCESS or DB_MISSING_HISTORY */
+@return error code
+@retval DB_SUCCESS if a previous version was fetched
+@retval DB_MISSING_HISTORY if the history is missing (a sign of corruption) */
dberr_t
row_vers_build_for_consistent_read(
/*===============================*/
diff --git a/storage/innobase/include/rw_lock.h b/storage/innobase/include/rw_lock.h
index f3d005ff764..4881f2f1d35 100644
--- a/storage/innobase/include/rw_lock.h
+++ b/storage/innobase/include/rw_lock.h
@@ -49,6 +49,18 @@ protected:
lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed);
#endif
}
+ /** Start waiting for an exclusive lock.
+ @return current value of the lock word */
+ uint32_t write_lock_wait_start_read()
+ { return lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed); }
+ /** Wait for an exclusive lock.
+ @param l the value of the lock word
+ @return whether the exclusive lock was acquired */
+ bool write_lock_wait_try(uint32_t &l)
+ {
+ return lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire,
+ std::memory_order_relaxed);
+ }
/** Try to acquire a shared lock.
@param l the value of the lock word
@return whether the lock was acquired */
@@ -64,36 +76,46 @@ protected:
}
return true;
}
+
/** Wait for an exclusive lock.
@return whether the exclusive lock was acquired */
bool write_lock_poll()
{
auto l= WRITER_WAITING;
- if (lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire,
- std::memory_order_relaxed))
+ if (write_lock_wait_try(l))
return true;
if (!(l & WRITER_WAITING))
/* write_lock() must have succeeded for another thread */
write_lock_wait_start();
return false;
}
+ /** @return the lock word value */
+ uint32_t value() const { return lock.load(std::memory_order_acquire); }
public:
/** Default constructor */
rw_lock() : lock(UNLOCKED) {}
- /** Release a shared lock */
- void read_unlock()
+ /** Release a shared lock.
+ @return whether any writers may have to be woken up */
+ bool read_unlock()
{
- IF_DBUG_ASSERT(auto l=,) lock.fetch_sub(1, std::memory_order_release);
- DBUG_ASSERT(l & ~WRITER_PENDING); /* at least one read lock */
+ auto l= lock.fetch_sub(1, std::memory_order_release);
DBUG_ASSERT(!(l & WRITER)); /* no write lock must have existed */
+ DBUG_ASSERT(~(WRITER_PENDING) & l); /* at least one read lock */
+ return (~WRITER_PENDING & l) == 1;
}
/** Release an exclusive lock */
void write_unlock()
{
+ /* Below, we use fetch_sub(WRITER) instead of fetch_and(~WRITER).
+ The reason is that on IA-32 and AMD64 it translates into the 80486
+ instruction LOCK XADD, while fetch_and() translates into a loop
+ around LOCK CMPXCHG. For other ISA either form should be fine. */
+ static_assert(WRITER == 1U << 31, "compatibility");
IF_DBUG_ASSERT(auto l=,) lock.fetch_sub(WRITER, std::memory_order_release);
- DBUG_ASSERT(l & WRITER); /* the write lock must have existed */
+ /* the write lock must have existed */
+ DBUG_ASSERT(l & WRITER);
}
/** Try to acquire a shared lock.
@return whether the lock was acquired */
@@ -108,15 +130,9 @@ public:
}
/** @return whether an exclusive lock is being held by any thread */
- bool is_write_locked() const
- { return !!(lock.load(std::memory_order_relaxed) & WRITER); }
- /** @return whether a shared lock is being held by any thread */
- bool is_read_locked() const
- {
- auto l= lock.load(std::memory_order_relaxed);
- return (l & ~WRITER_PENDING) && !(l & WRITER);
- }
+ bool is_write_locked() const { return !!(value() & WRITER); }
+ /** @return whether any lock is being held or waited for by any thread */
+ bool is_locked_or_waiting() const { return value() != 0; }
/** @return whether any lock is being held by any thread */
- bool is_locked() const
- { return (lock.load(std::memory_order_relaxed) & ~WRITER_WAITING) != 0; }
+ bool is_locked() const { return (value() & ~WRITER_WAITING) != 0; }
};
diff --git a/storage/innobase/include/small_vector.h b/storage/innobase/include/small_vector.h
new file mode 100644
index 00000000000..d28a36184b8
--- /dev/null
+++ b/storage/innobase/include/small_vector.h
@@ -0,0 +1,100 @@
+/*****************************************************************************
+
+Copyright (c) 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+/* A normally small vector, inspired by llvm::SmallVector */
+#include "my_global.h"
+#include <iterator>
+#include <memory>
+
+class small_vector_base
+{
+protected:
+ typedef uint32_t Size_T;
+ void *BeginX;
+ Size_T Size= 0, Capacity;
+ small_vector_base()= delete;
+ small_vector_base(void *small, size_t small_size)
+ : BeginX(small), Capacity(Size_T(small_size)) {}
+ ATTRIBUTE_COLD void grow_by_1(void *small, size_t element_size);
+public:
+ size_t size() const { return Size; }
+ size_t capacity() const { return Capacity; }
+ bool empty() const { return !Size; }
+ void clear() { Size= 0; }
+protected:
+ void set_size(size_t N) { Size= Size_T(N); }
+};
+
+template <typename T, unsigned N>
+class small_vector : public small_vector_base
+{
+ /** The fixed storage allocation */
+ T small[N];
+
+ using small_vector_base::set_size;
+
+ void grow_if_needed()
+ {
+ if (unlikely(size() >= capacity()))
+ grow_by_1(small, sizeof *small);
+ }
+
+public:
+ small_vector() : small_vector_base(small, N)
+ {
+ TRASH_ALLOC(small, sizeof small);
+ }
+ ~small_vector()
+ {
+ if (small != begin())
+ my_free(begin());
+ MEM_MAKE_ADDRESSABLE(small, sizeof small);
+ }
+
+ using iterator= T *;
+ using const_iterator= const T *;
+ using reverse_iterator= std::reverse_iterator<iterator>;
+ using reference= T &;
+ using const_reference= const T&;
+
+ iterator begin() { return static_cast<iterator>(BeginX); }
+ const_iterator begin() const { return static_cast<const_iterator>(BeginX); }
+ iterator end() { return begin() + size(); }
+ const_iterator end() const { return begin() + size(); }
+
+ reverse_iterator rbegin() { return reverse_iterator(end()); }
+ reverse_iterator rend() { return reverse_iterator(begin()); }
+
+ reference operator[](size_t i) { assert(i < size()); return begin()[i]; }
+ const_reference operator[](size_t i) const
+ { return const_cast<small_vector&>(*this)[i]; }
+
+ void erase(const_iterator S, const_iterator E)
+ {
+ set_size(std::move(const_cast<iterator>(E), end(),
+ const_cast<iterator>(S)) - begin());
+ }
+
+ void emplace_back(T &&arg)
+ {
+ grow_if_needed();
+ ::new (end()) T(arg);
+ set_size(size() + 1);
+ }
+};
diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h
index e65d31bfa04..971f6363bdb 100644
--- a/storage/innobase/include/srv0mon.h
+++ b/storage/innobase/include/srv0mon.h
@@ -36,7 +36,7 @@ Created 12/15/2009 Jimmy Yang
#define __STDC_LIMIT_MACROS
#endif /* __STDC_LIMIT_MACROS */
-#include <stdint.h>
+#include <cstdint>
#include "my_atomic.h"
#include "my_atomic_wrapper.h"
@@ -136,8 +136,6 @@ enum monitor_id_t {
/* Start of Metadata counter */
MONITOR_MODULE_METADATA,
MONITOR_TABLE_OPEN,
- MONITOR_TABLE_CLOSE,
- MONITOR_TABLE_REFERENCE,
/* Lock manager related counters */
MONITOR_MODULE_LOCK,
@@ -218,11 +216,7 @@ enum monitor_id_t {
MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
MONITOR_LRU_BATCH_SCANNED_PER_CALL,
MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
- MONITOR_LRU_BATCH_FLUSH_COUNT,
- MONITOR_LRU_BATCH_FLUSH_PAGES,
MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
- MONITOR_LRU_BATCH_EVICT_COUNT,
- MONITOR_LRU_BATCH_EVICT_PAGES,
MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT,
MONITOR_LRU_GET_FREE_SEARCH,
MONITOR_LRU_SEARCH_SCANNED,
@@ -287,7 +281,6 @@ enum monitor_id_t {
MONITOR_TRX_COMMIT_UNDO,
MONITOR_TRX_ROLLBACK,
MONITOR_TRX_ROLLBACK_SAVEPOINT,
- MONITOR_TRX_ACTIVE,
MONITOR_RSEG_HISTORY_LEN,
MONITOR_NUM_UNDO_SLOT_USED,
MONITOR_NUM_UNDO_SLOT_CACHED,
@@ -350,9 +343,7 @@ enum monitor_id_t {
/* Adaptive Hash Index related counters */
MONITOR_MODULE_ADAPTIVE_HASH,
MONITOR_OVLD_ADAPTIVE_HASH_SEARCH,
-#endif /* BTR_CUR_HASH_ADAPT */
MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE,
-#ifdef BTR_CUR_HASH_ADAPT
MONITOR_ADAPTIVE_HASH_PAGE_ADDED,
MONITOR_ADAPTIVE_HASH_PAGE_REMOVED,
MONITOR_ADAPTIVE_HASH_ROW_ADDED,
@@ -382,7 +373,6 @@ enum monitor_id_t {
MONITOR_OVLD_SERVER_ACTIVITY,
MONITOR_MASTER_ACTIVE_LOOPS,
MONITOR_MASTER_IDLE_LOOPS,
- MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND,
MONITOR_SRV_LOG_FLUSH_MICROSECOND,
MONITOR_SRV_DICT_LRU_MICROSECOND,
MONITOR_SRV_DICT_LRU_EVICT_COUNT_ACTIVE,
@@ -390,15 +380,6 @@ enum monitor_id_t {
MONITOR_OVLD_SRV_DBLWR_WRITES,
MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN,
MONITOR_OVLD_SRV_PAGE_SIZE,
- MONITOR_OVLD_RWLOCK_S_SPIN_WAITS,
- MONITOR_OVLD_RWLOCK_X_SPIN_WAITS,
- MONITOR_OVLD_RWLOCK_SX_SPIN_WAITS,
- MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS,
- MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS,
- MONITOR_OVLD_RWLOCK_SX_SPIN_ROUNDS,
- MONITOR_OVLD_RWLOCK_S_OS_WAITS,
- MONITOR_OVLD_RWLOCK_X_OS_WAITS,
- MONITOR_OVLD_RWLOCK_SX_OS_WAITS,
/* Data DML related counters */
MONITOR_MODULE_DML_STATS,
@@ -414,7 +395,6 @@ enum monitor_id_t {
/* Data DDL related counters */
MONITOR_MODULE_DDL_STATS,
MONITOR_BACKGROUND_DROP_INDEX,
- MONITOR_BACKGROUND_DROP_TABLE,
MONITOR_ONLINE_CREATE_INDEX,
MONITOR_PENDING_ALTER_TABLE,
MONITOR_ALTER_TABLE_SORT_FILES,
@@ -426,10 +406,6 @@ enum monitor_id_t {
MONITOR_ICP_OUT_OF_RANGE,
MONITOR_ICP_MATCH,
- /* Mutex/RW-Lock related counters */
- MONITOR_MODULE_LATCHES,
- MONITOR_LATCHES,
-
/* This is used only for control system to turn
on/off and reset all monitor counters */
MONITOR_ALL_COUNTER,
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index 75718a92a10..96cfe886c02 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -3,7 +3,7 @@
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved.
Copyright (c) 2008, 2009, Google Inc.
Copyright (c) 2009, Percona Inc.
-Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -45,16 +45,40 @@ Created 10/10/1995 Heikki Tuuri
#include "que0types.h"
#include "trx0types.h"
#include "fil0fil.h"
+#include "ut0counter.h"
#include "mysql/psi/mysql_stage.h"
#include "mysql/psi/psi.h"
#include <tpool.h>
#include <memory>
+/** Simple non-atomic counter
+@tparam Type the integer type of the counter */
+template <typename Type>
+struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) simple_counter
+{
+ /** Increment the counter */
+ Type inc() { return add(1); }
+ /** Decrement the counter */
+ Type dec() { return add(Type(~0)); }
+
+ /** Add to the counter
+ @param i amount to be added
+ @return the value of the counter after adding */
+ Type add(Type i) { return m_counter += i; }
+
+ /** @return the value of the counter */
+ operator Type() const { return m_counter; }
+
+private:
+ /** The counter */
+ Type m_counter;
+};
+
/** Global counters used inside InnoDB. */
struct srv_stats_t
{
- typedef ib_counter_t<ulint, 64> ulint_ctr_64_t;
+ typedef ib_counter_t<ulint> ulint_ctr_n_t;
typedef simple_counter<lsn_t> lsn_ctr_1_t;
typedef simple_counter<ulint> ulint_ctr_1_t;
typedef simple_counter<int64_t> int64_ctr_1_t;
@@ -84,91 +108,74 @@ struct srv_stats_t
/** Store the number of write requests issued */
ulint_ctr_1_t buf_pool_write_requests;
- /** Number of buffer pool reads that led to the reading of
- a disk page */
- ulint_ctr_1_t buf_pool_reads;
-
/** Number of bytes saved by page compression */
- ulint_ctr_64_t page_compression_saved;
+ ulint_ctr_n_t page_compression_saved;
/* Number of pages compressed with page compression */
- ulint_ctr_64_t pages_page_compressed;
+ ulint_ctr_n_t pages_page_compressed;
/* Number of TRIM operations induced by page compression */
- ulint_ctr_64_t page_compressed_trim_op;
+ ulint_ctr_n_t page_compressed_trim_op;
/* Number of pages decompressed with page compression */
- ulint_ctr_64_t pages_page_decompressed;
+ ulint_ctr_n_t pages_page_decompressed;
/* Number of page compression errors */
- ulint_ctr_64_t pages_page_compression_error;
+ ulint_ctr_n_t pages_page_compression_error;
/* Number of pages encrypted */
- ulint_ctr_64_t pages_encrypted;
+ ulint_ctr_n_t pages_encrypted;
/* Number of pages decrypted */
- ulint_ctr_64_t pages_decrypted;
+ ulint_ctr_n_t pages_decrypted;
/* Number of merge blocks encrypted */
- ulint_ctr_64_t n_merge_blocks_encrypted;
+ ulint_ctr_n_t n_merge_blocks_encrypted;
/* Number of merge blocks decrypted */
- ulint_ctr_64_t n_merge_blocks_decrypted;
+ ulint_ctr_n_t n_merge_blocks_decrypted;
/* Number of row log blocks encrypted */
- ulint_ctr_64_t n_rowlog_blocks_encrypted;
+ ulint_ctr_n_t n_rowlog_blocks_encrypted;
/* Number of row log blocks decrypted */
- ulint_ctr_64_t n_rowlog_blocks_decrypted;
+ ulint_ctr_n_t n_rowlog_blocks_decrypted;
/** Number of data read in total (in bytes) */
ulint_ctr_1_t data_read;
- /** Wait time of database locks */
- int64_ctr_1_t n_lock_wait_time;
-
- /** Number of database lock waits */
- ulint_ctr_1_t n_lock_wait_count;
-
- /** Number of threads currently waiting on database locks */
- MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<ulint>
- n_lock_wait_current_count;
-
/** Number of rows read. */
- ulint_ctr_64_t n_rows_read;
+ ulint_ctr_n_t n_rows_read;
/** Number of rows updated */
- ulint_ctr_64_t n_rows_updated;
+ ulint_ctr_n_t n_rows_updated;
/** Number of rows deleted */
- ulint_ctr_64_t n_rows_deleted;
+ ulint_ctr_n_t n_rows_deleted;
/** Number of rows inserted */
- ulint_ctr_64_t n_rows_inserted;
+ ulint_ctr_n_t n_rows_inserted;
/** Number of system rows read. */
- ulint_ctr_64_t n_system_rows_read;
+ ulint_ctr_n_t n_system_rows_read;
/** Number of system rows updated */
- ulint_ctr_64_t n_system_rows_updated;
+ ulint_ctr_n_t n_system_rows_updated;
/** Number of system rows deleted */
- ulint_ctr_64_t n_system_rows_deleted;
+ ulint_ctr_n_t n_system_rows_deleted;
/** Number of system rows inserted */
- ulint_ctr_64_t n_system_rows_inserted;
+ ulint_ctr_n_t n_system_rows_inserted;
/** Number of times secondary index lookup triggered cluster lookup */
- ulint_ctr_64_t n_sec_rec_cluster_reads;
+ ulint_ctr_n_t n_sec_rec_cluster_reads;
/** Number of times prefix optimization avoided triggering cluster lookup */
- ulint_ctr_64_t n_sec_rec_cluster_reads_avoided;
+ ulint_ctr_n_t n_sec_rec_cluster_reads_avoided;
/** Number of encryption_get_latest_key_version calls */
- ulint_ctr_64_t n_key_requests;
+ ulint_ctr_n_t n_key_requests;
/** Number of temporary tablespace blocks encrypted */
- ulint_ctr_64_t n_temp_blocks_encrypted;
+ ulint_ctr_n_t n_temp_blocks_encrypted;
/** Number of temporary tablespace blocks decrypted */
- ulint_ctr_64_t n_temp_blocks_decrypted;
-
- /** Number of lock deadlocks */
- ulint_ctr_1_t lock_deadlock_count;
+ ulint_ctr_n_t n_temp_blocks_decrypted;
};
/** We are prepared for a situation that we have this many threads waiting for
-a semaphore inside InnoDB. srv_start() sets the value. */
+a transactional lock inside InnoDB. srv_start() sets the value. */
extern ulint srv_max_n_threads;
extern const char* srv_main_thread_op_info;
@@ -193,15 +200,13 @@ at a time */
#define SRV_AUTO_EXTEND_INCREMENT (srv_sys_space.get_autoextend_increment())
/** Mutex protecting page_zip_stat_per_index */
-extern ib_mutex_t page_zip_stat_per_index_mutex;
-/* Mutex for locking srv_monitor_file. Not created if srv_read_only_mode */
-extern ib_mutex_t srv_monitor_file_mutex;
+extern mysql_mutex_t page_zip_stat_per_index_mutex;
+/** Mutex for locking srv_monitor_file */
+extern mysql_mutex_t srv_monitor_file_mutex;
/* Temporary file for innodb monitor output */
extern FILE* srv_monitor_file;
-/* Mutex for locking srv_misc_tmpfile. Only created if !srv_read_only_mode.
-This mutex has a very low rank; threads reserving it should not
-acquire any further latches or sleep before releasing this one. */
-extern ib_mutex_t srv_misc_tmpfile_mutex;
+/** Mutex for locking srv_misc_tmpfile */
+extern mysql_mutex_t srv_misc_tmpfile_mutex;
/* Temporary file for miscellanous diagnostic output */
extern FILE* srv_misc_tmpfile;
@@ -284,11 +289,6 @@ extern ulong srv_log_write_ahead_size;
extern my_bool srv_adaptive_flushing;
extern my_bool srv_flush_sync;
-/* If this flag is TRUE, then we will load the indexes' (and tables') metadata
-even if they are marked as "corrupted". Mostly it is for DBA to process
-corrupted index and table */
-extern my_bool srv_load_corrupted;
-
/** Requested size in bytes */
extern ulint srv_buf_pool_size;
/** Requested buffer pool chunk size. Each buffer pool instance consists
@@ -313,6 +313,8 @@ extern ulong srv_buf_pool_load_pages_abort;
/** Lock table size in bytes */
extern ulint srv_lock_table_size;
+/** the value of innodb_checksum_algorithm */
+extern ulong srv_checksum_algorithm;
extern my_bool srv_random_read_ahead;
extern ulong srv_read_ahead_threshold;
extern uint srv_n_read_io_threads;
@@ -397,12 +399,18 @@ enum srv_operation_mode {
/** Mariabackup restoring the incremental part of a backup */
SRV_OPERATION_RESTORE_DELTA,
/** Mariabackup restoring a backup for subsequent --export */
- SRV_OPERATION_RESTORE_EXPORT
+ SRV_OPERATION_RESTORE_EXPORT,
+ /** Mariabackup taking a backup and avoid deferring
+ any tablespace */
+ SRV_OPERATION_BACKUP_NO_DEFER
};
/** Current mode of operation */
extern enum srv_operation_mode srv_operation;
+/** whether this is the server's first start after mariabackup --prepare */
+extern bool srv_start_after_restore;
+
extern my_bool srv_print_innodb_monitor;
extern my_bool srv_print_innodb_lock_monitor;
extern ibool srv_print_verbose_log;
@@ -425,7 +433,6 @@ extern ulint srv_log_writes_and_flush;
#ifdef UNIV_DEBUG
extern my_bool innodb_evict_tables_on_commit_debug;
-extern my_bool srv_sync_debug;
extern my_bool srv_purge_view_update_only_debug;
/** InnoDB system tablespace to set during recovery */
@@ -442,9 +449,6 @@ extern uint srv_n_purge_threads;
/* the number of pages to purge in one batch */
extern ulong srv_purge_batch_size;
-/* the number of sync wait arrays */
-extern ulong srv_sync_array_size;
-
/* print all user-level transactions deadlocks to mysqld stderr */
extern my_bool srv_print_all_deadlocks;
@@ -532,11 +536,9 @@ enum {
SRV_FORCE_NO_BACKGROUND = 2, /*!< prevent the main thread from
running: if a crash would occur
in purge, this prevents it */
- SRV_FORCE_NO_TRX_UNDO = 3, /*!< do not run trx rollback after
+ SRV_FORCE_NO_TRX_UNDO = 3, /*!< do not run DML rollback after
recovery */
- SRV_FORCE_NO_IBUF_MERGE = 4, /*!< prevent also ibuf operations:
- if they would cause a crash, better
- not do them */
+ SRV_FORCE_NO_DDL_UNDO = 4, /*!< prevent also DDL rollback */
SRV_FORCE_NO_UNDO_LOG_SCAN = 5, /*!< do not look at undo logs when
starting the database: InnoDB will
treat even incomplete transactions
@@ -580,8 +582,7 @@ ibool
srv_printf_innodb_monitor(
/*======================*/
FILE* file, /*!< in: output stream */
- ibool nowait, /*!< in: whether to wait for the
- lock_sys_t::mutex */
+ ibool nowait, /*!< in: whether to wait for lock_sys.latch */
ulint* trx_start, /*!< out: file position of the start of
the list of active transactions */
ulint* trx_end); /*!< out: file position of the end of
@@ -659,29 +660,21 @@ void srv_init_purge_tasks();
/** Status variables to be passed to MySQL */
struct export_var_t{
+#ifdef BTR_CUR_HASH_ADAPT
+ ulint innodb_ahi_hit;
+ ulint innodb_ahi_miss;
+#endif /* BTR_CUR_HASH_ADAPT */
char innodb_buffer_pool_dump_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool dump status */
char innodb_buffer_pool_load_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool load status */
char innodb_buffer_pool_resize_status[512];/*!< Buf pool resize status */
my_bool innodb_buffer_pool_load_incomplete;/*!< Buf pool load incomplete */
ulint innodb_buffer_pool_pages_total; /*!< Buffer pool size */
- ulint innodb_buffer_pool_pages_data; /*!< Data pages */
ulint innodb_buffer_pool_bytes_data; /*!< File bytes used */
- ulint innodb_buffer_pool_pages_dirty; /*!< Dirty data pages */
- ulint innodb_buffer_pool_bytes_dirty; /*!< File bytes modified */
ulint innodb_buffer_pool_pages_misc; /*!< Miscellanous pages */
- ulint innodb_buffer_pool_pages_free; /*!< Free pages */
#ifdef UNIV_DEBUG
ulint innodb_buffer_pool_pages_latched; /*!< Latched pages */
#endif /* UNIV_DEBUG */
- ulint innodb_buffer_pool_pages_made_not_young;
- ulint innodb_buffer_pool_pages_made_young;
- ulint innodb_buffer_pool_pages_old;
- ulint innodb_buffer_pool_read_requests; /*!< buf_pool.stat.n_page_gets */
- ulint innodb_buffer_pool_reads; /*!< srv_buf_pool_reads */
ulint innodb_buffer_pool_write_requests;/*!< srv_stats.buf_pool_write_requests */
- ulint innodb_buffer_pool_read_ahead_rnd;/*!< srv_read_ahead_rnd */
- ulint innodb_buffer_pool_read_ahead; /*!< srv_read_ahead */
- ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/
ulint innodb_checkpoint_age;
ulint innodb_checkpoint_max_age;
ulint innodb_data_pending_reads; /*!< Pending reads */
@@ -791,30 +784,6 @@ struct export_var_t{
int64_t innodb_encryption_key_requests;
};
-/** Thread slot in the thread table. */
-struct srv_slot_t{
- ibool in_use; /*!< TRUE if this slot
- is in use */
- /** time(NULL) when the thread was suspended.
- FIXME: Use my_interval_timer() or similar, to avoid bogus
- timeouts in lock_wait_check_and_cancel() or lock_wait_suspend_thread()
- when the system time is adjusted to the past!
-
- FIXME: This is duplicating trx_lock_t::wait_started,
- which is being used for diagnostic purposes only. */
- time_t suspend_time;
- ulong wait_timeout; /*!< wait time that if exceeded
- the thread will be timed out.
- Initialized by
- lock_wait_table_reserve_slot()
- for lock wait */
- os_event_t event; /*!< event used in suspending
- the thread when it has nothing
- to do */
- que_thr_t* thr; /*!< suspended query thread
- (only used for user threads) */
-};
-
extern tpool::thread_pool *srv_thread_pool;
extern std::unique_ptr<tpool::timer> srv_master_timer;
extern std::unique_ptr<tpool::timer> srv_monitor_timer;
diff --git a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h
index 58488df4be6..44b19aa666b 100644
--- a/storage/innobase/include/srv0start.h
+++ b/storage/innobase/include/srv0start.h
@@ -93,8 +93,6 @@ extern lsn_t srv_shutdown_lsn;
/** TRUE if the server is being started */
extern bool srv_is_being_started;
-/** TRUE if SYS_TABLESPACES is available for lookups */
-extern bool srv_sys_tablespaces_open;
/** TRUE if the server is being started, before rolling back any
incomplete transactions */
extern bool srv_startup_is_before_trx_rollback_phase;
diff --git a/storage/innobase/include/srw_lock.h b/storage/innobase/include/srw_lock.h
new file mode 100644
index 00000000000..1dca0cc1054
--- /dev/null
+++ b/storage/innobase/include/srw_lock.h
@@ -0,0 +1,554 @@
+/*****************************************************************************
+
+Copyright (c) 2020, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+#include "univ.i"
+#include "rw_lock.h"
+
+#if defined __linux__
+/* futex(2): FUTEX_WAIT_PRIVATE, FUTEX_WAKE_PRIVATE */
+#elif defined __OpenBSD__ || defined __FreeBSD__ || defined __DragonFly__
+/* system calls similar to Linux futex(2) */
+#elif defined _WIN32
+/* SRWLOCK as well as WaitOnAddress(), WakeByAddressSingle() */
+#else
+# define SUX_LOCK_GENERIC /* fall back to generic synchronization primitives */
+#endif
+
+#if !defined SUX_LOCK_GENERIC && 0 /* defined SAFE_MUTEX */
+# define SUX_LOCK_GENERIC /* Use dummy implementation for debugging purposes */
+#endif
+
+#ifdef SUX_LOCK_GENERIC
+/** An exclusive-only variant of srw_lock */
+template<bool spinloop>
+class pthread_mutex_wrapper final
+{
+ pthread_mutex_t lock;
+public:
+ void init()
+ {
+ if (spinloop)
+ pthread_mutex_init(&lock, MY_MUTEX_INIT_FAST);
+ else
+ pthread_mutex_init(&lock, nullptr);
+ }
+ void destroy() { pthread_mutex_destroy(&lock); }
+# ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+ void wr_lock() { pthread_mutex_lock(&lock); }
+# else
+private:
+ void wr_wait();
+public:
+ inline void wr_lock();
+# endif
+ void wr_unlock() { pthread_mutex_unlock(&lock); }
+ bool wr_lock_try() { return !pthread_mutex_trylock(&lock); }
+};
+
+# ifndef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+template<> void pthread_mutex_wrapper<true>::wr_wait();
+template<>
+inline void pthread_mutex_wrapper<false>::wr_lock()
+{ pthread_mutex_lock(&lock); }
+template<>
+inline void pthread_mutex_wrapper<true>::wr_lock()
+{ if (!wr_lock_try()) wr_wait(); }
+# endif
+#endif
+
+/** Futex-based mutex */
+template<bool spinloop>
+class srw_mutex_impl final
+{
+ /** The lock word, containing HOLDER + 1 if the lock is being held,
+ plus the number of waiters */
+ std::atomic<uint32_t> lock;
+ /** Identifies that the lock is being held */
+ static constexpr uint32_t HOLDER= 1U << 31;
+
+#ifdef SUX_LOCK_GENERIC
+public:
+ /** The mutex for the condition variables. */
+ pthread_mutex_t mutex;
+private:
+ /** Condition variable for the lock word. Used with mutex. */
+ pthread_cond_t cond;
+#endif
+
+ /** Wait until the mutex has been acquired */
+ void wait_and_lock();
+ /** Wait for lock!=lk */
+ inline void wait(uint32_t lk);
+ /** Wake up one wait() thread */
+ void wake();
+public:
+ /** @return whether the mutex is being held or waited for */
+ bool is_locked_or_waiting() const
+ { return lock.load(std::memory_order_acquire) != 0; }
+ /** @return whether the mutex is being held by any thread */
+ bool is_locked() const
+ { return (lock.load(std::memory_order_acquire) & HOLDER) != 0; }
+
+ void init()
+ {
+ DBUG_ASSERT(!is_locked_or_waiting());
+#ifdef SUX_LOCK_GENERIC
+ pthread_mutex_init(&mutex, nullptr);
+ pthread_cond_init(&cond, nullptr);
+#endif
+ }
+ void destroy()
+ {
+ DBUG_ASSERT(!is_locked_or_waiting());
+#ifdef SUX_LOCK_GENERIC
+ pthread_mutex_destroy(&mutex);
+ pthread_cond_destroy(&cond);
+#endif
+ }
+
+ /** @return whether the mutex was acquired */
+ bool wr_lock_try()
+ {
+ uint32_t lk= 0;
+ return lock.compare_exchange_strong(lk, HOLDER + 1,
+ std::memory_order_acquire,
+ std::memory_order_relaxed);
+ }
+
+ void wr_lock() { if (!wr_lock_try()) wait_and_lock(); }
+ void wr_unlock()
+ {
+ const uint32_t lk= lock.fetch_sub(HOLDER + 1, std::memory_order_release);
+ if (lk != HOLDER + 1)
+ {
+ DBUG_ASSERT(lk & HOLDER);
+ wake();
+ }
+ }
+};
+
+#ifdef SUX_LOCK_GENERIC
+typedef pthread_mutex_wrapper<true> srw_spin_mutex;
+typedef pthread_mutex_wrapper<false> srw_mutex;
+#else
+typedef srw_mutex_impl<true> srw_spin_mutex;
+typedef srw_mutex_impl<false> srw_mutex;
+#endif
+
+template<bool spinloop> class srw_lock_impl;
+
+/** Slim shared-update-exclusive lock with no recursion */
+template<bool spinloop>
+class ssux_lock_impl final
+{
+#ifdef UNIV_PFS_RWLOCK
+ friend class ssux_lock;
+# ifdef SUX_LOCK_GENERIC
+# elif defined _WIN32
+# else
+ friend srw_lock_impl<spinloop>;
+# endif
+#endif
+ /** mutex for synchronization; held by U or X lock holders */
+ srw_mutex_impl<spinloop> writer;
+#ifdef SUX_LOCK_GENERIC
+ /** Condition variable for "readers"; used with writer.mutex. */
+ pthread_cond_t readers_cond;
+#endif
+ /** S or U holders, and WRITER flag for X holder or waiter */
+ std::atomic<uint32_t> readers;
+ /** indicates an X request; readers=WRITER indicates granted X lock */
+ static constexpr uint32_t WRITER= 1U << 31;
+
+ /** Wait for readers!=lk */
+ inline void wait(uint32_t lk);
+
+ /** Wait for readers!=lk|WRITER */
+ void wr_wait(uint32_t lk);
+ /** Wake up wait() on the last rd_unlock() */
+ void wake();
+ /** Acquire a read lock */
+ void rd_wait();
+public:
+ void init()
+ {
+ writer.init();
+ DBUG_ASSERT(is_vacant());
+#ifdef SUX_LOCK_GENERIC
+ pthread_cond_init(&readers_cond, nullptr);
+#endif
+ }
+ void destroy()
+ {
+ DBUG_ASSERT(is_vacant());
+ writer.destroy();
+#ifdef SUX_LOCK_GENERIC
+ pthread_cond_destroy(&readers_cond);
+#endif
+ }
+ /** @return whether any writer is waiting */
+ bool is_waiting() const
+ { return (readers.load(std::memory_order_relaxed) & WRITER) != 0; }
+#ifndef DBUG_OFF
+ /** @return whether the lock is being held or waited for */
+ bool is_vacant() const { return !is_locked_or_waiting(); }
+#endif /* !DBUG_OFF */
+
+ bool rd_lock_try()
+ {
+ uint32_t lk= 0;
+ while (!readers.compare_exchange_weak(lk, lk + 1,
+ std::memory_order_acquire,
+ std::memory_order_relaxed))
+ if (lk & WRITER)
+ return false;
+ return true;
+ }
+
+ bool u_lock_try()
+ {
+ if (!writer.wr_lock_try())
+ return false;
+ IF_DBUG_ASSERT(uint32_t lk=,)
+ readers.fetch_add(1, std::memory_order_acquire);
+ DBUG_ASSERT(lk < WRITER - 1);
+ return true;
+ }
+
+ bool wr_lock_try()
+ {
+ if (!writer.wr_lock_try())
+ return false;
+ uint32_t lk= 0;
+ if (readers.compare_exchange_strong(lk, WRITER,
+ std::memory_order_acquire,
+ std::memory_order_relaxed))
+ return true;
+ writer.wr_unlock();
+ return false;
+ }
+
+ void rd_lock() { if (!rd_lock_try()) rd_wait(); }
+ void u_lock()
+ {
+ writer.wr_lock();
+ IF_DBUG_ASSERT(uint32_t lk=,)
+ readers.fetch_add(1, std::memory_order_acquire);
+ DBUG_ASSERT(lk < WRITER - 1);
+ }
+ void wr_lock()
+ {
+ writer.wr_lock();
+#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
+ /* On IA-32 and AMD64, this type of fetch_or() can only be implemented
+ as a loop around LOCK CMPXCHG. In this particular case, setting the
+ most significant bit using fetch_add() is equivalent, and is
+ translated into a simple LOCK XADD. */
+ static_assert(WRITER == 1U << 31, "compatibility");
+ if (uint32_t lk= readers.fetch_add(WRITER, std::memory_order_acquire))
+ wr_wait(lk);
+#else
+ if (uint32_t lk= readers.fetch_or(WRITER, std::memory_order_acquire))
+ wr_wait(lk);
+#endif
+ }
+
+ void u_wr_upgrade()
+ {
+ DBUG_ASSERT(writer.is_locked());
+ uint32_t lk= readers.fetch_add(WRITER - 1, std::memory_order_acquire);
+ if (lk != 1)
+ wr_wait(lk - 1);
+ }
+ void wr_u_downgrade()
+ {
+ DBUG_ASSERT(writer.is_locked());
+ DBUG_ASSERT(is_write_locked());
+ readers.store(1, std::memory_order_release);
+ /* Note: Any pending rd_lock() will not be woken up until u_unlock() */
+ }
+
+ void rd_unlock()
+ {
+ uint32_t lk= readers.fetch_sub(1, std::memory_order_release);
+ ut_ad(~WRITER & lk);
+ if (lk == WRITER + 1)
+ wake();
+ }
+ void u_unlock()
+ {
+ IF_DBUG_ASSERT(uint32_t lk=,)
+ readers.fetch_sub(1, std::memory_order_release);
+ DBUG_ASSERT(lk);
+ DBUG_ASSERT(lk < WRITER);
+ writer.wr_unlock();
+ }
+ void wr_unlock()
+ {
+ DBUG_ASSERT(is_write_locked());
+ readers.store(0, std::memory_order_release);
+ writer.wr_unlock();
+ }
+ /** @return whether an exclusive lock may be held by any thread */
+ bool is_write_locked() const noexcept
+ { return readers.load(std::memory_order_acquire) == WRITER; }
+ /** @return whether any lock may be held by any thread */
+ bool is_locked() const noexcept
+ { return readers.load(std::memory_order_acquire) != 0; }
+ /** @return whether any lock may be held by any thread */
+ bool is_locked_or_waiting() const noexcept
+ { return is_locked() || writer.is_locked_or_waiting(); }
+
+ void lock_shared() { rd_lock(); }
+ void unlock_shared() { rd_unlock(); }
+ void lock() { wr_lock(); }
+ void unlock() { wr_unlock(); }
+};
+
+#if defined _WIN32 || defined SUX_LOCK_GENERIC
+/** Slim read-write lock */
+template<bool spinloop>
+class srw_lock_
+{
+# ifdef UNIV_PFS_RWLOCK
+ friend srw_lock_impl<spinloop>;
+# endif
+# ifdef _WIN32
+ SRWLOCK lk;
+# else
+ rw_lock_t lk;
+# endif
+
+ void rd_wait();
+ void wr_wait();
+public:
+ void init() { IF_WIN(,my_rwlock_init(&lk, nullptr)); }
+ void destroy() { IF_WIN(,rwlock_destroy(&lk)); }
+ inline void rd_lock();
+ inline void wr_lock();
+ bool rd_lock_try()
+ { return IF_WIN(TryAcquireSRWLockShared(&lk), !rw_tryrdlock(&lk)); }
+ void rd_unlock()
+ { IF_WIN(ReleaseSRWLockShared(&lk), rw_unlock(&lk)); }
+ bool wr_lock_try()
+ { return IF_WIN(TryAcquireSRWLockExclusive(&lk), !rw_trywrlock(&lk)); }
+ void wr_unlock()
+ { IF_WIN(ReleaseSRWLockExclusive(&lk), rw_unlock(&lk)); }
+#ifdef _WIN32
+ /** @return whether any lock may be held by any thread */
+ bool is_locked_or_waiting() const noexcept { return (size_t&)(lk) != 0; }
+ /** @return whether any lock may be held by any thread */
+ bool is_locked() const noexcept { return is_locked_or_waiting(); }
+ /** @return whether an exclusive lock may be held by any thread */
+ bool is_write_locked() const noexcept
+ {
+ // FIXME: this returns false positives for shared locks
+ return is_locked();
+ }
+
+ void lock_shared() { rd_lock(); }
+ void unlock_shared() { rd_unlock(); }
+ void lock() { wr_lock(); }
+ void unlock() { wr_unlock(); }
+#endif
+};
+
+template<> void srw_lock_<true>::rd_wait();
+template<> void srw_lock_<true>::wr_wait();
+
+template<>
+inline void srw_lock_<false>::rd_lock()
+{ IF_WIN(AcquireSRWLockShared(&lk), rw_rdlock(&lk)); }
+template<>
+inline void srw_lock_<false>::wr_lock()
+{ IF_WIN(AcquireSRWLockExclusive(&lk), rw_wrlock(&lk)); }
+
+template<>
+inline void srw_lock_<true>::rd_lock() { if (!rd_lock_try()) rd_wait(); }
+template<>
+inline void srw_lock_<true>::wr_lock() { if (!wr_lock_try()) wr_wait(); }
+
+typedef srw_lock_<false> srw_lock_low;
+typedef srw_lock_<true> srw_spin_lock_low;
+#else
+typedef ssux_lock_impl<false> srw_lock_low;
+typedef ssux_lock_impl<true> srw_spin_lock_low;
+#endif
+
+#ifndef UNIV_PFS_RWLOCK
+# define SRW_LOCK_INIT(key) init()
+# define SRW_LOCK_ARGS(file, line) /* nothing */
+# define SRW_LOCK_CALL /* nothing */
+typedef srw_lock_low srw_lock;
+typedef srw_spin_lock_low srw_spin_lock;
+#else
+# define SRW_LOCK_INIT(key) init(key)
+# define SRW_LOCK_ARGS(file, line) file, line
+# define SRW_LOCK_CALL __FILE__, __LINE__
+
+/** Slim shared-update-exclusive lock with PERFORMANCE_SCHEMA instrumentation */
+class ssux_lock
+{
+ PSI_rwlock *pfs_psi;
+ ssux_lock_impl<false> lock;
+
+ ATTRIBUTE_NOINLINE void psi_rd_lock(const char *file, unsigned line);
+ ATTRIBUTE_NOINLINE void psi_wr_lock(const char *file, unsigned line);
+ ATTRIBUTE_NOINLINE void psi_u_lock(const char *file, unsigned line);
+ ATTRIBUTE_NOINLINE void psi_u_wr_upgrade(const char *file, unsigned line);
+public:
+ void init(mysql_pfs_key_t key)
+ {
+ pfs_psi= PSI_RWLOCK_CALL(init_rwlock)(key, this);
+ lock.init();
+ }
+ void destroy()
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ {
+ PSI_RWLOCK_CALL(destroy_rwlock)(pfs_psi);
+ pfs_psi= nullptr;
+ }
+ lock.destroy();
+ }
+ void rd_lock(const char *file, unsigned line)
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ psi_rd_lock(file, line);
+ else
+ lock.rd_lock();
+ }
+ void rd_unlock()
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+ lock.rd_unlock();
+ }
+ void u_lock(const char *file, unsigned line)
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ psi_u_lock(file, line);
+ else
+ lock.u_lock();
+ }
+ void u_unlock()
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+ lock.u_unlock();
+ }
+ void wr_lock(const char *file, unsigned line)
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ psi_wr_lock(file, line);
+ else
+ lock.wr_lock();
+ }
+ void wr_unlock()
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+ lock.wr_unlock();
+ }
+ void u_wr_upgrade(const char *file, unsigned line)
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ psi_u_wr_upgrade(file, line);
+ else
+ lock.u_wr_upgrade();
+ }
+ bool rd_lock_try() { return lock.rd_lock_try(); }
+ bool u_lock_try() { return lock.u_lock_try(); }
+ bool wr_lock_try() { return lock.wr_lock_try(); }
+ bool is_waiting() const { return lock.is_waiting(); }
+};
+
+/** Slim reader-writer lock with PERFORMANCE_SCHEMA instrumentation */
+template<bool spinloop>
+class srw_lock_impl
+{
+ PSI_rwlock *pfs_psi;
+# if defined _WIN32 || defined SUX_LOCK_GENERIC
+ srw_lock_<spinloop> lock;
+# else
+ ssux_lock_impl<spinloop> lock;
+# endif
+
+ ATTRIBUTE_NOINLINE void psi_rd_lock(const char *file, unsigned line);
+ ATTRIBUTE_NOINLINE void psi_wr_lock(const char *file, unsigned line);
+public:
+ void init(mysql_pfs_key_t key)
+ {
+ pfs_psi= PSI_RWLOCK_CALL(init_rwlock)(key, this);
+ lock.init();
+ }
+ void destroy()
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ {
+ PSI_RWLOCK_CALL(destroy_rwlock)(pfs_psi);
+ pfs_psi= nullptr;
+ }
+ lock.destroy();
+ }
+ void rd_lock(const char *file, unsigned line)
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ psi_rd_lock(file, line);
+ else
+ lock.rd_lock();
+ }
+ void rd_unlock()
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+ lock.rd_unlock();
+ }
+ void wr_lock(const char *file, unsigned line)
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ psi_wr_lock(file, line);
+ else
+ lock.wr_lock();
+ }
+ void wr_unlock()
+ {
+ if (psi_likely(pfs_psi != nullptr))
+ PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+ lock.wr_unlock();
+ }
+ bool rd_lock_try() { return lock.rd_lock_try(); }
+ bool wr_lock_try() { return lock.wr_lock_try(); }
+ void lock_shared() { return rd_lock(SRW_LOCK_CALL); }
+ void unlock_shared() { return rd_unlock(); }
+#ifndef SUX_LOCK_GENERIC
+ /** @return whether any lock may be held by any thread */
+ bool is_locked_or_waiting() const noexcept
+ { return lock.is_locked_or_waiting(); }
+ /** @return whether an exclusive lock may be held by any thread */
+ bool is_locked() const noexcept { return lock.is_locked(); }
+ /** @return whether an exclusive lock may be held by any thread */
+ bool is_write_locked() const noexcept { return lock.is_write_locked(); }
+#endif
+};
+
+typedef srw_lock_impl<false> srw_lock;
+typedef srw_lock_impl<true> srw_spin_lock;
+
+#endif
diff --git a/storage/innobase/include/sux_lock.h b/storage/innobase/include/sux_lock.h
new file mode 100644
index 00000000000..2c0167ac651
--- /dev/null
+++ b/storage/innobase/include/sux_lock.h
@@ -0,0 +1,472 @@
+/*****************************************************************************
+
+Copyright (c) 2020, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+#include "srw_lock.h"
+#include "my_atomic_wrapper.h"
+#ifdef UNIV_DEBUG
+# include <unordered_set>
+#endif
+
+/** A "fat" rw-lock that supports
+S (shared), U (update, or shared-exclusive), and X (exclusive) modes
+as well as recursive U and X latch acquisition
+@tparam ssux ssux_lock_impl or ssux_lock */
+template<typename ssux>
+class sux_lock final
+{
+ /** The underlying non-recursive lock */
+ ssux lock;
+ /** Numbers of U and X locks. Protected by lock. */
+ uint32_t recursive;
+ /** The owner of the U or X lock (0 if none); protected by lock */
+ std::atomic<pthread_t> writer;
+ /** Special writer!=0 value to indicate that the lock is non-recursive
+ and will be released by an I/O thread */
+#if defined __linux__ || defined _WIN32
+ static constexpr pthread_t FOR_IO= pthread_t(~0UL);
+#else
+# define FOR_IO ((pthread_t) ~0UL) /* it could be a pointer */
+#endif
+#ifdef UNIV_DEBUG
+ /** Protects readers */
+ mutable srw_mutex readers_lock;
+ /** Threads that hold the lock in shared mode */
+ std::atomic<std::unordered_multiset<pthread_t>*> readers;
+#endif
+
+ /** The multiplier in recursive for X locks */
+ static constexpr uint32_t RECURSIVE_X= 1U;
+ /** The multiplier in recursive for U locks */
+ static constexpr uint32_t RECURSIVE_U= 1U << 16;
+ /** The maximum allowed level of recursion */
+ static constexpr uint32_t RECURSIVE_MAX= RECURSIVE_U - 1;
+
+public:
+#ifdef UNIV_PFS_RWLOCK
+ inline void init();
+#endif
+ void SRW_LOCK_INIT(mysql_pfs_key_t key)
+ {
+ lock.SRW_LOCK_INIT(key);
+ ut_ad(!writer.load(std::memory_order_relaxed));
+ ut_ad(!recursive);
+ ut_d(readers_lock.init());
+#ifdef UNIV_DEBUG
+ if (auto r= readers.load(std::memory_order_relaxed))
+ ut_ad(r->empty());
+#endif
+ }
+
+ /** Free the rw-lock after init() */
+ void free()
+ {
+ ut_ad(!writer.load(std::memory_order_relaxed));
+ ut_ad(!recursive);
+#ifdef UNIV_DEBUG
+ readers_lock.destroy();
+ if (auto r= readers.load(std::memory_order_relaxed))
+ {
+ ut_ad(r->empty());
+ delete r;
+ readers.store(nullptr, std::memory_order_relaxed);
+ }
+#endif
+ lock.destroy();
+ }
+
+ /** needed for dict_index_t::clone() */
+ inline void operator=(const sux_lock&);
+
+#ifdef UNIV_DEBUG
+ /** @return whether no recursive locks are being held */
+ bool not_recursive() const
+ {
+ ut_ad(recursive);
+ return recursive == RECURSIVE_X || recursive == RECURSIVE_U;
+ }
+
+ /** @return the number of X locks being held (by any thread) */
+ unsigned x_lock_count() const { return recursive & RECURSIVE_MAX; }
+#endif
+
+ /** Acquire a recursive lock */
+ template<bool allow_readers> void writer_recurse()
+ {
+ ut_ad(writer == pthread_self());
+ ut_d(auto rec= (recursive / (allow_readers ? RECURSIVE_U : RECURSIVE_X)) &
+ RECURSIVE_MAX);
+ ut_ad(allow_readers ? recursive : rec);
+ ut_ad(rec < RECURSIVE_MAX);
+ recursive+= allow_readers ? RECURSIVE_U : RECURSIVE_X;
+ }
+
+private:
+ /** Transfer the ownership of a write lock to another thread
+ @param id the new owner of the U or X lock */
+ void set_new_owner(pthread_t id)
+ {
+ IF_DBUG(DBUG_ASSERT(writer.exchange(id, std::memory_order_relaxed)),
+ writer.store(id, std::memory_order_relaxed));
+ }
+ /** Assign the ownership of a write lock to a thread
+ @param id the owner of the U or X lock */
+ void set_first_owner(pthread_t id)
+ {
+ IF_DBUG(DBUG_ASSERT(!writer.exchange(id, std::memory_order_relaxed)),
+ writer.store(id, std::memory_order_relaxed));
+ }
+#ifdef UNIV_DEBUG
+ /** Register the current thread as a holder of a shared lock */
+ void s_lock_register()
+ {
+ const pthread_t id= pthread_self();
+ readers_lock.wr_lock();
+ auto r= readers.load(std::memory_order_relaxed);
+ if (!r)
+ {
+ r= new std::unordered_multiset<pthread_t>();
+ readers.store(r, std::memory_order_relaxed);
+ }
+ r->emplace(id);
+ readers_lock.wr_unlock();
+ }
+#endif
+
+public:
+ /** In crash recovery or the change buffer, claim the ownership
+ of the exclusive block lock to the current thread */
+ void claim_ownership() { set_new_owner(pthread_self()); }
+
+ /** @return whether the current thread is holding X or U latch */
+ bool have_u_or_x() const
+ {
+ if (pthread_self() != writer.load(std::memory_order_relaxed))
+ return false;
+ ut_ad(recursive);
+ return true;
+ }
+ /** @return whether the current thread is holding U but not X latch */
+ bool have_u_not_x() const
+ { return have_u_or_x() && !((recursive / RECURSIVE_X) & RECURSIVE_MAX); }
+ /** @return whether the current thread is holding X latch */
+ bool have_x() const
+ { return have_u_or_x() && ((recursive / RECURSIVE_X) & RECURSIVE_MAX); }
+#ifdef UNIV_DEBUG
+ /** @return whether the current thread is holding S latch */
+ bool have_s() const
+ {
+ if (auto r= readers.load(std::memory_order_relaxed))
+ {
+ readers_lock.wr_lock();
+ bool found= r->find(pthread_self()) != r->end();
+ readers_lock.wr_unlock();
+ return found;
+ }
+ return false;
+ }
+ /** @return whether the current thread is holding the latch */
+ bool have_any() const { return have_u_or_x() || have_s(); }
+#endif
+
+ /** Acquire a shared lock */
+ inline void s_lock();
+ inline void s_lock(const char *file, unsigned line);
+ /** Acquire an update lock */
+ inline void u_lock();
+ inline void u_lock(const char *file, unsigned line);
+ /** Acquire an exclusive lock */
+ inline void x_lock(bool for_io= false);
+ inline void x_lock(const char *file, unsigned line);
+ /** Acquire a recursive exclusive lock */
+ void x_lock_recursive() { writer_recurse<false>(); }
+ /** Upgrade an update lock */
+ inline void u_x_upgrade();
+ inline void u_x_upgrade(const char *file, unsigned line);
+ /** Downgrade a single exclusive lock to an update lock */
+ void x_u_downgrade()
+ {
+ ut_ad(have_u_or_x());
+ ut_ad(recursive <= RECURSIVE_MAX);
+ recursive*= RECURSIVE_U;
+ lock.wr_u_downgrade();
+ }
+
+ /** Acquire an exclusive lock or upgrade an update lock
+ @return whether U locks were upgraded to X */
+ inline bool x_lock_upgraded();
+
+ /** @return whether a shared lock was acquired */
+ bool s_lock_try()
+ {
+ bool acquired= lock.rd_lock_try();
+ ut_d(if (acquired) s_lock_register());
+ return acquired;
+ }
+
+ /** Try to acquire an update lock
+ @param for_io whether the lock will be released by another thread
+ @return whether the update lock was acquired */
+ inline bool u_lock_try(bool for_io);
+
+ /** Try to acquire an exclusive lock
+ @return whether an exclusive lock was acquired */
+ inline bool x_lock_try();
+
+ /** Release a shared lock */
+ void s_unlock()
+ {
+#ifdef UNIV_DEBUG
+ const pthread_t id= pthread_self();
+ auto r= readers.load(std::memory_order_relaxed);
+ ut_ad(r);
+ readers_lock.wr_lock();
+ auto i= r->find(id);
+ ut_ad(i != r->end());
+ r->erase(i);
+ readers_lock.wr_unlock();
+#endif
+ lock.rd_unlock();
+ }
+ /** Release an update or exclusive lock
+ @param allow_readers whether we are releasing a U lock
+ @param claim_ownership whether the lock was acquired by another thread */
+ void u_or_x_unlock(bool allow_readers, bool claim_ownership= false)
+ {
+ ut_d(auto owner= writer.load(std::memory_order_relaxed));
+ ut_ad(owner == pthread_self() ||
+ (owner == FOR_IO && claim_ownership &&
+ recursive == (allow_readers ? RECURSIVE_U : RECURSIVE_X)));
+ ut_d(auto rec= (recursive / (allow_readers ? RECURSIVE_U : RECURSIVE_X)) &
+ RECURSIVE_MAX);
+ ut_ad(rec);
+ if (!(recursive-= allow_readers ? RECURSIVE_U : RECURSIVE_X))
+ {
+ set_new_owner(0);
+ if (allow_readers)
+ lock.u_unlock();
+ else
+ lock.wr_unlock();
+ }
+ }
+ /** Release an update lock */
+ void u_unlock(bool claim_ownership= false)
+ { u_or_x_unlock(true, claim_ownership); }
+ /** Release an exclusive lock */
+ void x_unlock(bool claim_ownership= false)
+ { u_or_x_unlock(false, claim_ownership); }
+
+ /** @return whether any writer is waiting */
+ bool is_waiting() const { return lock.is_waiting(); }
+
+ bool is_write_locked() const { return lock.is_write_locked(); }
+
+ bool is_locked_or_waiting() const { return lock.is_locked_or_waiting(); }
+
+ inline void lock_shared();
+ inline void unlock_shared();
+};
+
+typedef sux_lock<ssux_lock_impl<true>> block_lock;
+
+#ifndef UNIV_PFS_RWLOCK
+typedef sux_lock<ssux_lock_impl<false>> index_lock;
+#else
+typedef sux_lock<ssux_lock> index_lock;
+
+template<> inline void sux_lock<ssux_lock_impl<true>>::init()
+{
+ lock.init();
+ ut_ad(!writer.load(std::memory_order_relaxed));
+ ut_ad(!recursive);
+ ut_d(readers_lock.init());
+#ifdef UNIV_DEBUG
+ if (auto r= readers.load(std::memory_order_relaxed))
+ ut_ad(r->empty());
+#endif
+}
+
+template<>
+inline void sux_lock<ssux_lock>::s_lock(const char *file, unsigned line)
+{
+ ut_ad(!have_x());
+ ut_ad(!have_s());
+ lock.rd_lock(file, line);
+ ut_d(s_lock_register());
+}
+
+template<>
+inline void sux_lock<ssux_lock>::u_lock(const char *file, unsigned line)
+{
+ pthread_t id= pthread_self();
+ if (writer.load(std::memory_order_relaxed) == id)
+ writer_recurse<true>();
+ else
+ {
+ lock.u_lock(file, line);
+ ut_ad(!recursive);
+ recursive= RECURSIVE_U;
+ set_first_owner(id);
+ }
+}
+
+template<>
+inline void sux_lock<ssux_lock>::x_lock(const char *file, unsigned line)
+{
+ pthread_t id= pthread_self();
+ if (writer.load(std::memory_order_relaxed) == id)
+ writer_recurse<false>();
+ else
+ {
+ lock.wr_lock(file, line);
+ ut_ad(!recursive);
+ recursive= RECURSIVE_X;
+ set_first_owner(id);
+ }
+}
+
+template<>
+inline void sux_lock<ssux_lock>::u_x_upgrade(const char *file, unsigned line)
+{
+ ut_ad(have_u_not_x());
+ lock.u_wr_upgrade(file, line);
+ recursive/= RECURSIVE_U;
+}
+#endif
+
+/** needed for dict_index_t::clone() */
+template<> inline void index_lock::operator=(const sux_lock&)
+{
+ memset((void*) this, 0, sizeof *this);
+}
+
+template<typename ssux> inline void sux_lock<ssux>::s_lock()
+{
+ ut_ad(!have_x());
+ ut_ad(!have_s());
+ lock.rd_lock();
+ ut_d(s_lock_register());
+}
+
+template<typename ssux>
+inline void sux_lock<ssux>::lock_shared() { s_lock(); }
+template<typename ssux>
+inline void sux_lock<ssux>::unlock_shared() { s_unlock(); }
+
+template<typename ssux> inline void sux_lock<ssux>::u_lock()
+{
+ pthread_t id= pthread_self();
+ if (writer.load(std::memory_order_relaxed) == id)
+ writer_recurse<true>();
+ else
+ {
+ lock.u_lock();
+ ut_ad(!recursive);
+ recursive= RECURSIVE_U;
+ set_first_owner(id);
+ }
+}
+
+template<typename ssux> inline void sux_lock<ssux>::x_lock(bool for_io)
+{
+ pthread_t id= pthread_self();
+ if (writer.load(std::memory_order_relaxed) == id)
+ {
+ ut_ad(!for_io);
+ writer_recurse<false>();
+ }
+ else
+ {
+ lock.wr_lock();
+ ut_ad(!recursive);
+ recursive= RECURSIVE_X;
+ set_first_owner(for_io ? FOR_IO : id);
+ }
+}
+
+template<typename ssux> inline void sux_lock<ssux>::u_x_upgrade()
+{
+ ut_ad(have_u_not_x());
+ lock.u_wr_upgrade();
+ recursive/= RECURSIVE_U;
+}
+
+template<typename ssux> inline bool sux_lock<ssux>::x_lock_upgraded()
+{
+ pthread_t id= pthread_self();
+ if (writer.load(std::memory_order_relaxed) == id)
+ {
+ ut_ad(recursive);
+ static_assert(RECURSIVE_X == 1, "compatibility");
+ if (recursive & RECURSIVE_MAX)
+ {
+ writer_recurse<false>();
+ return false;
+ }
+ /* Upgrade the lock. */
+ lock.u_wr_upgrade();
+ recursive/= RECURSIVE_U;
+ return true;
+ }
+ else
+ {
+ lock.wr_lock();
+ ut_ad(!recursive);
+ recursive= RECURSIVE_X;
+ set_first_owner(id);
+ return false;
+ }
+}
+
+template<typename ssux> inline bool sux_lock<ssux>::u_lock_try(bool for_io)
+{
+ pthread_t id= pthread_self();
+ if (writer.load(std::memory_order_relaxed) == id)
+ {
+ if (for_io)
+ return false;
+ writer_recurse<true>();
+ return true;
+ }
+ if (lock.u_lock_try())
+ {
+ ut_ad(!recursive);
+ recursive= RECURSIVE_U;
+ set_first_owner(for_io ? FOR_IO : id);
+ return true;
+ }
+ return false;
+}
+
+template<typename ssux> inline bool sux_lock<ssux>::x_lock_try()
+{
+ pthread_t id= pthread_self();
+ if (writer.load(std::memory_order_relaxed) == id)
+ {
+ writer_recurse<false>();
+ return true;
+ }
+ if (lock.wr_lock_try())
+ {
+ ut_ad(!recursive);
+ recursive= RECURSIVE_X;
+ set_first_owner(id);
+ return true;
+ }
+ return false;
+}
diff --git a/storage/innobase/include/transactional_lock_guard.h b/storage/innobase/include/transactional_lock_guard.h
new file mode 100644
index 00000000000..168a68977a7
--- /dev/null
+++ b/storage/innobase/include/transactional_lock_guard.h
@@ -0,0 +1,174 @@
+/*****************************************************************************
+
+Copyright (c) 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+
+#if defined __powerpc64__
+#elif defined __s390__
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64) && !defined(__clang__)
+#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+# if __GNUC__ >= 8
+# elif defined __clang_major__ && __clang_major__ > 6
+# else
+# define NO_ELISION
+# endif
+#else /* Transactional memory has not been implemented for this ISA */
+# define NO_ELISION
+#endif
+
+#ifdef NO_ELISION
+constexpr bool have_transactional_memory= false;
+# ifdef UNIV_DEBUG
+static inline bool xtest() { return false; }
+# endif
+# define TRANSACTIONAL_TARGET /* nothing */
+# define TRANSACTIONAL_INLINE /* nothing */
+#else
+# if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
+extern bool have_transactional_memory;
+bool transactional_lock_enabled();
+
+# include <immintrin.h>
+# if defined __GNUC__ && !defined __INTEL_COMPILER
+# define TRANSACTIONAL_TARGET __attribute__((target("rtm"),hot))
+# define TRANSACTIONAL_INLINE __attribute__((target("rtm"),hot,always_inline))
+# else
+# define TRANSACTIONAL_TARGET /* nothing */
+# define TRANSACTIONAL_INLINE /* nothing */
+# endif
+
+TRANSACTIONAL_INLINE static inline bool xbegin()
+{
+ return have_transactional_memory && _xbegin() == _XBEGIN_STARTED;
+}
+
+# ifdef UNIV_DEBUG
+# ifdef __GNUC__
+/** @return whether a memory transaction is active */
+bool xtest();
+# else
+static inline bool xtest() { return have_transactional_memory && _xtest(); }
+# endif
+# endif
+
+TRANSACTIONAL_INLINE static inline void xabort() { _xabort(0); }
+
+TRANSACTIONAL_INLINE static inline void xend() { _xend(); }
+# elif defined __powerpc64__ || defined __s390__
+extern bool have_transactional_memory;
+bool transactional_lock_enabled();
+# define TRANSACTIONAL_TARGET __attribute__((hot))
+# define TRANSACTIONAL_INLINE __attribute__((hot,always_inline))
+
+/**
+ Newer gcc compilers only provide __builtin_{htm}
+ functions when the -mhtm CFLAG is actually provided. So
+ we've got the option of including it globally, or
+ pushing down the inclusion of htmxlintrin.h to one
+ file with -mhtm enabled and removing the inline
+ optimization.
+
+ Per FIXME in s390x's htmxlintrin.h, the __TM_simple_begin
+ isn't always_inline resulting in duplicate definitions if
+ it where included more than once. While xabort and xend
+ could be implemented here, we keep the implementation the
+ same as ppc64.
+ */
+TRANSACTIONAL_TARGET bool xbegin();
+TRANSACTIONAL_TARGET void xabort();
+TRANSACTIONAL_TARGET void xend();
+# ifdef UNIV_DEBUG
+bool xtest();
+# endif
+
+# endif
+#endif
+
+template<class mutex>
+class transactional_lock_guard
+{
+ mutex &m;
+
+public:
+ TRANSACTIONAL_INLINE transactional_lock_guard(mutex &m) : m(m)
+ {
+#ifndef NO_ELISION
+ if (xbegin())
+ {
+ if (was_elided())
+ return;
+ xabort();
+ }
+#endif
+ m.lock();
+ }
+ transactional_lock_guard(const transactional_lock_guard &)= delete;
+ TRANSACTIONAL_INLINE ~transactional_lock_guard()
+ {
+#ifndef NO_ELISION
+ if (was_elided()) xend(); else
+#endif
+ m.unlock();
+ }
+
+#ifndef NO_ELISION
+ bool was_elided() const noexcept { return !m.is_locked_or_waiting(); }
+#else
+ bool was_elided() const noexcept { return false; }
+#endif
+};
+
+template<class mutex>
+class transactional_shared_lock_guard
+{
+ mutex &m;
+#ifndef NO_ELISION
+ bool elided;
+#else
+ static constexpr bool elided= false;
+#endif
+
+public:
+ TRANSACTIONAL_INLINE transactional_shared_lock_guard(mutex &m) : m(m)
+ {
+#ifndef NO_ELISION
+ if (xbegin())
+ {
+ if (!m.is_write_locked())
+ {
+ elided= true;
+ return;
+ }
+ xabort();
+ }
+ elided= false;
+#endif
+ m.lock_shared();
+ }
+ transactional_shared_lock_guard(const transactional_shared_lock_guard &)=
+ delete;
+ TRANSACTIONAL_INLINE ~transactional_shared_lock_guard()
+ {
+#ifndef NO_ELISION
+ if (was_elided()) xend(); else
+#endif
+ m.unlock_shared();
+ }
+
+ bool was_elided() const noexcept { return elided; }
+};
diff --git a/storage/innobase/include/trx0i_s.h b/storage/innobase/include/trx0i_s.h
index 40160ce4362..caacfa0972a 100644
--- a/storage/innobase/include/trx0i_s.h
+++ b/storage/innobase/include/trx0i_s.h
@@ -114,8 +114,7 @@ struct i_s_locks_row_t {
/** This structure represents INFORMATION_SCHEMA.innodb_trx row */
struct i_s_trx_row_t {
trx_id_t trx_id; /*!< transaction identifier */
- const char* trx_state; /*!< transaction state from
- trx_get_que_state_str() */
+ const char* trx_state;
time_t trx_started; /*!< trx_t::start_time */
const i_s_locks_row_t* requested_lock_row;
/*!< pointer to a row
@@ -138,7 +137,7 @@ struct i_s_trx_row_t {
ulint trx_lock_memory_bytes;
/*!< mem_heap_get_size(
trx->lock_heap) */
- ulint trx_rows_locked;/*!< lock_number_of_rows_locked() */
+ ulint trx_rows_locked;/*!< trx_lock_t::n_rec_locks */
uintmax_t trx_rows_modified;/*!< trx_t::undo_no */
uint trx_isolation_level;
/*!< trx_t::isolation_level */
diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h
index 14cf6a2958b..ac39d3ec45b 100644
--- a/storage/innobase/include/trx0purge.h
+++ b/storage/innobase/include/trx0purge.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,18 +24,14 @@ Purge old versions
Created 3/26/1996 Heikki Tuuri
*******************************************************/
-#ifndef trx0purge_h
-#define trx0purge_h
+#pragma once
-#include "trx0rseg.h"
+#include "trx0sys.h"
#include "que0types.h"
+#include "srw_lock.h"
#include <queue>
-/** A dummy undo record used as a return value when we have a whole undo log
-which needs no purge */
-extern trx_undo_rec_t trx_purge_dummy_rec;
-
/** Prepend the history list with an undo log.
Remove the undo log segment from the rseg slot if it is too big for reuse.
@param[in] trx transaction
@@ -123,17 +119,26 @@ private:
class purge_sys_t
{
public:
- /** latch protecting view, m_enabled */
- MY_ALIGNED(CACHE_LINE_SIZE)
- mutable rw_lock_t latch;
+ /** latch protecting view, m_enabled */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) mutable srw_spin_lock latch;
private:
- /** The purge will not remove undo logs which are >= this view */
- MY_ALIGNED(CACHE_LINE_SIZE)
- ReadViewBase view;
- /** whether purge is enabled; protected by latch and std::atomic */
- std::atomic<bool> m_enabled;
- /** number of pending stop() calls without resume() */
- Atomic_counter<int32_t> m_paused;
+ /** Read view at the start of a purge batch. Any encountered index records
+ that are older than view will be removed. */
+ ReadViewBase view;
+ /** whether purge is enabled; protected by latch and std::atomic */
+ std::atomic<bool> m_enabled;
+ /** number of pending stop() calls without resume() */
+ Atomic_counter<uint32_t> m_paused;
+ /** number of stop_SYS() calls without resume_SYS() */
+ Atomic_counter<uint32_t> m_SYS_paused;
+ /** number of stop_FTS() calls without resume_FTS() */
+ Atomic_counter<uint32_t> m_FTS_paused;
+
+ /** latch protecting end_view */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock_low end_latch;
+ /** Read view at the end of a purge batch (copied from view). Any undo pages
+ containing records older than end_view may be freed. */
+ ReadViewBase end_view;
public:
que_t* query; /*!< The query graph which will do the
parallelized purge operation */
@@ -184,7 +189,7 @@ public:
purge_pq_t purge_queue; /*!< Binary min-heap, ordered on
TrxUndoRsegs::trx_no. It is protected
by the pq_mutex */
- PQMutex pq_mutex; /*!< Mutex protecting purge_queue */
+ mysql_mutex_t pq_mutex; /*!< Mutex protecting purge_queue */
/** Undo tablespace file truncation (only accessed by the
srv_purge_coordinator_thread) */
@@ -235,34 +240,108 @@ public:
/** @return whether the purge tasks are active */
bool running() const;
- /** Stop purge during FLUSH TABLES FOR EXPORT */
+ /** Stop purge during FLUSH TABLES FOR EXPORT. */
void stop();
/** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */
void resume();
- /** A wrapper around ReadView::changes_visible(). */
- bool changes_visible(trx_id_t id, const table_name_t &name) const
- {
- ut_ad(rw_lock_own(&latch, RW_LOCK_S));
- return view.changes_visible(id, name);
- }
+
+private:
+ void wait_SYS();
+ void wait_FTS();
+public:
+ /** Suspend purge in data dictionary tables */
+ void stop_SYS();
+ /** Resume purge in data dictionary tables */
+ static void resume_SYS(void *);
+ /** @return whether stop_SYS() is in effect */
+ bool must_wait_SYS() const { return m_SYS_paused; }
+ /** check stop_SYS() */
+ void check_stop_SYS() { if (must_wait_SYS()) wait_SYS(); }
+
+ /** Pause purge during a DDL operation that could drop FTS_ tables. */
+ void stop_FTS() { m_FTS_paused++; }
+ /** Resume purge after stop_FTS(). */
+ void resume_FTS() { ut_d(const auto p=) m_FTS_paused--; ut_ad(p); }
+ /** @return whether stop_SYS() is in effect */
+ bool must_wait_FTS() const { return m_FTS_paused; }
+ /** check stop_SYS() */
+ void check_stop_FTS() { if (must_wait_FTS()) wait_FTS(); }
+
+ /** Determine if the history of a transaction is purgeable.
+ @param trx_id transaction identifier
+ @return whether the history is purgeable */
+ TRANSACTIONAL_TARGET bool is_purgeable(trx_id_t trx_id) const;
+
/** A wrapper around ReadView::low_limit_no(). */
trx_id_t low_limit_no() const
{
-#if 0 /* Unfortunately we don't hold this assertion, see MDEV-22718. */
- ut_ad(rw_lock_own(&latch, RW_LOCK_S));
-#endif
+ /* This function may only be called by purge_coordinator_callback().
+
+ The purge coordinator task may call this without holding any latch,
+ because it is the only thread that may modify purge_sys.view.
+
+ Any other threads that access purge_sys.view must hold purge_sys.latch,
+ typically via purge_sys_t::view_guard. */
return view.low_limit_no();
}
/** A wrapper around trx_sys_t::clone_oldest_view(). */
+ template<bool also_end_view= false>
void clone_oldest_view()
{
- rw_lock_x_lock(&latch);
+ latch.wr_lock(SRW_LOCK_CALL);
trx_sys.clone_oldest_view(&view);
- rw_lock_x_unlock(&latch);
+ if (also_end_view)
+ (end_view= view).
+ clamp_low_limit_id(head.trx_no ? head.trx_no : tail.trx_no);
+ latch.wr_unlock();
}
+
+ /** Update end_view at the end of a purge batch. */
+ inline void clone_end_view();
+
+ struct view_guard
+ {
+ inline view_guard();
+ inline ~view_guard();
+
+ /** @return purge_sys.view */
+ inline const ReadViewBase &view() const;
+ };
+
+ struct end_view_guard
+ {
+ inline end_view_guard();
+ inline ~end_view_guard();
+
+ /** @return purge_sys.end_view */
+ inline const ReadViewBase &view() const;
+ };
+
+ /** Stop the purge thread and check n_ref_count of all auxiliary
+ and common table associated with the fts table.
+ @param table parent FTS table
+ @param already_stopped True indicates purge threads were
+ already stopped */
+ void stop_FTS(const dict_table_t &table, bool already_stopped=false);
};
/** The global data structure coordinating a purge */
extern purge_sys_t purge_sys;
-#endif /* trx0purge_h */
+purge_sys_t::view_guard::view_guard()
+{ purge_sys.latch.rd_lock(SRW_LOCK_CALL); }
+
+purge_sys_t::view_guard::~view_guard()
+{ purge_sys.latch.rd_unlock(); }
+
+const ReadViewBase &purge_sys_t::view_guard::view() const
+{ return purge_sys.view; }
+
+purge_sys_t::end_view_guard::end_view_guard()
+{ purge_sys.end_latch.rd_lock(); }
+
+purge_sys_t::end_view_guard::~end_view_guard()
+{ purge_sys.end_latch.rd_unlock(); }
+
+const ReadViewBase &purge_sys_t::end_view_guard::view() const
+{ return purge_sys.end_view; }
diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h
index 66b2220a457..58ec5ab1707 100644
--- a/storage/innobase/include/trx0rec.h
+++ b/storage/innobase/include/trx0rec.h
@@ -24,8 +24,7 @@ Transaction undo log record
Created 3/26/1996 Heikki Tuuri
*******************************************************/
-#ifndef trx0rec_h
-#define trx0rec_h
+#pragma once
#include "trx0types.h"
#include "row0types.h"
@@ -37,29 +36,31 @@ Created 3/26/1996 Heikki Tuuri
/***********************************************************************//**
Copies the undo record to the heap.
-@return own: copy of undo log record */
-UNIV_INLINE
-trx_undo_rec_t*
-trx_undo_rec_copy(
-/*==============*/
- const trx_undo_rec_t* undo_rec, /*!< in: undo log record */
- mem_heap_t* heap); /*!< in: heap where copied */
-/**********************************************************************//**
-Reads the undo log record type.
-@return record type */
-UNIV_INLINE
-ulint
-trx_undo_rec_get_type(
-/*==================*/
- const trx_undo_rec_t* undo_rec); /*!< in: undo log record */
+@param undo_rec record in an undo log page
+@param heap memory heap
+@return copy of undo_rec
+@retval nullptr if the undo log record is corrupted */
+inline trx_undo_rec_t* trx_undo_rec_copy(const trx_undo_rec_t *undo_rec,
+ mem_heap_t *heap)
+{
+ const size_t offset= ut_align_offset(undo_rec, srv_page_size);
+ const size_t end= mach_read_from_2(undo_rec);
+ if (end <= offset || end >= srv_page_size - FIL_PAGE_DATA_END)
+ return nullptr;
+ const size_t len= end - offset;
+ trx_undo_rec_t *rec= static_cast<trx_undo_rec_t*>
+ (mem_heap_dup(heap, undo_rec, len));
+ mach_write_to_2(rec, len);
+ return rec;
+}
+
/**********************************************************************//**
Reads the undo log record number.
@return undo no */
-UNIV_INLINE
-undo_no_t
-trx_undo_rec_get_undo_no(
-/*=====================*/
- const trx_undo_rec_t* undo_rec); /*!< in: undo log record */
+inline undo_no_t trx_undo_rec_get_undo_no(const trx_undo_rec_t *undo_rec)
+{
+ return mach_u64_read_much_compressed(undo_rec + 3);
+}
/**********************************************************************//**
Returns the start of the undo record data area. */
@@ -69,10 +70,10 @@ Returns the start of the undo record data area. */
/**********************************************************************//**
Reads from an undo log record the general parameters.
@return remaining part of undo log record after reading these values */
-byte*
+const byte*
trx_undo_rec_get_pars(
/*==================*/
- trx_undo_rec_t* undo_rec, /*!< in: undo log record */
+ const trx_undo_rec_t* undo_rec, /*!< in: undo log record */
ulint* type, /*!< out: undo record type:
TRX_UNDO_INSERT_REC, ... */
ulint* cmpl_info, /*!< out: compiler info, relevant only
@@ -82,13 +83,14 @@ trx_undo_rec_get_pars(
undo_no_t* undo_no, /*!< out: undo log record number */
table_id_t* table_id) /*!< out: table id */
MY_ATTRIBUTE((nonnull));
+
/*******************************************************************//**
Builds a row reference from an undo log record.
@return pointer to remaining part of undo record */
-byte*
+const byte*
trx_undo_rec_get_row_ref(
/*=====================*/
- byte* ptr, /*!< in: remaining part of a copy of an undo log
+ const byte* ptr, /*!< in: remaining part of a copy of an undo log
record, at the start of the row reference;
NOTE that this copy of the undo log record must
be preserved as long as the row reference is
@@ -96,8 +98,9 @@ trx_undo_rec_get_row_ref(
record! */
dict_index_t* index, /*!< in: clustered index */
const dtuple_t**ref, /*!< out, own: row reference */
- mem_heap_t* heap); /*!< in: memory heap from which the memory
+ mem_heap_t* heap) /*!< in: memory heap from which the memory
needed is allocated */
+ MY_ATTRIBUTE((nonnull));
/**********************************************************************//**
Reads from an undo log update record the system field values of the old
version.
@@ -178,53 +181,59 @@ trx_undo_report_row_operation(
is being called purge view and we would like to get the purge record
even it is in the purge view (in normal case, it will return without
fetching the purge record */
-#define TRX_UNDO_PREV_IN_PURGE 0x1
+static constexpr ulint TRX_UNDO_PREV_IN_PURGE = 1;
/** This tells trx_undo_prev_version_build() to fetch the old value in
the undo log (which is the after image for an update) */
-#define TRX_UNDO_GET_OLD_V_VALUE 0x2
+static constexpr ulint TRX_UNDO_GET_OLD_V_VALUE = 2;
-/*******************************************************************//**
-Build a previous version of a clustered index record. The caller must
-hold a latch on the index page of the clustered index record.
-@retval true if previous version was built, or if it was an insert
-or the table has been rebuilt
-@retval false if the previous version is earlier than purge_view,
-which means that it may have been removed */
-bool
+/** indicate a call from row_vers_old_has_index_entry() */
+static constexpr ulint TRX_UNDO_CHECK_PURGEABILITY = 4;
+
+/** Build a previous version of a clustered index record. The caller
+must hold a latch on the index page of the clustered index record.
+@param rec version of a clustered index record
+@param index clustered index
+@param offsets rec_get_offsets(rec, index)
+@param heap memory heap from which the memory needed is
+ allocated
+@param old_vers previous version or NULL if rec is the
+ first inserted version, or if history data
+ has been deleted (an error), or if the purge
+ could have removed the version
+ though it has not yet done so
+@param v_heap memory heap used to create vrow
+ dtuple if it is not yet created. This heap
+ diffs from "heap" above in that it could be
+ prebuilt->old_vers_heap for selection
+@param vrow virtual column info, if any
+@param v_status status determine if it is going into this
+ function by purge thread or not.
+ And if we read "after image" of undo log
+@return error code
+@retval DB_SUCCESS if previous version was successfully built,
+or if it was an insert or the undo record refers to the table before rebuild
+@retval DB_MISSING_HISTORY if the history is missing */
+dberr_t
trx_undo_prev_version_build(
-/*========================*/
- const rec_t* index_rec,/*!< in: clustered index record in the
- index tree */
- mtr_t* index_mtr,/*!< in: mtr which contains the latch to
- index_rec page and purge_view */
- const rec_t* rec, /*!< in: version of a clustered index record */
- dict_index_t* index, /*!< in: clustered index */
- rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */
- mem_heap_t* heap, /*!< in: memory heap from which the memory
- needed is allocated */
- rec_t** old_vers,/*!< out, own: previous version, or NULL if
- rec is the first inserted version, or if
- history data has been deleted */
- mem_heap_t* v_heap, /* !< in: memory heap used to create vrow
- dtuple if it is not yet created. This heap
- diffs from "heap" above in that it could be
- prebuilt->old_vers_heap for selection */
- dtuple_t** vrow, /*!< out: virtual column info, if any */
+ const rec_t *rec,
+ dict_index_t *index,
+ rec_offs *offsets,
+ mem_heap_t *heap,
+ rec_t **old_vers,
+ mem_heap_t *v_heap,
+ dtuple_t **vrow,
ulint v_status);
- /*!< in: status determine if it is going
- into this function by purge thread or not.
- And if we read "after image" of undo log */
/** Read from an undo log record a non-virtual column value.
-@param[in,out] ptr pointer to remaining part of the undo record
-@param[in,out] field stored field
-@param[in,out] len length of the field, or UNIV_SQL_NULL
-@param[in,out] orig_len original length of the locally stored part
+@param ptr pointer to remaining part of the undo record
+@param field stored field
+@param len length of the field, or UNIV_SQL_NULL
+@param orig_len original length of the locally stored part
of an externally stored column, or 0
@return remaining part of undo log record after reading these values */
-byte *trx_undo_rec_get_col_val(const byte *ptr, const byte **field,
- uint32_t *len, uint32_t *orig_len);
+const byte *trx_undo_rec_get_col_val(const byte *ptr, const byte **field,
+ uint32_t *len, uint32_t *orig_len);
/** Read virtual column value from undo log
@param[in] table the table
@@ -261,9 +270,22 @@ trx_undo_read_v_idx(
compilation info multiplied by 16 is ORed to this value in an undo log
record */
-#define TRX_UNDO_RENAME_TABLE 9 /*!< RENAME TABLE */
-#define TRX_UNDO_INSERT_METADATA 10 /*!< insert a metadata
- pseudo-record for instant ALTER */
+/** Undo log records for DDL operations
+
+Note: special rollback and purge triggers exist for SYS_INDEXES records:
+@see dict_drop_index_tree() */
+enum trx_undo_ddl_type
+{
+ /** RENAME TABLE (logging the old table name).
+
+ Because SYS_TABLES has PRIMARY KEY(NAME), the row-level undo log records
+ for SYS_TABLES cannot be distinguished from DROP TABLE, CREATE TABLE. */
+ TRX_UNDO_RENAME_TABLE= 9,
+ /** insert a metadata pseudo-record for instant ALTER TABLE */
+ TRX_UNDO_INSERT_METADATA= 10
+};
+
+/* DML operations */
#define TRX_UNDO_INSERT_REC 11 /* fresh insert into clustered index */
#define TRX_UNDO_UPD_EXIST_REC 12 /* update of a non-delete-marked
record */
@@ -272,6 +294,13 @@ record */
fields of the record can change */
#define TRX_UNDO_DEL_MARK_REC 14 /* delete marking of a record; fields
do not change */
+/** Bulk insert operation. It is written only when the table is
+under exclusive lock and the clustered index root page latch is being held,
+and the clustered index is empty. Rollback will empty the table and
+free the leaf segment of all indexes, re-create the new
+leaf segment and re-initialize the root page alone. */
+#define TRX_UNDO_EMPTY 15
+
#define TRX_UNDO_CMPL_INFO_MULT 16U /* compilation info is multiplied by
this and ORed to the type above */
#define TRX_UNDO_UPD_EXTERN 128U /* This bit can be ORed to type_cmpl
@@ -291,7 +320,3 @@ inline table_id_t trx_undo_rec_get_table_id(const trx_undo_rec_t *rec)
mach_read_next_much_compressed(&rec);
return mach_read_next_much_compressed(&rec);
}
-
-#include "trx0rec.inl"
-
-#endif /* trx0rec_h */
diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h
index 6a562dcb425..9ef9ebe93b2 100644
--- a/storage/innobase/include/trx0roll.h
+++ b/storage/innobase/include/trx0roll.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2020, MariaDB Corporation.
+Copyright (c) 2015, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -34,14 +34,6 @@ Created 3/26/1996 Heikki Tuuri
extern bool trx_rollback_is_active;
extern const trx_t* trx_roll_crash_recv_trx;
-/*******************************************************************//**
-Returns a transaction savepoint taken at this point in time.
-@return savepoint */
-trx_savept_t
-trx_savept_take(
-/*============*/
- trx_t* trx); /*!< in: transaction */
-
/** Report progress when rolling back a row of a recovered transaction. */
void trx_roll_report_progress();
/*******************************************************************//**
@@ -58,11 +50,8 @@ Rollback or clean up any incomplete transactions which were
encountered in crash recovery. If the transaction already was
committed, then we clean up a possible insert undo log. If the
transaction was not yet committed, then we roll it back.
-Note: this is done in a background thread.
-@return a dummy parameter */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(trx_rollback_all_recovered)(void*);
+Note: this is done in a background thread. */
+void trx_rollback_all_recovered(void*);
/*********************************************************************//**
Creates a rollback command node struct.
@return own: rollback node struct */
@@ -141,15 +130,7 @@ trx_release_savepoint_for_mysql(
trx_t* trx, /*!< in: transaction handle */
const char* savepoint_name) /*!< in: savepoint name */
MY_ATTRIBUTE((nonnull, warn_unused_result));
-/*******************************************************************//**
-Frees savepoint structs starting from savep. */
-void
-trx_roll_savepoints_free(
-/*=====================*/
- trx_t* trx, /*!< in: transaction handle */
- trx_named_savept_t* savep); /*!< in: free all savepoints > this one;
- if this is NULL, free all savepoints
- of trx */
+
/** Rollback node states */
enum roll_node_state {
ROLL_NODE_NONE = 0, /*!< Unknown state */
diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h
index 96655c7020f..1d95b7d2e7a 100644
--- a/storage/innobase/include/trx0rseg.h
+++ b/storage/innobase/include/trx0rseg.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,67 +24,28 @@ Rollback segment
Created 3/26/1996 Heikki Tuuri
*******************************************************/
-#ifndef trx0rseg_h
-#define trx0rseg_h
-
-#include "trx0sys.h"
+#pragma once
+#include "trx0types.h"
#include "fut0lst.h"
-/** Gets a rollback segment header.
-@param[in] space space where placed
-@param[in] page_no page number of the header
-@param[in,out] mtr mini-transaction
-@return rollback segment header, page x-latched */
-UNIV_INLINE
-buf_block_t*
-trx_rsegf_get(fil_space_t* space, uint32_t page_no, mtr_t* mtr);
-
-/** Gets a newly created rollback segment header.
-@param[in] space space where placed
-@param[in] page_no page number of the header
-@param[in,out] mtr mini-transaction
-@return rollback segment header, page x-latched */
-UNIV_INLINE
-buf_block_t*
-trx_rsegf_get_new(
- ulint space,
- uint32_t page_no,
- mtr_t* mtr);
-
/** Create a rollback segment header.
-@param[in,out] space system, undo, or temporary tablespace
-@param[in] rseg_id rollback segment identifier
-@param[in] max_trx_id new value of TRX_RSEG_MAX_TRX_ID
-@param[in,out] sys_header the TRX_SYS page (NULL for temporary rseg)
-@param[in,out] mtr mini-transaction
+@param[in,out] space system, undo, or temporary tablespace
+@param[in] rseg_id rollback segment identifier
+@param[in] max_trx_id new value of TRX_RSEG_MAX_TRX_ID
+@param[in,out] mtr mini-transaction
+@param[out] err error code
@return the created rollback segment
-@retval NULL on failure */
-buf_block_t*
-trx_rseg_header_create(
- fil_space_t* space,
- ulint rseg_id,
- trx_id_t max_trx_id,
- buf_block_t* sys_header,
- mtr_t* mtr);
+@retval nullptr on failure */
+buf_block_t *trx_rseg_header_create(fil_space_t *space, ulint rseg_id,
+ trx_id_t max_trx_id, mtr_t *mtr,
+ dberr_t *err)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/** Initialize or recover the rollback segments at startup. */
dberr_t trx_rseg_array_init();
-/** Free a rollback segment in memory. */
-void
-trx_rseg_mem_free(trx_rseg_t* rseg);
-
-/** Create a persistent rollback segment.
-@param[in] space_id system or undo tablespace id
-@return pointer to new rollback segment
-@retval NULL on failure */
-trx_rseg_t*
-trx_rseg_create(ulint space_id)
- MY_ATTRIBUTE((warn_unused_result));
-
/** Create the temporary rollback segments. */
-void
-trx_temp_rseg_create();
+dberr_t trx_temp_rseg_create(mtr_t *mtr);
/* Number of undo log slots in a rollback segment file copy */
#define TRX_RSEG_N_SLOTS (srv_page_size / 16)
@@ -93,34 +54,117 @@ trx_temp_rseg_create();
#define TRX_RSEG_MAX_N_TRXS (TRX_RSEG_N_SLOTS / 2)
/** The rollback segment memory object */
-struct trx_rseg_t {
- /*--------------------------------------------------------*/
- /** rollback segment id == the index of its slot in the trx
- system file copy */
- ulint id;
+struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_rseg_t
+{
+ /** tablespace containing the rollback segment; constant after init() */
+ fil_space_t *space;
+ /** latch protecting everything except page_no, space */
+ srw_spin_lock latch;
+ /** rollback segment header page number; constant after init() */
+ uint32_t page_no;
+ /** length of the TRX_RSEG_HISTORY list (number of transactions) */
+ uint32_t history_size;
- /** mutex protecting the fields in this struct except id,space,page_no
- which are constant */
- RsegMutex mutex;
+ /** Last known transaction that has not been purged yet,
+ or 0 if everything has been purged. */
+ trx_id_t needs_purge;
- /** space where the rollback segment header is placed */
- fil_space_t* space;
+private:
+ /** Reference counter to track is_persistent() transactions,
+ with SKIP flag. */
+ std::atomic<uint32_t> ref;
- /** page number of the rollback segment header */
- uint32_t page_no;
+ /** Whether undo tablespace truncation is pending */
+ static constexpr uint32_t SKIP= 1;
+ /** Transaction reference count multiplier */
+ static constexpr uint32_t REF= 2;
- /** current size in pages */
- uint32_t curr_size;
+ uint32_t ref_load() const { return ref.load(std::memory_order_relaxed); }
- /*--------------------------------------------------------*/
- /* Fields for undo logs */
- /** List of undo logs */
- UT_LIST_BASE_NODE_T(trx_undo_t) undo_list;
+ /** Set the SKIP bit */
+ void ref_set_skip()
+ {
+ static_assert(SKIP == 1U, "compatibility");
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+ __asm__ __volatile__("lock btsl $0, %0" : "+m" (ref));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+ _interlockedbittestandset(reinterpret_cast<volatile long*>(&ref), 0);
+#else
+ ref.fetch_or(SKIP, std::memory_order_relaxed);
+#endif
+ }
+ /** Clear a bit in ref */
+ void ref_reset_skip()
+ {
+ static_assert(SKIP == 1U, "compatibility");
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+ __asm__ __volatile__("lock btrl $0, %0" : "+m" (ref));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+ _interlockedbittestandreset(reinterpret_cast<volatile long*>(&ref), 0);
+#else
+ ref.fetch_and(~SKIP, std::memory_order_relaxed);
+#endif
+ }
+
+public:
- /** List of undo log segments cached for fast reuse */
- UT_LIST_BASE_NODE_T(trx_undo_t) undo_cached;
+ /** Initialize the fields that are not zero-initialized. */
+ void init(fil_space_t *space, uint32_t page);
+ /** Reinitialize the fields on undo tablespace truncation. */
+ void reinit(uint32_t page);
+ /** Clean up. */
+ void destroy();
- /*--------------------------------------------------------*/
+ /** Note that undo tablespace truncation was started. */
+ void set_skip_allocation() { ut_ad(is_persistent()); ref_set_skip(); }
+ /** Note that undo tablespace truncation was completed. */
+ void clear_skip_allocation()
+ {
+ ut_ad(is_persistent());
+#if defined DBUG_OFF
+ ref_reset_skip();
+#else
+ ut_d(auto r=) ref.fetch_and(~SKIP, std::memory_order_relaxed);
+ ut_ad(r == SKIP);
+#endif
+ }
+ /** @return whether the segment is marked for undo truncation */
+ bool skip_allocation() const
+ { return ref.load(std::memory_order_acquire) & SKIP; }
+ /** Increment the reference count */
+ void acquire()
+ { ut_d(auto r=) ref.fetch_add(REF); ut_ad(!(r & SKIP)); }
+ /** Increment the reference count if possible
+ @retval true if the reference count was incremented
+ @retval false if skip_allocation() holds */
+ bool acquire_if_available()
+ {
+ uint32_t r= 0;
+ while (!ref.compare_exchange_weak(r, r + REF,
+ std::memory_order_relaxed,
+ std::memory_order_relaxed))
+ if (r & SKIP)
+ return false;
+ return true;
+ }
+
+ /** Decrement the reference count */
+ void release()
+ {
+ ut_d(const auto r=)
+ ref.fetch_sub(REF, std::memory_order_relaxed);
+ ut_ad(r >= REF);
+ }
+ /** @return whether references exist */
+ bool is_referenced() const { return ref_load() >= REF; }
+
+ /** current size in pages */
+ uint32_t curr_size;
+
+ /** List of undo logs (transactions) */
+ UT_LIST_BASE_NODE_T(trx_undo_t) undo_list;
+ /** List of undo log segments cached for fast reuse */
+ UT_LIST_BASE_NODE_T(trx_undo_t) undo_cached;
/** Last not yet purged undo log header; FIL_NULL if all purged */
uint32_t last_page_no;
@@ -128,20 +172,6 @@ struct trx_rseg_t {
/** trx_t::no | last_offset << 48 */
uint64_t last_commit_and_offset;
- /** Last known transaction that has not been purged yet,
- or 0 if everything has been purged. */
- trx_id_t needs_purge;
-
- /** Number of active (non-committed) transactions associated with a
- an is_persistent() rollback segment. Needed for protecting
- trx->rsegs.m_redo.rseg assignments
- before trx->rsegs.m_redo.undo has been assigned. */
- ulint trx_ref_count;
-
- /** whether undo log truncation was initiated, and transactions
- cannot be allocated in this is_persistent() rollback segment */
- bool skip_allocation;
-
/** @return the commit ID of the last committed transaction */
trx_id_t last_trx_no() const
{ return last_commit_and_offset & ((1ULL << 48) - 1); }
@@ -154,24 +184,27 @@ struct trx_rseg_t {
last_commit_and_offset= static_cast<uint64_t>(last_offset) << 48 | trx_no;
}
- /** @return whether the rollback segment is persistent */
- bool is_persistent() const
- {
- ut_ad(space == fil_system.temp_space
- || space == fil_system.sys_space
- || (srv_undo_space_id_start > 0
- && space->id >= srv_undo_space_id_start
- && space->id <= srv_undo_space_id_start
- + TRX_SYS_MAX_UNDO_SPACES));
- ut_ad(space == fil_system.temp_space
- || space == fil_system.sys_space
- || (srv_undo_space_id_start > 0
- && space->id >= srv_undo_space_id_start
- && space->id <= srv_undo_space_id_start
- + srv_undo_tablespaces_open)
- || !srv_was_started);
- return(space->id != SRV_TMP_SPACE_ID);
- }
+ /** @return the page identifier */
+ page_id_t page_id() const { return page_id_t{space->id, page_no}; }
+
+ /** @return the rollback segment header page, exclusively latched */
+ buf_block_t *get(mtr_t *mtr, dberr_t *err) const;
+
+ /** @return whether the rollback segment is persistent */
+ bool is_persistent() const
+ {
+ ut_ad(space == fil_system.temp_space || space == fil_system.sys_space ||
+ (srv_undo_space_id_start > 0 &&
+ space->id >= srv_undo_space_id_start &&
+ space->id <= srv_undo_space_id_start + TRX_SYS_MAX_UNDO_SPACES));
+ ut_ad(space == fil_system.temp_space || space == fil_system.sys_space ||
+ !srv_was_started ||
+ (srv_undo_space_id_start > 0 &&
+ space->id >= srv_undo_space_id_start
+ && space->id <= srv_undo_space_id_start +
+ srv_undo_tablespaces_open));
+ return space->id != SRV_TMP_SPACE_ID;
+ }
};
/* Undo log segment slot in a rollback segment header */
@@ -212,32 +245,8 @@ If no binlog information is present, the first byte is NUL. */
#define TRX_RSEG_BINLOG_NAME_LEN 512
#ifdef WITH_WSREP
-/** The offset to WSREP XID headers */
-#define TRX_RSEG_WSREP_XID_INFO TRX_RSEG_MAX_TRX_ID + 16 + 512
-
-/** WSREP XID format (1 if present and valid, 0 if not present) */
-#define TRX_RSEG_WSREP_XID_FORMAT TRX_RSEG_WSREP_XID_INFO
-/** WSREP XID GTRID length */
-#define TRX_RSEG_WSREP_XID_GTRID_LEN TRX_RSEG_WSREP_XID_INFO + 4
-/** WSREP XID bqual length */
-#define TRX_RSEG_WSREP_XID_BQUAL_LEN TRX_RSEG_WSREP_XID_INFO + 8
-/** WSREP XID data (XIDDATASIZE bytes) */
-#define TRX_RSEG_WSREP_XID_DATA TRX_RSEG_WSREP_XID_INFO + 12
-#endif /* WITH_WSREP*/
-
-/*-------------------------------------------------------------*/
+# include "trx0xa.h"
-/** Read the page number of an undo log slot.
-@param[in] rseg_header rollback segment header
-@param[in] n slot number */
-inline uint32_t trx_rsegf_get_nth_undo(const buf_block_t *rseg_header, ulint n)
-{
- ut_ad(n < TRX_RSEG_N_SLOTS);
- return mach_read_from_4(TRX_RSEG + TRX_RSEG_UNDO_SLOTS +
- n * TRX_RSEG_SLOT_SIZE + rseg_header->frame);
-}
-
-#ifdef WITH_WSREP
/** Update the WSREP XID information in rollback segment header.
@param[in,out] rseg_header rollback segment header
@param[in] xid WSREP XID
@@ -263,6 +272,16 @@ void trx_rseg_update_wsrep_checkpoint(const XID* xid);
bool trx_rseg_read_wsrep_checkpoint(XID& xid);
#endif /* WITH_WSREP */
+/** Read the page number of an undo log slot.
+@param[in] rseg_header rollback segment header
+@param[in] n slot number */
+inline uint32_t trx_rsegf_get_nth_undo(const buf_block_t *rseg_header, ulint n)
+{
+ ut_ad(n < TRX_RSEG_N_SLOTS);
+ return mach_read_from_4(TRX_RSEG + TRX_RSEG_UNDO_SLOTS +
+ n * TRX_RSEG_SLOT_SIZE + rseg_header->page.frame);
+}
+
/** Upgrade a rollback segment header page to MariaDB 10.3 format.
@param[in,out] rseg_header rollback segment header page
@param[in,out] mtr mini-transaction */
@@ -277,7 +296,3 @@ up to which replication has proceeded.
@param[in,out] mtr mini-transaction */
void trx_rseg_update_binlog_offset(buf_block_t *rseg_header, const trx_t *trx,
mtr_t *mtr);
-
-#include "trx0rseg.inl"
-
-#endif
diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
index e033a3e1fe4..245b981974b 100644
--- a/storage/innobase/include/trx0sys.h
+++ b/storage/innobase/include/trx0sys.h
@@ -24,24 +24,23 @@ Transaction system
Created 3/26/1996 Heikki Tuuri
*******************************************************/
-#ifndef trx0sys_h
-#define trx0sys_h
-
+#pragma once
#include "buf0buf.h"
#include "fil0fil.h"
-#include "trx0types.h"
+#include "trx0rseg.h"
#include "mem0mem.h"
#include "mtr0mtr.h"
#include "ut0byte.h"
#include "ut0lst.h"
#include "read0types.h"
#include "page0types.h"
-#include "ut0mutex.h"
#include "trx0trx.h"
-#ifdef WITH_WSREP
-#include "trx0xa.h"
-#endif /* WITH_WSREP */
#include "ilist.h"
+#include "my_cpu.h"
+
+#ifdef UNIV_PFS_MUTEX
+extern mysql_pfs_key_t trx_sys_mutex_key;
+#endif
/** Checks if a page address is the trx sys header page.
@param[in] page_id page id
@@ -53,9 +52,8 @@ inline bool trx_sys_hdr_page(const page_id_t page_id)
/*****************************************************************//**
Creates and initializes the transaction system at the database creation. */
-void
-trx_sys_create_sys_pages(void);
-/*==========================*/
+dberr_t trx_sys_create_sys_pages(mtr_t *mtr);
+
/** Find an available rollback segment.
@param[in] sys_header
@return an unallocated rollback segment slot in the TRX_SYS header
@@ -68,10 +66,8 @@ trx_sys_rseg_find_free(const buf_block_t* sys_header);
@retval NULL if the page cannot be read */
inline buf_block_t *trx_sysf_get(mtr_t* mtr, bool rw= true)
{
- buf_block_t* block = buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
- 0, rw ? RW_X_LATCH : RW_S_LATCH, mtr);
- ut_d(if (block) buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);)
- return block;
+ return buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
+ 0, rw ? RW_X_LATCH : RW_S_LATCH, mtr);
}
#ifdef UNIV_DEBUG
@@ -134,9 +130,6 @@ trx_sys_print_mysql_binlog_offset();
bool
trx_sys_create_rsegs();
-/** The automatically created system rollback segment has this id */
-#define TRX_SYS_SYSTEM_RSEG_ID 0
-
/** The offset of the transaction system header on the page */
#define TRX_SYS FSEG_PAGE_DATA
@@ -156,13 +149,6 @@ from older MySQL or MariaDB versions. */
/*!< the start of the array of
rollback segment specification
slots */
-/*------------------------------------------------------------- @} */
-
-/** The number of rollback segments; rollback segment id must fit in
-the 7 bits reserved for it in DB_ROLL_PTR. */
-#define TRX_SYS_N_RSEGS 128
-/** Maximum number of undo tablespaces (not counting the system tablespace) */
-#define TRX_SYS_MAX_UNDO_SPACES (TRX_SYS_N_RSEGS - 1)
/* Rollback segment specification slot offsets */
@@ -185,7 +171,7 @@ trx_sysf_rseg_get_space(const buf_block_t* sys_header, ulint rseg_id)
ut_ad(rseg_id < TRX_SYS_N_RSEGS);
return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE
+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE
- + sys_header->frame);
+ + sys_header->page.frame);
}
/** Read the page number of a rollback segment slot.
@@ -198,7 +184,7 @@ trx_sysf_rseg_get_page_no(const buf_block_t *sys_header, ulint rseg_id)
ut_ad(rseg_id < TRX_SYS_N_RSEGS);
return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO +
rseg_id * TRX_SYS_RSEG_SLOT_SIZE +
- sys_header->frame);
+ sys_header->page.frame);
}
/** Maximum length of MySQL binlog file name, in bytes.
@@ -344,16 +330,14 @@ trx_t* current_trx();
struct rw_trx_hash_element_t
{
- rw_trx_hash_element_t(): trx(0)
+ rw_trx_hash_element_t()
{
- mutex_create(LATCH_ID_RW_TRX_HASH_ELEMENT, &mutex);
+ memset(reinterpret_cast<void*>(this), 0, sizeof *this);
+ mutex.init();
}
- ~rw_trx_hash_element_t()
- {
- mutex_free(&mutex);
- }
+ ~rw_trx_hash_element_t() { mutex.destroy(); }
trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */
@@ -366,7 +350,7 @@ struct rw_trx_hash_element_t
*/
Atomic_counter<trx_id_t> no;
trx_t *trx;
- ib_mutex_t mutex;
+ srw_mutex mutex;
};
@@ -515,12 +499,12 @@ class rw_trx_hash_t
ut_ad(!trx->read_only || !trx->rsegs.m_redo.rseg);
ut_ad(!trx->is_autocommit_non_locking());
/* trx->state can be anything except TRX_STATE_NOT_STARTED */
- mutex_enter(&trx->mutex);
+ ut_d(trx->mutex_lock());
ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) ||
trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) ||
trx_state_eq(trx, TRX_STATE_PREPARED));
- mutex_exit(&trx->mutex);
+ ut_d(trx->mutex_unlock());
}
@@ -535,10 +519,11 @@ class rw_trx_hash_t
static my_bool debug_iterator(rw_trx_hash_element_t *element,
debug_iterator_arg<T> *arg)
{
- mutex_enter(&element->mutex);
+ element->mutex.wr_lock();
if (element->trx)
validate_element(element->trx);
- mutex_exit(&element->mutex);
+ element->mutex.wr_unlock();
+ ut_ad(element->id < element->no);
return arg->action(element, arg->argument);
}
#endif
@@ -591,10 +576,10 @@ public:
the transaction may get committed before this method returns.
With do_ref_count == false the caller may dereference returned trx pointer
- only if lock_sys.mutex was acquired before calling find().
+ only if lock_sys.latch was acquired before calling find().
With do_ref_count == true caller may dereference trx even if it is not
- holding lock_sys.mutex. Caller is responsible for calling
+ holding lock_sys.latch. Caller is responsible for calling
trx->release_reference() when it is done playing with trx.
Ideally this method should get caller rw_trx_hash_pins along with trx
@@ -640,7 +625,7 @@ public:
sizeof(trx_id_t)));
if (element)
{
- mutex_enter(&element->mutex);
+ element->mutex.wr_lock();
lf_hash_search_unpin(pins);
if ((trx= element->trx)) {
DBUG_ASSERT(trx_id == trx->id);
@@ -655,16 +640,13 @@ public:
trx->mutex is released, and it will have to be rechecked
by the caller after reacquiring the mutex.
*/
- trx_mutex_enter(trx);
- const trx_state_t state= trx->state;
- trx_mutex_exit(trx);
- if (state == TRX_STATE_COMMITTED_IN_MEMORY)
- trx= NULL;
+ if (trx->state == TRX_STATE_COMMITTED_IN_MEMORY)
+ trx= nullptr;
else
trx->reference();
}
}
- mutex_exit(&element->mutex);
+ element->mutex.wr_unlock();
}
if (!caller_trx)
lf_hash_put_pins(pins);
@@ -698,9 +680,9 @@ public:
void erase(trx_t *trx)
{
ut_d(validate_element(trx));
- mutex_enter(&trx->rw_trx_hash_element->mutex);
- trx->rw_trx_hash_element->trx= 0;
- mutex_exit(&trx->rw_trx_hash_element->mutex);
+ trx->rw_trx_hash_element->mutex.wr_lock();
+ trx->rw_trx_hash_element->trx= nullptr;
+ trx->rw_trx_hash_element->mutex.wr_unlock();
int res= lf_hash_delete(&hash, get_pins(trx),
reinterpret_cast<const void*>(&trx->id),
sizeof(trx_id_t));
@@ -734,12 +716,12 @@ public:
May return element with committed transaction. If caller doesn't like to
see committed transactions, it has to skip those under element mutex:
- mutex_enter(&element->mutex);
+ element->mutex.wr_lock();
if (trx_t trx= element->trx)
{
// trx is protected against commit in this branch
}
- mutex_exit(&element->mutex);
+ element->mutex.wr_unlock();
May miss concurrently inserted transactions.
@@ -800,53 +782,53 @@ public:
class thread_safe_trx_ilist_t
{
public:
- void create() { mutex_create(LATCH_ID_TRX_SYS, &mutex); }
- void close() { mutex_free(&mutex); }
+ void create() { mysql_mutex_init(trx_sys_mutex_key, &mutex, nullptr); }
+ void close() { mysql_mutex_destroy(&mutex); }
bool empty() const
{
- mutex_enter(&mutex);
+ mysql_mutex_lock(&mutex);
auto result= trx_list.empty();
- mutex_exit(&mutex);
+ mysql_mutex_unlock(&mutex);
return result;
}
void push_front(trx_t &trx)
{
- mutex_enter(&mutex);
+ mysql_mutex_lock(&mutex);
trx_list.push_front(trx);
- mutex_exit(&mutex);
+ mysql_mutex_unlock(&mutex);
}
void remove(trx_t &trx)
{
- mutex_enter(&mutex);
+ mysql_mutex_lock(&mutex);
trx_list.remove(trx);
- mutex_exit(&mutex);
+ mysql_mutex_unlock(&mutex);
}
template <typename Callable> void for_each(Callable &&callback) const
{
- mutex_enter(&mutex);
+ mysql_mutex_lock(&mutex);
for (const auto &trx : trx_list)
callback(trx);
- mutex_exit(&mutex);
+ mysql_mutex_unlock(&mutex);
}
template <typename Callable> void for_each(Callable &&callback)
{
- mutex_enter(&mutex);
+ mysql_mutex_lock(&mutex);
for (auto &trx : trx_list)
callback(trx);
- mutex_exit(&mutex);
+ mysql_mutex_unlock(&mutex);
}
- void freeze() const { mutex_enter(&mutex); }
- void unfreeze() const { mutex_exit(&mutex); }
+ void freeze() const { mysql_mutex_lock(&mutex); }
+ void unfreeze() const { mysql_mutex_unlock(&mutex); }
private:
- alignas(CACHE_LINE_SIZE) mutable TrxSysMutex mutex;
- alignas(CACHE_LINE_SIZE) ilist<trx_t> trx_list;
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) mutable mysql_mutex_t mutex;
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) ilist<trx_t> trx_list;
};
/** The transaction system central memory data structure. */
@@ -856,7 +838,7 @@ class trx_sys_t
The smallest number not yet assigned as a transaction id or transaction
number. Accessed and updated with atomic operations.
*/
- MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<trx_id_t> m_max_trx_id;
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) Atomic_counter<trx_id_t> m_max_trx_id;
/**
@@ -867,39 +849,28 @@ class trx_sys_t
@sa assign_new_trx_no()
@sa snapshot_ids()
*/
- MY_ALIGNED(CACHE_LINE_SIZE) std::atomic<trx_id_t> m_rw_trx_hash_version;
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE)
+ std::atomic<trx_id_t> m_rw_trx_hash_version;
bool m_initialised;
public:
- /**
- TRX_RSEG_HISTORY list length (number of committed transactions to purge)
- */
- MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<size_t> rseg_history_len;
-
/** List of all transactions. */
thread_safe_trx_ilist_t trx_list;
- MY_ALIGNED(CACHE_LINE_SIZE)
- /** Temporary rollback segments */
- trx_rseg_t* temp_rsegs[TRX_SYS_N_RSEGS];
+ /** Temporary rollback segments */
+ trx_rseg_t temp_rsegs[TRX_SYS_N_RSEGS];
- MY_ALIGNED(CACHE_LINE_SIZE)
- trx_rseg_t* rseg_array[TRX_SYS_N_RSEGS];
- /*!< Pointer array to rollback
- segments; NULL if slot not in use;
- created and destroyed in
- single-threaded mode; not protected
- by any mutex, because it is read-only
- during multi-threaded operation */
+ /** Persistent rollback segments; space==nullptr if slot not in use */
+ trx_rseg_t rseg_array[TRX_SYS_N_RSEGS];
/**
Lock-free hash of in memory read-write transactions.
Works faster when it is on it's own cache line (tested).
*/
- MY_ALIGNED(CACHE_LINE_SIZE) rw_trx_hash_t rw_trx_hash;
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) rw_trx_hash_t rw_trx_hash;
#ifdef WITH_WSREP
@@ -925,20 +896,47 @@ public:
/**
- Returns the minimum trx id in rw trx list.
+ @return TRX_RSEG_HISTORY length (number of committed transactions to purge)
+ */
+ size_t history_size();
+
+
+ /**
+ Check whether history_size() exceeds a specified number.
+ @param threshold number of committed transactions
+ @return whether TRX_RSEG_HISTORY length exceeds the threshold
+ */
+ bool history_exceeds(size_t threshold);
- This is the smallest id for which the trx can possibly be active. (But, you
- must look at the trx->state to find out if the minimum trx id transaction
- itself is active, or already committed.)
- @return the minimum trx id, or m_max_trx_id if the trx list is empty
+ /**
+ @return approximate history_size(), without latch protection
*/
+ TPOOL_SUPPRESS_TSAN size_t history_size_approx() const;
- trx_id_t get_min_trx_id()
+
+ /**
+ @return whether history_size() is nonzero (with some race condition)
+ */
+ TPOOL_SUPPRESS_TSAN bool history_exists();
+
+
+ /**
+ Determine if the specified transaction or any older one might be active.
+
+ @param trx current transaction
+ @param id transaction identifier
+ @return whether any transaction not newer than id might be active
+ */
+
+ bool find_same_or_older(trx_t *trx, trx_id_t id)
{
- trx_id_t id= get_max_trx_id();
- rw_trx_hash.iterate(get_min_trx_id_callback, &id);
- return id;
+ if (trx->max_inactive_id >= id)
+ return false;
+ bool found= rw_trx_hash.iterate(trx, find_same_or_older_callback, &id);
+ if (!found)
+ trx->max_inactive_id= id;
+ return found;
}
@@ -1045,7 +1043,7 @@ public:
}
- bool is_initialised() { return m_initialised; }
+ bool is_initialised() const { return m_initialised; }
/** Initialise the transaction subsystem. */
@@ -1059,6 +1057,22 @@ public:
/**
+ Determine the rollback segment identifier.
+
+ @param rseg rollback segment
+ @param persistent whether the rollback segment is persistent
+ @return the rollback segment identifier
+ */
+ unsigned rseg_id(const trx_rseg_t *rseg, bool persistent) const
+ {
+ const trx_rseg_t *array= persistent ? rseg_array : temp_rsegs;
+ ut_ad(rseg >= array);
+ ut_ad(rseg < &array[TRX_SYS_N_RSEGS]);
+ return static_cast<unsigned>(rseg - array);
+ }
+
+
+ /**
Registers read-write transaction.
Transaction becomes visible to MVCC.
@@ -1157,18 +1171,10 @@ public:
}
private:
- static my_bool get_min_trx_id_callback(rw_trx_hash_element_t *element,
- trx_id_t *id)
+ static my_bool find_same_or_older_callback(rw_trx_hash_element_t *element,
+ trx_id_t *id)
{
- if (element->id < *id)
- {
- mutex_enter(&element->mutex);
- /* We don't care about read-only transactions here. */
- if (element->trx && element->trx->rsegs.m_redo.rseg)
- *id= element->id;
- mutex_exit(&element->mutex);
- }
- return 0;
+ return element->id <= *id;
}
@@ -1231,5 +1237,3 @@ private:
/** The transaction system */
extern trx_sys_t trx_sys;
-
-#endif
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
index ce3eca7593f..5b2b2264a46 100644
--- a/storage/innobase/include/trx0trx.h
+++ b/storage/innobase/include/trx0trx.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2021, MariaDB Corporation.
+Copyright (c) 2015, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -38,7 +38,6 @@ Created 3/26/1996 Heikki Tuuri
#include "ilist.h"
#include <vector>
-#include <set>
// Forward declaration
struct mtr_t;
@@ -96,18 +95,11 @@ trx_start_if_not_started_low(
trx_t* trx, /*!< in/out: transaction */
bool read_write); /*!< in: true if read write transaction */
-/*************************************************************//**
-Starts a transaction for internal processing. */
-void
-trx_start_internal_low(
-/*===================*/
- trx_t* trx); /*!< in/out: transaction */
-
-/** Starts a read-only transaction for internal processing.
-@param[in,out] trx transaction to be started */
-void
-trx_start_internal_read_only_low(
- trx_t* trx);
+/**
+Start a transaction for internal processing.
+@param trx transaction
+@param read_write whether writes may be performed */
+void trx_start_internal_low(trx_t *trx, bool read_write);
#ifdef UNIV_DEBUG
#define trx_start_if_not_started_xa(t, rw) \
@@ -128,48 +120,39 @@ trx_start_internal_read_only_low(
do { \
(t)->start_line = __LINE__; \
(t)->start_file = __FILE__; \
- trx_start_internal_low((t)); \
+ trx_start_internal_low(t, true); \
} while (false)
-
#define trx_start_internal_read_only(t) \
do { \
(t)->start_line = __LINE__; \
(t)->start_file = __FILE__; \
- trx_start_internal_read_only_low(t); \
+ trx_start_internal_low(t, false); \
} while (false)
#else
#define trx_start_if_not_started(t, rw) \
trx_start_if_not_started_low((t), rw)
-#define trx_start_internal(t) \
- trx_start_internal_low((t))
-
-#define trx_start_internal_read_only(t) \
- trx_start_internal_read_only_low(t)
+#define trx_start_internal(t) trx_start_internal_low(t, true)
+#define trx_start_internal_read_only(t) trx_start_internal_low(t, false)
#define trx_start_if_not_started_xa(t, rw) \
trx_start_if_not_started_xa_low((t), (rw))
#endif /* UNIV_DEBUG */
-/*************************************************************//**
-Starts the transaction for a DDL operation. */
-void
-trx_start_for_ddl_low(
-/*==================*/
- trx_t* trx, /*!< in/out: transaction */
- trx_dict_op_t op); /*!< in: dictionary operation type */
+/** Start a transaction for a DDL operation.
+@param trx transaction */
+void trx_start_for_ddl_low(trx_t *trx);
#ifdef UNIV_DEBUG
-#define trx_start_for_ddl(t, o) \
+# define trx_start_for_ddl(t) \
do { \
ut_ad((t)->start_file == 0); \
(t)->start_line = __LINE__; \
(t)->start_file = __FILE__; \
- trx_start_for_ddl_low((t), (o)); \
+ trx_start_for_ddl_low(t); \
} while (0)
#else
-#define trx_start_for_ddl(t, o) \
- trx_start_for_ddl_low((t), (o))
+# define trx_start_for_ddl(t) trx_start_for_ddl_low(t)
#endif /* UNIV_DEBUG */
/**********************************************************************//**
@@ -245,7 +228,7 @@ trx_print_low(
/*!< in: max query length to print,
or 0 to use the default max length */
ulint n_rec_locks,
- /*!< in: lock_number_of_rows_locked(&trx->lock) */
+ /*!< in: trx->lock.n_rec_locks */
ulint n_trx_locks,
/*!< in: length of trx->lock.trx_locks */
ulint heap_size);
@@ -264,7 +247,7 @@ trx_print_latched(
/**********************************************************************//**
Prints info about a transaction.
-Acquires and releases lock_sys.mutex. */
+Acquires and releases lock_sys.latch. */
void
trx_print(
/*======*/
@@ -274,25 +257,6 @@ trx_print(
or 0 to use the default max length */
/**********************************************************************//**
-Determine if a transaction is a dictionary operation.
-@return dictionary operation mode */
-UNIV_INLINE
-enum trx_dict_op_t
-trx_get_dict_operation(
-/*===================*/
- const trx_t* trx) /*!< in: transaction */
- MY_ATTRIBUTE((warn_unused_result));
-/**********************************************************************//**
-Flag a transaction a dictionary operation. */
-UNIV_INLINE
-void
-trx_set_dict_operation(
-/*===================*/
- trx_t* trx, /*!< in/out: transaction */
- enum trx_dict_op_t op); /*!< in: operation, not
- TRX_DICT_OP_NONE */
-
-/**********************************************************************//**
Determines if a transaction is in the given state.
The caller must hold trx->mutex, or it must be the thread
that is serving a running transaction.
@@ -328,43 +292,6 @@ is estimated as the number of altered rows + the number of locked rows.
@return transaction weight */
#define TRX_WEIGHT(t) ((t)->undo_no + UT_LIST_GET_LEN((t)->lock.trx_locks))
-/*******************************************************************//**
-Compares the "weight" (or size) of two transactions. Transactions that
-have edited non-transactional tables are considered heavier than ones
-that have not.
-@return true if weight(a) >= weight(b) */
-bool
-trx_weight_ge(
-/*==========*/
- const trx_t* a, /*!< in: the transaction to be compared */
- const trx_t* b); /*!< in: the transaction to be compared */
-/* Maximum length of a string that can be returned by
-trx_get_que_state_str(). */
-#define TRX_QUE_STATE_STR_MAX_LEN 12 /* "ROLLING BACK" */
-
-/*******************************************************************//**
-Retrieves transaction's que state in a human readable string. The string
-should not be free()'d or modified.
-@return string in the data segment */
-UNIV_INLINE
-const char*
-trx_get_que_state_str(
-/*==================*/
- const trx_t* trx); /*!< in: transaction */
-
-/** Retreieves the transaction ID.
-In a given point in time it is guaranteed that IDs of the running
-transactions are unique. The values returned by this function for readonly
-transactions may be reused, so a subsequent RO transaction may get the same ID
-as a RO transaction that existed in the past. The values returned by this
-function should be used for printing purposes only.
-@param[in] trx transaction whose id to retrieve
-@return transaction id */
-UNIV_INLINE
-trx_id_t
-trx_get_id_for_print(
- const trx_t* trx);
-
/** Create the trx_t pool */
void
trx_pool_init();
@@ -395,95 +322,82 @@ from innodb_lock_wait_timeout via trx_t::mysql_thd.
typedef std::vector<ib_lock_t*, ut_allocator<ib_lock_t*> > lock_list;
-/*******************************************************************//**
-Latching protocol for trx_lock_t::que_state. trx_lock_t::que_state
-captures the state of the query thread during the execution of a query.
-This is different from a transaction state. The query state of a transaction
-can be updated asynchronously by other threads. The other threads can be
-system threads, like the timeout monitor thread or user threads executing
-other queries. Another thing to be mindful of is that there is a delay between
-when a query thread is put into LOCK_WAIT state and before it actually starts
-waiting. Between these two events it is possible that the query thread is
-granted the lock it was waiting for, which implies that the state can be changed
-asynchronously.
-
-All these operations take place within the context of locking. Therefore state
-changes within the locking code must acquire both the lock mutex and the
-trx->mutex when changing trx->lock.que_state to TRX_QUE_LOCK_WAIT or
-trx->lock.wait_lock to non-NULL but when the lock wait ends it is sufficient
-to only acquire the trx->mutex.
-To query the state either of the mutexes is sufficient within the locking
-code and no mutex is required when the query thread is no longer waiting. */
-
/** The locks and state of an active transaction. Protected by
-lock_sys.mutex, trx->mutex or both. */
-struct trx_lock_t {
-#ifdef UNIV_DEBUG
- /** number of active query threads; at most 1, except for the
- dummy transaction in trx_purge() */
- ulint n_active_thrs;
-#endif
- trx_que_t que_state; /*!< valid when trx->state
- == TRX_STATE_ACTIVE: TRX_QUE_RUNNING,
- TRX_QUE_LOCK_WAIT, ... */
-
- lock_t* wait_lock; /*!< if trx execution state is
- TRX_QUE_LOCK_WAIT, this points to
- the lock request, otherwise this is
- NULL; set to non-NULL when holding
- both trx->mutex and lock_sys.mutex;
- set to NULL when holding
- lock_sys.mutex; readers should
- hold lock_sys.mutex, except when
- they are holding trx->mutex and
- wait_lock==NULL */
- ib_uint64_t deadlock_mark; /*!< A mark field that is initialized
- to and checked against lock_mark_counter
- by lock_deadlock_recursive(). */
- bool was_chosen_as_deadlock_victim;
- /*!< when the transaction decides to
- wait for a lock, it sets this to false;
- if another transaction chooses this
- transaction as a victim in deadlock
- resolution, it sets this to true.
- Protected by trx->mutex. */
- time_t wait_started; /*!< lock wait started at this time,
- protected only by lock_sys.mutex */
+lock_sys.latch, trx->mutex or both. */
+struct trx_lock_t
+{
+ /** Lock request being waited for.
+ Set to nonnull when holding lock_sys.latch, lock_sys.wait_mutex and
+ trx->mutex, by the thread that is executing the transaction.
+ Set to nullptr when holding lock_sys.wait_mutex. */
+ Atomic_relaxed<lock_t*> wait_lock;
+ /** Transaction being waited for; protected by lock_sys.wait_mutex */
+ trx_t *wait_trx;
+ /** condition variable for !wait_lock; used with lock_sys.wait_mutex */
+ pthread_cond_t cond;
+ /** lock wait start time */
+ Atomic_relaxed<my_hrtime_t> suspend_time;
+
+#if defined(UNIV_DEBUG) || !defined(DBUG_OFF)
+ /** 2=high priority WSREP thread has marked this trx to abort;
+ 1=another transaction chose this as a victim in deadlock resolution. */
+ Atomic_relaxed<byte> was_chosen_as_deadlock_victim;
+
+ /** Flag the lock owner as a victim in Galera conflict resolution. */
+ void set_wsrep_victim()
+ {
+# if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+ /* There is no 8-bit version of the 80386 BTS instruction.
+ Technically, this is the wrong addressing mode (16-bit), but
+ there are other data members stored after the byte. */
+ __asm__ __volatile__("lock btsw $1, %0"
+ : "+m" (was_chosen_as_deadlock_victim));
+# else
+ was_chosen_as_deadlock_victim.fetch_or(2);
+# endif
+ }
+#else /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */
+
+ /** High priority WSREP thread has marked this trx to abort or
+ another transaction chose this as a victim in deadlock resolution. */
+ Atomic_relaxed<bool> was_chosen_as_deadlock_victim;
+
+ /** Flag the lock owner as a victim in Galera conflict resolution. */
+ void set_wsrep_victim() {
+ was_chosen_as_deadlock_victim= true;
+ }
+#endif /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */
+
+ /** Next available rec_pool[] entry */
+ byte rec_cached;
+ /** Next available table_pool[] entry */
+ byte table_cached;
que_thr_t* wait_thr; /*!< query thread belonging to this
- trx that is in QUE_THR_LOCK_WAIT
+ trx that is in waiting
state. For threads suspended in a
lock wait, this is protected by
- lock_sys.mutex. Otherwise, this may
+ lock_sys.latch. Otherwise, this may
only be modified by the thread that is
serving the running transaction. */
-#ifdef WITH_WSREP
- bool was_chosen_as_wsrep_victim;
- /*!< high priority wsrep thread has
- marked this trx to abort */
-#endif /* WITH_WSREP */
-
- /** Pre-allocated record locks */
- struct {
- ib_lock_t lock; byte pad[256];
- } rec_pool[8];
- /** Pre-allocated table locks */
- ib_lock_t table_pool[8];
+ /** Pre-allocated record locks */
+ struct {
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) ib_lock_t lock;
+ } rec_pool[8];
- /** Next available rec_pool[] entry */
- unsigned rec_cached;
+ /** Pre-allocated table locks */
+ ib_lock_t table_pool[8];
- /** Next available table_pool[] entry */
- unsigned table_cached;
+ /** Memory heap for trx_locks. Protected by lock_sys.assert_locked()
+ and lock_sys.is_writer() || trx->mutex_is_owner(). */
+ mem_heap_t *lock_heap;
- mem_heap_t* lock_heap; /*!< memory heap for trx_locks;
- protected by lock_sys.mutex */
-
- trx_lock_list_t trx_locks; /*!< locks requested by the transaction;
- insertions are protected by trx->mutex
- and lock_sys.mutex; removals are
- protected by lock_sys.mutex */
+ /** Locks held by the transaction. Protected by lock_sys.assert_locked()
+ and lock_sys.is_writer() || trx->mutex_is_owner().
+ (If lock_sys.latch is only held in shared mode, then the modification
+ must be protected by trx->mutex.) */
+ trx_lock_list_t trx_locks;
lock_list table_locks; /*!< All table locks requested by this
transaction, including AUTOINC locks */
@@ -491,75 +405,94 @@ struct trx_lock_t {
/** List of pending trx_t::evict_table() */
UT_LIST_BASE_NODE_T(dict_table_t) evicted_tables;
- bool cancel; /*!< true if the transaction is being
- rolled back either via deadlock
- detection or due to lock timeout. The
- caller has to acquire the trx_t::mutex
- in order to cancel the locks. In
- lock_trx_table_locks_remove() we
- check for this cancel of a transaction's
- locks and avoid reacquiring the trx
- mutex to prevent recursive deadlocks.
- Protected by both the lock sys mutex
- and the trx_t::mutex. */
- ulint n_rec_locks; /*!< number of rec locks in this trx */
+ /** number of record locks; protected by lock_sys.assert_locked(page_id) */
+ ulint n_rec_locks;
};
/** Logical first modification time of a table in a transaction */
class trx_mod_table_time_t
{
- /** First modification of the table */
- undo_no_t first;
- /** First modification of a system versioned column */
- undo_no_t first_versioned;
-
- /** Magic value signifying that a system versioned column of a
- table was never modified in a transaction. */
- static const undo_no_t UNVERSIONED = IB_ID_MAX;
-
+ /** Impossible value for trx_t::undo_no */
+ static constexpr undo_no_t NONE= ~undo_no_t{0};
+ /** Theoretical maximum value for trx_t::undo_no.
+ DB_ROLL_PTR is only 7 bytes, so it cannot point to more than
+ this many undo log records. */
+ static constexpr undo_no_t LIMIT= (undo_no_t{1} << (7 * 8)) - 1;
+
+ /** Flag in 'first' to indicate that subsequent operations are
+ covered by a TRX_UNDO_EMPTY record (for the first statement to
+ insert into an empty table) */
+ static constexpr undo_no_t BULK= 1ULL << 63;
+
+ /** First modification of the table, possibly ORed with BULK */
+ undo_no_t first;
+ /** First modification of a system versioned column
+ (NONE= no versioning, BULK= the table was dropped) */
+ undo_no_t first_versioned= NONE;
+#ifdef UNIV_DEBUG
+ /** Whether the modified table is a FTS auxiliary table */
+ bool fts_aux_table= false;
+#endif /* UNIV_DEBUG */
public:
- /** Constructor
- @param[in] rows number of modified rows so far */
- trx_mod_table_time_t(undo_no_t rows)
- : first(rows), first_versioned(UNVERSIONED) {}
+ /** Constructor
+ @param rows number of modified rows so far */
+ trx_mod_table_time_t(undo_no_t rows) : first(rows) { ut_ad(rows < LIMIT); }
#ifdef UNIV_DEBUG
- /** Validation
- @param[in] rows number of modified rows so far
- @return whether the object is valid */
- bool valid(undo_no_t rows = UNVERSIONED) const
- {
- return first <= first_versioned && first <= rows;
- }
+ /** Validation
+ @param rows number of modified rows so far
+ @return whether the object is valid */
+ bool valid(undo_no_t rows= NONE) const
+ { auto f= first & LIMIT; return f <= first_versioned && f <= rows; }
#endif /* UNIV_DEBUG */
- /** @return if versioned columns were modified */
- bool is_versioned() const { return first_versioned != UNVERSIONED; }
+ /** @return if versioned columns were modified */
+ bool is_versioned() const { return (~first_versioned & LIMIT) != 0; }
+ /** @return if the table was dropped */
+ bool is_dropped() const { return first_versioned == BULK; }
+
+ /** After writing an undo log record, set is_versioned() if needed
+ @param rows number of modified rows so far */
+ void set_versioned(undo_no_t rows)
+ {
+ ut_ad(first_versioned == NONE);
+ first_versioned= rows;
+ ut_ad(valid(rows));
+ }
- /** After writing an undo log record, set is_versioned() if needed
- @param[in] rows number of modified rows so far */
- void set_versioned(undo_no_t rows)
- {
- ut_ad(!is_versioned());
- first_versioned = rows;
- ut_ad(valid());
- }
+ /** After writing an undo log record, note that the table will be dropped */
+ void set_dropped()
+ {
+ ut_ad(first_versioned == NONE);
+ first_versioned= BULK;
+ }
- /** Invoked after partial rollback
- @param[in] limit number of surviving modified rows
- @return whether this should be erased from trx_t::mod_tables */
- bool rollback(undo_no_t limit)
- {
- ut_ad(valid());
- if (first >= limit) {
- return true;
- }
+ /** Notify the start of a bulk insert operation */
+ void start_bulk_insert() { first|= BULK; }
- if (first_versioned < limit && is_versioned()) {
- first_versioned = UNVERSIONED;
- }
+ /** Notify the end of a bulk insert operation */
+ void end_bulk_insert() { first&= ~BULK; }
- return false;
- }
+ /** @return whether an insert is covered by TRX_UNDO_EMPTY record */
+ bool is_bulk_insert() const { return first & BULK; }
+
+ /** Invoked after partial rollback
+ @param limit number of surviving modified rows (trx_t::undo_no)
+ @return whether this should be erased from trx_t::mod_tables */
+ bool rollback(undo_no_t limit)
+ {
+ ut_ad(valid());
+ if ((LIMIT & first) >= limit)
+ return true;
+ if (first_versioned < limit)
+ first_versioned= NONE;
+ return false;
+ }
+
+#ifdef UNIV_DEBUG
+ void set_aux_table() { fts_aux_table= true; }
+
+ bool is_aux_table() const { return fts_aux_table; }
+#endif /* UNIV_DEBUG */
};
/** Collection of persistent tables and their first modification
@@ -593,7 +526,7 @@ no longer be associated with a session when the server is restarted.
A session may be served by at most one thread at a time. The serving
thread of a session might change in some MySQL implementations.
-Therefore we do not have os_thread_get_curr_id() assertions in the code.
+Therefore we do not have pthread_self() assertions in the code.
Normally, only the thread that is currently associated with a running
transaction may access (read and modify) the trx object, and it may do
@@ -604,7 +537,7 @@ transactions (state == TRX_STATE_ACTIVE && is_recovered)
while the system is already processing new user transactions (!is_recovered).
* trx_print_low() may access transactions not associated with the current
-thread. The caller must be holding lock_sys.mutex.
+thread. The caller must be holding lock_sys.latch.
* When a transaction handle is in the trx_sys.trx_list, some of its fields
must not be modified without holding trx->mutex.
@@ -612,7 +545,7 @@ must not be modified without holding trx->mutex.
* The locking code (in particular, lock_deadlock_recursive() and
lock_rec_convert_impl_to_expl()) will access transactions associated
to other connections. The locks of transactions are protected by
-lock_sys.mutex (insertions also by trx->mutex). */
+lock_sys.latch (insertions also by trx->mutex). */
/** Represents an instance of rollback segment along with its state variables.*/
struct trx_undo_ptr_t {
@@ -643,7 +576,8 @@ struct trx_rsegs_t {
trx_temp_undo_t m_noredo;
};
-struct trx_t : ilist_node<> {
+struct trx_t : ilist_node<>
+{
private:
/**
Least significant 31 bits is count of references.
@@ -658,96 +592,139 @@ private:
we don't want to get blocked on GAP locks taken for protecting
concurrent unique insert or replace operation.
*/
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE)
Atomic_relaxed<uint32_t> skip_lock_inheritance_and_n_ref;
public:
- TrxMutex mutex; /*!< Mutex protecting the fields
- state and lock (except some fields
- of lock, which are protected by
- lock_sys.mutex) */
+ /** Transaction identifier (0 if no locks were acquired).
+ Set by trx_sys_t::register_rw() or trx_resurrect() before
+ the transaction is added to trx_sys.rw_trx_hash.
+ Cleared in commit_in_memory() after commit_state(),
+ trx_sys_t::deregister_rw(), release_locks(). */
+ trx_id_t id;
+ /** The largest encountered transaction identifier for which no
+ transaction was observed to be active. This is a cache to speed up
+ trx_sys_t::find_same_or_older(). */
+ trx_id_t max_inactive_id;
+
+private:
+ /** mutex protecting state and some of lock
+ (some are protected by lock_sys.latch) */
+ srw_spin_mutex mutex;
+#ifdef UNIV_DEBUG
+ /** The owner of mutex (0 if none); protected by mutex */
+ std::atomic<pthread_t> mutex_owner{0};
+#endif /* UNIV_DEBUG */
+public:
+ void mutex_init() { mutex.init(); }
+ void mutex_destroy() { mutex.destroy(); }
+
+ /** Acquire the mutex */
+ void mutex_lock()
+ {
+ ut_ad(!mutex_is_owner());
+ mutex.wr_lock();
+ ut_ad(!mutex_owner.exchange(pthread_self(),
+ std::memory_order_relaxed));
+ }
+ /** Release the mutex */
+ void mutex_unlock()
+ {
+ ut_ad(mutex_owner.exchange(0, std::memory_order_relaxed)
+ == pthread_self());
+ mutex.wr_unlock();
+ }
+#ifndef SUX_LOCK_GENERIC
+ bool mutex_is_locked() const noexcept { return mutex.is_locked(); }
+#endif
+#ifdef UNIV_DEBUG
+ /** @return whether the current thread holds the mutex */
+ bool mutex_is_owner() const
+ {
+ return mutex_owner.load(std::memory_order_relaxed) ==
+ pthread_self();
+ }
+#endif /* UNIV_DEBUG */
+
+ /** State of the trx from the point of view of concurrency control
+ and the valid state transitions.
- trx_id_t id; /*!< transaction id */
+ Possible states:
- /** State of the trx from the point of view of concurrency control
- and the valid state transitions.
+ TRX_STATE_NOT_STARTED
+ TRX_STATE_ACTIVE
+ TRX_STATE_PREPARED
+ TRX_STATE_PREPARED_RECOVERED (special case of TRX_STATE_PREPARED)
+ TRX_STATE_COMMITTED_IN_MEMORY (alias below COMMITTED)
- Possible states:
+ Valid state transitions are:
- TRX_STATE_NOT_STARTED
- TRX_STATE_ACTIVE
- TRX_STATE_PREPARED
- TRX_STATE_PREPARED_RECOVERED (special case of TRX_STATE_PREPARED)
- TRX_STATE_COMMITTED_IN_MEMORY (alias below COMMITTED)
+ Regular transactions:
+ * NOT_STARTED -> ACTIVE -> COMMITTED -> NOT_STARTED
- Valid state transitions are:
+ Auto-commit non-locking read-only:
+ * NOT_STARTED -> ACTIVE -> NOT_STARTED
- Regular transactions:
- * NOT_STARTED -> ACTIVE -> COMMITTED -> NOT_STARTED
+ XA (2PC):
+ * NOT_STARTED -> ACTIVE -> PREPARED -> COMMITTED -> NOT_STARTED
- Auto-commit non-locking read-only:
- * NOT_STARTED -> ACTIVE -> NOT_STARTED
+ Recovered XA:
+ * NOT_STARTED -> PREPARED -> COMMITTED -> (freed)
- XA (2PC):
- * NOT_STARTED -> ACTIVE -> PREPARED -> COMMITTED -> NOT_STARTED
+ Recovered XA followed by XA ROLLBACK:
+ * NOT_STARTED -> PREPARED -> ACTIVE -> COMMITTED -> (freed)
- Recovered XA:
- * NOT_STARTED -> PREPARED -> COMMITTED -> (freed)
+ XA (2PC) (shutdown or disconnect before ROLLBACK or COMMIT):
+ * NOT_STARTED -> PREPARED -> (freed)
- Recovered XA followed by XA ROLLBACK:
- * NOT_STARTED -> PREPARED -> ACTIVE -> COMMITTED -> (freed)
+ Disconnected XA PREPARE transaction can become recovered:
+ * ... -> ACTIVE -> PREPARED (connected) -> PREPARED (disconnected)
- XA (2PC) (shutdown or disconnect before ROLLBACK or COMMIT):
- * NOT_STARTED -> PREPARED -> (freed)
+ Latching and various transaction lists membership rules:
- Disconnected XA can become recovered:
- * ... -> ACTIVE -> PREPARED (connected) -> PREPARED (disconnected)
- Disconnected means from mysql e.g due to the mysql client disconnection.
- Latching and various transaction lists membership rules:
+ XA (2PC) transactions are always treated as non-autocommit.
- XA (2PC) transactions are always treated as non-autocommit.
+ Transitions to ACTIVE or NOT_STARTED occur when transaction
+ is not in rw_trx_hash.
- Transitions to ACTIVE or NOT_STARTED occur when transaction
- is not in rw_trx_hash.
+ Autocommit non-locking read-only transactions move between states
+ without holding any mutex. They are not in rw_trx_hash.
- Autocommit non-locking read-only transactions move between states
- without holding any mutex. They are not in rw_trx_hash.
+ All transactions, unless they are determined to be ac-nl-ro,
+ explicitly tagged as read-only or read-write, will first be put
+ on the read-only transaction list. Only when a !read-only transaction
+ in the read-only list tries to acquire an X or IX lock on a table
+ do we remove it from the read-only list and put it on the read-write
+ list. During this switch we assign it a rollback segment.
- All transactions, unless they are determined to be ac-nl-ro,
- explicitly tagged as read-only or read-write, will first be put
- on the read-only transaction list. Only when a !read-only transaction
- in the read-only list tries to acquire an X or IX lock on a table
- do we remove it from the read-only list and put it on the read-write
- list. During this switch we assign it a rollback segment.
+ When a transaction is NOT_STARTED, it can be in trx_list. It cannot be
+ in rw_trx_hash.
- When a transaction is NOT_STARTED, it can be in trx_list. It cannot be
- in rw_trx_hash.
+ ACTIVE->PREPARED->COMMITTED is only possible when trx is in rw_trx_hash.
+ The transition ACTIVE->PREPARED is protected by trx->mutex.
- ACTIVE->PREPARED->COMMITTED is only possible when trx is in rw_trx_hash.
- The transition ACTIVE->PREPARED is protected by trx->mutex.
+ ACTIVE->COMMITTED is possible when the transaction is in
+ rw_trx_hash.
- ACTIVE->COMMITTED is possible when the transaction is in
- rw_trx_hash.
+ Transitions to COMMITTED are protected by trx_t::mutex. */
+ Atomic_relaxed<trx_state_t> state;
+
+ /** The locks of the transaction. Protected by lock_sys.latch
+ (insertions also by trx_t::mutex). */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_lock_t lock;
- Transitions to COMMITTED are protected by trx_t::mutex. */
- trx_state_t state;
#ifdef WITH_WSREP
- /** whether wsrep_on(mysql_thd) held at the start of transaction */
- bool wsrep;
- bool is_wsrep() const { return UNIV_UNLIKELY(wsrep); }
- /** true, if BF thread is performing unique secondary index scanning */
- bool wsrep_UK_scan;
- bool is_wsrep_UK_scan() const { return UNIV_UNLIKELY(wsrep_UK_scan); }
+ /** whether wsrep_on(mysql_thd) held at the start of transaction */
+ byte wsrep;
+ bool is_wsrep() const { return UNIV_UNLIKELY(wsrep); }
+ bool is_wsrep_UK_scan() const { return UNIV_UNLIKELY(wsrep & 2); }
#else /* WITH_WSREP */
- bool is_wsrep() const { return false; }
+ bool is_wsrep() const { return false; }
#endif /* WITH_WSREP */
- ReadView read_view; /*!< consistent read view used in the
- transaction, or NULL if not yet set */
- trx_lock_t lock; /*!< Information about the transaction
- locks and state. Protected by
- lock_sys.mutex (insertions also
- by trx_t::mutex). */
+ /** Consistent read view of the transaction */
+ ReadView read_view;
/* These fields are not protected by any mutex. */
@@ -767,6 +744,8 @@ public:
wants to suppress foreign key checks,
(in table imports, for example) we
set this FALSE */
+ /** whether an insert into an empty table is active */
+ bool bulk_insert;
/*------------------------------*/
/* MySQL has a transaction coordinator to coordinate two phase
commit between multiple storage engines and the binary log. When
@@ -800,13 +779,15 @@ public:
flush the log in
trx_commit_complete_for_mysql() */
ulint duplicates; /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
- trx_dict_op_t dict_operation; /**< @see enum trx_dict_op_t */
-
- ib_uint32_t dict_operation_lock_mode;
- /*!< 0, RW_S_LATCH, or RW_X_LATCH:
- the latch mode trx currently holds
- on dict_sys.latch. Protected
- by dict_sys.latch. */
+ /** whether this modifies InnoDB dictionary tables */
+ bool dict_operation;
+#ifdef UNIV_DEBUG
+ /** copy of dict_operation during commit() */
+ bool was_dict_operation;
+#endif
+ /** whether dict_sys.latch is held exclusively; protected by
+ dict_sys.latch */
+ bool dict_operation_lock_mode;
/** wall-clock time of the latest transition to TRX_STATE_ACTIVE;
used for diagnostic purposes only */
@@ -814,8 +795,6 @@ public:
/** microsecond_interval_timer() of transaction start */
ulonglong start_time_micro;
lsn_t commit_lsn; /*!< lsn at the time of the commit */
- table_id_t table_id; /*!< Table to drop iff dict_operation
- == TRX_DICT_OP_TABLE, or 0. */
/*------------------------------*/
THD* mysql_thd; /*!< MySQL thread handle corresponding
to this trx, or NULL */
@@ -886,7 +865,7 @@ public:
also in the lock list trx_locks. This
vector needs to be freed explicitly
when the trx instance is destroyed.
- Protected by lock_sys.mutex. */
+ Protected by lock_sys.latch. */
/*------------------------------*/
bool read_only; /*!< true if transaction is flagged
as a READ-ONLY transaction.
@@ -899,6 +878,10 @@ public:
bool auto_commit; /*!< true if it is an autocommit */
bool will_lock; /*!< set to inform trx_start_low() that
the transaction may acquire locks */
+ /* True if transaction has to read the undo log and
+ log the DML changes for online DDL table */
+ bool apply_online_log = false;
+
/*------------------------------*/
fts_trx_t* fts_trx; /*!< FTS information, or NULL if
transaction hasn't modified tables
@@ -909,20 +892,12 @@ public:
count of tables being flushed. */
/*------------------------------*/
- bool ddl; /*!< true if it is an internal
- transaction for DDL */
- bool internal; /*!< true if it is a system/internal
- transaction background task. This
- includes DDL transactions too. Such
- transactions are always treated as
- read-write. */
- /*------------------------------*/
#ifdef UNIV_DEBUG
unsigned start_line; /*!< Track where it was started from */
const char* start_file; /*!< Filename where it was started */
#endif /* UNIV_DEBUG */
- XID* xid; /*!< X/Open XA transaction
+ XID xid; /*!< X/Open XA transaction
identification to identify a
transaction branch */
trx_mod_tables_t mod_tables; /*!< List of tables that were modified
@@ -964,8 +939,9 @@ public:
inline void release_locks();
/** Evict a table definition due to the rollback of ALTER TABLE.
- @param[in] table_id table identifier */
- void evict_table(table_id_t table_id);
+ @param table_id table identifier
+ @param reset_only whether to only reset dict_table_t::def_trx_id */
+ void evict_table(table_id_t table_id, bool reset_only= false);
/** Initiate rollback.
@param savept savepoint to which to roll back
@@ -979,8 +955,17 @@ public:
@retval false if the rollback was aborted by shutdown */
inline bool rollback_finish();
private:
- /** Mark a transaction committed in the main memory data structures. */
+ /** Apply any changes to tables for which online DDL is in progress. */
+ ATTRIBUTE_COLD void apply_log();
+ /** Process tables that were modified by the committing transaction. */
+ inline void commit_tables();
+ /** Mark a transaction committed in the main memory data structures.
+ @param mtr mini-transaction (if there are any persistent modifications) */
inline void commit_in_memory(const mtr_t *mtr);
+ /** Write log for committing the transaction. */
+ void commit_persist();
+ /** Clean up the transaction after commit_in_memory() */
+ void commit_cleanup();
/** Commit the transaction in a mini-transaction.
@param mtr mini-transaction (if there are any persistent modifications) */
void commit_low(mtr_t *mtr= nullptr);
@@ -988,11 +973,41 @@ public:
/** Commit the transaction. */
void commit();
+
+ /** Try to drop a persistent table.
+ @param table persistent table
+ @param fk whether to drop FOREIGN KEY metadata
+ @return error code */
+ dberr_t drop_table(const dict_table_t &table);
+ /** Try to drop the foreign key constraints for a persistent table.
+ @param name name of persistent table
+ @return error code */
+ dberr_t drop_table_foreign(const table_name_t &name);
+ /** Try to drop the statistics for a persistent table.
+ @param name name of persistent table
+ @return error code */
+ dberr_t drop_table_statistics(const table_name_t &name);
+ /** Commit the transaction, possibly after drop_table().
+ @param deleted handles of data files that were deleted */
+ void commit(std::vector<pfs_os_file_t> &deleted);
+
+
+ /** Discard all savepoints */
+ void savepoints_discard()
+ { savepoints_discard(UT_LIST_GET_FIRST(trx_savepoints)); }
+
+
+ /** Discard all savepoints starting from a particular savepoint.
+ @param savept first savepoint to discard */
+ void savepoints_discard(trx_named_savept_t *savept);
+
+
bool is_referenced() const
{
return (skip_lock_inheritance_and_n_ref & ~(1U << 31)) > 0;
}
+
void reference()
{
ut_d(auto old_n_ref =)
@@ -1032,7 +1047,7 @@ public:
}
/** @return whether the table has lock on
- mysql.innodb_table_stats and mysql.innodb_index_stats */
+ mysql.innodb_table_stats or mysql.innodb_index_stats */
bool has_stats_table_lock() const;
/** Free the memory to trx_pools */
@@ -1043,25 +1058,64 @@ public:
{
ut_ad(state == TRX_STATE_NOT_STARTED);
ut_ad(!id);
+ ut_ad(!mutex_is_owner());
ut_ad(!has_logged());
ut_ad(!is_referenced());
ut_ad(!is_wsrep());
-#ifdef WITH_WSREP
- ut_ad(!lock.was_chosen_as_wsrep_victim);
-#endif
+ ut_ad(!lock.was_chosen_as_deadlock_victim);
+ ut_ad(mod_tables.empty());
ut_ad(!read_view.is_open());
ut_ad(!lock.wait_thr);
+ ut_ad(!lock.wait_lock);
ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0);
ut_ad(lock.table_locks.empty());
ut_ad(!autoinc_locks || ib_vector_is_empty(autoinc_locks));
ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0);
- ut_ad(dict_operation == TRX_DICT_OP_NONE);
+ ut_ad(!dict_operation);
+ ut_ad(!apply_online_log);
ut_ad(!is_not_inheriting_locks());
+ ut_ad(check_foreigns);
+ ut_ad(check_unique_secondary);
+ }
+
+ /** This has to be invoked on SAVEPOINT or at the end of a statement.
+ Even if a TRX_UNDO_EMPTY record was written for this table to cover an
+ insert into an empty table, subsequent operations will have to be covered
+ by row-level undo log records, so that ROLLBACK TO SAVEPOINT or a
+ rollback to the start of a statement will work.
+ @param table table on which any preceding bulk insert ended */
+ void end_bulk_insert(const dict_table_t &table)
+ {
+ auto it= mod_tables.find(const_cast<dict_table_t*>(&table));
+ if (it != mod_tables.end())
+ it->second.end_bulk_insert();
}
/** @return whether this is a non-locking autocommit transaction */
bool is_autocommit_non_locking() const { return auto_commit && !will_lock; }
+ /** This has to be invoked on SAVEPOINT or at the start of a statement.
+ Even if TRX_UNDO_EMPTY records were written for any table to cover an
+ insert into an empty table, subsequent operations will have to be covered
+ by row-level undo log records, so that ROLLBACK TO SAVEPOINT or a
+ rollback to the start of a statement will work. */
+ void end_bulk_insert()
+ {
+ for (auto& t : mod_tables)
+ t.second.end_bulk_insert();
+ }
+
+ /** @return whether a bulk insert into empty table is in progress */
+ bool is_bulk_insert() const
+ {
+ if (!bulk_insert || check_unique_secondary || check_foreigns)
+ return false;
+ for (const auto& t : mod_tables)
+ if (t.second.is_bulk_insert())
+ return true;
+ return false;
+ }
+
private:
/** Assign a rollback segment for modifying temporary tables.
@return the assigned rollback segment */
@@ -1134,19 +1188,6 @@ struct commit_node_t{
};
-/** Test if trx->mutex is owned. */
-#define trx_mutex_own(t) mutex_own(&t->mutex)
-
-/** Acquire the trx->mutex. */
-#define trx_mutex_enter(t) do { \
- mutex_enter(&t->mutex); \
-} while (0)
-
-/** Release the trx->mutex. */
-#define trx_mutex_exit(t) do { \
- mutex_exit(&t->mutex); \
-} while (0)
-
#include "trx0trx.inl"
#endif
diff --git a/storage/innobase/include/trx0trx.inl b/storage/innobase/include/trx0trx.inl
index 93c9591e0c2..b063c920e2f 100644
--- a/storage/innobase/include/trx0trx.inl
+++ b/storage/innobase/include/trx0trx.inl
@@ -84,123 +84,3 @@ trx_get_error_info(
{
return(trx->error_info);
}
-
-/*******************************************************************//**
-Retrieves transaction's que state in a human readable string. The string
-should not be free()'d or modified.
-@return string in the data segment */
-UNIV_INLINE
-const char*
-trx_get_que_state_str(
-/*==================*/
- const trx_t* trx) /*!< in: transaction */
-{
- /* be sure to adjust TRX_QUE_STATE_STR_MAX_LEN if you change this */
- switch (trx->lock.que_state) {
- case TRX_QUE_RUNNING:
- return("RUNNING");
- case TRX_QUE_LOCK_WAIT:
- return("LOCK WAIT");
- case TRX_QUE_ROLLING_BACK:
- return("ROLLING BACK");
- case TRX_QUE_COMMITTING:
- return("COMMITTING");
- default:
- return("UNKNOWN");
- }
-}
-
-/** Retreieves the transaction ID.
-In a given point in time it is guaranteed that IDs of the running
-transactions are unique. The values returned by this function for readonly
-transactions may be reused, so a subsequent RO transaction may get the same ID
-as a RO transaction that existed in the past. The values returned by this
-function should be used for printing purposes only.
-@param[in] trx transaction whose id to retrieve
-@return transaction id */
-UNIV_INLINE
-trx_id_t
-trx_get_id_for_print(
- const trx_t* trx)
-{
- /* Readonly and transactions whose intentions are unknown (whether
- they will eventually do a WRITE) don't have trx_t::id assigned (it is
- 0 for those transactions). Transaction IDs in
- innodb_trx.trx_id,
- innodb_locks.lock_id,
- innodb_locks.lock_trx_id,
- innodb_lock_waits.requesting_trx_id,
- innodb_lock_waits.blocking_trx_id should match because those tables
- could be used in an SQL JOIN on those columns. Also trx_t::id is
- printed by SHOW ENGINE INNODB STATUS, and in logs, so we must have the
- same value printed everywhere consistently. */
-
- /* DATA_TRX_ID_LEN is the storage size in bytes. */
- static const trx_id_t max_trx_id
- = (1ULL << (DATA_TRX_ID_LEN * CHAR_BIT)) - 1;
-
- ut_ad(trx->id <= max_trx_id);
-
- return(trx->id != 0
- ? trx->id
- : reinterpret_cast<trx_id_t>(trx) | (max_trx_id + 1));
-}
-
-/**********************************************************************//**
-Determine if a transaction is a dictionary operation.
-@return dictionary operation mode */
-UNIV_INLINE
-enum trx_dict_op_t
-trx_get_dict_operation(
-/*===================*/
- const trx_t* trx) /*!< in: transaction */
-{
- trx_dict_op_t op = static_cast<trx_dict_op_t>(trx->dict_operation);
-
-#ifdef UNIV_DEBUG
- switch (op) {
- case TRX_DICT_OP_NONE:
- case TRX_DICT_OP_TABLE:
- case TRX_DICT_OP_INDEX:
- return(op);
- }
- ut_error;
-#endif /* UNIV_DEBUG */
- return(op);
-}
-/**********************************************************************//**
-Flag a transaction a dictionary operation. */
-UNIV_INLINE
-void
-trx_set_dict_operation(
-/*===================*/
- trx_t* trx, /*!< in/out: transaction */
- enum trx_dict_op_t op) /*!< in: operation, not
- TRX_DICT_OP_NONE */
-{
-#ifdef UNIV_DEBUG
- enum trx_dict_op_t old_op = trx_get_dict_operation(trx);
-
- switch (op) {
- case TRX_DICT_OP_NONE:
- ut_error;
- break;
- case TRX_DICT_OP_TABLE:
- switch (old_op) {
- case TRX_DICT_OP_NONE:
- case TRX_DICT_OP_INDEX:
- case TRX_DICT_OP_TABLE:
- goto ok;
- }
- ut_error;
- break;
- case TRX_DICT_OP_INDEX:
- ut_ad(old_op == TRX_DICT_OP_NONE);
- break;
- }
-ok:
-#endif /* UNIV_DEBUG */
-
- trx->ddl = true;
- trx->dict_operation = op;
-}
diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h
index 99a9c66c839..07c1c6a756b 100644
--- a/storage/innobase/include/trx0types.h
+++ b/storage/innobase/include/trx0types.h
@@ -24,11 +24,9 @@ Transaction system global type definitions
Created 3/26/1996 Heikki Tuuri
*******************************************************/
-#ifndef trx0types_h
-#define trx0types_h
-
-#include "ut0byte.h"
-#include "ut0mutex.h"
+#pragma once
+#include "univ.i"
+#include "ut0new.h"
#include <vector>
@@ -50,15 +48,6 @@ static const ulint TRX_MAGIC_N = 91118598;
constexpr uint innodb_purge_threads_MAX= 32;
-/** Transaction execution states when trx->state == TRX_STATE_ACTIVE */
-enum trx_que_t {
- TRX_QUE_RUNNING, /*!< transaction is running */
- TRX_QUE_LOCK_WAIT, /*!< transaction is waiting for
- a lock */
- TRX_QUE_ROLLING_BACK, /*!< transaction is rolling back */
- TRX_QUE_COMMITTING /*!< transaction is committing */
-};
-
/** Transaction states (trx_t::state) */
enum trx_state_t {
TRX_STATE_NOT_STARTED,
@@ -72,21 +61,6 @@ enum trx_state_t {
TRX_STATE_COMMITTED_IN_MEMORY
};
-/** Type of data dictionary operation */
-enum trx_dict_op_t {
- /** The transaction is not modifying the data dictionary. */
- TRX_DICT_OP_NONE = 0,
- /** The transaction is creating a table or an index, or
- dropping a table. The table must be dropped in crash
- recovery. This and TRX_DICT_OP_NONE are the only possible
- operation modes in crash recovery. */
- TRX_DICT_OP_TABLE = 1,
- /** The transaction is creating or dropping an index in an
- existing table. In crash recovery, the data dictionary
- must be locked, but the table must not be dropped. */
- TRX_DICT_OP_INDEX = 2
-};
-
/** Memory objects */
/* @{ */
/** Transaction */
@@ -133,10 +107,10 @@ typedef byte trx_undo_rec_t;
/* @} */
-typedef ib_mutex_t RsegMutex;
-typedef ib_mutex_t TrxMutex;
-typedef ib_mutex_t PQMutex;
-typedef ib_mutex_t TrxSysMutex;
-
typedef std::vector<trx_id_t, ut_allocator<trx_id_t> > trx_ids_t;
-#endif /* trx0types_h */
+
+/** The number of rollback segments; rollback segment id must fit in
+the 7 bits reserved for it in DB_ROLL_PTR. */
+static constexpr unsigned TRX_SYS_N_RSEGS= 128;
+/** Maximum number of undo tablespaces (not counting the system tablespace) */
+static constexpr unsigned TRX_SYS_MAX_UNDO_SPACES= TRX_SYS_N_RSEGS - 1;
diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h
index a4578d61fe2..3474a903f6c 100644
--- a/storage/innobase/include/trx0undo.h
+++ b/storage/innobase/include/trx0undo.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -96,22 +96,6 @@ inline roll_ptr_t trx_read_roll_ptr(const byte* ptr)
return mach_read_from_7(ptr);
}
-/** Gets an undo log page and x-latches it.
-@param[in] page_id page id
-@param[in,out] mtr mini-transaction
-@return pointer to page x-latched */
-UNIV_INLINE
-buf_block_t*
-trx_undo_page_get(const page_id_t page_id, mtr_t* mtr);
-
-/** Gets an undo log page and s-latches it.
-@param[in] page_id page id
-@param[in,out] mtr mini-transaction
-@return pointer to page s-latched */
-UNIV_INLINE
-buf_block_t*
-trx_undo_page_get_s_latched(const page_id_t page_id, mtr_t* mtr);
-
/** Get the next record in an undo log.
@param[in] undo_page undo log page
@param[in] rec undo record offset in the page
@@ -140,8 +124,8 @@ trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
@param[in,out] mtr mini-transaction
@return undo log record, the page latched, NULL if none */
trx_undo_rec_t*
-trx_undo_get_next_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
- uint16_t offset, mtr_t *mtr);
+trx_undo_get_next_rec(const buf_block_t *&block, uint16_t rec,
+ uint32_t page_no, uint16_t offset, mtr_t *mtr);
/** Get the first record in an undo log.
@param[in] space undo log header space
@@ -150,11 +134,13 @@ trx_undo_get_next_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
@param[in] mode latching mode: RW_S_LATCH or RW_X_LATCH
@param[out] block undo log page
@param[in,out] mtr mini-transaction
-@return undo log record, the page latched, NULL if none */
+@param[out] err error code
+@return undo log record, the page latched
+@retval nullptr if none */
trx_undo_rec_t*
trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no,
- uint16_t offset, ulint mode, buf_block_t*& block,
- mtr_t *mtr);
+ uint16_t offset, ulint mode, const buf_block_t*& block,
+ mtr_t *mtr, dberr_t *err);
/** Initialize an undo log page.
NOTE: This corresponds to a redo log record and must not be changed!
@@ -165,24 +151,24 @@ void trx_undo_page_init(const buf_block_t &block);
/** Allocate an undo log page.
@param[in,out] undo undo log
@param[in,out] mtr mini-transaction that does not hold any page latch
+@param[out] err error code
@return X-latched block if success
-@retval NULL on failure */
-buf_block_t* trx_undo_add_page(trx_undo_t* undo, mtr_t* mtr)
- MY_ATTRIBUTE((nonnull, warn_unused_result));
+@retval nullptr on failure */
+buf_block_t *trx_undo_add_page(trx_undo_t *undo, mtr_t *mtr, dberr_t *err)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/** Free the last undo log page. The caller must hold the rseg mutex.
@param[in,out] undo undo log
@param[in,out] mtr mini-transaction that does not hold any undo log page
- or that has allocated the undo log page */
-void
-trx_undo_free_last_page(trx_undo_t* undo, mtr_t* mtr)
- MY_ATTRIBUTE((nonnull));
+ or that has allocated the undo log page
+@return error code */
+dberr_t trx_undo_free_last_page(trx_undo_t *undo, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
-/** Truncate the tail of an undo log during rollback.
-@param[in,out] undo undo log
-@param[in] limit all undo logs after this limit will be discarded
-@param[in] is_temp whether this is temporary undo log */
-void trx_undo_truncate_end(trx_undo_t& undo, undo_no_t limit, bool is_temp);
+/** Try to truncate the undo logs.
+@param trx transaction
+@return error code */
+dberr_t trx_undo_try_truncate(const trx_t &trx);
/** Truncate the head of an undo log.
NOTE that only whole pages are freed; the header page is not
@@ -191,13 +177,15 @@ freed, but emptied, if all the records there are below the limit.
@param[in] hdr_page_no header page number
@param[in] hdr_offset header offset on the page
@param[in] limit first undo number to preserve
-(everything below the limit will be truncated) */
-void
+(everything below the limit will be truncated)
+@return error code */
+dberr_t
trx_undo_truncate_start(
trx_rseg_t* rseg,
uint32_t hdr_page_no,
uint16_t hdr_offset,
- undo_no_t limit);
+ undo_no_t limit)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/** Mark that an undo log header belongs to a data dictionary transaction.
@param[in] trx dictionary transaction
@param[in,out] undo undo log
@@ -292,9 +280,7 @@ struct trx_undo_t {
log */
XID xid; /*!< X/Open XA transaction
identification */
- ibool dict_operation; /*!< TRUE if a dict operation trx */
- table_id_t table_id; /*!< if a dict operation, then the table
- id */
+ bool dict_operation; /*!< TRUE if a dict operation trx */
trx_rseg_t* rseg; /*!< rseg where the undo log belongs */
/*-----------------------------*/
uint32_t hdr_page_no; /*!< page number of the header page in
@@ -326,6 +312,106 @@ struct trx_undo_t {
/*!< undo log objects in the rollback
segment are chained into lists */
};
+
+/** Cache a pointer to an undo record in a latched buffer pool page,
+parse the undo log record and store the record type, update vector
+and compiler information */
+class UndorecApplier
+{
+ /** Undo log block page id */
+ page_id_t page_id;
+ /** Undo log record pointer */
+ const trx_undo_rec_t *undo_rec;
+ /** Offset of the undo log record within the block */
+ uint16_t offset;
+ /** Transaction id of the undo log */
+ const trx_id_t trx_id;
+ /** Undo log record type */
+ ulint type;
+ /** compiler information */
+ ulint cmpl_info;
+ /** Update vector */
+ upd_t *update;
+ /** memory heap which can be used to build previous version of
+ the index record and its offsets */
+ mem_heap_t *heap;
+ /** mini-transaction for accessing B-tree pages */
+ mtr_t mtr;
+
+public:
+ UndorecApplier(page_id_t page_id, trx_id_t trx_id) :
+ page_id(page_id), trx_id(trx_id), heap(mem_heap_create(100))
+ {
+ }
+
+ /** Assign the next page id */
+ void assign_next(const page_id_t next_page_id)
+ {
+ page_id= next_page_id;
+ }
+
+ /** Assign the undo log record and offset */
+ inline void assign_rec(const buf_block_t &block, uint16_t offset);
+
+ uint16_t get_offset() const { return offset; }
+
+ page_id_t get_page_id() const { return page_id; }
+
+ /** Handle the DML undo log and apply it on online indexes */
+ inline void apply_undo_rec();
+
+ ~UndorecApplier()
+ {
+ mem_heap_free(heap);
+ }
+
+private:
+ /** Handle the insert undo log and apply it on online indexes
+ @param tuple row reference from undo log record
+ @param clust_index clustered index */
+ void log_insert(const dtuple_t &tuple, dict_index_t *clust_index);
+
+ /** Handle the update, delete undo log and apply it on online
+ indexes.
+ @param tuple row reference from undo log record
+ @param clust_index clustered index */
+ void log_update(const dtuple_t &tuple, dict_index_t *clust_index);
+
+ /** Check whether the given roll pointer is generated by
+ the current undo log record information stored.
+ @return true if roll pointer matches with current undo log info */
+ bool is_same(roll_ptr_t roll_ptr) const
+ {
+ uint16_t offset= static_cast<uint16_t>(roll_ptr);
+ uint32_t page_no= static_cast<uint32_t>(roll_ptr >> 16);
+ return page_no == page_id.page_no() && offset == this->offset;
+ }
+
+ /** Clear the undo log record information */
+ void clear_undo_rec()
+ {
+ undo_rec= nullptr;
+ cmpl_info= 0;
+ type= 0;
+ update= nullptr;
+ mem_heap_empty(heap);
+ }
+
+ /** Get the correct version of the clustered index record that
+ was modified by the current undo log record. Because there could
+ be the multiple successive updates of the same record within the
+ same transaction.
+ @param tuple tuple contains primary key value
+ @param index clustered index
+ @param[out] clust_rec current clustered index record
+ @param offsets offsets points to the record
+ @return clustered index record which was changed by
+ the undo log record or nullptr when there is no clustered
+ index record changed by undo log record */
+ const rec_t* get_old_rec(const dtuple_t &tuple, dict_index_t *index,
+ const rec_t **clust_rec, rec_offs **offsets);
+};
+
#endif /* !UNIV_INNOCHECKSUM */
/** The offset of the undo log page header on pages of the undo log */
diff --git a/storage/innobase/include/trx0undo.inl b/storage/innobase/include/trx0undo.inl
index 43af932708e..9f05989f634 100644
--- a/storage/innobase/include/trx0undo.inl
+++ b/storage/innobase/include/trx0undo.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -95,35 +95,6 @@ trx_undo_trx_id_is_insert(
return bool(trx_id[DATA_TRX_ID_LEN] >> 7);
}
-/** Gets an undo log page and x-latches it.
-@param[in] page_id page id
-@param[in,out] mtr mini-transaction
-@return pointer to page x-latched */
-UNIV_INLINE
-buf_block_t*
-trx_undo_page_get(const page_id_t page_id, mtr_t* mtr)
-{
- buf_block_t* block = buf_page_get(page_id, 0, RW_X_LATCH, mtr);
-
- buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
- return block;
-}
-
-/** Gets an undo log page and s-latches it.
-@param[in] page_id page id
-@param[in,out] mtr mini-transaction
-@return pointer to page s-latched */
-UNIV_INLINE
-buf_block_t*
-trx_undo_page_get_s_latched(const page_id_t page_id, mtr_t* mtr)
-{
- buf_block_t* block = buf_page_get(page_id, 0, RW_S_LATCH, mtr);
-
- buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
-
- return block;
-}
-
/** Determine the end offset of undo log records of an undo log page.
@param[in] undo_page undo log page
@param[in] page_no undo log header page number
@@ -135,11 +106,11 @@ uint16_t trx_undo_page_get_end(const buf_block_t *undo_page, uint32_t page_no,
{
if (page_no == undo_page->page.id().page_no())
if (uint16_t end = mach_read_from_2(TRX_UNDO_NEXT_LOG + offset +
- undo_page->frame))
+ undo_page->page.frame))
return end;
return mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
- undo_page->frame);
+ undo_page->page.frame);
}
/** Get the next record in an undo log.
@@ -153,6 +124,6 @@ trx_undo_page_get_next_rec(const buf_block_t *undo_page, uint16_t rec,
uint32_t page_no, uint16_t offset)
{
uint16_t end= trx_undo_page_get_end(undo_page, page_no, offset);
- uint16_t next= mach_read_from_2(undo_page->frame + rec);
- return next == end ? nullptr : undo_page->frame + next;
+ uint16_t next= mach_read_from_2(undo_page->page.frame + rec);
+ return next == end ? nullptr : undo_page->page.frame + next;
}
diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
index 7c1af230eaf..23eee89f857 100644
--- a/storage/innobase/include/univ.i
+++ b/storage/innobase/include/univ.i
@@ -31,8 +31,8 @@ Version control for database, common definitions, and include files
Created 1/20/1994 Heikki Tuuri
****************************************************************************/
-#ifndef univ_i
-#define univ_i
+#pragma once
+#define my_test_if_thinly_provisioned(f) 0
/* aux macros to convert M into "123" (string) if M is defined like
#define M 123 */
@@ -57,16 +57,6 @@ component, i.e. we show M.N.P as M.N */
(time in seconds) */
#define INNODB_EXTEND_TIMEOUT_INTERVAL 30
-#ifdef MYSQL_DYNAMIC_PLUGIN
-/* In the dynamic plugin, redefine some externally visible symbols
-in order not to conflict with the symbols of a builtin InnoDB. */
-
-/* Rename all C++ classes that contain virtual functions, because we
-have not figured out how to apply the visibility=hidden attribute to
-the virtual method table (vtable) in GCC 3. */
-# define ha_innobase ha_innodb
-#endif /* MYSQL_DYNAMIC_PLUGIN */
-
#if defined(_WIN32)
# include <windows.h>
#endif /* _WIN32 */
@@ -78,16 +68,9 @@ support cross-platform development and expose comonly used SQL names. */
#include <my_global.h>
#include "my_counter.h"
+#include "aligned.h"
#include <m_string.h>
-
-/* JAN: TODO: missing 5.7 header */
-#ifdef HAVE_MY_THREAD_H
-//# include <my_thread.h>
-#endif
-
-#ifndef UNIV_INNOCHECKSUM
-# include <mysqld_error.h>
-#endif /* !UNIV_INNOCHECKSUM */
+#include <mysqld_error.h>
/* Include <sys/stat.h> to get S_I... macros defined for os0file.cc */
#include <sys/stat.h>
@@ -119,15 +102,6 @@ HAVE_PSI_INTERFACE is defined. */
# define UNIV_PFS_MEMORY
# endif /* HAVE_PSI_MEMORY_INTERFACE */
-/* There are mutexes/rwlocks that we want to exclude from
-instrumentation even if their corresponding performance schema
-define is set. And this PFS_NOT_INSTRUMENTED is used
-as the key value to identify those objects that would
-be excluded from instrumentation. */
-# define PFS_NOT_INSTRUMENTED ULINT32_UNDEFINED
-
-# define PFS_IS_INSTRUMENTED(key) ((key) != PFS_NOT_INSTRUMENTED)
-
#ifdef HAVE_PFS_THREAD_PROVIDER_H
/* For PSI_MUTEX_CALL() and similar. */
#include "pfs_thread_provider.h"
@@ -194,8 +168,6 @@ using the call command. */
related stuff. */
#define UNIV_SEARCH_PERF_STAT /* statistics for the
adaptive hash index */
-#define UNIV_SRV_PRINT_LATCH_WAITS /* enable diagnostic output
- in sync0sync.cc */
#define UNIV_BTR_PRINT /* enable functions for
printing B-trees */
#define UNIV_ZIP_DEBUG /* extensive consistency checks
@@ -212,27 +184,8 @@ using the call command. */
info output */
#endif
-#define UNIV_BTR_DEBUG /* check B-tree links */
-#define UNIV_LIGHT_MEM_DEBUG /* light memory debugging */
-
// #define UNIV_SQL_DEBUG
-/* Linkage specifier for non-static InnoDB symbols (variables and functions)
-that are only referenced from within InnoDB, not from MySQL. We disable the
-GCC visibility directive on all Sun operating systems because there is no
-easy way to get it to work. See http://bugs.mysql.com/bug.php?id=52263. */
-#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(sun) || defined(__INTEL_COMPILER)
-# define UNIV_INTERN __attribute__((visibility ("hidden")))
-#else
-# define UNIV_INTERN
-#endif
-
-#if defined(__GNUC__) && (__GNUC__ >= 11)
-# define ATTRIBUTE_ACCESS(X) __attribute__((access X))
-#else
-# define ATTRIBUTE_ACCESS(X)
-#endif
-
#ifndef MY_ATTRIBUTE
#if defined(__GNUC__)
# define MY_ATTRIBUTE(A) __attribute__(A)
@@ -421,12 +374,6 @@ in both 32-bit and 64-bit environments. */
# define UINT64PFx "%016" PRIx64
#endif
-#ifdef UNIV_INNOCHECKSUM
-extern bool strict_verify;
-extern FILE* log_file;
-extern uint32_t cur_page_num;
-#endif /* UNIV_INNOCHECKSUM */
-
typedef int64_t ib_int64_t;
typedef uint64_t ib_uint64_t;
typedef uint32_t ib_uint32_t;
@@ -522,14 +469,21 @@ it is read or written. */
# define UNIV_PREFETCH_R(addr) ((void) 0)
# define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr)
-# elif defined __WIN__
-# include <xmmintrin.h>
+# elif defined _MSC_VER
# define UNIV_EXPECT(expr,value) (expr)
# define UNIV_LIKELY_NULL(expr) (expr)
-// __MM_HINT_T0 - (temporal data)
-// prefetch data into all levels of the cache hierarchy.
-# define UNIV_PREFETCH_R(addr) _mm_prefetch((char *) addr, _MM_HINT_T0)
-# define UNIV_PREFETCH_RW(addr) _mm_prefetch((char *) addr, _MM_HINT_T0)
+# if defined _M_IX86 || defined _M_X64
+ // __MM_HINT_T0 - (temporal data)
+ // prefetch data into all levels of the cache hierarchy.
+# define UNIV_PREFETCH_R(addr) _mm_prefetch((char *) addr, _MM_HINT_T0)
+# define UNIV_PREFETCH_RW(addr) _mm_prefetch((char *) addr, _MM_HINT_T0)
+# elif defined _M_ARM64
+# define UNIV_PREFETCH_R(addr) __prefetch(addr)
+# define UNIV_PREFETCH_RW(addr) __prefetch(addr)
+# else
+# define UNIV_PREFETCH_R ((void) 0)
+# define UNIV_PREFETCH_RW(addr) ((void) 0)
+# endif
#else
/* Dummy versions of the macros */
# define UNIV_EXPECT(expr,value) (expr)
@@ -546,28 +500,11 @@ it is read or written. */
/* Compile-time constant of the given array's size. */
#define UT_ARR_SIZE(a) (sizeof(a) / sizeof((a)[0]))
-/* The return type from a thread's start function differs between Unix and
-Windows, so define a typedef for it and a macro to use at the end of such
-functions. */
-
-#ifdef _WIN32
-typedef DWORD os_thread_ret_t;
-# define OS_THREAD_DUMMY_RETURN return(0)
-# define OS_PATH_SEPARATOR '\\'
-# define OS_PATH_SEPARATOR_ALT '/'
-#else
-typedef void* os_thread_ret_t;
-# define OS_THREAD_DUMMY_RETURN return(NULL)
-# define OS_PATH_SEPARATOR '/'
-# define OS_PATH_SEPARATOR_ALT '\\'
-#endif
-
#include <stdio.h>
#include "db0err.h"
#include "ut0dbg.h"
#include "ut0lst.h"
#include "ut0ut.h"
-#include "sync0types.h"
extern ulong srv_page_size_shift;
extern ulong srv_page_size;
@@ -576,4 +513,49 @@ extern ulong srv_page_size;
myisam/sp_defs.h. We only support 2 dimension data */
#define SPDIMS 2
-#endif
+#ifdef HAVE_PSI_INTERFACE
+typedef unsigned int mysql_pfs_key_t;
+
+# ifdef UNIV_PFS_MUTEX
+extern mysql_pfs_key_t buf_pool_mutex_key;
+extern mysql_pfs_key_t dict_foreign_err_mutex_key;
+extern mysql_pfs_key_t fil_system_mutex_key;
+extern mysql_pfs_key_t flush_list_mutex_key;
+extern mysql_pfs_key_t fts_cache_mutex_key;
+extern mysql_pfs_key_t fts_cache_init_mutex_key;
+extern mysql_pfs_key_t fts_delete_mutex_key;
+extern mysql_pfs_key_t fts_doc_id_mutex_key;
+extern mysql_pfs_key_t ibuf_bitmap_mutex_key;
+extern mysql_pfs_key_t ibuf_mutex_key;
+extern mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key;
+extern mysql_pfs_key_t log_sys_mutex_key;
+extern mysql_pfs_key_t log_flush_order_mutex_key;
+extern mysql_pfs_key_t recalc_pool_mutex_key;
+extern mysql_pfs_key_t purge_sys_pq_mutex_key;
+extern mysql_pfs_key_t recv_sys_mutex_key;
+extern mysql_pfs_key_t rtr_active_mutex_key;
+extern mysql_pfs_key_t rtr_match_mutex_key;
+extern mysql_pfs_key_t rtr_path_mutex_key;
+extern mysql_pfs_key_t page_zip_stat_per_index_mutex_key;
+extern mysql_pfs_key_t srv_innodb_monitor_mutex_key;
+extern mysql_pfs_key_t srv_misc_tmpfile_mutex_key;
+extern mysql_pfs_key_t srv_monitor_file_mutex_key;
+extern mysql_pfs_key_t buf_dblwr_mutex_key;
+extern mysql_pfs_key_t trx_pool_mutex_key;
+extern mysql_pfs_key_t trx_pool_manager_mutex_key;
+extern mysql_pfs_key_t lock_wait_mutex_key;
+extern mysql_pfs_key_t srv_threads_mutex_key;
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+extern mysql_pfs_key_t dict_operation_lock_key;
+extern mysql_pfs_key_t fil_space_latch_key;
+extern mysql_pfs_key_t trx_i_s_cache_lock_key;
+extern mysql_pfs_key_t trx_purge_latch_key;
+extern mysql_pfs_key_t index_tree_rw_lock_key;
+extern mysql_pfs_key_t index_online_log_key;
+extern mysql_pfs_key_t trx_sys_rw_lock_key;
+extern mysql_pfs_key_t lock_latch_key;
+extern mysql_pfs_key_t trx_rseg_latch_key;
+# endif /* UNIV_PFS_RWLOCK */
+#endif /* HAVE_PSI_INTERFACE */
diff --git a/storage/innobase/include/ut0counter.h b/storage/innobase/include/ut0counter.h
index 646a5f367c2..d6589cc4fd3 100644
--- a/storage/innobase/include/ut0counter.h
+++ b/storage/innobase/include/ut0counter.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2012, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -28,19 +28,9 @@ Created 2012/04/12 by Sunny Bains
#ifndef ut0counter_h
#define ut0counter_h
-#include "os0thread.h"
+#include "univ.i"
#include "my_rdtsc.h"
-/** CPU cache line size */
-#ifdef CPU_LEVEL1_DCACHE_LINESIZE
-# define CACHE_LINE_SIZE CPU_LEVEL1_DCACHE_LINESIZE
-#else
-# error CPU_LEVEL1_DCACHE_LINESIZE is undefined
-#endif /* CPU_LEVEL1_DCACHE_LINESIZE */
-
-/** Default number of slots to use in ib_counter_t */
-#define IB_N_SLOTS 64
-
/** Use the result of my_timer_cycles(), which mainly uses RDTSC for cycles
as a random value. See the comments for my_timer_cycles() */
/** @return result from RDTSC or similar functions. */
@@ -56,7 +46,7 @@ get_rnd_value()
/* We may go here if my_timer_cycles() returns 0,
so we have to have the plan B for the counter. */
#if !defined(_WIN32)
- return (size_t)os_thread_get_curr_id();
+ return (size_t)pthread_self();
#else
LARGE_INTEGER cnt;
QueryPerformanceCounter(&cnt);
@@ -65,14 +55,34 @@ get_rnd_value()
#endif /* !_WIN32 */
}
+/** Atomic which occupies whole CPU cache line.
+Note: We rely on the default constructor of std::atomic and
+do not explicitly initialize the contents. This works for us,
+because ib_counter_t is only intended for usage with global
+memory that is allocated from the .bss and thus guaranteed to
+be zero-initialized by the run-time environment.
+@see srv_stats */
+template <typename Type>
+struct ib_atomic_counter_element_t {
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) Atomic_relaxed<Type> value;
+};
+
+template <typename Type>
+struct ib_counter_element_t {
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) Type value;
+};
+
+
/** Class for using fuzzy counters. The counter is multi-instance relaxed atomic
so the results are not guaranteed to be 100% accurate but close
-enough. Creates an array of counters and separates each element by the
-CACHE_LINE_SIZE bytes */
-template <typename Type, int N = IB_N_SLOTS>
+enough. */
+template <typename Type,
+ template <typename T> class Element = ib_atomic_counter_element_t,
+ int N = 128 >
struct ib_counter_t {
/** Increment the counter by 1. */
void inc() { add(1); }
+ ib_counter_t& operator++() { inc(); return *this; }
/** Increment the counter by 1.
@param[in] index a reasonably thread-unique identifier */
@@ -85,12 +95,12 @@ struct ib_counter_t {
/** Add to the counter.
@param[in] index a reasonably thread-unique identifier
@param[in] n amount to be added */
- void add(size_t index, Type n) {
+ TPOOL_SUPPRESS_TSAN void add(size_t index, Type n) {
index = index % N;
ut_ad(index < UT_ARR_SIZE(m_counter));
- m_counter[index].value.fetch_add(n, std::memory_order_relaxed);
+ m_counter[index].value += n;
}
/* @return total value - not 100% accurate, since it is relaxed atomic*/
@@ -98,28 +108,16 @@ struct ib_counter_t {
Type total = 0;
for (const auto &counter : m_counter) {
- total += counter.value.load(std::memory_order_relaxed);
+ total += counter.value;
}
return(total);
}
private:
- /** Atomic which occupies whole CPU cache line.
- Note: We rely on the default constructor of std::atomic and
- do not explicitly initialize the contents. This works for us,
- because ib_counter_t is only intended for usage with global
- memory that is allocated from the .bss and thus guaranteed to
- be zero-initialized by the run-time environment.
- @see srv_stats
- @see rw_lock_stats */
- struct ib_counter_element_t {
- MY_ALIGNED(CACHE_LINE_SIZE) std::atomic<Type> value;
- };
- static_assert(sizeof(ib_counter_element_t) == CACHE_LINE_SIZE, "");
-
+ static_assert(sizeof(Element<Type>) == CPU_LEVEL1_DCACHE_LINESIZE, "");
/** Array of counter elements */
- MY_ALIGNED(CACHE_LINE_SIZE) ib_counter_element_t m_counter[N];
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) Element<Type> m_counter[N];
};
#endif /* ut0counter_h */
diff --git a/storage/innobase/include/ut0new.h b/storage/innobase/include/ut0new.h
index 4c8d2cf7a61..f4183e4c61a 100644
--- a/storage/innobase/include/ut0new.h
+++ b/storage/innobase/include/ut0new.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -120,9 +120,8 @@ InnoDB:
#ifndef ut0new_h
#define ut0new_h
-#include <algorithm> /* std::min() */
#include <limits> /* std::numeric_limits */
-#include <map> /* std::map */
+#include <thread>
#include <stddef.h>
#include <stdlib.h> /* malloc() */
@@ -136,8 +135,7 @@ InnoDB:
#include "mysql/psi/psi_memory.h" /* PSI_memory_key, PSI_memory_info */
-#include "os0thread.h" /* os_thread_sleep() */
-#include "ut0ut.h" /* ut_strcmp_functor, ut_basename_noext() */
+#include "ut0ut.h" /* ut_strcmp_functor */
#define OUT_OF_MEMORY_MSG \
"Check if you should increase the swap file or ulimits of your" \
@@ -381,7 +379,7 @@ public:
break;
}
- os_thread_sleep(1000000 /* 1 second */);
+ std::this_thread::sleep_for(std::chrono::seconds(1));
}
if (ptr == NULL) {
@@ -516,7 +514,7 @@ public:
break;
}
- os_thread_sleep(1000000 /* 1 second */);
+ std::this_thread::sleep_for(std::chrono::seconds(1));
}
if (pfx_new == NULL) {
@@ -843,6 +841,8 @@ constexpr const char* const auto_event_names[] =
"buf0buf",
"buf0dblwr",
"buf0dump",
+ "buf0lru",
+ "buf0rea",
"dict0dict",
"dict0mem",
"dict0stats",
@@ -868,7 +868,6 @@ constexpr const char* const auto_event_names[] =
"lexyy",
"lock0lock",
"mem0mem",
- "os0event",
"os0file",
"pars0lex",
"rem0rec",
@@ -879,11 +878,6 @@ constexpr const char* const auto_event_names[] =
"row0mysql",
"row0sel",
"srv0start",
- "sync0arr",
- "sync0debug",
- "sync0rw",
- "sync0start",
- "sync0types",
"trx0i_s",
"trx0i_s",
"trx0roll",
diff --git a/storage/innobase/include/ut0pool.h b/storage/innobase/include/ut0pool.h
index e0a1f7c04ca..63628cc169f 100644
--- a/storage/innobase/include/ut0pool.h
+++ b/storage/innobase/include/ut0pool.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2013, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -31,7 +31,7 @@ Created 2012-Feb-26 Sunny Bains
#include <queue>
#include <functional>
-#include "ut0new.h"
+#include <my_global.h>
/** Allocate the memory for the object in blocks. We keep the objects sorted
on pointer so that they are closer together in case they have to be iterated
@@ -41,8 +41,6 @@ struct Pool {
typedef Type value_type;
- // FIXME: Add an assertion to check alignment and offset is
- // as we expect it. Also, sizeof(void*) can be 8, can we impove on this.
struct Element {
Pool* m_pool;
value_type m_type;
@@ -57,17 +55,23 @@ struct Pool {
m_size(size),
m_last()
{
+ ut_ad(ut_is_2pow(size));
ut_a(size >= sizeof(Element));
+ static_assert(!(sizeof(Element) % CPU_LEVEL1_DCACHE_LINESIZE),
+ "alignment");
m_lock_strategy.create();
ut_a(m_start == 0);
- m_start = reinterpret_cast<Element*>(ut_zalloc_nokey(m_size));
+ m_start = static_cast<Element*>(
+ aligned_malloc(m_size, CPU_LEVEL1_DCACHE_LINESIZE));
+ memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(
+ m_start, 0, m_size);
m_last = m_start;
- m_end = &m_start[m_size / sizeof(*m_start)];
+ m_end = &m_start[m_size / sizeof *m_start];
/* Note: Initialise only a small subset, even though we have
allocated all the memory. This is required only because PFS
@@ -90,7 +94,7 @@ struct Pool {
Factory::destroy(&elem->m_type);
}
- ut_free(m_start);
+ IF_WIN(_aligned_free,free)(m_start);
m_end = m_last = m_start = 0;
m_size = 0;
}
@@ -254,7 +258,8 @@ struct PoolManager {
except crash and burn, however lets
be a little optimistic and wait for
a resource to be freed. */
- os_thread_sleep(delay * 1000000);
+ std::this_thread::sleep_for(
+ std::chrono::seconds(delay));
if (delay < 32) {
delay <<= 1;
diff --git a/storage/innobase/include/ut0rnd.h b/storage/innobase/include/ut0rnd.h
index dba8d3f1a06..511eb21fd11 100644
--- a/storage/innobase/include/ut0rnd.h
+++ b/storage/innobase/include/ut0rnd.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, 2020, MariaDB Corporation.
+Copyright (c) 2019, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -90,15 +90,6 @@ ut_fold_ull(
/*========*/
ib_uint64_t d) /*!< in: 64-bit integer */
MY_ATTRIBUTE((const));
-/*************************************************************//**
-Folds a character string ending in the null character.
-@return folded value */
-UNIV_INLINE
-ulint
-ut_fold_string(
-/*===========*/
- const char* str) /*!< in: null-terminated string */
- MY_ATTRIBUTE((warn_unused_result));
/***********************************************************//**
Looks for a prime number slightly greater than the given argument.
The prime is chosen so that it is not near any power of 2.
diff --git a/storage/innobase/include/ut0rnd.inl b/storage/innobase/include/ut0rnd.inl
index c0105160a42..37da323f8f3 100644
--- a/storage/innobase/include/ut0rnd.inl
+++ b/storage/innobase/include/ut0rnd.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -59,28 +59,6 @@ ut_fold_ull(
return(ut_fold_ulint_pair((ulint) d & ULINT32_MASK,
(ulint) (d >> 32)));
}
-
-/*************************************************************//**
-Folds a character string ending in the null character.
-@return folded value */
-UNIV_INLINE
-ulint
-ut_fold_string(
-/*===========*/
- const char* str) /*!< in: null-terminated string */
-{
- ulint fold = 0;
-
- ut_ad(str);
-
- while (*str != '\0') {
- fold = ut_fold_ulint_pair(fold, (ulint)(*str));
- str++;
- }
-
- return(fold);
-}
-
#endif /* !UNIV_INNOCHECKSUM */
/*************************************************************//**
diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h
index 369f3f8c5d3..b7625b512a2 100644
--- a/storage/innobase/include/ut0ut.h
+++ b/storage/innobase/include/ut0ut.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, 2020, MariaDB Corporation.
+Copyright (c) 2019, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -276,25 +276,6 @@ ut_strerr(
#endif /* !UNIV_INNOCHECKSUM */
-#ifdef UNIV_PFS_MEMORY
-
-/** Extract the basename of a file without its extension.
-For example, extract "foo0bar" out of "/path/to/foo0bar.cc".
-@param[in] file file path, e.g. "/path/to/foo0bar.cc"
-@param[out] base result, e.g. "foo0bar"
-@param[in] base_size size of the output buffer 'base', if there
-is not enough space, then the result will be truncated, but always
-'\0'-terminated
-@return number of characters that would have been printed if the size
-were unlimited (not including the final ‘\0’) */
-size_t
-ut_basename_noext(
- const char* file,
- char* base,
- size_t base_size);
-
-#endif /* UNIV_PFS_MEMORY */
-
namespace ib {
/** This is a wrapper class, used to print any unsigned integer type
diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h
index 26838c95443..95c7a248f7a 100644
--- a/storage/innobase/include/ut0wqueue.h
+++ b/storage/innobase/include/ut0wqueue.h
@@ -30,8 +30,7 @@ wait for work items to be available and take them off the queue for
processing.
************************************************************************/
-#ifndef IB_WORK_QUEUE_H
-#define IB_WORK_QUEUE_H
+#pragma once
#include "ut0list.h"
#include "mem0mem.h"
@@ -42,12 +41,12 @@ struct ib_list_t;
/** Work queue */
struct ib_wqueue_t
{
- /** Mutex protecting everything */
- ib_mutex_t mutex;
- /** Work item list */
- ib_list_t* items;
- /** ib_list_len(*items) */
- size_t length;
+ /** Mutex protecting everything */
+ mysql_mutex_t mutex;
+ /** Work item list */
+ ib_list_t *items;
+ /** ib_list_len(*items) */
+ size_t length;
};
/****************************************************************//**
@@ -85,5 +84,3 @@ void*
ib_wqueue_nowait(
/*=============*/
ib_wqueue_t* wq); /*<! in: work queue */
-
-#endif /* IB_WORK_QUEUE_H */